kun yu 77e1ddd81b branch-0.4.0
Former-commit-id: a4df63653202df32d0b983de27f5c969905d17ac
2019-07-30 10:23:34 +08:00

185 lines
6.3 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef ARROW_UTIL_UTF8_H
#define ARROW_UTIL_UTF8_H
#include <cassert>
#include <cstdint>
#include <cstring>
#include <memory>
#include <string>
#include "arrow/status.h"
#include "arrow/util/macros.h"
#include "arrow/util/string_view.h"
#include "arrow/util/visibility.h"
namespace arrow {
namespace util {
// Convert a UTF8 string to a wstring (either UTF16 or UTF32, depending
// on the wchar_t width).
ARROW_EXPORT Status UTF8ToWideString(const std::string& source, std::wstring* out);
// Similarly, convert a wstring to a UTF8 string.
ARROW_EXPORT Status WideStringToUTF8(const std::wstring& source, std::string* out);
namespace internal {
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
// A compact state table allowing UTF8 decoding using two dependent
// lookups per byte. The first lookup determines the character class
// and the second lookup reads the next state.
// In this table states are multiples of 12.
ARROW_EXPORT extern const uint8_t utf8_small_table[256 + 9 * 12];
// Success / reject states when looked up in the small table
static constexpr uint8_t kUTF8DecodeAccept = 0;
static constexpr uint8_t kUTF8DecodeReject = 12;
// An expanded state table allowing transitions using a single lookup
// at the expense of a larger memory footprint (but on non-random data,
// not all the table will end up accessed and cached).
// In this table states are multiples of 256.
ARROW_EXPORT extern uint16_t utf8_large_table[9 * 256];
// Success / reject states when looked up in the large table
static constexpr uint16_t kUTF8ValidateAccept = 0;
static constexpr uint16_t kUTF8ValidateReject = 256;
static inline uint8_t DecodeOneUTF8Byte(uint8_t byte, uint8_t state, uint32_t* codep) {
uint8_t type = utf8_small_table[byte];
*codep = (state != kUTF8DecodeAccept) ? (byte & 0x3fu) | (*codep << 6)
: (0xff >> type) & (byte);
state = utf8_small_table[256 + state + type];
return state;
}
static inline uint16_t ValidateOneUTF8Byte(uint8_t byte, uint16_t state) {
return utf8_large_table[state + byte];
}
#ifndef NDEBUG
ARROW_EXPORT void CheckUTF8Initialized();
#endif
} // namespace internal
// This function needs to be called before doing UTF8 validation.
ARROW_EXPORT void InitializeUTF8();
inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
static constexpr uint64_t high_bits_64 = 0x8080808080808080ULL;
// For some reason, defining this variable outside the loop helps clang
uint64_t mask;
#ifndef NDEBUG
internal::CheckUTF8Initialized();
#endif
while (size >= 8) {
// XXX This is doing an unaligned access. Contemporary architectures
// (x86-64, AArch64, PPC64) support it natively and often have good
// performance nevertheless.
memcpy(&mask, data, 8);
if (ARROW_PREDICT_TRUE((mask & high_bits_64) == 0)) {
// 8 bytes of pure ASCII, move forward
size -= 8;
data += 8;
continue;
}
// Non-ASCII run detected.
// We process at least 4 bytes, to avoid too many spurious 64-bit reads
// in case the non-ASCII bytes are at the end of the tested 64-bit word.
// We also only check for rejection at the end since that state is stable
// (once in reject state, we always remain in reject state).
// It is guaranteed that size >= 8 when arriving here, which allows
// us to avoid size checks.
uint16_t state = internal::kUTF8ValidateAccept;
// Byte 0
state = internal::ValidateOneUTF8Byte(*data++, state);
--size;
// Byte 1
state = internal::ValidateOneUTF8Byte(*data++, state);
--size;
// Byte 2
state = internal::ValidateOneUTF8Byte(*data++, state);
--size;
// Byte 3
state = internal::ValidateOneUTF8Byte(*data++, state);
--size;
// Byte 4
state = internal::ValidateOneUTF8Byte(*data++, state);
--size;
if (state == internal::kUTF8ValidateAccept) {
continue; // Got full char, switch back to ASCII detection
}
// Byte 5
state = internal::ValidateOneUTF8Byte(*data++, state);
--size;
if (state == internal::kUTF8ValidateAccept) {
continue; // Got full char, switch back to ASCII detection
}
// Byte 6
state = internal::ValidateOneUTF8Byte(*data++, state);
--size;
if (state == internal::kUTF8ValidateAccept) {
continue; // Got full char, switch back to ASCII detection
}
// Byte 7
state = internal::ValidateOneUTF8Byte(*data++, state);
--size;
if (state == internal::kUTF8ValidateAccept) {
continue; // Got full char, switch back to ASCII detection
}
// kUTF8ValidateAccept not reached along 4 transitions has to mean a rejection
assert(state == internal::kUTF8ValidateReject);
return false;
}
// Validate string tail one byte at a time
// Note the state table is designed so that, once in the reject state,
// we remain in that state until the end. So we needn't check for
// rejection at each char (we don't gain much by short-circuiting here).
uint16_t state = internal::kUTF8ValidateAccept;
while (size-- > 0) {
state = internal::ValidateOneUTF8Byte(*data++, state);
}
return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept);
}
inline bool ValidateUTF8(const util::string_view& str) {
const uint8_t* data = reinterpret_cast<const uint8_t*>(str.data());
const size_t length = str.size();
return ValidateUTF8(data, length);
}
// Skip UTF8 byte order mark, if any.
ARROW_EXPORT
Status SkipUTF8BOM(const uint8_t* data, int64_t size, const uint8_t** out);
} // namespace util
} // namespace arrow
#endif