mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-01 08:28:10 +08:00
1493 lines
51 KiB
C++
1493 lines
51 KiB
C++
// Copyright 2016 Daniel Parker
|
|
// Distributed under the Boost license, Version 1.0.
|
|
// (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
// See https://github.com/danielaparker/unicode_traits for latest version
|
|
|
|
/*
|
|
* Includes code derived from Unicode, Inc decomposition code in ConvertUTF.h and ConvertUTF.c
|
|
* http://www.unicode.org/
|
|
*
|
|
* "Unicode, Inc. hereby grants the right to freely use the information
|
|
* supplied in this file in the creation of products supporting the
|
|
* Unicode Standard."
|
|
*/
|
|
|
|
#ifndef JSONCONS_UNICONS_UNICODE_TRAITS_HPP
|
|
#define JSONCONS_UNICONS_UNICODE_TRAITS_HPP
|
|
|
|
#if defined(__clang__)
|
|
# define UNICONS_FALLTHROUGH [[clang::fallthrough]]
|
|
#elif defined(__GNUC__) && ((__GNUC__ >= 7))
|
|
# define UNICONS_FALLTHROUGH __attribute__((fallthrough))
|
|
#elif defined (__GNUC__)
|
|
# define UNICONS_FALLTHROUGH // FALLTHRU
|
|
#else
|
|
# define UNICONS_FALLTHROUGH
|
|
#endif
|
|
|
|
#if defined (__clang__)
|
|
#if defined(_GLIBCXX_USE_NOEXCEPT)
|
|
#define UNICONS_NOEXCEPT _GLIBCXX_USE_NOEXCEPT
|
|
#else
|
|
#define UNICONS_NOEXCEPT noexcept
|
|
#endif
|
|
#elif defined(__GNUC__)
|
|
#define UNICONS_NOEXCEPT _GLIBCXX_USE_NOEXCEPT
|
|
#elif defined(_MSC_VER)
|
|
#if _MSC_VER >= 1900
|
|
#define UNICONS_NOEXCEPT noexcept
|
|
#else
|
|
#define UNICONS_NOEXCEPT
|
|
#endif
|
|
#else
|
|
#define UNICONS_NOEXCEPT
|
|
#endif
|
|
|
|
#include <string>
|
|
#include <iterator>
|
|
#include <type_traits>
|
|
#include <system_error>
|
|
|
|
namespace jsoncons { namespace unicons {
|
|
|
|
/*
|
|
* Magic values subtracted from a buffer value during UTF8 conversion.
|
|
* This table contains as many values as there might be trailing bytes
|
|
* in a UTF-8 sequence. Source: ConvertUTF.c
|
|
*/
|
|
const uint32_t offsets_from_utf8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
|
|
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
|
|
|
|
/*
|
|
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
|
|
* into the first byte, depending on how many bytes follow. There are
|
|
* as many entries in this table as there are UTF-8 sequence types.
|
|
* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
|
|
* for *legal* UTF-8 will be 4 or fewer bytes total. Source: ConvertUTF.c
|
|
*/
|
|
const uint8_t first_byte_mark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
|
|
|
|
/*
|
|
* Index into the table below with the first byte of a UTF-8 sequence to
|
|
* get the number of trailing bytes that are supposed to follow it.
|
|
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
|
|
* left as-is for anyone who may want to do such conversion, which was
|
|
* allowed in earlier algorithms. Source: ConvertUTF.c
|
|
*/
|
|
const uint8_t trailing_bytes_for_utf8[256] = {
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
|
|
};
|
|
|
|
// Some fundamental constants. Source: ConvertUTF.h
|
|
const uint32_t replacement_char = 0x0000FFFD;
|
|
const uint32_t max_bmp = 0x0000FFFF;
|
|
const uint32_t max_utf16 = 0x0010FFFF;
|
|
const uint32_t max_utf32 = 0x7FFFFFFF;
|
|
const uint32_t max_legal_utf32 = 0x0010FFFF;
|
|
|
|
const int half_shift = 10; // used for shifting by 10 bits
|
|
const uint32_t half_base = 0x0010000UL;
|
|
const uint32_t half_mask = 0x3FFUL;
|
|
|
|
const uint16_t sur_high_start = 0xD800;
|
|
const uint16_t sur_high_end = 0xDBFF;
|
|
const uint16_t sur_low_start = 0xDC00;
|
|
const uint16_t sur_low_end = 0xDFFF;
|
|
|
|
inline
|
|
static bool is_continuation_byte(unsigned char ch)
|
|
{
|
|
return (ch & 0xC0) == 0x80;
|
|
}
|
|
|
|
inline
|
|
bool is_high_surrogate(uint32_t ch) UNICONS_NOEXCEPT
|
|
{
|
|
return (ch >= sur_high_start && ch <= sur_high_end);
|
|
}
|
|
|
|
inline
|
|
bool is_low_surrogate(uint32_t ch) UNICONS_NOEXCEPT
|
|
{
|
|
return (ch >= sur_low_start && ch <= sur_low_end);
|
|
}
|
|
|
|
inline
|
|
bool is_surrogate(uint32_t ch) UNICONS_NOEXCEPT
|
|
{
|
|
return (ch >= sur_high_start && ch <= sur_low_end);
|
|
}
|
|
|
|
enum class conv_flags
|
|
{
|
|
strict = 0,
|
|
lenient
|
|
};
|
|
|
|
// conv_errc
|
|
|
|
enum class conv_errc
|
|
{
|
|
ok = 0,
|
|
over_long_utf8_sequence = 1, // over long utf8 sequence
|
|
expected_continuation_byte, // expected continuation byte
|
|
unpaired_high_surrogate, // unpaired high surrogate UTF-16
|
|
illegal_surrogate_value, // UTF-16 surrogate values are illegal in UTF-32
|
|
source_exhausted, // partial character in source, but hit end
|
|
source_illegal // source sequence is illegal/malformed
|
|
};
|
|
|
|
class Unicode_traits_error_category_impl_
|
|
: public std::error_category
|
|
{
|
|
public:
|
|
virtual const char* name() const UNICONS_NOEXCEPT
|
|
{
|
|
return "unicons conversion error";
|
|
}
|
|
virtual std::string message(int ev) const
|
|
{
|
|
switch (static_cast<conv_errc>(ev))
|
|
{
|
|
case conv_errc::over_long_utf8_sequence:
|
|
return "Over long utf8 sequence";
|
|
case conv_errc::expected_continuation_byte:
|
|
return "Expected continuation byte";
|
|
case conv_errc::unpaired_high_surrogate:
|
|
return "Unpaired high surrogate UTF-16";
|
|
case conv_errc::illegal_surrogate_value:
|
|
return "UTF-16 surrogate values are illegal in UTF-32";
|
|
case conv_errc::source_exhausted:
|
|
return "Partial character in source, but hit end";
|
|
case conv_errc::source_illegal:
|
|
return "Source sequence is illegal/malformed";
|
|
default:
|
|
return "";
|
|
break;
|
|
}
|
|
}
|
|
};
|
|
|
|
inline
|
|
const std::error_category& unicode_traits_error_category()
|
|
{
|
|
static Unicode_traits_error_category_impl_ instance;
|
|
return instance;
|
|
}
|
|
|
|
inline
|
|
std::error_code make_error_code(conv_errc result)
|
|
{
|
|
return std::error_code(static_cast<int>(result),unicode_traits_error_category());
|
|
}
|
|
|
|
// encoding_errc
|
|
|
|
enum class encoding_errc
|
|
{
|
|
ok = 0,
|
|
expected_u8_found_u16 = 1,
|
|
expected_u8_found_u32,
|
|
expected_u16_found_fffe,
|
|
expected_u32_found_fffe
|
|
};
|
|
|
|
class Encoding_errc_impl_
|
|
: public std::error_category
|
|
{
|
|
public:
|
|
virtual const char* name() const UNICONS_NOEXCEPT
|
|
{
|
|
return "unicons encoding error";
|
|
}
|
|
virtual std::string message(int ev) const
|
|
{
|
|
switch (static_cast<encoding_errc>(ev))
|
|
{
|
|
case encoding_errc::expected_u8_found_u16:
|
|
return "Expected UTF-8, found UTF-16";
|
|
case encoding_errc::expected_u8_found_u32:
|
|
return "Expected UTF-8, found UTF-32";
|
|
case encoding_errc::expected_u16_found_fffe:
|
|
return "Expected UTF-16, found non character";
|
|
case encoding_errc::expected_u32_found_fffe:
|
|
return "Expected UTF-32, found non character";
|
|
default:
|
|
return "";
|
|
break;
|
|
}
|
|
}
|
|
};
|
|
|
|
inline
|
|
const std::error_category& encoding_error_category()
|
|
{
|
|
static Encoding_errc_impl_ instance;
|
|
return instance;
|
|
}
|
|
|
|
inline
|
|
std::error_code make_error_code(encoding_errc result)
|
|
{
|
|
return std::error_code(static_cast<int>(result),encoding_error_category());
|
|
}
|
|
|
|
// utf8
|
|
|
|
template <class Iterator>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<Iterator>::value_type>::value
|
|
&& sizeof(typename std::iterator_traits<Iterator>::value_type) == sizeof(uint8_t),
|
|
conv_errc >::type
|
|
is_legal_utf8(Iterator first, size_t length)
|
|
{
|
|
uint8_t a;
|
|
Iterator srcptr = first+length;
|
|
switch (length) {
|
|
default:
|
|
return conv_errc::over_long_utf8_sequence;
|
|
case 4:
|
|
if (((a = (*--srcptr))& 0xC0) != 0x80)
|
|
return conv_errc::expected_continuation_byte;
|
|
UNICONS_FALLTHROUGH;
|
|
case 3:
|
|
if (((a = (*--srcptr))& 0xC0) != 0x80)
|
|
return conv_errc::expected_continuation_byte;
|
|
UNICONS_FALLTHROUGH;
|
|
case 2:
|
|
if (((a = (*--srcptr))& 0xC0) != 0x80)
|
|
return conv_errc::expected_continuation_byte;
|
|
|
|
switch (static_cast<uint8_t>(*first))
|
|
{
|
|
/* no fall-through in this inner switch */
|
|
case 0xE0: if (a < 0xA0) return conv_errc::source_illegal; break;
|
|
case 0xED: if (a > 0x9F) return conv_errc::source_illegal; break;
|
|
case 0xF0: if (a < 0x90) return conv_errc::source_illegal; break;
|
|
case 0xF4: if (a > 0x8F) return conv_errc::source_illegal; break;
|
|
default: if (a < 0x80) return conv_errc::source_illegal;
|
|
}
|
|
|
|
UNICONS_FALLTHROUGH;
|
|
case 1:
|
|
if (static_cast<uint8_t>(*first) >= 0x80 && static_cast<uint8_t>(*first) < 0xC2)
|
|
return conv_errc::source_illegal;
|
|
break;
|
|
}
|
|
if (static_cast<uint8_t>(*first) > 0xF4)
|
|
return conv_errc::source_illegal;
|
|
|
|
return conv_errc();
|
|
}
|
|
|
|
template <class...> using void_t = void;
|
|
|
|
template <class, class, class = void>
|
|
struct is_output_iterator : std::false_type {};
|
|
|
|
template <class I, class E>
|
|
struct is_output_iterator<I, E, void_t<
|
|
typename std::iterator_traits<I>::iterator_category,
|
|
decltype(*std::declval<I>() = std::declval<E>())>> : std::true_type {};
|
|
|
|
// is_same_size fixes issue with vs2013
|
|
|
|
// primary template
|
|
template<class T1, class T2, class Enable = void>
|
|
struct is_same_size : std::false_type
|
|
{
|
|
};
|
|
|
|
// specialization for non void types
|
|
template<class T1, class T2>
|
|
struct is_same_size<T1, T2, typename std::enable_if<!std::is_void<T1>::value && !std::is_void<T2>::value>::type>
|
|
{
|
|
static const bool value = (sizeof(T1) == sizeof(T2));
|
|
};
|
|
|
|
template<class OutputIt, class CharT, class Enable = void>
|
|
struct is_compatible_output_iterator : std::false_type {};
|
|
|
|
template<class OutputIt, class CharT>
|
|
struct is_compatible_output_iterator<OutputIt,CharT,
|
|
typename std::enable_if<is_output_iterator<OutputIt,CharT>::value
|
|
&& std::is_void<typename std::iterator_traits<OutputIt>::value_type>::value
|
|
&& std::is_integral<typename OutputIt::container_type::value_type>::value
|
|
&& !std::is_void<typename OutputIt::container_type::value_type>::value
|
|
&& is_same_size<typename OutputIt::container_type::value_type,CharT>::value>::type
|
|
> : std::true_type {};
|
|
|
|
template<class OutputIt, class CharT>
|
|
struct is_compatible_output_iterator<OutputIt,CharT,
|
|
typename std::enable_if<is_output_iterator<OutputIt,CharT>::value
|
|
&& std::is_integral<typename std::iterator_traits<OutputIt>::value_type>::value
|
|
&& is_same_size<typename std::iterator_traits<OutputIt>::value_type,CharT>::value>::type
|
|
> : std::true_type {};
|
|
|
|
template<class OutputIt, class CharT>
|
|
struct is_compatible_output_iterator<OutputIt,CharT,
|
|
typename std::enable_if<is_output_iterator<OutputIt,CharT>::value
|
|
&& std::is_void<typename std::iterator_traits<OutputIt>::value_type>::value
|
|
&& is_same_size<typename OutputIt::char_type,CharT>::value>::type
|
|
> : std::true_type {};
|
|
|
|
// convert
|
|
|
|
template <class Iterator>
|
|
struct convert_result
|
|
{
|
|
Iterator it;
|
|
conv_errc ec;
|
|
};
|
|
|
|
template <class InputIt,class OutputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint8_t)
|
|
&& is_compatible_output_iterator<OutputIt,uint8_t>::value,convert_result<InputIt>>::type
|
|
convert(InputIt first, InputIt last, OutputIt target, conv_flags flags=conv_flags::strict)
|
|
{
|
|
(void)flags;
|
|
|
|
conv_errc result = conv_errc();
|
|
while (first != last)
|
|
{
|
|
size_t length = trailing_bytes_for_utf8[static_cast<uint8_t>(*first)] + 1;
|
|
if (length > (size_t)(last - first))
|
|
{
|
|
return convert_result<InputIt>{first, conv_errc::source_exhausted};
|
|
}
|
|
if ((result=is_legal_utf8(first, length)) != conv_errc())
|
|
{
|
|
return convert_result<InputIt>{first,result};
|
|
}
|
|
|
|
switch (length) {
|
|
case 4: *target++ = (static_cast<uint8_t>(*first++));
|
|
UNICONS_FALLTHROUGH;
|
|
case 3: *target++ = (static_cast<uint8_t>(*first++));
|
|
UNICONS_FALLTHROUGH;
|
|
case 2: *target++ = (static_cast<uint8_t>(*first++));
|
|
UNICONS_FALLTHROUGH;
|
|
case 1: *target++ = (static_cast<uint8_t>(*first++));
|
|
}
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
template <class InputIt,class OutputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint8_t)
|
|
&& is_compatible_output_iterator<OutputIt,uint16_t>::value,convert_result<InputIt>>::type
|
|
convert(InputIt first, InputIt last,
|
|
OutputIt target,
|
|
conv_flags flags = conv_flags::strict)
|
|
{
|
|
conv_errc result = conv_errc();
|
|
|
|
while (first != last)
|
|
{
|
|
uint32_t ch = 0;
|
|
unsigned short extra_bytes_to_read = trailing_bytes_for_utf8[static_cast<uint8_t>(*first)];
|
|
if (extra_bytes_to_read >= last - first)
|
|
{
|
|
result = conv_errc::source_exhausted;
|
|
break;
|
|
}
|
|
/* Do this check whether lenient or strict */
|
|
if ((result=is_legal_utf8(first, extra_bytes_to_read+1)) != conv_errc())
|
|
{
|
|
break;
|
|
}
|
|
/*
|
|
* The cases all fall through. See "Note A" below.
|
|
*/
|
|
switch (extra_bytes_to_read) {
|
|
case 5: ch += static_cast<uint8_t>(*first++); ch <<= 6; /* remember, illegal UTF-8 */
|
|
case 4: ch += static_cast<uint8_t>(*first++); ch <<= 6; /* remember, illegal UTF-8 */
|
|
case 3: ch += static_cast<uint8_t>(*first++); ch <<= 6;
|
|
case 2: ch += static_cast<uint8_t>(*first++); ch <<= 6;
|
|
case 1: ch += static_cast<uint8_t>(*first++); ch <<= 6;
|
|
case 0: ch += static_cast<uint8_t>(*first++);
|
|
}
|
|
ch -= offsets_from_utf8[extra_bytes_to_read];
|
|
|
|
if (ch <= max_bmp) { /* Target is a character <= 0xFFFF */
|
|
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
if (is_surrogate(ch) ) {
|
|
if (flags == conv_flags::strict) {
|
|
first -= (extra_bytes_to_read+1); /* return to the illegal value itself */
|
|
result = conv_errc::source_illegal;
|
|
break;
|
|
} else {
|
|
*target++ = (replacement_char);
|
|
}
|
|
} else {
|
|
*target++ = ((uint16_t)ch); /* normal case */
|
|
}
|
|
} else if (ch > max_utf16) {
|
|
if (flags == conv_flags::strict) {
|
|
result = conv_errc::source_illegal;
|
|
first -= (extra_bytes_to_read+1); /* return to the start */
|
|
break; /* Bail out; shouldn't continue */
|
|
} else {
|
|
*target++ = (replacement_char);
|
|
}
|
|
} else {
|
|
/* target is a character in range 0xFFFF - 0x10FFFF. */
|
|
ch -= half_base;
|
|
*target++ = ((uint16_t)((ch >> half_shift) + sur_high_start));
|
|
*target++ = ((uint16_t)((ch & half_mask) + sur_low_start));
|
|
}
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
template <class InputIt,class OutputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint8_t)
|
|
&& is_compatible_output_iterator<OutputIt,uint32_t>::value,convert_result<InputIt>>::type
|
|
convert(InputIt first, InputIt last,
|
|
OutputIt target,
|
|
conv_flags flags = conv_flags::strict)
|
|
{
|
|
conv_errc result = conv_errc();
|
|
|
|
while (first < last)
|
|
{
|
|
uint32_t ch = 0;
|
|
unsigned short extra_bytes_to_read = trailing_bytes_for_utf8[static_cast<uint8_t>(*first)];
|
|
if (extra_bytes_to_read >= last - first)
|
|
{
|
|
result = conv_errc::source_exhausted;
|
|
break;
|
|
}
|
|
/* Do this check whether lenient or strict */
|
|
if ((result=is_legal_utf8(first, extra_bytes_to_read+1)) != conv_errc()) {
|
|
break;
|
|
}
|
|
/*
|
|
* The cases all fall through. See "Note A" below.
|
|
*/
|
|
switch (extra_bytes_to_read)
|
|
{
|
|
case 5:
|
|
ch += static_cast<uint8_t>(*first++);
|
|
ch <<= 6;
|
|
UNICONS_FALLTHROUGH;
|
|
case 4:
|
|
ch += static_cast<uint8_t>(*first++);
|
|
ch <<= 6;
|
|
UNICONS_FALLTHROUGH;
|
|
case 3:
|
|
ch += static_cast<uint8_t>(*first++);
|
|
ch <<= 6;
|
|
UNICONS_FALLTHROUGH;
|
|
case 2:
|
|
ch += static_cast<uint8_t>(*first++);
|
|
ch <<= 6;
|
|
UNICONS_FALLTHROUGH;
|
|
case 1:
|
|
ch += static_cast<uint8_t>(*first++);
|
|
ch <<= 6;
|
|
UNICONS_FALLTHROUGH;
|
|
case 0:
|
|
ch += static_cast<uint8_t>(*first++);
|
|
break;
|
|
}
|
|
ch -= offsets_from_utf8[extra_bytes_to_read];
|
|
|
|
if (ch <= max_legal_utf32) {
|
|
/*
|
|
* UTF-16 surrogate values are illegal in UTF-32, and anything
|
|
* over Plane 17 (> 0x10FFFF) is illegal.
|
|
*/
|
|
if (is_surrogate(ch) ) {
|
|
if (flags == conv_flags::strict) {
|
|
first -= (extra_bytes_to_read+1); /* return to the illegal value itself */
|
|
result = conv_errc::source_illegal;
|
|
break;
|
|
} else {
|
|
*target++ = (replacement_char);
|
|
}
|
|
} else {
|
|
*target++ = (ch);
|
|
}
|
|
} else { /* i.e., ch > max_legal_utf32 */
|
|
result = conv_errc::source_illegal;
|
|
*target++ = (replacement_char);
|
|
}
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
// utf16
|
|
|
|
template <class InputIt,class OutputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint16_t)
|
|
&& is_compatible_output_iterator<OutputIt,uint8_t>::value,convert_result<InputIt>>::type
|
|
convert(InputIt first, InputIt last,
|
|
OutputIt target,
|
|
conv_flags flags = conv_flags::strict) {
|
|
conv_errc result = conv_errc();
|
|
while (first < last) {
|
|
unsigned short bytes_to_write = 0;
|
|
const uint32_t byteMask = 0xBF;
|
|
const uint32_t byteMark = 0x80;
|
|
uint32_t ch = *first++;
|
|
/* If we have a surrogate pair, convert to uint32_t first. */
|
|
if (is_high_surrogate(ch)) {
|
|
/* If the 16 bits following the high surrogate are in the first buffer... */
|
|
if (first < last) {
|
|
uint32_t ch2 = *first;
|
|
/* If it's a low surrogate, convert to uint32_t. */
|
|
if (ch2 >= sur_low_start && ch2 <= sur_low_end) {
|
|
ch = ((ch - sur_high_start) << half_shift)
|
|
+ (ch2 - sur_low_start) + half_base;
|
|
++first;
|
|
} else if (flags == conv_flags::strict) { /* it's an unpaired high surrogate */
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::unpaired_high_surrogate;
|
|
break;
|
|
}
|
|
} else { /* We don't have the 16 bits following the high surrogate. */
|
|
--first; /* return to the high surrogate */
|
|
result = conv_errc::source_exhausted;
|
|
break;
|
|
}
|
|
} else if (flags == conv_flags::strict) {
|
|
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
if (is_low_surrogate(ch)) {
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::source_illegal;
|
|
break;
|
|
}
|
|
}
|
|
/* Figure out how many bytes the result will require */
|
|
if (ch < (uint32_t)0x80) {
|
|
bytes_to_write = 1;
|
|
} else if (ch < (uint32_t)0x800) {
|
|
bytes_to_write = 2;
|
|
} else if (ch < (uint32_t)0x10000) {
|
|
bytes_to_write = 3;
|
|
} else if (ch < (uint32_t)0x110000) {
|
|
bytes_to_write = 4;
|
|
} else {
|
|
bytes_to_write = 3;
|
|
ch = replacement_char;
|
|
}
|
|
|
|
uint8_t byte1 = 0;
|
|
uint8_t byte2 = 0;
|
|
uint8_t byte3 = 0;
|
|
uint8_t byte4 = 0;
|
|
|
|
switch (bytes_to_write) { // note: everything falls through
|
|
case 4: byte4 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
case 3: byte3 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
case 2: byte2 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
case 1: byte1 = (uint8_t)(ch | first_byte_mark[bytes_to_write]);
|
|
}
|
|
switch (bytes_to_write)
|
|
{
|
|
case 4:
|
|
*target++ = (byte1);
|
|
*target++ = (byte2);
|
|
*target++ = (byte3);
|
|
*target++ = (byte4);
|
|
break;
|
|
case 3:
|
|
*target++ = (byte1);
|
|
*target++ = (byte2);
|
|
*target++ = (byte3);
|
|
break;
|
|
case 2:
|
|
*target++ = (byte1);
|
|
*target++ = (byte2);
|
|
break;
|
|
case 1:
|
|
*target++ = (byte1);
|
|
break;
|
|
}
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
template <class InputIt,class OutputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint16_t)
|
|
&& is_compatible_output_iterator<OutputIt,uint16_t>::value,convert_result<InputIt>>::type
|
|
convert(InputIt first, InputIt last,
|
|
OutputIt target,
|
|
conv_flags flags = conv_flags::strict)
|
|
{
|
|
conv_errc result = conv_errc();
|
|
|
|
while (first != last)
|
|
{
|
|
uint32_t ch = *first++;
|
|
/* If we have a surrogate pair, convert to uint32_t first. */
|
|
if (is_high_surrogate(ch))
|
|
{
|
|
/* If the 16 bits following the high surrogate are in the first buffer... */
|
|
if (first < last) {
|
|
uint32_t ch2 = *first;
|
|
/* If it's a low surrogate, */
|
|
if (ch2 >= sur_low_start && ch2 <= sur_low_end) {
|
|
*target++ = ((uint16_t)ch);
|
|
*target++ = ((uint16_t)ch2);
|
|
++first;
|
|
} else if (flags == conv_flags::strict) { /* it's an unpaired high surrogate */
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::unpaired_high_surrogate;
|
|
break;
|
|
}
|
|
} else { /* We don't have the 16 bits following the high surrogate. */
|
|
--first; /* return to the high surrogate */
|
|
result = conv_errc::source_exhausted;
|
|
break;
|
|
}
|
|
} else if (is_low_surrogate(ch))
|
|
{
|
|
// illegal leading low surrogate
|
|
if (flags == conv_flags::strict) {
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::source_illegal;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
*target++ = ((uint16_t)ch);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
*target++ = ((uint16_t)ch);
|
|
}
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
template <class InputIt,class OutputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint16_t)
|
|
&& is_compatible_output_iterator<OutputIt,uint32_t>::value,convert_result<InputIt>>::type
|
|
convert(InputIt first, InputIt last,
|
|
OutputIt target,
|
|
conv_flags flags = conv_flags::strict)
|
|
{
|
|
conv_errc result = conv_errc();
|
|
|
|
while (first != last)
|
|
{
|
|
uint32_t ch = *first++;
|
|
/* If we have a surrogate pair, convert to UTF32 first. */
|
|
if (is_high_surrogate(ch)) {
|
|
/* If the 16 bits following the high surrogate are in the first buffer... */
|
|
if (first < last) {
|
|
uint32_t ch2 = *first;
|
|
/* If it's a low surrogate, convert to UTF32. */
|
|
if (ch2 >= sur_low_start && ch2 <= sur_low_end ) {
|
|
ch = ((ch - sur_high_start) << half_shift)
|
|
+ (ch2 - sur_low_start) + half_base;
|
|
++first;
|
|
} else if (flags == conv_flags::strict) { /* it's an unpaired high surrogate */
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::source_illegal;
|
|
break;
|
|
}
|
|
} else { /* We don't have the 16 bits following the high surrogate. */
|
|
--first; /* return to the high surrogate */
|
|
result = conv_errc::source_exhausted;
|
|
break;
|
|
}
|
|
} else if (flags == conv_flags::strict) {
|
|
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
if (is_low_surrogate(ch) ) {
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::source_illegal;
|
|
break;
|
|
}
|
|
}
|
|
*target++ = (ch);
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
// utf32
|
|
|
|
template <class InputIt,class OutputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint32_t)
|
|
&& is_compatible_output_iterator<OutputIt,uint8_t>::value,convert_result<InputIt>>::type
|
|
convert(InputIt first, InputIt last,
|
|
OutputIt target,
|
|
conv_flags flags = conv_flags::strict)
|
|
{
|
|
conv_errc result = conv_errc();
|
|
while (first < last) {
|
|
unsigned short bytes_to_write = 0;
|
|
const uint32_t byteMask = 0xBF;
|
|
const uint32_t byteMark = 0x80;
|
|
uint32_t ch = *first++;
|
|
if (flags == conv_flags::strict ) {
|
|
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
if (is_surrogate(ch)) {
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::illegal_surrogate_value;
|
|
break;
|
|
}
|
|
}
|
|
/*
|
|
* Figure out how many bytes the result will require. Turn any
|
|
* illegally large UTF32 things (> Plane 17) into replacement chars.
|
|
*/
|
|
if (ch < (uint32_t)0x80) { bytes_to_write = 1;
|
|
} else if (ch < (uint32_t)0x800) { bytes_to_write = 2;
|
|
} else if (ch < (uint32_t)0x10000) { bytes_to_write = 3;
|
|
} else if (ch <= max_legal_utf32) { bytes_to_write = 4;
|
|
} else {
|
|
bytes_to_write = 3;
|
|
ch = replacement_char;
|
|
result = conv_errc::source_illegal;
|
|
}
|
|
|
|
uint8_t byte1 = 0;
|
|
uint8_t byte2 = 0;
|
|
uint8_t byte3 = 0;
|
|
uint8_t byte4 = 0;
|
|
|
|
switch (bytes_to_write) {
|
|
case 4:
|
|
byte4 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
UNICONS_FALLTHROUGH;
|
|
case 3:
|
|
byte3 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
UNICONS_FALLTHROUGH;
|
|
case 2:
|
|
byte2 = (uint8_t)((ch | byteMark) & byteMask); ch >>= 6;
|
|
UNICONS_FALLTHROUGH;
|
|
case 1:
|
|
byte1 = (uint8_t) (ch | first_byte_mark[bytes_to_write]);
|
|
}
|
|
|
|
switch (bytes_to_write)
|
|
{
|
|
case 4:
|
|
*target++ = (byte1);
|
|
*target++ = (byte2);
|
|
*target++ = (byte3);
|
|
*target++ = (byte4);
|
|
break;
|
|
case 3:
|
|
*target++ = (byte1);
|
|
*target++ = (byte2);
|
|
*target++ = (byte3);
|
|
break;
|
|
case 2:
|
|
*target++ = (byte1);
|
|
*target++ = (byte2);
|
|
break;
|
|
case 1:
|
|
*target++ = (byte1);
|
|
break;
|
|
}
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
template <class InputIt,class OutputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint32_t)
|
|
&& is_compatible_output_iterator<OutputIt,uint16_t>::value,convert_result<InputIt>>::type
|
|
convert(InputIt first, InputIt last,
|
|
OutputIt target,
|
|
conv_flags flags = conv_flags::strict)
|
|
{
|
|
conv_errc result = conv_errc();
|
|
|
|
while (first != last)
|
|
{
|
|
uint32_t ch = *first++;
|
|
if (ch <= max_bmp) { /* Target is a character <= 0xFFFF */
|
|
/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
|
|
if (is_surrogate(ch) ) {
|
|
if (flags == conv_flags::strict) {
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::source_illegal;
|
|
break;
|
|
} else {
|
|
*target++ = (replacement_char);
|
|
}
|
|
} else {
|
|
*target++ = ((uint16_t)ch); /* normal case */
|
|
}
|
|
} else if (ch > max_legal_utf32) {
|
|
if (flags == conv_flags::strict) {
|
|
result = conv_errc::source_illegal;
|
|
} else {
|
|
*target++ = (replacement_char);
|
|
}
|
|
} else {
|
|
/* target is a character in range 0xFFFF - 0x10FFFF. */
|
|
ch -= half_base;
|
|
*target++ = ((uint16_t)((ch >> half_shift) + sur_high_start));
|
|
*target++ = ((uint16_t)((ch & half_mask) + sur_low_start));
|
|
}
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
template <class InputIt,class OutputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint32_t)
|
|
&& is_compatible_output_iterator<OutputIt,uint32_t>::value,convert_result<InputIt>>::type
|
|
convert(InputIt first, InputIt last,
|
|
OutputIt target,
|
|
conv_flags flags = conv_flags::strict)
|
|
{
|
|
conv_errc result = conv_errc();
|
|
|
|
while (first != last)
|
|
{
|
|
uint32_t ch = *first++;
|
|
if (flags == conv_flags::strict ) {
|
|
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
if (is_surrogate(ch)) {
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::illegal_surrogate_value;
|
|
break;
|
|
}
|
|
}
|
|
if (ch <= max_legal_utf32)
|
|
{
|
|
*target++ = (ch);
|
|
}
|
|
else
|
|
{
|
|
*target++ = (replacement_char);
|
|
result = conv_errc::source_illegal;
|
|
}
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
// validate
|
|
|
|
template <class InputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint8_t)
|
|
,convert_result<InputIt>>::type
|
|
validate(InputIt first, InputIt last) UNICONS_NOEXCEPT
|
|
{
|
|
conv_errc result = conv_errc();
|
|
while (first != last)
|
|
{
|
|
size_t length = trailing_bytes_for_utf8[static_cast<uint8_t>(*first)] + 1;
|
|
if (length > (size_t)(last - first))
|
|
{
|
|
return convert_result<InputIt>{first, conv_errc::source_exhausted};
|
|
}
|
|
if ((result=is_legal_utf8(first, length)) != conv_errc())
|
|
{
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
first += length;
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
// utf16
|
|
|
|
template <class InputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint16_t)
|
|
,convert_result<InputIt>>::type
|
|
validate(InputIt first, InputIt last) UNICONS_NOEXCEPT
|
|
{
|
|
conv_errc result = conv_errc();
|
|
|
|
while (first != last)
|
|
{
|
|
uint32_t ch = *first++;
|
|
/* If we have a surrogate pair, validate to uint32_t first. */
|
|
if (is_high_surrogate(ch))
|
|
{
|
|
/* If the 16 bits following the high surrogate are in the first buffer... */
|
|
if (first < last) {
|
|
uint32_t ch2 = *first;
|
|
/* If it's a low surrogate, */
|
|
if (ch2 >= sur_low_start && ch2 <= sur_low_end) {
|
|
++first;
|
|
} else {
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::unpaired_high_surrogate;
|
|
break;
|
|
}
|
|
} else { /* We don't have the 16 bits following the high surrogate. */
|
|
--first; /* return to the high surrogate */
|
|
result = conv_errc::source_exhausted;
|
|
break;
|
|
}
|
|
} else if (is_low_surrogate(ch))
|
|
{
|
|
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::source_illegal;
|
|
break;
|
|
}
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
|
|
// utf32
|
|
|
|
|
|
template <class InputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint32_t)
|
|
,convert_result<InputIt>>::type
|
|
validate(InputIt first, InputIt last) UNICONS_NOEXCEPT
|
|
{
|
|
conv_errc result = conv_errc();
|
|
|
|
while (first != last)
|
|
{
|
|
uint32_t ch = *first++;
|
|
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
if (is_surrogate(ch)) {
|
|
--first; /* return to the illegal value itself */
|
|
result = conv_errc::illegal_surrogate_value;
|
|
break;
|
|
}
|
|
if (!(ch <= max_legal_utf32))
|
|
{
|
|
result = conv_errc::source_illegal;
|
|
}
|
|
}
|
|
return convert_result<InputIt>{first,result} ;
|
|
}
|
|
|
|
// sequence
|
|
|
|
template <class Iterator>
|
|
class sequence
|
|
{
|
|
Iterator first_;
|
|
size_t length_;
|
|
public:
|
|
sequence(Iterator first, size_t length)
|
|
: first_(first), length_(length)
|
|
{
|
|
}
|
|
|
|
Iterator begin() const
|
|
{
|
|
return first_;
|
|
}
|
|
|
|
size_t length() const
|
|
{
|
|
return length_;
|
|
}
|
|
|
|
template <class CharT = typename std::iterator_traits<Iterator>::value_type>
|
|
typename std::enable_if<sizeof(CharT) == sizeof(uint8_t),uint32_t>::type
|
|
codepoint() const UNICONS_NOEXCEPT
|
|
{
|
|
uint32_t ch = 0;
|
|
Iterator it = first_;
|
|
switch (length_)
|
|
{
|
|
default:
|
|
return replacement_char;
|
|
break;
|
|
case 4:
|
|
ch += static_cast<uint8_t>(*it++); ch <<= 6;
|
|
UNICONS_FALLTHROUGH;
|
|
case 3:
|
|
ch += static_cast<uint8_t>(*it++); ch <<= 6;
|
|
UNICONS_FALLTHROUGH;
|
|
case 2:
|
|
ch += static_cast<uint8_t>(*it++); ch <<= 6;
|
|
UNICONS_FALLTHROUGH;
|
|
case 1:
|
|
ch += static_cast<uint8_t>(*it++);
|
|
ch -= offsets_from_utf8[length_ - 1];
|
|
break;
|
|
}
|
|
if (ch <= max_legal_utf32)
|
|
{
|
|
if (is_surrogate(ch))
|
|
{
|
|
ch = replacement_char;
|
|
}
|
|
}
|
|
else // ch > max_legal_utf32
|
|
{
|
|
ch = replacement_char;
|
|
}
|
|
return ch;
|
|
}
|
|
|
|
template <class CharT = typename std::iterator_traits<Iterator>::value_type>
|
|
typename std::enable_if<sizeof(CharT) == sizeof(uint16_t),uint32_t>::type
|
|
codepoint() const UNICONS_NOEXCEPT
|
|
{
|
|
if (length_ == 0)
|
|
{
|
|
return replacement_char;
|
|
}
|
|
if (length_ == 2)
|
|
{
|
|
uint32_t ch = *first_;
|
|
uint32_t ch2 = *(first_+ 1);
|
|
ch = ((ch - sur_high_start) << half_shift)
|
|
+ (ch2 - sur_low_start) + half_base;
|
|
return ch;
|
|
}
|
|
else
|
|
{
|
|
return *first_;
|
|
}
|
|
}
|
|
|
|
template <class CharT = typename std::iterator_traits<Iterator>::value_type>
|
|
typename std::enable_if<sizeof(CharT) == sizeof(uint32_t),uint32_t>::type
|
|
codepoint() const UNICONS_NOEXCEPT
|
|
{
|
|
if (length_ == 0)
|
|
{
|
|
return replacement_char;
|
|
}
|
|
return *(first_);
|
|
}
|
|
};
|
|
|
|
// sequence_generator
|
|
|
|
template <class Iterator>
|
|
class sequence_generator
|
|
{
|
|
Iterator begin_;
|
|
Iterator last_;
|
|
conv_flags flags_;
|
|
size_t length_;
|
|
conv_errc err_cd_;
|
|
public:
|
|
typedef sequence<Iterator> sequence_type;
|
|
|
|
sequence_generator(Iterator first, Iterator last,
|
|
conv_flags flags = conv_flags::strict) UNICONS_NOEXCEPT
|
|
: begin_(first), last_(last), flags_(flags),
|
|
length_(0), err_cd_(conv_errc())
|
|
{
|
|
next();
|
|
}
|
|
|
|
bool done() const UNICONS_NOEXCEPT
|
|
{
|
|
return err_cd_ != conv_errc() || begin_ == last_;
|
|
}
|
|
|
|
conv_errc status() const UNICONS_NOEXCEPT
|
|
{
|
|
return err_cd_;
|
|
}
|
|
|
|
sequence_type get() const UNICONS_NOEXCEPT
|
|
{
|
|
return sequence<Iterator>(begin_,length_);
|
|
}
|
|
|
|
template <class CharT = typename std::iterator_traits<Iterator>::value_type>
|
|
typename std::enable_if<sizeof(CharT) == sizeof(uint8_t)>::type
|
|
next() UNICONS_NOEXCEPT
|
|
{
|
|
begin_ += length_;
|
|
if (begin_ != last_)
|
|
{
|
|
size_t length = trailing_bytes_for_utf8[static_cast<uint8_t>(*begin_)] + 1;
|
|
if (length > (size_t)(last_ - begin_))
|
|
{
|
|
err_cd_ = conv_errc::source_exhausted;
|
|
}
|
|
else if ((err_cd_ = is_legal_utf8(begin_, length)) != conv_errc())
|
|
{
|
|
}
|
|
else
|
|
{
|
|
length_ = length;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <class CharT = typename std::iterator_traits<Iterator>::value_type>
|
|
typename std::enable_if<sizeof(CharT) == sizeof(uint16_t)>::type
|
|
next() UNICONS_NOEXCEPT
|
|
{
|
|
begin_ += length_;
|
|
if (begin_ != last_)
|
|
{
|
|
if (begin_ != last_)
|
|
{
|
|
|
|
Iterator it = begin_;
|
|
|
|
uint32_t ch = *it++;
|
|
/* If we have a surrogate pair, validate to uint32_t it. */
|
|
if (is_high_surrogate(ch))
|
|
{
|
|
/* If the 16 bits following the high surrogate are in the it buffer... */
|
|
if (it < last_) {
|
|
uint32_t ch2 = *it;
|
|
/* If it's a low surrogate, */
|
|
if (ch2 >= sur_low_start && ch2 <= sur_low_end)
|
|
{
|
|
++it;
|
|
length_ = 2;
|
|
}
|
|
else
|
|
{
|
|
err_cd_ = conv_errc::unpaired_high_surrogate;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// We don't have the 16 bits following the high surrogate.
|
|
err_cd_ = conv_errc::source_exhausted;
|
|
}
|
|
}
|
|
else if (is_low_surrogate(ch))
|
|
{
|
|
/* leading low surrogate */
|
|
err_cd_ = conv_errc::source_illegal;
|
|
}
|
|
else
|
|
{
|
|
length_ = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <class CharT = typename std::iterator_traits<Iterator>::value_type>
|
|
typename std::enable_if<sizeof(CharT) == sizeof(uint32_t)>::type
|
|
next() UNICONS_NOEXCEPT
|
|
{
|
|
begin_ += length_;
|
|
length_ = 1;
|
|
}
|
|
};
|
|
|
|
template <class Iterator>
|
|
sequence_generator<Iterator> make_sequence_generator(Iterator first, Iterator last,
|
|
conv_flags flags = conv_flags::strict)
|
|
{
|
|
return sequence_generator<Iterator>(first, last, flags);
|
|
}
|
|
|
|
template <class InputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value
|
|
&& (sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint8_t) || sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint16_t)),
|
|
sequence<InputIt>>::type
|
|
sequence_at(InputIt first, InputIt last, size_t index)
|
|
{
|
|
sequence_generator<InputIt> g(first, last, unicons::conv_flags::strict);
|
|
|
|
size_t count = 0;
|
|
while (!g.done() && count < index)
|
|
{
|
|
g.next();
|
|
++count;
|
|
}
|
|
return (!g.done() && count == index) ? g.get() : sequence<InputIt>(last,0);
|
|
}
|
|
|
|
template <class InputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint32_t),
|
|
sequence<InputIt>>::type
|
|
sequence_at(InputIt first, InputIt last, size_t index)
|
|
{
|
|
size_t size = std::distance(first,last);
|
|
return index < size ? sequence<InputIt>(first+index,1) : sequence<InputIt>(last,0);
|
|
}
|
|
|
|
// u8_length
|
|
|
|
template <class InputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint8_t),size_t>::type
|
|
u8_length(InputIt first, InputIt last) UNICONS_NOEXCEPT
|
|
{
|
|
return std::distance(first,last);
|
|
}
|
|
|
|
// utf16
|
|
|
|
template <class InputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint16_t),size_t>::type
|
|
u8_length(InputIt first, InputIt last) UNICONS_NOEXCEPT
|
|
{
|
|
conv_flags flags = conv_flags::strict;
|
|
size_t count = 0;
|
|
for (InputIt p = first; p != last; ++p)
|
|
{
|
|
uint32_t ch = *p;
|
|
if (is_high_surrogate(ch)) {
|
|
/* If the 16 bits following the high surrogate are in the p buffer... */
|
|
if (p < last) {
|
|
uint32_t ch2 = *(++p);
|
|
/* If it's a low surrogate, convert to uint32_t. */
|
|
if (ch2 >= sur_low_start && ch2 <= sur_low_end) {
|
|
ch = ((ch - sur_high_start) << half_shift)
|
|
+ (ch2 - sur_low_start) + half_base;
|
|
|
|
} else if (flags == conv_flags::strict) { /* it's an unpaired high surrogate */
|
|
break;
|
|
}
|
|
} else { /* We don't have the 16 bits following the high surrogate. */
|
|
break;
|
|
}
|
|
} else if (flags == conv_flags::strict) {
|
|
/* UTF-16 surrogate values are illegal in UTF-32 */
|
|
if (is_low_surrogate(ch)) {
|
|
break;
|
|
}
|
|
}
|
|
if (ch < (uint32_t)0x80) {
|
|
++count;
|
|
} else if (ch < (uint32_t)0x800) {
|
|
count += 2;
|
|
} else if (ch < (uint32_t)0x10000) {
|
|
count += 3;
|
|
} else if (ch < (uint32_t)0x110000) {
|
|
count += 4;
|
|
} else {
|
|
count += 3;
|
|
}
|
|
}
|
|
return count;
|
|
}
|
|
|
|
|
|
// utf32
|
|
|
|
template <class InputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint32_t),size_t>::type
|
|
u8_length(InputIt first, InputIt last) UNICONS_NOEXCEPT
|
|
{
|
|
size_t count = 0;
|
|
for (InputIt p = first; p < last; ++p)
|
|
{
|
|
uint32_t ch = *p;
|
|
if (ch < (uint32_t)0x80) {
|
|
++count;
|
|
} else if (ch < (uint32_t)0x800) {
|
|
count += 2;
|
|
} else if (ch < (uint32_t)0x10000) {
|
|
count += 3;
|
|
} else if (ch <= max_legal_utf32) {
|
|
count += 4;
|
|
} else {
|
|
count += 3;
|
|
}
|
|
}
|
|
return count;
|
|
}
|
|
|
|
// u32_length
|
|
|
|
template <class InputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value
|
|
&& (sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint8_t) || sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint16_t)),
|
|
size_t>::type
|
|
u32_length(InputIt first, InputIt last) UNICONS_NOEXCEPT
|
|
{
|
|
sequence_generator<InputIt> g(first, last, unicons::conv_flags::strict);
|
|
|
|
size_t count = 0;
|
|
while (!g.done())
|
|
{
|
|
g.next();
|
|
++count;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
template <class InputIt>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<InputIt>::value_type>::value && sizeof(typename std::iterator_traits<InputIt>::value_type) == sizeof(uint32_t),
|
|
size_t>::type
|
|
u32_length(InputIt first, InputIt last) UNICONS_NOEXCEPT
|
|
{
|
|
return std::distance(first,last);
|
|
}
|
|
|
|
enum class encoding {u8,u16le,u16be,u32le,u32be,undetected};
|
|
|
|
template <class Iterator>
|
|
struct detect_encoding_result
|
|
{
|
|
Iterator it;
|
|
encoding ec;
|
|
};
|
|
|
|
template <class Iterator>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<Iterator>::value_type>::value && sizeof(typename std::iterator_traits<Iterator>::value_type) == sizeof(uint8_t),
|
|
detect_encoding_result<Iterator>>::type
|
|
detect_encoding(Iterator first, Iterator last) UNICONS_NOEXCEPT
|
|
{
|
|
Iterator it1 = first;
|
|
if (std::distance(first,last) < 4)
|
|
{
|
|
if (std::distance(first,last) == 3)
|
|
{
|
|
Iterator it2 = ++first;
|
|
Iterator it3 = ++first;
|
|
if (static_cast<uint8_t>(*it1) == 0xEF && static_cast<uint8_t>(*it2) == 0xBB && static_cast<uint8_t>(*it3) == 0xBF)
|
|
{
|
|
return detect_encoding_result<Iterator>{last,encoding::u8};
|
|
}
|
|
}
|
|
return detect_encoding_result<Iterator>{it1,encoding::undetected};
|
|
}
|
|
else
|
|
{
|
|
Iterator it2 = ++first;
|
|
Iterator it3 = ++first;
|
|
Iterator it4 = ++first;
|
|
|
|
uint32_t bom = static_cast<uint8_t>(*it1) | (static_cast<uint8_t>(*it2) << 8) | (static_cast<uint8_t>(*it3) << 16) | (static_cast<uint8_t>(*it4) << 24);
|
|
if (bom == 0xFFFE0000)
|
|
{
|
|
return detect_encoding_result<Iterator>{it4++,encoding::u32be};
|
|
}
|
|
else if (bom == 0x0000FEFF)
|
|
{
|
|
return detect_encoding_result<Iterator>{first,encoding::u32le};
|
|
}
|
|
else if ((bom & 0xFFFF) == 0xFFFE)
|
|
{
|
|
return detect_encoding_result<Iterator>{it3,encoding::u16be};
|
|
}
|
|
else if ((bom & 0xFFFF) == 0xFEFF)
|
|
{
|
|
return detect_encoding_result<Iterator>{it3,encoding::u16le};
|
|
}
|
|
else if ((bom & 0xFFFFFF) == 0xBFBBEF)
|
|
{
|
|
return detect_encoding_result<Iterator>{it4,encoding::u8};
|
|
}
|
|
else
|
|
{
|
|
uint32_t pattern = (static_cast<uint8_t>(*it1) ? 1 : 0) | (static_cast<uint8_t>(*it2) ? 2 : 0) | (static_cast<uint8_t>(*it3) ? 4 : 0) | (static_cast<uint8_t>(*it4) ? 8 : 0);
|
|
switch (pattern) {
|
|
case 0x08:
|
|
return detect_encoding_result<Iterator>{it1,encoding::u32be};
|
|
case 0x0A:
|
|
return detect_encoding_result<Iterator>{it1,encoding::u16be};
|
|
case 0x01:
|
|
return detect_encoding_result<Iterator>{it1,encoding::u32le};
|
|
case 0x05:
|
|
return detect_encoding_result<Iterator>{it1,encoding::u16le};
|
|
case 0x0F:
|
|
return detect_encoding_result<Iterator>{it1,encoding::u8};
|
|
default:
|
|
return detect_encoding_result<Iterator>{it1,encoding::undetected};
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
template <class Iterator>
|
|
struct skip_bom_result
|
|
{
|
|
Iterator it;
|
|
encoding_errc ec;
|
|
};
|
|
|
|
template <class Iterator>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<Iterator>::value_type>::value && sizeof(typename std::iterator_traits<Iterator>::value_type) == sizeof(uint8_t),
|
|
skip_bom_result<Iterator>>::type
|
|
skip_bom(Iterator first, Iterator last) UNICONS_NOEXCEPT
|
|
{
|
|
auto result = unicons::detect_encoding(first,last);
|
|
switch (result.ec)
|
|
{
|
|
case unicons::encoding::u8:
|
|
return skip_bom_result<Iterator>{result.it,encoding_errc()};
|
|
break;
|
|
case unicons::encoding::u16le:
|
|
case unicons::encoding::u16be:
|
|
return skip_bom_result<Iterator>{result.it,encoding_errc::expected_u8_found_u16};
|
|
break;
|
|
case unicons::encoding::u32le:
|
|
case unicons::encoding::u32be:
|
|
return skip_bom_result<Iterator>{result.it,encoding_errc::expected_u8_found_u32};
|
|
break;
|
|
default:
|
|
return skip_bom_result<Iterator>{result.it,encoding_errc()};
|
|
break;
|
|
}
|
|
}
|
|
|
|
template <class Iterator>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<Iterator>::value_type>::value && sizeof(typename std::iterator_traits<Iterator>::value_type) == sizeof(uint16_t),
|
|
skip_bom_result<Iterator>>::type
|
|
skip_bom(Iterator first, Iterator last) UNICONS_NOEXCEPT
|
|
{
|
|
if (first == last)
|
|
{
|
|
return skip_bom_result<Iterator>{first,encoding_errc()};
|
|
}
|
|
uint16_t bom = static_cast<uint16_t>(*first);
|
|
if (bom == 0xFEFF)
|
|
{
|
|
return skip_bom_result<Iterator>{++first,encoding_errc()};
|
|
}
|
|
else if (bom == 0xFFFE)
|
|
{
|
|
return skip_bom_result<Iterator>{last,encoding_errc::expected_u16_found_fffe};
|
|
}
|
|
else
|
|
{
|
|
return skip_bom_result<Iterator>{first,encoding_errc()};
|
|
}
|
|
}
|
|
|
|
template <class Iterator>
|
|
typename std::enable_if<std::is_integral<typename std::iterator_traits<Iterator>::value_type>::value && sizeof(typename std::iterator_traits<Iterator>::value_type) == sizeof(uint32_t),
|
|
skip_bom_result<Iterator>>::type
|
|
skip_bom(Iterator first, Iterator last) UNICONS_NOEXCEPT
|
|
{
|
|
if (first == last)
|
|
{
|
|
return skip_bom_result<Iterator>{first,encoding_errc()};
|
|
}
|
|
uint32_t bom = static_cast<uint32_t>(*first);
|
|
if (bom == 0xFEFF0000)
|
|
{
|
|
return skip_bom_result<Iterator>{++first,encoding_errc()};
|
|
}
|
|
else if (bom == 0xFFFE0000)
|
|
{
|
|
return skip_bom_result<Iterator>{last,encoding_errc::expected_u32_found_fffe};
|
|
}
|
|
else
|
|
{
|
|
return skip_bom_result<Iterator>{first,encoding_errc()};
|
|
}
|
|
}
|
|
|
|
} // unicons
|
|
} // jsoncons
|
|
|
|
namespace std {
|
|
template<>
|
|
struct is_error_code_enum<jsoncons::unicons::conv_errc> : public true_type
|
|
{
|
|
};
|
|
template<>
|
|
struct is_error_code_enum<jsoncons::unicons::encoding_errc> : public true_type
|
|
{
|
|
};
|
|
}
|
|
|
|
#endif
|
|
|