kun yu 77e1ddd81b branch-0.4.0
Former-commit-id: a4df63653202df32d0b983de27f5c969905d17ac
2019-07-30 10:23:34 +08:00

513 lines
17 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This is a private header for string-to-number parsing utilitiers
#ifndef ARROW_UTIL_PARSING_H
#define ARROW_UTIL_PARSING_H
#include <cassert>
#include <chrono>
#include <limits>
#include <memory>
#include <string>
#include <type_traits>
#include <double-conversion/double-conversion.h>
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/config.h"
#include "arrow/vendored/datetime.h"
namespace arrow {
namespace internal {
/// \brief A class providing conversion from strings to some Arrow data types
///
/// Conversion is triggered by calling operator(). It returns true on
/// success, false on failure.
///
/// The class may have a non-trivial construction cost in some cases,
/// so it's recommended to use a single instance many times, if doing bulk
/// conversion. Instances of this class are not guaranteed to be thread-safe.
///
template <typename ARROW_TYPE, typename Enable = void>
class StringConverter;
template <>
class StringConverter<BooleanType> {
public:
explicit StringConverter(const std::shared_ptr<DataType>& = NULLPTR) {}
using value_type = bool;
bool operator()(const char* s, size_t length, value_type* out) {
if (length == 1) {
// "0" or "1"?
if (s[0] == '0') {
*out = false;
return true;
}
if (s[0] == '1') {
*out = true;
return true;
}
return false;
}
if (length == 4) {
// "true"?
*out = true;
return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
(s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
}
if (length == 5) {
// "false"?
*out = false;
return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
(s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
(s[4] == 'e' || s[4] == 'E'));
}
return false;
}
};
// Ideas for faster float parsing:
// - http://rapidjson.org/md_doc_internals.html#ParsingDouble
// - https://github.com/google/double-conversion [used here]
// - https://github.com/achan001/dtoa-fast
template <class ARROW_TYPE>
class StringToFloatConverterMixin {
public:
using value_type = typename ARROW_TYPE::c_type;
explicit StringToFloatConverterMixin(const std::shared_ptr<DataType>& = NULLPTR)
: main_converter_(flags_, main_junk_value_, main_junk_value_, "inf", "nan"),
fallback_converter_(flags_, fallback_junk_value_, fallback_junk_value_, "inf",
"nan") {}
bool operator()(const char* s, size_t length, value_type* out) {
value_type v;
// double-conversion doesn't give us an error flag but signals parse
// errors with sentinel values. Since a sentinel value can appear as
// legitimate input, we fallback on a second converter with a different
// sentinel to eliminate false errors.
TryConvert(main_converter_, s, length, &v);
if (ARROW_PREDICT_FALSE(v == static_cast<value_type>(main_junk_value_))) {
TryConvert(fallback_converter_, s, length, &v);
if (ARROW_PREDICT_FALSE(v == static_cast<value_type>(fallback_junk_value_))) {
return false;
}
}
*out = v;
return true;
}
protected:
// This is only support in double-conversion 3.1+
#ifdef DOUBLE_CONVERSION_HAS_CASE_INSENSIBILITY
static const int flags_ =
double_conversion::StringToDoubleConverter::ALLOW_CASE_INSENSIBILITY;
#else
static const int flags_ = double_conversion::StringToDoubleConverter::NO_FLAGS;
#endif
// Two unlikely values to signal a parsing error
static constexpr double main_junk_value_ = 0.7066424364107089;
static constexpr double fallback_junk_value_ = 0.40088499148279166;
double_conversion::StringToDoubleConverter main_converter_;
double_conversion::StringToDoubleConverter fallback_converter_;
inline void TryConvert(double_conversion::StringToDoubleConverter& converter,
const char* s, size_t length, float* out) {
int processed_length;
*out = converter.StringToFloat(s, static_cast<int>(length), &processed_length);
}
inline void TryConvert(double_conversion::StringToDoubleConverter& converter,
const char* s, size_t length, double* out) {
int processed_length;
*out = converter.StringToDouble(s, static_cast<int>(length), &processed_length);
}
};
template <>
class StringConverter<FloatType> : public StringToFloatConverterMixin<FloatType> {
using StringToFloatConverterMixin<FloatType>::StringToFloatConverterMixin;
};
template <>
class StringConverter<DoubleType> : public StringToFloatConverterMixin<DoubleType> {
using StringToFloatConverterMixin<DoubleType>::StringToFloatConverterMixin;
};
// NOTE: HalfFloatType would require a half<->float conversion library
namespace detail {
inline uint8_t ParseDecimalDigit(char c) { return static_cast<uint8_t>(c - '0'); }
#define PARSE_UNSIGNED_ITERATION(C_TYPE) \
if (length > 0) { \
uint8_t digit = ParseDecimalDigit(*s++); \
result = static_cast<C_TYPE>(result * 10U); \
length--; \
if (ARROW_PREDICT_FALSE(digit > 9U)) { \
/* Non-digit */ \
return false; \
} \
result = static_cast<C_TYPE>(result + digit); \
}
#define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE) \
if (length > 0) { \
if (ARROW_PREDICT_FALSE(result > std::numeric_limits<C_TYPE>::max() / 10U)) { \
/* Overflow */ \
return false; \
} \
uint8_t digit = ParseDecimalDigit(*s++); \
result = static_cast<C_TYPE>(result * 10U); \
C_TYPE new_result = static_cast<C_TYPE>(result + digit); \
if (ARROW_PREDICT_FALSE(--length > 0)) { \
/* Too many digits */ \
return false; \
} \
if (ARROW_PREDICT_FALSE(digit > 9U)) { \
/* Non-digit */ \
return false; \
} \
if (ARROW_PREDICT_FALSE(new_result < result)) { \
/* Overflow */ \
return false; \
} \
result = new_result; \
}
inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) {
uint8_t result = 0;
PARSE_UNSIGNED_ITERATION(uint8_t);
PARSE_UNSIGNED_ITERATION(uint8_t);
PARSE_UNSIGNED_ITERATION_LAST(uint8_t);
*out = result;
return true;
}
inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) {
uint16_t result = 0;
PARSE_UNSIGNED_ITERATION(uint16_t);
PARSE_UNSIGNED_ITERATION(uint16_t);
PARSE_UNSIGNED_ITERATION(uint16_t);
PARSE_UNSIGNED_ITERATION(uint16_t);
PARSE_UNSIGNED_ITERATION_LAST(uint16_t);
*out = result;
return true;
}
inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) {
uint32_t result = 0;
PARSE_UNSIGNED_ITERATION(uint32_t);
PARSE_UNSIGNED_ITERATION(uint32_t);
PARSE_UNSIGNED_ITERATION(uint32_t);
PARSE_UNSIGNED_ITERATION(uint32_t);
PARSE_UNSIGNED_ITERATION(uint32_t);
PARSE_UNSIGNED_ITERATION(uint32_t);
PARSE_UNSIGNED_ITERATION(uint32_t);
PARSE_UNSIGNED_ITERATION(uint32_t);
PARSE_UNSIGNED_ITERATION(uint32_t);
PARSE_UNSIGNED_ITERATION_LAST(uint32_t);
*out = result;
return true;
}
inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) {
uint64_t result = 0;
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION(uint64_t);
PARSE_UNSIGNED_ITERATION_LAST(uint64_t);
*out = result;
return true;
}
#undef PARSE_UNSIGNED_ITERATION
#undef PARSE_UNSIGNED_ITERATION_LAST
} // namespace detail
template <class ARROW_TYPE>
class StringToUnsignedIntConverterMixin {
public:
using value_type = typename ARROW_TYPE::c_type;
explicit StringToUnsignedIntConverterMixin(const std::shared_ptr<DataType>& = NULLPTR) {
}
bool operator()(const char* s, size_t length, value_type* out) {
if (ARROW_PREDICT_FALSE(length == 0)) {
return false;
}
// Skip leading zeros
while (length > 0 && *s == '0') {
length--;
s++;
}
return detail::ParseUnsigned(s, length, out);
}
};
template <>
class StringConverter<UInt8Type> : public StringToUnsignedIntConverterMixin<UInt8Type> {
using StringToUnsignedIntConverterMixin<UInt8Type>::StringToUnsignedIntConverterMixin;
};
template <>
class StringConverter<UInt16Type> : public StringToUnsignedIntConverterMixin<UInt16Type> {
using StringToUnsignedIntConverterMixin<UInt16Type>::StringToUnsignedIntConverterMixin;
};
template <>
class StringConverter<UInt32Type> : public StringToUnsignedIntConverterMixin<UInt32Type> {
using StringToUnsignedIntConverterMixin<UInt32Type>::StringToUnsignedIntConverterMixin;
};
template <>
class StringConverter<UInt64Type> : public StringToUnsignedIntConverterMixin<UInt64Type> {
using StringToUnsignedIntConverterMixin<UInt64Type>::StringToUnsignedIntConverterMixin;
};
template <class ARROW_TYPE>
class StringToSignedIntConverterMixin {
public:
using value_type = typename ARROW_TYPE::c_type;
using unsigned_type = typename std::make_unsigned<value_type>::type;
explicit StringToSignedIntConverterMixin(const std::shared_ptr<DataType>& = NULLPTR) {}
bool operator()(const char* s, size_t length, value_type* out) {
static constexpr unsigned_type max_positive =
static_cast<unsigned_type>(std::numeric_limits<value_type>::max());
// Assuming two's complement
static constexpr unsigned_type max_negative = max_positive + 1;
bool negative = false;
unsigned_type unsigned_value = 0;
if (ARROW_PREDICT_FALSE(length == 0)) {
return false;
}
if (*s == '-') {
negative = true;
s++;
if (--length == 0) {
return false;
}
}
// Skip leading zeros
while (length > 0 && *s == '0') {
length--;
s++;
}
if (!ARROW_PREDICT_TRUE(detail::ParseUnsigned(s, length, &unsigned_value))) {
return false;
}
if (negative) {
if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) {
return false;
}
// To avoid both compiler warnings (with unsigned negation)
// and undefined behaviour (with signed negation overflow),
// use the expanded formula for 2's complement negation.
*out = static_cast<value_type>(~unsigned_value + 1);
} else {
if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) {
return false;
}
*out = static_cast<value_type>(unsigned_value);
}
return true;
}
};
template <>
class StringConverter<Int8Type> : public StringToSignedIntConverterMixin<Int8Type> {
using StringToSignedIntConverterMixin<Int8Type>::StringToSignedIntConverterMixin;
};
template <>
class StringConverter<Int16Type> : public StringToSignedIntConverterMixin<Int16Type> {
using StringToSignedIntConverterMixin<Int16Type>::StringToSignedIntConverterMixin;
};
template <>
class StringConverter<Int32Type> : public StringToSignedIntConverterMixin<Int32Type> {
using StringToSignedIntConverterMixin<Int32Type>::StringToSignedIntConverterMixin;
};
template <>
class StringConverter<Int64Type> : public StringToSignedIntConverterMixin<Int64Type> {
using StringToSignedIntConverterMixin<Int64Type>::StringToSignedIntConverterMixin;
};
template <>
class StringConverter<TimestampType> {
public:
using value_type = TimestampType::c_type;
explicit StringConverter(const std::shared_ptr<DataType>& type)
: unit_(checked_cast<TimestampType*>(type.get())->unit()) {}
bool operator()(const char* s, size_t length, value_type* out) {
// We allow the following formats:
// - "YYYY-MM-DD"
// - "YYYY-MM-DD[ T]hh:mm:ss"
// - "YYYY-MM-DD[ T]hh:mm:ssZ"
// UTC is always assumed, and the DataType's timezone is ignored.
arrow_vendored::date::year_month_day ymd;
if (ARROW_PREDICT_FALSE(length < 10)) {
return false;
}
if (length == 10) {
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
return ConvertTimePoint(arrow_vendored::date::sys_days(ymd), out);
}
if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) {
return false;
}
if (s[length - 1] == 'Z') {
--length;
}
if (length == 19) {
if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &ymd))) {
return false;
}
std::chrono::duration<value_type> seconds;
if (ARROW_PREDICT_FALSE(!ParseHH_MM_SS(s + 11, &seconds))) {
return false;
}
return ConvertTimePoint(arrow_vendored::date::sys_days(ymd) + seconds, out);
}
return false;
}
protected:
template <class TimePoint>
bool ConvertTimePoint(TimePoint tp, value_type* out) {
auto duration = tp.time_since_epoch();
switch (unit_) {
case TimeUnit::SECOND:
*out = std::chrono::duration_cast<std::chrono::seconds>(duration).count();
return true;
case TimeUnit::MILLI:
*out = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
return true;
case TimeUnit::MICRO:
*out = std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
return true;
case TimeUnit::NANO:
*out = std::chrono::duration_cast<std::chrono::nanoseconds>(duration).count();
return true;
}
// Unreachable, but suppress compiler warning
assert(0);
*out = 0;
return true;
}
bool ParseYYYY_MM_DD(const char* s, arrow_vendored::date::year_month_day* out) {
uint16_t year;
uint8_t month, day;
if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
return false;
}
if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 4, &year))) {
return false;
}
if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 5, 2, &month))) {
return false;
}
if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 8, 2, &day))) {
return false;
}
*out = {arrow_vendored::date::year{year}, arrow_vendored::date::month{month},
arrow_vendored::date::day{day}};
return out->ok();
}
bool ParseHH_MM_SS(const char* s, std::chrono::duration<value_type>* out) {
uint8_t hours, minutes, seconds;
if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) {
return false;
}
if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 0, 2, &hours))) {
return false;
}
if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 3, 2, &minutes))) {
return false;
}
if (ARROW_PREDICT_FALSE(!detail::ParseUnsigned(s + 6, 2, &seconds))) {
return false;
}
if (ARROW_PREDICT_FALSE(hours >= 24)) {
return false;
}
if (ARROW_PREDICT_FALSE(minutes >= 60)) {
return false;
}
if (ARROW_PREDICT_FALSE(seconds >= 60)) {
return false;
}
*out = std::chrono::duration<value_type>(3600U * hours + 60U * minutes + seconds);
return true;
}
const TimeUnit::type unit_;
};
} // namespace internal
} // namespace arrow
#endif // ARROW_UTIL_PARSING_H