kun yu 77e1ddd81b branch-0.4.0
Former-commit-id: a4df63653202df32d0b983de27f5c969905d17ac
2019-07-30 10:23:34 +08:00

123 lines
3.9 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
// functions needed for parquet-cpp
#pragma once
#include "arrow/util/macros.h"
#ifdef ARROW_USE_SIMD
// MSVC x86-64
#if (defined(_M_AMD64) || defined(_M_X64))
#define ARROW_HAVE_SSE2 1
#define ARROW_HAVE_SSE4_2 1
#include <intrin.h>
#endif
// gcc/clang (possibly others)
#if defined(__SSE2__)
#define ARROW_HAVE_SSE2 1
#include <emmintrin.h>
#endif
#if defined(__SSE4_2__)
#define ARROW_HAVE_SSE4_2 1
#include <nmmintrin.h>
#endif
#endif // ARROW_USE_SIMD
// MSVC x86-64
namespace arrow {
/// This class contains constants useful for text processing with SSE4.2 intrinsics.
namespace SSEUtil {
/// Number of characters that fit in 64/128 bit register. SSE provides instructions
/// for loading 64 or 128 bits into a register at a time.
static const int CHARS_PER_64_BIT_REGISTER = 8;
static const int CHARS_PER_128_BIT_REGISTER = 16;
/// SSE4.2 adds instructions for text processing. The instructions have a control
/// byte that determines some of functionality of the instruction. (Equivalent to
/// GCC's _SIDD_CMP_EQUAL_ANY, etc).
static const int PCMPSTR_EQUAL_ANY = 0x00; // strchr
static const int PCMPSTR_EQUAL_EACH = 0x08; // strcmp
static const int PCMPSTR_UBYTE_OPS = 0x00; // unsigned char (8-bits, rather than 16)
static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4.
/// In this mode, SSE text processing functions will return a mask of all the
/// characters that matched.
static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS;
/// In this mode, SSE text processing functions will return the number of
/// bytes that match consecutively from the beginning.
static const int STRCMP_MODE =
PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS | PCMPSTR_NEG_POLARITY;
/// Precomputed mask values up to 16 bits.
static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = {
1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7,
1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15,
};
} // namespace SSEUtil
#ifdef ARROW_HAVE_SSE4_2
/// Define the SSE 4.2 intrinsics. The caller must first verify at runtime (or codegen
/// IR load time) that the processor supports SSE 4.2 before calling these. These are
/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros.
template <int MODE>
static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
return _mm_cmpestrm(str1, len1, str2, len2, MODE);
}
template <int MODE>
static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
return _mm_cmpestri(str1, len1, str2, len2, MODE);
}
static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
return _mm_crc32_u8(crc, v);
}
static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
return _mm_crc32_u16(crc, v);
}
static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
return _mm_crc32_u32(crc, v);
}
static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
#if ARROW_BITNESS == 32
return 0;
#else
return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
#endif
}
#endif // ARROW_HAVE_SSE4_2
} // namespace arrow