fix: support building tantivy index with low version(5) (#40822)

fix: https://github.com/milvus-io/milvus/issues/40823
To solve the problem in the issue, we have to support building tantivy
index with low version
for those query nodes with low tantivy version.

This PR does two things:
1. refactor codes for IndexWriterWrapper to make it concise
2. enable IndexWriterWrapper to build tantivy index by different tantivy
crate

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
This commit is contained in:
Spade A 2025-04-02 18:46:20 +08:00 committed by GitHub
parent afa519b4c7
commit f552ec67dd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
55 changed files with 2712 additions and 632 deletions

View File

@ -176,21 +176,21 @@ Driver::Next(std::shared_ptr<BlockingState>& blocking_state) {
return result;
}
#define CALL_OPERATOR(call_func, operator, method_name) \
try { \
call_func; \
} catch (std::exception & e) { \
std::string stack_trace = milvus::impl::EasyStackTrace(); \
auto err_msg = fmt::format( \
"Operator::{} failed for [Operator:{}, plan node id: " \
"{}] : {}\nStack trace: {}", \
method_name, \
operator->ToString() , \
operator->get_plannode_id(), \
e.what(), \
stack_trace); \
LOG_ERROR(err_msg); \
throw ExecOperatorException(err_msg); \
#define CALL_OPERATOR(call_func, operator, method_name) \
try { \
call_func; \
} catch (std::exception & e) { \
std::string stack_trace = milvus::impl::EasyStackTrace(); \
auto err_msg = fmt::format( \
"Operator::{} failed for [Operator:{}, plan node id: " \
"{}] : {}\nStack trace: {}", \
method_name, \
operator->ToString(), \
operator->get_plannode_id(), \
e.what(), \
stack_trace); \
LOG_ERROR(err_msg); \
throw ExecOperatorException(err_msg); \
}
StopReason

View File

@ -29,9 +29,11 @@ namespace index {
template <typename T>
HybridScalarIndex<T>::HybridScalarIndex(
uint32_t tantivy_index_version,
const storage::FileManagerContext& file_manager_context)
: ScalarIndex<T>(HYBRID_INDEX_TYPE),
is_built_(false),
tantivy_index_version_(tantivy_index_version),
bitmap_index_cardinality_limit_(
DEFAULT_HYBRID_INDEX_BITMAP_CARDINALITY_LIMIT),
file_manager_context_(file_manager_context) {
@ -191,8 +193,8 @@ HybridScalarIndex<T>::GetInternalIndex() {
internal_index_ =
std::make_shared<ScalarIndexSort<T>>(file_manager_context_);
} else if (internal_index_type_ == ScalarIndexType::INVERTED) {
internal_index_ =
std::make_shared<InvertedIndexTantivy<T>>(file_manager_context_);
internal_index_ = std::make_shared<InvertedIndexTantivy<T>>(
tantivy_index_version_, file_manager_context_);
} else {
PanicInfo(UnexpectedError,
"unknown index type when get internal index");
@ -215,7 +217,7 @@ HybridScalarIndex<std::string>::GetInternalIndex() {
std::make_shared<StringIndexMarisa>(file_manager_context_);
} else if (internal_index_type_ == ScalarIndexType::INVERTED) {
internal_index_ = std::make_shared<InvertedIndexTantivy<std::string>>(
file_manager_context_);
tantivy_index_version_, file_manager_context_);
} else {
PanicInfo(UnexpectedError,
"unknown index type when get internal index");

View File

@ -42,6 +42,7 @@ template <typename T>
class HybridScalarIndex : public ScalarIndex<T> {
public:
explicit HybridScalarIndex(
uint32_t tantivy_index_version,
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext());
@ -193,6 +194,13 @@ class HybridScalarIndex : public ScalarIndex<T> {
std::shared_ptr<ScalarIndex<T>> internal_index_{nullptr};
storage::FileManagerContext file_manager_context_;
std::shared_ptr<storage::MemFileManagerImpl> mem_file_manager_{nullptr};
// `tantivy_index_version_` is used to control which kind of tantivy index should be used.
// There could be the case where milvus version of read node is lower than the version of index builder node(and read node
// may not be upgraded to a higher version in a predictable time), so we are using a lower version of tantivy to read index
// built from a higher version of tantivy which is not supported.
// Therefore, we should provide a way to allow higher version of milvus to build tantivy index with low version.
uint32_t tantivy_index_version_{0};
};
} // namespace index

View File

@ -46,8 +46,10 @@ IndexFactory::CreatePrimitiveScalarIndex(
const storage::FileManagerContext& file_manager_context) {
auto index_type = create_index_info.index_type;
if (index_type == INVERTED_INDEX_TYPE) {
assert(create_index_info.tantivy_index_version != 0);
// scalar_index_engine_version 0 means we should built tantivy index within single segment
return std::make_unique<InvertedIndexTantivy<T>>(
create_index_info.tantivy_index_version,
file_manager_context,
create_index_info.scalar_index_engine_version == 0);
}
@ -55,7 +57,8 @@ IndexFactory::CreatePrimitiveScalarIndex(
return std::make_unique<BitmapIndex<T>>(file_manager_context);
}
if (index_type == HYBRID_INDEX_TYPE) {
return std::make_unique<HybridScalarIndex<T>>(file_manager_context);
return std::make_unique<HybridScalarIndex<T>>(
create_index_info.tantivy_index_version, file_manager_context);
}
return CreateScalarIndexSort<T>(file_manager_context);
}
@ -75,8 +78,10 @@ IndexFactory::CreatePrimitiveScalarIndex<std::string>(
auto index_type = create_index_info.index_type;
#if defined(__linux__) || defined(__APPLE__)
if (index_type == INVERTED_INDEX_TYPE) {
assert(create_index_info.tantivy_index_version != 0);
// scalar_index_engine_version 0 means we should built tantivy index within single segment
return std::make_unique<InvertedIndexTantivy<std::string>>(
create_index_info.tantivy_index_version,
file_manager_context,
create_index_info.scalar_index_engine_version == 0);
}
@ -85,7 +90,7 @@ IndexFactory::CreatePrimitiveScalarIndex<std::string>(
}
if (index_type == HYBRID_INDEX_TYPE) {
return std::make_unique<HybridScalarIndex<std::string>>(
file_manager_context);
create_index_info.tantivy_index_version, file_manager_context);
}
return CreateStringIndexMarisa(file_manager_context);
#else

View File

@ -27,7 +27,8 @@ struct CreateIndexInfo {
IndexVersion index_engine_version;
std::string field_name;
int64_t dim;
int32_t scalar_index_engine_version;
int32_t scalar_index_engine_version{1};
uint32_t tantivy_index_version{7};
JsonCastType json_cast_type;
std::string json_path;
};

View File

@ -55,15 +55,22 @@ InvertedIndexTantivy<T>::InitForBuildIndex() {
"build inverted index temp dir:{} not empty",
path_);
}
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_);
wrapper_ =
std::make_shared<TantivyIndexWrapper>(field.c_str(),
d_type_,
path_.c_str(),
tantivy_index_version_,
inverted_index_single_segment_);
}
template <typename T>
InvertedIndexTantivy<T>::InvertedIndexTantivy(
const storage::FileManagerContext& ctx, bool inverted_index_single_segment)
uint32_t tantivy_index_version,
const storage::FileManagerContext& ctx,
bool inverted_index_single_segment)
: ScalarIndex<T>(INVERTED_INDEX_TYPE),
schema_(ctx.fieldDataMeta.field_schema),
tantivy_index_version_(tantivy_index_version),
inverted_index_single_segment_(inverted_index_single_segment) {
mem_file_manager_ = std::make_shared<MemFileManager>(ctx);
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx);
@ -465,8 +472,16 @@ InvertedIndexTantivy<T>::BuildWithRawDataForUT(size_t n,
GetValueFromConfig<int32_t>(config,
milvus::index::SCALAR_INDEX_ENGINE_VERSION)
.value_or(1) == 0;
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_);
tantivy_index_version_ =
GetValueFromConfig<int32_t>(config,
milvus::index::TANTIVY_INDEX_VERSION)
.value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION);
wrapper_ =
std::make_shared<TantivyIndexWrapper>(field.c_str(),
d_type_,
path_.c_str(),
tantivy_index_version_,
inverted_index_single_segment_);
if (!inverted_index_single_segment_) {
if (config.find("is_array") != config.end()) {
// only used in ut.

View File

@ -69,7 +69,9 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
InvertedIndexTantivy() : ScalarIndex<T>(INVERTED_INDEX_TYPE) {
}
explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx,
// Default, we build tantivy index with version 7 (newest version now).
explicit InvertedIndexTantivy(uint32_t tantivy_index_version,
const storage::FileManagerContext& ctx,
bool inverted_index_single_segment = false);
~InvertedIndexTantivy();
@ -254,5 +256,12 @@ class InvertedIndexTantivy : public ScalarIndex<T> {
// new version while the query node is a older version. So we have this `inverted_index_single_segment_` to control the index
// building node to build specific type of tantivy index.
bool inverted_index_single_segment_{false};
// `tantivy_index_version_` is used to control which kind of tantivy index should be used.
// There could be the case where milvus version of read node is lower than the version of index builder node(and read node
// may not be upgraded to a higher version in a predictable time), so we are using a lower version of tantivy to read index
// built from a higher version of tantivy which is not supported.
// Therefore, we should provide a way to allow higher version of milvus to build tantivy index with low version.
uint32_t tantivy_index_version_{0};
};
} // namespace milvus::index

View File

@ -89,7 +89,10 @@ class JsonInvertedIndex : public index::InvertedIndexTantivy<T> {
std::string field_name = std::to_string(
this->disk_file_manager_->GetFieldDataMeta().field_id);
this->wrapper_ = std::make_shared<index::TantivyIndexWrapper>(
field_name.c_str(), this->d_type_, this->path_.c_str());
field_name.c_str(),
this->d_type_,
this->path_.c_str(),
TANTIVY_INDEX_LATEST_VERSION /* json index is not supported in old version */);
}
void

View File

@ -48,6 +48,8 @@ constexpr const char* BITMAP_INDEX_TYPE = "BITMAP";
constexpr const char* HYBRID_INDEX_TYPE = "HYBRID";
constexpr const char* SCALAR_INDEX_ENGINE_VERSION =
"scalar_index_engine_version";
constexpr const char* TANTIVY_INDEX_VERSION = "tantivy_index_version";
constexpr uint32_t TANTIVY_INDEX_LATEST_VERSION = 7;
// index meta
constexpr const char* COLLECTION_ID = "collection_id";

View File

@ -28,11 +28,18 @@ TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms,
last_commit_time_(stdclock::now()) {
d_type_ = TantivyDataType::Text;
wrapper_ = std::make_shared<TantivyIndexWrapper>(
unique_id, true, "", tokenizer_name, analyzer_params);
unique_id,
true,
"",
TANTIVY_INDEX_LATEST_VERSION /* Growing segment has no reason to use old version index*/
,
tokenizer_name,
analyzer_params);
}
TextMatchIndex::TextMatchIndex(const std::string& path,
const char* unique_id,
uint32_t tantivy_index_version,
const char* tokenizer_name,
const char* analyzer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
@ -42,11 +49,16 @@ TextMatchIndex::TextMatchIndex(const std::string& path,
boost::filesystem::path sub_path = unique_id;
path_ = (prefix / sub_path).string();
boost::filesystem::create_directories(path_);
wrapper_ = std::make_shared<TantivyIndexWrapper>(
unique_id, false, path_.c_str(), tokenizer_name, analyzer_params);
wrapper_ = std::make_shared<TantivyIndexWrapper>(unique_id,
false,
path_.c_str(),
tantivy_index_version,
tokenizer_name,
analyzer_params);
}
TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
uint32_t tantivy_index_version,
const char* tokenizer_name,
const char* analyzer_params)
: commit_interval_in_ms_(std::numeric_limits<int64_t>::max()),
@ -65,6 +77,7 @@ TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx,
wrapper_ = std::make_shared<TantivyIndexWrapper>(field_name.c_str(),
false,
path_.c_str(),
tantivy_index_version,
tokenizer_name,
analyzer_params);
}

View File

@ -30,10 +30,12 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
// for sealed segment.
explicit TextMatchIndex(const std::string& path,
const char* unique_id,
uint32_t tantivy_index_version,
const char* tokenizer_name,
const char* analyzer_params);
// for building index.
explicit TextMatchIndex(const storage::FileManagerContext& ctx,
uint32_t tantivy_index_version,
const char* tokenizer_name,
const char* analyzer_params);
// for loading index

View File

@ -41,6 +41,11 @@ ScalarIndexCreator::ScalarIndexCreator(
config, milvus::index::SCALAR_INDEX_ENGINE_VERSION)
.value_or(1);
index_info.tantivy_index_version =
milvus::index::GetValueFromConfig<int32_t>(
config, milvus::index::TANTIVY_INDEX_VERSION)
.value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION);
index_info.field_type = dtype_;
index_info.index_type = index_type();
if (dtype == DataType::JSON) {

View File

@ -267,10 +267,16 @@ BuildTextIndex(ProtoLayoutInterface result,
milvus::storage::FileManagerContext fileManagerContext(
field_meta, index_meta, chunk_manager);
uint32_t tantivy_index_version =
milvus::index::GetValueFromConfig<int32_t>(
config, milvus::index::TANTIVY_INDEX_VERSION)
.value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION);
auto field_schema =
FieldMeta::ParseFrom(build_index_info->field_schema());
auto index = std::make_unique<index::TextMatchIndex>(
fileManagerContext,
tantivy_index_version,
"milvus_tokenizer",
field_schema.get_analyzer_params().c_str());
index->Build(config);

View File

@ -1476,6 +1476,8 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index = std::make_unique<index::TextMatchIndex>(
cfg.GetMmapPath(),
unique_id.c_str(),
// todo: make it configurable
index::TANTIVY_INDEX_LATEST_VERSION,
"milvus_tokenizer",
field_meta.get_analyzer_params().c_str());
}

View File

@ -2079,6 +2079,8 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
index = std::make_unique<index::TextMatchIndex>(
cfg.GetMmapPath(),
unique_id.c_str(),
// todo: make it configurable
index::TANTIVY_INDEX_LATEST_VERSION,
"milvus_tokenizer",
field_meta.get_analyzer_params().c_str());
}

View File

@ -194,6 +194,21 @@ appendScalarIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) {
index_info.field_type = milvus::DataType(field_type);
index_info.index_type = index_params["index_type"];
auto config = milvus::index::ParseConfigFromIndexParams(
load_index_info->index_params);
// Config should have value for milvus::index::SCALAR_INDEX_ENGINE_VERSION for production calling chain.
// Use value_or(1) for unit test without setting this value
index_info.scalar_index_engine_version =
milvus::index::GetValueFromConfig<int32_t>(
config, milvus::index::SCALAR_INDEX_ENGINE_VERSION)
.value_or(1);
index_info.tantivy_index_version =
milvus::index::GetValueFromConfig<int32_t>(
config, milvus::index::TANTIVY_INDEX_VERSION)
.value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION);
load_index_info->index =
milvus::index::IndexFactory::GetInstance().CreateIndex(
index_info, milvus::storage::FileManagerContext());
@ -262,6 +277,21 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) {
index_info.field_type = load_index_info->field_type;
index_info.index_engine_version = engine_version;
auto config = milvus::index::ParseConfigFromIndexParams(
load_index_info->index_params);
// Config should have value for milvus::index::SCALAR_INDEX_ENGINE_VERSION for production calling chain.
// Use value_or(1) for unit test without setting this value
index_info.scalar_index_engine_version =
milvus::index::GetValueFromConfig<int32_t>(
config, milvus::index::SCALAR_INDEX_ENGINE_VERSION)
.value_or(1);
index_info.tantivy_index_version =
milvus::index::GetValueFromConfig<int32_t>(
config, milvus::index::TANTIVY_INDEX_VERSION)
.value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION);
auto ctx = milvus::tracer::TraceContext{
c_trace.traceID, c_trace.spanID, c_trace.traceFlags};
auto span = milvus::tracer::StartSpan("SegCoreLoadIndex", &ctx);
@ -303,8 +333,6 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) {
milvus::storage::RemoteChunkManagerSingleton::GetInstance()
.GetRemoteChunkManager();
auto config = milvus::index::ParseConfigFromIndexParams(
load_index_info->index_params);
config[milvus::index::INDEX_FILES] = load_index_info->index_files;
if (load_index_info->field_type == milvus::DataType::JSON) {

View File

@ -30,11 +30,13 @@ token_stream_get_token(CTokenStream token_stream) {
CToken
token_stream_get_detailed_token(CTokenStream token_stream) {
auto token= static_cast<milvus::tantivy::TokenStream*>(token_stream)
->get_detailed_token();
return CToken{
token.token, token.start_offset, token.end_offset, token.position, token.position_length
};
auto token = static_cast<milvus::tantivy::TokenStream*>(token_stream)
->get_detailed_token();
return CToken{token.token,
token.start_offset,
token.end_offset,
token.position,
token.position_length};
}
void

View File

@ -21,13 +21,13 @@ extern "C" {
#endif
typedef void* CTokenStream;
typedef struct CToken{
const char *token;
int64_t start_offset;
int64_t end_offset;
int64_t position;
int64_t position_length;
}CToken;
typedef struct CToken {
const char* token;
int64_t start_offset;
int64_t end_offset;
int64_t position;
int64_t position_length;
} CToken;
void free_token_stream(CTokenStream);

View File

@ -17,6 +17,18 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
[[package]]
name = "ahash"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
dependencies = [
"cfg-if",
"once_cell",
"version_check",
"zerocopy",
]
[[package]]
name = "aho-corasick"
version = "1.1.3"
@ -129,7 +141,7 @@ version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi",
"hermit-abi 0.1.19",
"libc",
"winapi",
]
@ -152,9 +164,15 @@ dependencies = [
"miniz_oxide",
"object",
"rustc-demangle",
"windows-targets",
"windows-targets 0.52.6",
]
[[package]]
name = "base64"
version = "0.21.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
[[package]]
name = "base64"
version = "0.22.1"
@ -182,6 +200,15 @@ version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
[[package]]
name = "bitpacking"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8c7d2ac73c167c06af4a5f37e6e59d84148d57ccbe4480b76f0273eefea82d7"
dependencies = [
"crunchy",
]
[[package]]
name = "bitpacking"
version = "0.9.2"
@ -721,6 +748,16 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "fs4"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47"
dependencies = [
"rustix",
"windows-sys 0.48.0",
]
[[package]]
name = "fs4"
version = "0.8.4"
@ -849,7 +886,7 @@ dependencies = [
"cfg-if",
"libc",
"wasi 0.13.3+wasi-0.2.2",
"windows-targets",
"windows-targets 0.52.6",
]
[[package]]
@ -894,6 +931,10 @@ name = "hashbrown"
version = "0.14.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
dependencies = [
"ahash",
"allocator-api2",
]
[[package]]
name = "hashbrown"
@ -927,6 +968,12 @@ dependencies = [
"libc",
]
[[package]]
name = "hermit-abi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
[[package]]
name = "htmlescape"
version = "0.3.1"
@ -1225,6 +1272,18 @@ dependencies = [
"hashbrown 0.15.2",
]
[[package]]
name = "instant"
version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
dependencies = [
"cfg-if",
"js-sys",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "ipnet"
version = "2.11.0"
@ -1237,6 +1296,15 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itertools"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.14.0"
@ -1394,7 +1462,7 @@ dependencies = [
"reqwest",
"serde",
"tar",
"thiserror",
"thiserror 2.0.11",
"yada",
]
@ -1468,6 +1536,15 @@ version = "0.4.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f"
[[package]]
name = "lru"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4a83fb7698b3643a0e34f9ae6f2e8f0178c0fd42f8b59d493aa271ff3a5bf21"
dependencies = [
"hashbrown 0.14.5",
]
[[package]]
name = "lru"
version = "0.12.5"
@ -1483,6 +1560,16 @@ version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
[[package]]
name = "measure_time"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbefd235b0aadd181626f281e1d684e116972988c14c264e42069d5e8a5775cc"
dependencies = [
"instant",
"log",
]
[[package]]
name = "measure_time"
version = "0.9.0"
@ -1498,6 +1585,15 @@ version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "memmap2"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6"
dependencies = [
"libc",
]
[[package]]
name = "memmap2"
version = "0.9.5"
@ -1588,6 +1684,16 @@ dependencies = [
"libm",
]
[[package]]
name = "num_cpus"
version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
dependencies = [
"hermit-abi 0.3.9",
"libc",
]
[[package]]
name = "object"
version = "0.36.7"
@ -1659,6 +1765,14 @@ version = "6.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1"
[[package]]
name = "ownedbytes"
version = "0.6.0"
source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473"
dependencies = [
"stable_deref_trait",
]
[[package]]
name = "ownedbytes"
version = "0.7.0"
@ -1856,7 +1970,7 @@ dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
"regex-syntax 0.8.5",
]
[[package]]
@ -1867,9 +1981,15 @@ checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
"regex-syntax 0.8.5",
]
[[package]]
name = "regex-syntax"
version = "0.6.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
[[package]]
name = "regex-syntax"
version = "0.8.5"
@ -1882,7 +2002,7 @@ version = "0.12.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da"
dependencies = [
"base64",
"base64 0.22.1",
"bytes",
"encoding_rs",
"futures-core",
@ -1951,6 +2071,12 @@ version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "rustc-hash"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustc-hash"
version = "2.1.1"
@ -2128,6 +2254,15 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d"
[[package]]
name = "sketches-ddsketch"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c"
dependencies = [
"serde",
]
[[package]]
name = "sketches-ddsketch"
version = "0.3.0"
@ -2277,6 +2412,61 @@ dependencies = [
"libc",
]
[[package]]
name = "tantivy"
version = "0.21.1"
source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473"
dependencies = [
"aho-corasick",
"arc-swap",
"async-channel",
"async-trait",
"base64 0.21.7",
"bitpacking 0.8.4",
"byteorder",
"census",
"crc32fast",
"crossbeam-channel",
"downcast-rs",
"fastdivide",
"fs4 0.6.6",
"htmlescape",
"itertools 0.11.0",
"lazy_static",
"levenshtein_automata",
"log",
"lru 0.11.1",
"lz4_flex",
"measure_time 0.8.3",
"memmap2 0.7.1",
"murmurhash32",
"num_cpus",
"once_cell",
"oneshot",
"rayon",
"regex",
"rust-stemmers",
"rustc-hash 1.1.0",
"serde",
"serde_json",
"sketches-ddsketch 0.2.2",
"smallvec",
"tantivy-bitpacker 0.5.0",
"tantivy-columnar 0.2.0",
"tantivy-common 0.6.0",
"tantivy-fst 0.4.0",
"tantivy-query-grammar 0.21.0",
"tantivy-stacker 0.2.0",
"tantivy-tokenizer-api 0.2.0",
"tempfile",
"thiserror 1.0.69",
"time",
"tokio",
"uuid",
"winapi",
"zstd-sys",
]
[[package]]
name = "tantivy"
version = "0.23.0"
@ -2285,8 +2475,8 @@ dependencies = [
"aho-corasick",
"arc-swap",
"async-channel",
"base64",
"bitpacking",
"base64 0.22.1",
"bitpacking 0.9.2",
"bon",
"byteorder",
"census",
@ -2295,36 +2485,36 @@ dependencies = [
"downcast-rs",
"fastdivide",
"fnv",
"fs4",
"fs4 0.8.4",
"htmlescape",
"hyperloglogplus",
"itertools",
"itertools 0.14.0",
"lazy_static",
"levenshtein_automata",
"log",
"lru",
"lru 0.12.5",
"lz4_flex",
"measure_time",
"memmap2",
"measure_time 0.9.0",
"memmap2 0.9.5",
"once_cell",
"oneshot",
"rayon",
"regex",
"rust-stemmers",
"rustc-hash",
"rustc-hash 2.1.1",
"serde",
"serde_json",
"sketches-ddsketch",
"sketches-ddsketch 0.3.0",
"smallvec",
"tantivy-bitpacker",
"tantivy-columnar",
"tantivy-common",
"tantivy-fst",
"tantivy-query-grammar",
"tantivy-stacker",
"tantivy-tokenizer-api",
"tantivy-bitpacker 0.6.0",
"tantivy-columnar 0.3.0",
"tantivy-common 0.7.0",
"tantivy-fst 0.5.0",
"tantivy-query-grammar 0.22.0",
"tantivy-stacker 0.3.0",
"tantivy-tokenizer-api 0.3.0",
"tempfile",
"thiserror",
"thiserror 2.0.11",
"time",
"tokio",
"uuid",
@ -2348,17 +2538,41 @@ dependencies = [
"regex",
"scopeguard",
"serde_json",
"tantivy",
"tantivy 0.21.1",
"tantivy 0.23.0",
"tempfile",
"zstd-sys",
]
[[package]]
name = "tantivy-bitpacker"
version = "0.5.0"
source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473"
dependencies = [
"bitpacking 0.8.4",
]
[[package]]
name = "tantivy-bitpacker"
version = "0.6.0"
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
dependencies = [
"bitpacking",
"bitpacking 0.9.2",
]
[[package]]
name = "tantivy-columnar"
version = "0.2.0"
source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473"
dependencies = [
"fastdivide",
"fnv",
"itertools 0.11.0",
"serde",
"tantivy-bitpacker 0.5.0",
"tantivy-common 0.6.0",
"tantivy-sstable 0.2.0",
"tantivy-stacker 0.2.0",
]
[[package]]
@ -2368,12 +2582,24 @@ source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a
dependencies = [
"downcast-rs",
"fastdivide",
"itertools",
"itertools 0.14.0",
"serde",
"tantivy-bitpacker",
"tantivy-common",
"tantivy-sstable",
"tantivy-stacker",
"tantivy-bitpacker 0.6.0",
"tantivy-common 0.7.0",
"tantivy-sstable 0.3.0",
"tantivy-stacker 0.3.0",
]
[[package]]
name = "tantivy-common"
version = "0.6.0"
source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473"
dependencies = [
"async-trait",
"byteorder",
"ownedbytes 0.6.0",
"serde",
"time",
]
[[package]]
@ -2383,12 +2609,23 @@ source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a
dependencies = [
"async-trait",
"byteorder",
"ownedbytes",
"ownedbytes 0.7.0",
"serde",
"time",
"tokio",
]
[[package]]
name = "tantivy-fst"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc3c506b1a8443a3a65352df6382a1fb6a7afe1a02e871cee0d25e2c3d5f3944"
dependencies = [
"byteorder",
"regex-syntax 0.6.29",
"utf8-ranges",
]
[[package]]
name = "tantivy-fst"
version = "0.5.0"
@ -2396,10 +2633,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18"
dependencies = [
"byteorder",
"regex-syntax",
"regex-syntax 0.8.5",
"utf8-ranges",
]
[[package]]
name = "tantivy-query-grammar"
version = "0.21.0"
source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473"
dependencies = [
"nom",
]
[[package]]
name = "tantivy-query-grammar"
version = "0.22.0"
@ -2408,17 +2653,36 @@ dependencies = [
"nom",
]
[[package]]
name = "tantivy-sstable"
version = "0.2.0"
source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473"
dependencies = [
"tantivy-common 0.6.0",
"tantivy-fst 0.4.0",
"zstd 0.12.4",
]
[[package]]
name = "tantivy-sstable"
version = "0.3.0"
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
dependencies = [
"futures-util",
"itertools",
"tantivy-bitpacker",
"tantivy-common",
"tantivy-fst",
"zstd",
"itertools 0.14.0",
"tantivy-bitpacker 0.6.0",
"tantivy-common 0.7.0",
"tantivy-fst 0.5.0",
"zstd 0.13.0",
]
[[package]]
name = "tantivy-stacker"
version = "0.2.0"
source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473"
dependencies = [
"murmurhash32",
"tantivy-common 0.6.0",
]
[[package]]
@ -2428,7 +2692,15 @@ source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a
dependencies = [
"murmurhash32",
"rand_distr",
"tantivy-common",
"tantivy-common 0.7.0",
]
[[package]]
name = "tantivy-tokenizer-api"
version = "0.2.0"
source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473"
dependencies = [
"serde",
]
[[package]]
@ -2479,13 +2751,33 @@ version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9"
[[package]]
name = "thiserror"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
dependencies = [
"thiserror-impl 1.0.69",
]
[[package]]
name = "thiserror"
version = "2.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
dependencies = [
"thiserror-impl",
"thiserror-impl 2.0.11",
]
[[package]]
name = "thiserror-impl"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.98",
]
[[package]]
@ -2766,6 +3058,12 @@ version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "want"
version = "0.3.1"
@ -2910,7 +3208,7 @@ checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0"
dependencies = [
"windows-result",
"windows-strings",
"windows-targets",
"windows-targets 0.52.6",
]
[[package]]
@ -2919,7 +3217,7 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e"
dependencies = [
"windows-targets",
"windows-targets 0.52.6",
]
[[package]]
@ -2929,7 +3227,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10"
dependencies = [
"windows-result",
"windows-targets",
"windows-targets 0.52.6",
]
[[package]]
name = "windows-sys"
version = "0.48.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
dependencies = [
"windows-targets 0.48.5",
]
[[package]]
@ -2938,7 +3245,7 @@ version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets",
"windows-targets 0.52.6",
]
[[package]]
@ -2947,7 +3254,22 @@ version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
"windows-targets 0.52.6",
]
[[package]]
name = "windows-targets"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
dependencies = [
"windows_aarch64_gnullvm 0.48.5",
"windows_aarch64_msvc 0.48.5",
"windows_i686_gnu 0.48.5",
"windows_i686_msvc 0.48.5",
"windows_x86_64_gnu 0.48.5",
"windows_x86_64_gnullvm 0.48.5",
"windows_x86_64_msvc 0.48.5",
]
[[package]]
@ -2956,28 +3278,46 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_aarch64_gnullvm 0.52.6",
"windows_aarch64_msvc 0.52.6",
"windows_i686_gnu 0.52.6",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
"windows_i686_msvc 0.52.6",
"windows_x86_64_gnu 0.52.6",
"windows_x86_64_gnullvm 0.52.6",
"windows_x86_64_msvc 0.52.6",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
@ -2990,24 +3330,48 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.48.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
@ -3146,13 +3510,32 @@ dependencies = [
"syn 2.0.98",
]
[[package]]
name = "zstd"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
dependencies = [
"zstd-safe 6.0.6",
]
[[package]]
name = "zstd"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110"
dependencies = [
"zstd-safe",
"zstd-safe 7.0.0",
]
[[package]]
name = "zstd-safe"
version = "6.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
dependencies = [
"libc",
"zstd-sys",
]
[[package]]

View File

@ -15,6 +15,7 @@ lindera-cc-cedict = ["lindera/cc-cedict"]
[dependencies]
tantivy = { git = "https://github.com/zilliztech/tantivy.git" }
tantivy-5 = { package = "tantivy", git = "https://github.com/milvus-io/tantivy.git", tag = "0.21.1-fix3" }
lindera = "0.40.1"
futures = "0.3.21"
libc = "0.2"

View File

@ -178,12 +178,14 @@ RustResult tantivy_register_tokenizer(void *ptr,
RustResult tantivy_create_index(const char *field_name,
TantivyDataType data_type,
const char *path,
uint32_t tantivy_index_version,
uintptr_t num_threads,
uintptr_t overall_memory_budget_in_bytes);
RustResult tantivy_create_index_with_single_segment(const char *field_name,
TantivyDataType data_type,
const char *path);
const char *path,
uint32_t tantivy_index_version);
void tantivy_free_index_writer(void *ptr);
@ -334,6 +336,7 @@ RustResult tantivy_index_add_array_keywords_by_single_segment_writer(void *ptr,
RustResult tantivy_create_text_writer(const char *field_name,
const char *path,
uint32_t tantivy_index_version,
const char *tokenizer_name,
const char *analyzer_params,
uintptr_t num_threads,

View File

@ -6,6 +6,7 @@ lazy_static! {
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
}
#[allow(dead_code)]
#[derive(Clone)]
pub enum JiebaMode {
Exact,

View File

@ -1,15 +1,14 @@
use core::result::Result::Err;
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use lindera::token::Token as LToken;
use lindera::tokenizer::{Tokenizer as LTokenizer, TokenizerBuilder};
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
use tantivy::tokenizer::{Token, Tokenizer, TokenStream};
use lindera::tokenizer::Tokenizer as LTokenizer;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
use crate::error::{Result, TantivyBindingError};
use serde_json as json;
use crate::error::{Result,TantivyBindingError};
pub struct LinderaTokenStream<'a> {
pub tokens: Vec<LToken<'a>>,
@ -52,7 +51,7 @@ impl LinderaTokenizer {
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<LinderaTokenizer> {
let kind = fetch_lindera_kind(params)?;
let dictionary = load_dictionary_from_kind(kind);
if dictionary.is_err(){
if dictionary.is_err() {
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer with invalid dict_kind"
)));
@ -87,9 +86,9 @@ trait DictionaryKindParser {
fn into_dict_kind(self) -> Result<DictionaryKind>;
}
impl DictionaryKindParser for &str{
impl DictionaryKindParser for &str {
fn into_dict_kind(self) -> Result<DictionaryKind> {
match self{
match self {
"ipadic" => Ok(DictionaryKind::IPADIC),
"ipadic-neologd" => Ok(DictionaryKind::IPADICNEologd),
"unidic" => Ok(DictionaryKind::UniDic),
@ -98,21 +97,21 @@ impl DictionaryKindParser for &str{
other => Err(TantivyBindingError::InvalidArgument(format!(
"unsupported lindera dict type: {}",
other
)))
))),
}
}
}
fn fetch_lindera_kind(params:&json::Map<String, json::Value>) -> Result<DictionaryKind>{
match params.get("dict_kind"){
fn fetch_lindera_kind(params: &json::Map<String, json::Value>) -> Result<DictionaryKind> {
match params.get("dict_kind") {
Some(val) => {
if !val.is_string(){
if !val.is_string() {
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer dict kind should be string"
)))
)));
}
val.as_str().unwrap().into_dict_kind()
},
}
_ => {
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer dict_kind must be set"
@ -128,29 +127,29 @@ mod tests {
use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer;
#[test]
fn test_lindera_tokenizer(){
fn test_lindera_tokenizer() {
let params = r#"{
"type": "lindera",
"dict_kind": "ipadic"
}"#;
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
assert!(json_param.is_ok());
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
}
#[test]
#[cfg(feature = "lindera-cc-cedict")]
fn test_lindera_tokenizer_cc(){
fn test_lindera_tokenizer_cc() {
let params = r#"{
"type": "lindera",
"dict_kind": "cc-cedict"
}"#;
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
assert!(json_param.is_ok());
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
}
}
}

View File

@ -1,13 +1,10 @@
use tantivy::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
use lindera::segmenter::Segmenter;
use tantivy::tokenizer::*;
use lindera::mode::Mode;
use serde_json as json;
use log::warn;
use serde_json as json;
use tantivy::tokenizer::*;
use tantivy::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
use crate::analyzer::tokenizers::{JiebaTokenizer, LinderaTokenizer};
use crate::error::{Result,TantivyBindingError};
use crate::error::{Result, TantivyBindingError};
pub fn standard_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
@ -21,11 +18,13 @@ pub fn jieba_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
}
pub fn lindera_builder(params: Option<&json::Map<String, json::Value>>) -> Result<TextAnalyzerBuilder>{
if params.is_none(){
pub fn lindera_builder(
params: Option<&json::Map<String, json::Value>>,
) -> Result<TextAnalyzerBuilder> {
if params.is_none() {
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer must be costum"
)))
)));
}
let tokenizer = LinderaTokenizer::from_json(params.unwrap())?;
Ok(TextAnalyzer::builder(tokenizer).dynamic())
@ -34,25 +33,25 @@ pub fn lindera_builder(params: Option<&json::Map<String, json::Value>>) -> Resul
pub fn get_builder_with_tokenizer(params: &json::Value) -> Result<TextAnalyzerBuilder> {
let name;
let params_map;
if params.is_string(){
if params.is_string() {
name = params.as_str().unwrap();
params_map = None;
}else{
} else {
let m = params.as_object().unwrap();
match m.get("type"){
match m.get("type") {
Some(val) => {
if !val.is_string(){
if !val.is_string() {
return Err(TantivyBindingError::InvalidArgument(format!(
"tokenizer type should be string"
)))
)));
}
name = val.as_str().unwrap();
},
}
_ => {
return Err(TantivyBindingError::InvalidArgument(format!(
"costum tokenizer must set type"
)))
},
}
}
params_map = Some(m);
}

View File

@ -98,6 +98,7 @@ pub extern "C" fn free_rust_array_i64(array: RustArrayI64) {
}
}
#[allow(dead_code)]
#[repr(C)]
pub enum Value {
None(()),
@ -192,11 +193,9 @@ pub extern "C" fn free_rust_error(error: *const c_char) {
#[macro_export]
macro_rules! cstr_to_str {
($cstr:expr) => {
unsafe {
match CStr::from_ptr($cstr).to_str() {
Ok(f) => f,
Err(e) => return RustResult::from_error(e.to_string()),
}
match unsafe { CStr::from_ptr($cstr).to_str() } {
Ok(f) => f,
Err(e) => return RustResult::from_error(e.to_string()),
}
};
}

View File

@ -1,3 +1,4 @@
#[allow(dead_code)]
#[repr(u8)]
#[derive(Debug)]
pub enum TantivyDataType {

View File

@ -65,7 +65,7 @@ impl SegmentCollector for DocIdChildCollector<u32> {
self.collect_block(&[doc]);
}
fn harvest(mut self) -> Self::Fruit {
fn harvest(self) -> Self::Fruit {
self.milvus_doc_ids
}
}
@ -117,7 +117,7 @@ impl SegmentCollector for DocIdChildCollector<i64> {
self.collect_block(&[doc]);
}
fn harvest(mut self) -> Self::Fruit {
fn harvest(self) -> Self::Fruit {
self.milvus_doc_ids
}
}

View File

@ -4,6 +4,7 @@ use core::{fmt, str};
pub enum TantivyBindingError {
JsonError(serde_json::Error),
TantivyError(tantivy::TantivyError),
TantivyErrorV5(tantivy_5::TantivyError),
InvalidArgument(String),
InternalError(String),
}
@ -20,11 +21,18 @@ impl From<tantivy::TantivyError> for TantivyBindingError {
}
}
impl From<tantivy_5::TantivyError> for TantivyBindingError {
fn from(value: tantivy_5::TantivyError) -> Self {
TantivyBindingError::TantivyErrorV5(value)
}
}
impl fmt::Display for TantivyBindingError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
TantivyBindingError::JsonError(e) => write!(f, "JsonError: {}", e),
TantivyBindingError::TantivyError(e) => write!(f, "TantivyError: {}", e),
TantivyBindingError::TantivyErrorV5(e) => write!(f, "TantivyErrorV5: {}", e),
TantivyBindingError::InvalidArgument(e) => write!(f, "InvalidArgument: {}", e),
TantivyBindingError::InternalError(e) => write!(f, "InternalError: {}", e),
}
@ -36,6 +44,7 @@ impl std::error::Error for TantivyBindingError {
match self {
TantivyBindingError::JsonError(e) => Some(e),
TantivyBindingError::TantivyError(e) => Some(e),
TantivyBindingError::TantivyErrorV5(e) => Some(e),
TantivyBindingError::InvalidArgument(_) => None,
TantivyBindingError::InternalError(_) => None,
}

View File

@ -12,6 +12,7 @@ use crate::vec_collector::VecCollector;
use crate::error::{Result, TantivyBindingError};
#[allow(dead_code)]
pub(crate) struct IndexReaderWrapper {
pub(crate) field_name: String,
pub(crate) field: Field,

View File

@ -65,25 +65,26 @@ mod tests {
use tantivy::query::TermQuery;
use tempfile::TempDir;
use crate::{analyzer::create_analyzer, index_writer::IndexWriterWrapper};
use crate::{index_writer::IndexWriterWrapper, TantivyIndexVersion};
#[test]
fn test_jeba() {
let params = "{\"tokenizer\": \"jieba\"}".to_string();
let tokenizer = create_analyzer(&params).unwrap();
let dir = TempDir::new().unwrap();
let mut writer = IndexWriterWrapper::create_text_writer(
"text".to_string(),
dir.path().to_str().unwrap().to_string(),
"jieba".to_string(),
tokenizer,
"text",
dir.path().to_str().unwrap(),
"jieba",
&params,
1,
50_000_000,
false,
);
TantivyIndexVersion::default_version(),
)
.unwrap();
writer.add_string("网球和滑雪", 0).unwrap();
writer.add_string("网球以及滑雪", 1).unwrap();
writer.add("网球和滑雪", Some(0)).unwrap();
writer.add("网球以及滑雪", Some(1)).unwrap();
writer.commit().unwrap();
@ -100,20 +101,21 @@ mod tests {
#[test]
fn test_read() {
let tokenizer = create_analyzer("").unwrap();
let dir = TempDir::new().unwrap();
let mut writer = IndexWriterWrapper::create_text_writer(
"text".to_string(),
dir.path().to_str().unwrap().to_string(),
"default".to_string(),
tokenizer,
"text",
dir.path().to_str().unwrap(),
"default",
"",
1,
50_000_000,
false,
);
TantivyIndexVersion::default_version(),
)
.unwrap();
for i in 0..10000 {
writer.add_string("hello world", i).unwrap();
writer.add("hello world", Some(i)).unwrap();
}
writer.commit().unwrap();

View File

@ -3,17 +3,15 @@ use std::ffi::CStr;
use libc::{c_char, c_void};
use crate::{
array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, log::init_log,
analyzer::create_analyzer,
analyzer::create_analyzer, array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper,
log::init_log,
};
#[no_mangle]
pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> RustResult {
let real = ptr as *mut IndexReaderWrapper;
unsafe {
let query = cstr_to_str!(query);
(*real).match_query(query).into()
}
let query = cstr_to_str!(query);
unsafe { (*real).match_query(query).into() }
}
#[no_mangle]

View File

@ -1,389 +1,285 @@
use std::ffi::CStr;
use std::sync::Arc;
use either::Either;
use futures::executor::block_on;
use index_writer_v5::TantivyDocumentV5;
use index_writer_v7::TantivyDocumentV7;
use libc::c_char;
use log::info;
use tantivy::schema::{
Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED,
};
use tantivy::{doc, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument};
use crate::data_type::TantivyDataType;
use crate::error::Result;
use crate::error::{Result, TantivyBindingError};
use crate::index_reader::IndexReaderWrapper;
use crate::log::init_log;
use crate::{index_writer_v5, index_writer_v7, TantivyIndexVersion};
pub(crate) struct IndexWriterWrapper {
pub(crate) field: Field,
pub(crate) index_writer: Either<IndexWriter, SingleSegmentIndexWriter>,
pub(crate) id_field: Option<Field>,
pub(crate) index: Arc<Index>,
pub trait TantivyValue<D> {
fn add_to_document(&self, field: u32, document: &mut D);
}
#[inline]
fn schema_builder_add_field(
schema_builder: &mut SchemaBuilder,
field_name: &str,
data_type: TantivyDataType,
) -> Field {
match data_type {
TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED),
TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED),
TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED),
TantivyDataType::Keyword => {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
schema_builder.add_text_field(&field_name, text_options)
}
TantivyDataType::Text => {
panic!("text should be indexed with analyzer");
}
}
pub enum IndexWriterWrapper {
V5(index_writer_v5::IndexWriterWrapperImpl),
V7(index_writer_v7::IndexWriterWrapperImpl),
}
impl IndexWriterWrapper {
// create a IndexWriterWrapper according to `tanviy_index_version`.
// version 7 is the latest version and is what we should use in most cases.
// We may also build with version 5 for compatibility for reader nodes with older versions.
pub fn new(
field_name: String,
field_name: &str,
data_type: TantivyDataType,
path: String,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
tanviy_index_version: TantivyIndexVersion,
) -> Result<IndexWriterWrapper> {
init_log();
info!(
"create index writer, field_name: {}, data_type: {:?}",
field_name, data_type
);
let mut schema_builder = Schema::builder();
let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
// We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
let id_field = schema_builder.add_i64_field("doc_id", FAST);
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?;
let index_writer =
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
Ok(IndexWriterWrapper {
field,
index_writer: Either::Left(index_writer),
id_field: Some(id_field),
index: Arc::new(index),
})
match tanviy_index_version {
TantivyIndexVersion::V5 => {
let writer = index_writer_v5::IndexWriterWrapperImpl::new(
field_name,
data_type,
path,
num_threads,
overall_memory_budget_in_bytes,
)?;
Ok(IndexWriterWrapper::V5(writer))
}
TantivyIndexVersion::V7 => {
let writer = index_writer_v7::IndexWriterWrapperImpl::new(
field_name,
data_type,
path,
num_threads,
overall_memory_budget_in_bytes,
)?;
Ok(IndexWriterWrapper::V7(writer))
}
}
}
pub fn new_with_single_segment(
field_name: String,
field_name: &str,
data_type: TantivyDataType,
path: String,
tanviy_index_version: TantivyIndexVersion,
) -> Result<IndexWriterWrapper> {
init_log();
info!(
"create single segment index writer, field_name: {}, data_type: {:?}",
field_name, data_type
);
let mut schema_builder = Schema::builder();
let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?;
let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?;
Ok(IndexWriterWrapper {
field,
index_writer: Either::Right(index_writer),
id_field: None,
index: Arc::new(index),
})
match tanviy_index_version {
TantivyIndexVersion::V5 => {
let writer = index_writer_v5::IndexWriterWrapperImpl::new_with_single_segment(
field_name, data_type, path,
)?;
Ok(IndexWriterWrapper::V5(writer))
}
TantivyIndexVersion::V7 => {
let writer = index_writer_v7::IndexWriterWrapperImpl::new_with_single_segment(
field_name, data_type, path,
)?;
Ok(IndexWriterWrapper::V7(writer))
}
}
}
pub fn create_reader(&self) -> Result<IndexReaderWrapper> {
IndexReaderWrapper::from_index(self.index.clone())
}
fn index_writer_add_document(&self, document: TantivyDocument) -> Result<()> {
match self.index_writer {
Either::Left(ref writer) => {
let _ = writer.add_document(document)?;
}
Either::Right(_) => {
panic!("unexpected writer");
match self {
IndexWriterWrapper::V5(_) => {
return Err(TantivyBindingError::InternalError(
"create reader with tantivy index version 5
is not supported from tantivy with version 7"
.into(),
));
}
IndexWriterWrapper::V7(writer) => writer.create_reader(),
}
Ok(())
}
fn single_segment_index_writer_add_document(
&mut self,
document: TantivyDocument,
) -> Result<()> {
match self.index_writer {
Either::Left(_) => {
panic!("unexpected writer");
}
Either::Right(ref mut single_segmnet_writer) => {
let _ = single_segmnet_writer.add_document(document)?;
}
pub fn add<T>(&mut self, data: T, offset: Option<i64>) -> Result<()>
where
T: TantivyValue<TantivyDocumentV5> + TantivyValue<TantivyDocumentV7>,
{
match self {
IndexWriterWrapper::V5(writer) => writer.add(data, offset),
IndexWriterWrapper::V7(writer) => writer.add(data, offset),
}
Ok(())
}
pub fn add_i8(&mut self, data: i8, offset: i64) -> Result<()> {
self.add_i64(data.into(), offset)
}
pub fn add_i16(&mut self, data: i16, offset: i64) -> Result<()> {
self.add_i64(data.into(), offset)
}
pub fn add_i32(&mut self, data: i32, offset: i64) -> Result<()> {
self.add_i64(data.into(), offset)
}
pub fn add_i64(&mut self, data: i64, offset: i64) -> Result<()> {
self.index_writer_add_document(doc!(
self.field => data,
self.id_field.unwrap() => offset,
))
}
pub fn add_f32(&mut self, data: f32, offset: i64) -> Result<()> {
self.add_f64(data.into(), offset)
}
pub fn add_f64(&mut self, data: f64, offset: i64) -> Result<()> {
self.index_writer_add_document(doc!(
self.field => data,
self.id_field.unwrap() => offset,
))
}
pub fn add_bool(&mut self, data: bool, offset: i64) -> Result<()> {
self.index_writer_add_document(doc!(
self.field => data,
self.id_field.unwrap() => offset,
))
}
pub fn add_string(&mut self, data: &str, offset: i64) -> Result<()> {
self.index_writer_add_document(doc!(
self.field => data,
self.id_field.unwrap() => offset,
))
}
pub fn add_array_i8s(&mut self, datas: &[i8], offset: i64) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, &(*data as i64));
pub fn add_array<T, I>(&mut self, data: I, offset: Option<i64>) -> Result<()>
where
I: IntoIterator<Item = T>,
T: TantivyValue<TantivyDocumentV5> + TantivyValue<TantivyDocumentV7>,
{
match self {
IndexWriterWrapper::V5(writer) => writer.add_array(data, offset),
IndexWriterWrapper::V7(writer) => writer.add_array(data, offset),
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_array_i16s(&mut self, datas: &[i16], offset: i64) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, &(*data as i64));
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_array_i32s(&mut self, datas: &[i32], offset: i64) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, &(*data as i64));
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_array_i64s(&mut self, datas: &[i64], offset: i64) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, data);
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_array_f32s(&mut self, datas: &[f32], offset: i64) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, &(*data as f64));
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_array_f64s(&mut self, datas: &[f64], offset: i64) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, data);
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_array_bools(&mut self, datas: &[bool], offset: i64) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, data);
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_array_keywords(&mut self, datas: &[*const c_char], offset: i64) -> Result<()> {
let mut document = TantivyDocument::default();
for element in datas {
let data = unsafe { CStr::from_ptr(*element) };
document.add_field_value(self.field, data.to_str()?);
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_i8_by_single_segment_writer(&mut self, data: i8) -> Result<()> {
self.add_i64_by_single_segment_writer(data.into())
}
pub fn add_i16_by_single_segment_writer(&mut self, data: i16) -> Result<()> {
self.add_i64_by_single_segment_writer(data.into())
}
pub fn add_i32_by_single_segment_writer(&mut self, data: i32) -> Result<()> {
self.add_i64_by_single_segment_writer(data.into())
}
pub fn add_i64_by_single_segment_writer(&mut self, data: i64) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_f32_by_single_segment_writer(&mut self, data: f32) -> Result<()> {
self.add_f64_by_single_segment_writer(data.into())
}
pub fn add_f64_by_single_segment_writer(&mut self, data: f64) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_bool_by_single_segment_writer(&mut self, data: bool) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_string_by_single_segment_writer(&mut self, data: &str) -> Result<()> {
self.single_segment_index_writer_add_document(doc!(
self.field => data
))
}
pub fn add_array_i8s_by_single_segment_writer(&mut self, datas: &[i8]) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, &(*data as i64));
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_array_i16s_by_single_segment_writer(&mut self, datas: &[i16]) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, &(*data as i64));
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_array_i32s_by_single_segment_writer(&mut self, datas: &[i32]) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, &(*data as i64));
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_array_i64s_by_single_segment_writer(&mut self, datas: &[i64]) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, data);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_array_f32s_by_single_segment_writer(&mut self, datas: &[f32]) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, &(*data as f64));
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_array_f64s_by_single_segment_writer(&mut self, datas: &[f64]) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, data);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_array_bools_by_single_segment_writer(&mut self, datas: &[bool]) -> Result<()> {
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, data);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_array_keywords_by_single_segment_writer(
pub fn add_array_keywords(
&mut self,
datas: &[*const c_char],
offset: Option<i64>,
) -> Result<()> {
let mut document = TantivyDocument::default();
for element in datas {
let data = unsafe { CStr::from_ptr(*element) };
document.add_field_value(self.field, data.to_str()?);
match self {
IndexWriterWrapper::V5(writer) => writer.add_array_keywords(datas, offset),
IndexWriterWrapper::V7(writer) => writer.add_array_keywords(datas, offset),
}
self.single_segment_index_writer_add_document(document)
}
fn manual_merge(&mut self) -> Result<()> {
let index_writer = self.index_writer.as_mut().left().unwrap();
let metas = index_writer.index().searchable_segment_metas()?;
let policy = index_writer.get_merge_policy();
let candidates = policy.compute_merge_candidates(metas.as_slice());
for candidate in candidates {
index_writer.merge(candidate.0.as_slice()).wait()?;
#[allow(dead_code)]
pub fn manual_merge(&mut self) -> Result<()> {
match self {
IndexWriterWrapper::V5(writer) => writer.manual_merge(),
IndexWriterWrapper::V7(writer) => writer.manual_merge(),
}
Ok(())
}
#[allow(dead_code)]
pub fn commit(&mut self) -> Result<()> {
match self {
IndexWriterWrapper::V5(writer) => writer.commit(),
IndexWriterWrapper::V7(writer) => writer.commit(),
}
}
#[allow(dead_code)]
pub fn finish(self) -> Result<()> {
match self.index_writer {
Either::Left(mut index_writer) => {
index_writer.commit()?;
// self.manual_merge();
block_on(index_writer.garbage_collect_files())?;
index_writer.wait_merging_threads()?;
}
Either::Right(single_segment_index_writer) => {
single_segment_index_writer
.finalize()
.expect("failed to build inverted index");
}
match self {
IndexWriterWrapper::V5(writer) => writer.finish(),
IndexWriterWrapper::V7(writer) => writer.finish(),
}
Ok(())
}
pub(crate) fn commit(&mut self) -> Result<()> {
self.index_writer.as_mut().left().unwrap().commit()?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::ops::Bound;
use tempfile::TempDir;
use crate::{data_type::TantivyDataType, TantivyIndexVersion};
use super::IndexWriterWrapper;
#[test]
fn test_build_index_version5() {
let field_name = "number";
let data_type = TantivyDataType::I64;
let dir = TempDir::new().unwrap();
{
let mut index_wrapper = IndexWriterWrapper::new(
field_name,
data_type,
dir.path().to_str().unwrap().to_string(),
1,
50_000_000,
TantivyIndexVersion::V5,
)
.unwrap();
for i in 0..10 {
index_wrapper.add::<i64>(i, Some(i as i64)).unwrap();
}
index_wrapper.commit().unwrap();
}
use tantivy_5::{collector, query, Index, ReloadPolicy};
let index = Index::open_in_dir(dir.path()).unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let query = query::RangeQuery::new_i64_bounds(
field_name.to_string(),
Bound::Included(0),
Bound::Included(9),
);
let res = reader
.searcher()
.search(&query, &tantivy_5::collector::TopDocs::with_limit(10))
.unwrap();
assert_eq!(res.len(), 10);
}
#[test]
fn test_build_index_version5_single_segment() {
let field_name = "number";
let data_type = TantivyDataType::I64;
let dir = TempDir::new().unwrap();
{
let mut index_wrapper = IndexWriterWrapper::new_with_single_segment(
field_name,
data_type,
dir.path().to_str().unwrap().to_string(),
TantivyIndexVersion::V5,
)
.unwrap();
for i in 0..10 {
index_wrapper.add::<i64>(i, None).unwrap();
}
index_wrapper.finish().unwrap();
}
use tantivy_5::{collector, query, Index, ReloadPolicy};
let index = Index::open_in_dir(dir.path()).unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let query = query::RangeQuery::new_i64_bounds(
field_name.to_string(),
Bound::Included(0),
Bound::Included(9),
);
let res = reader
.searcher()
.search(&query, &collector::TopDocs::with_limit(10))
.unwrap();
assert_eq!(res.len(), 10);
}
#[test]
fn test_build_text_index_version5() {
let field_name = "text";
let dir = TempDir::new().unwrap();
{
let mut index_wrapper = IndexWriterWrapper::create_text_writer(
field_name,
dir.path().to_str().unwrap(),
"default",
"",
1,
50_000_000,
false,
TantivyIndexVersion::V5,
)
.unwrap();
for i in 0..10 {
index_wrapper.add("hello", Some(i as i64)).unwrap();
}
index_wrapper.commit().unwrap();
}
use tantivy_5::{collector, query, schema, Index, ReloadPolicy, Term};
let index = Index::open_in_dir(dir.path()).unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let text = index.schema().get_field("text").unwrap();
let query = query::TermQuery::new(
Term::from_field_text(text, "hello"),
schema::IndexRecordOption::Basic,
);
let res = reader
.searcher()
.search(&query, &collector::TopDocs::with_limit(10))
.unwrap();
assert_eq!(res.len(), 10);
}
}

View File

@ -8,6 +8,7 @@ use crate::{
error::Result,
index_writer::IndexWriterWrapper,
util::{create_binding, free_binding},
TantivyIndexVersion,
};
macro_rules! convert_to_rust_slice {
@ -25,17 +26,25 @@ pub extern "C" fn tantivy_create_index(
field_name: *const c_char,
data_type: TantivyDataType,
path: *const c_char,
tantivy_index_version: u32,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
) -> RustResult {
let field_name_str = cstr_to_str!(field_name);
let path_str = cstr_to_str!(path);
let tantivy_index_version = match TantivyIndexVersion::from_u32(tantivy_index_version) {
Ok(v) => v,
Err(e) => return RustResult::from_error(e.to_string()),
};
match IndexWriterWrapper::new(
String::from(field_name_str),
field_name_str,
data_type,
String::from(path_str),
num_threads,
overall_memory_budget_in_bytes,
tantivy_index_version,
) {
Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)),
Err(e) => RustResult::from_error(e.to_string()),
@ -47,13 +56,21 @@ pub extern "C" fn tantivy_create_index_with_single_segment(
field_name: *const c_char,
data_type: TantivyDataType,
path: *const c_char,
tantivy_index_version: u32,
) -> RustResult {
let field_name_str = cstr_to_str!(field_name);
let path_str = cstr_to_str!(path);
let tantivy_index_version = match TantivyIndexVersion::from_u32(tantivy_index_version) {
Ok(v) => v,
Err(e) => return RustResult::from_error(e.to_string()),
};
match IndexWriterWrapper::new_with_single_segment(
String::from(field_name_str),
field_name_str,
data_type,
String::from(path_str),
tantivy_index_version,
) {
Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)),
Err(e) => RustResult::from_error(e.to_string()),
@ -90,25 +107,31 @@ pub extern "C" fn tantivy_create_reader_from_writer(ptr: *mut c_void) -> RustRes
}
// -------------------------build--------------------
fn execute<T: Copy>(
arr: &[T],
fn execute<T: Copy, I>(
arr: I,
offset: i64,
e: fn(&mut IndexWriterWrapper, T, i64) -> Result<()>,
e: fn(&mut IndexWriterWrapper, T, Option<i64>) -> Result<()>,
w: &mut IndexWriterWrapper,
) -> Result<()> {
for (index, data) in arr.iter().enumerate() {
e(w, *data, offset + (index as i64))?;
) -> Result<()>
where
I: IntoIterator<Item = T>,
{
for (index, data) in arr.into_iter().enumerate() {
e(w, data, Some(offset + (index as i64)))?;
}
Ok(())
}
fn execute_by_single_segment_writer<T: Copy>(
arr: &[T],
e: fn(&mut IndexWriterWrapper, T) -> Result<()>,
fn execute_by_single_segment_writer<T: Copy, I>(
arr: I,
e: fn(&mut IndexWriterWrapper, T, Option<i64>) -> Result<()>,
w: &mut IndexWriterWrapper,
) -> Result<()> {
for (_, data) in arr.iter().enumerate() {
e(w, *data)?;
) -> Result<()>
where
I: IntoIterator<Item = T>,
{
for data in arr.into_iter() {
e(w, data, None)?;
}
Ok(())
}
@ -122,7 +145,15 @@ pub extern "C" fn tantivy_index_add_int8s(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i8, &mut (*real)).into() }
unsafe {
execute(
arr.into_iter().map(|num| *num as i64),
offset_begin,
IndexWriterWrapper::add::<i64>,
&mut (*real),
)
.into()
}
}
#[no_mangle]
@ -135,8 +166,8 @@ pub extern "C" fn tantivy_index_add_int8s_by_single_segment_writer(
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_i8_by_single_segment_writer,
arr.into_iter().map(|num| *num as i64),
IndexWriterWrapper::add::<i64>,
&mut (*real),
)
.into()
@ -152,7 +183,15 @@ pub extern "C" fn tantivy_index_add_int16s(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i16, &mut (*real)).into() }
unsafe {
execute(
arr.into_iter().map(|num| *num as i64),
offset_begin,
IndexWriterWrapper::add::<i64>,
&mut (*real),
)
.into()
}
}
#[no_mangle]
@ -165,8 +204,8 @@ pub extern "C" fn tantivy_index_add_int16s_by_single_segment_writer(
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_i16_by_single_segment_writer,
arr.into_iter().map(|num| *num as i64),
IndexWriterWrapper::add::<i64>,
&mut (*real),
)
.into()
@ -182,7 +221,15 @@ pub extern "C" fn tantivy_index_add_int32s(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i32, &mut (*real)).into() }
unsafe {
execute(
arr.into_iter().map(|num| *num as i64),
offset_begin,
IndexWriterWrapper::add::<i64>,
&mut (*real),
)
.into()
}
}
#[no_mangle]
@ -195,8 +242,8 @@ pub extern "C" fn tantivy_index_add_int32s_by_single_segment_writer(
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_i32_by_single_segment_writer,
arr.into_iter().map(|num| *num as i64),
IndexWriterWrapper::add::<i64>,
&mut (*real),
)
.into()
@ -213,7 +260,15 @@ pub extern "C" fn tantivy_index_add_int64s(
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i64, &mut (*real)).into() }
unsafe {
execute(
arr.iter().copied(),
offset_begin,
IndexWriterWrapper::add::<i64>,
&mut (*real),
)
.into()
}
}
#[no_mangle]
@ -227,8 +282,8 @@ pub extern "C" fn tantivy_index_add_int64s_by_single_segment_writer(
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_i64_by_single_segment_writer,
arr.iter().copied(),
IndexWriterWrapper::add::<i64>,
&mut (*real),
)
.into()
@ -244,7 +299,15 @@ pub extern "C" fn tantivy_index_add_f32s(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f32, &mut (*real)).into() }
unsafe {
execute(
arr.into_iter().map(|num| *num as f64),
offset_begin,
IndexWriterWrapper::add::<f64>,
&mut (*real),
)
.into()
}
}
#[no_mangle]
@ -257,8 +320,8 @@ pub extern "C" fn tantivy_index_add_f32s_by_single_segment_writer(
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_f32_by_single_segment_writer,
arr.into_iter().map(|num| *num as f64),
IndexWriterWrapper::add::<f64>,
&mut (*real),
)
.into()
@ -274,7 +337,15 @@ pub extern "C" fn tantivy_index_add_f64s(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f64, &mut (*real)).into() }
unsafe {
execute(
arr.iter().copied(),
offset_begin,
IndexWriterWrapper::add::<f64>,
&mut (*real),
)
.into()
}
}
#[no_mangle]
@ -287,8 +358,8 @@ pub extern "C" fn tantivy_index_add_f64s_by_single_segment_writer(
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_f64_by_single_segment_writer,
arr.into_iter().map(|num| *num as f64),
IndexWriterWrapper::add::<f64>,
&mut (*real),
)
.into()
@ -306,9 +377,9 @@ pub extern "C" fn tantivy_index_add_bools(
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute(
arr,
arr.iter().copied(),
offset_begin,
IndexWriterWrapper::add_bool,
IndexWriterWrapper::add::<bool>,
&mut (*real),
)
.into()
@ -325,8 +396,8 @@ pub extern "C" fn tantivy_index_add_bools_by_single_segment_writer(
let arr = unsafe { slice::from_raw_parts(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
IndexWriterWrapper::add_bool_by_single_segment_writer,
arr.iter().copied(),
IndexWriterWrapper::add::<bool>,
&mut (*real),
)
.into()
@ -343,7 +414,7 @@ pub extern "C" fn tantivy_index_add_string(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let s = cstr_to_str!(s);
unsafe { (*real).add_string(s, offset).into() }
unsafe { (*real).add::<&str>(s, Some(offset)).into() }
}
#[no_mangle]
@ -353,7 +424,7 @@ pub extern "C" fn tantivy_index_add_string_by_single_segment_writer(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let s = cstr_to_str!(s);
unsafe { (*real).add_string_by_single_segment_writer(s).into() }
unsafe { (*real).add::<&str>(s, None).into() }
}
// --------------------------------------------- array ------------------------------------------
@ -368,7 +439,9 @@ pub extern "C" fn tantivy_index_add_array_int8s(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_i8s(arr, offset).into()
(*real)
.add_array::<i64, _>(arr.into_iter().map(|num| *num as i64), Some(offset))
.into()
}
}
@ -381,7 +454,9 @@ pub extern "C" fn tantivy_index_add_array_int8s_by_single_segment_writer(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_i8s_by_single_segment_writer(arr).into()
(*real)
.add_array::<i64, _>(arr.into_iter().map(|num| *num as i64), None)
.into()
}
}
@ -395,7 +470,9 @@ pub extern "C" fn tantivy_index_add_array_int16s(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_i16s(arr, offset).into()
(*real)
.add_array::<i64, _>(arr.into_iter().map(|num| *num as i64), Some(offset))
.into()
}
}
@ -408,7 +485,9 @@ pub extern "C" fn tantivy_index_add_array_int16s_by_single_segment_writer(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_i16s_by_single_segment_writer(arr).into()
(*real)
.add_array::<i64, _>(arr.into_iter().map(|num| *num as i64), None)
.into()
}
}
@ -422,7 +501,9 @@ pub extern "C" fn tantivy_index_add_array_int32s(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_i32s(arr, offset).into()
(*real)
.add_array::<i64, _>(arr.into_iter().map(|num| *num as i64), Some(offset))
.into()
}
}
@ -435,7 +516,9 @@ pub extern "C" fn tantivy_index_add_array_int32s_by_single_segment_writer(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_i32s_by_single_segment_writer(arr).into()
(*real)
.add_array::<i64, _>(arr.into_iter().map(|num| *num as i64), None)
.into()
}
}
@ -449,7 +532,9 @@ pub extern "C" fn tantivy_index_add_array_int64s(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_i64s(arr, offset).into()
(*real)
.add_array::<i64, _>(arr.iter().copied(), Some(offset))
.into()
}
}
@ -462,7 +547,9 @@ pub extern "C" fn tantivy_index_add_array_int64s_by_single_segment_writer(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_i64s_by_single_segment_writer(arr).into()
(*real)
.add_array::<i64, _>(arr.iter().copied(), None)
.into()
}
}
@ -476,7 +563,9 @@ pub extern "C" fn tantivy_index_add_array_f32s(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_f32s(arr, offset).into()
(*real)
.add_array::<f64, _>(arr.into_iter().map(|num| *num as f64), Some(offset))
.into()
}
}
@ -489,7 +578,9 @@ pub extern "C" fn tantivy_index_add_array_f32s_by_single_segment_writer(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_f32s_by_single_segment_writer(arr).into()
(*real)
.add_array::<f64, _>(arr.into_iter().map(|num| *num as f64), None)
.into()
}
}
@ -503,7 +594,9 @@ pub extern "C" fn tantivy_index_add_array_f64s(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_f64s(arr, offset).into()
(*real)
.add_array::<f64, _>(arr.iter().copied(), Some(offset))
.into()
}
}
@ -516,7 +609,9 @@ pub extern "C" fn tantivy_index_add_array_f64s_by_single_segment_writer(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_f64s_by_single_segment_writer(arr).into()
(*real)
.add_array::<f64, _>(arr.iter().copied(), None)
.into()
}
}
@ -530,7 +625,9 @@ pub extern "C" fn tantivy_index_add_array_bools(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_bools(arr, offset).into()
(*real)
.add_array::<bool, _>(arr.iter().copied(), Some(offset))
.into()
}
}
@ -543,7 +640,9 @@ pub extern "C" fn tantivy_index_add_array_bools_by_single_segment_writer(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_bools_by_single_segment_writer(arr).into()
(*real)
.add_array::<bool, _>(arr.iter().copied(), None)
.into()
}
}
@ -557,7 +656,7 @@ pub extern "C" fn tantivy_index_add_array_keywords(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real).add_array_keywords(arr, offset).into()
(*real).add_array_keywords(arr, Some(offset)).into()
}
}
@ -570,8 +669,6 @@ pub extern "C" fn tantivy_index_add_array_keywords_by_single_segment_writer(
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = convert_to_rust_slice!(array, len);
(*real)
.add_array_keywords_by_single_segment_writer(arr)
.into()
(*real).add_array_keywords(arr, None).into()
}
}

View File

@ -1,53 +1,44 @@
use std::sync::Arc;
use either::Either;
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST};
use tantivy::tokenizer::TextAnalyzer;
use tantivy::Index;
use crate::{index_writer::IndexWriterWrapper, log::init_log};
fn build_text_schema(field_name: &String, tokenizer_name: &String) -> (Schema, Field, Field) {
let mut schema_builder = Schema::builder();
// positions is required for matching phase.
let indexing = TextFieldIndexing::default()
.set_tokenizer(&tokenizer_name)
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let option = TextOptions::default().set_indexing_options(indexing);
let field = schema_builder.add_text_field(&field_name, option);
let id_field = schema_builder.add_i64_field("doc_id", FAST);
(schema_builder.build(), field, id_field)
}
use crate::error::Result;
use crate::index_writer::IndexWriterWrapper;
use crate::{index_writer_v5, index_writer_v7, TantivyIndexVersion};
impl IndexWriterWrapper {
// create a text writer according to `tanviy_index_version`.
// version 7 is the latest version and is what we should use in most cases.
// We may also build with version 5 for compatibility for reader nodes with older versions.
pub(crate) fn create_text_writer(
field_name: String,
path: String,
tokenizer_name: String,
tokenizer: TextAnalyzer,
field_name: &str,
path: &str,
tokenizer_name: &str,
tokenizer_params: &str,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
in_ram: bool,
) -> IndexWriterWrapper {
init_log();
let (schema, field, id_field) = build_text_schema(&field_name, &tokenizer_name);
let index: Index;
if in_ram {
index = Index::create_in_ram(schema);
} else {
index = Index::create_in_dir(path.clone(), schema).unwrap();
}
index.tokenizers().register(&tokenizer_name, tokenizer);
let index_writer = index
.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)
.unwrap();
IndexWriterWrapper {
field,
index_writer: Either::Left(index_writer),
id_field: Some(id_field),
index: Arc::new(index),
tanviy_index_version: TantivyIndexVersion,
) -> Result<IndexWriterWrapper> {
match tanviy_index_version {
TantivyIndexVersion::V5 => Ok(IndexWriterWrapper::V5(
index_writer_v5::IndexWriterWrapperImpl::create_text_writer(
field_name,
path,
tokenizer_name,
tokenizer_params,
num_threads,
overall_memory_budget_in_bytes,
in_ram,
)?,
)),
TantivyIndexVersion::V7 => Ok(IndexWriterWrapper::V7(
index_writer_v7::IndexWriterWrapperImpl::create_text_writer(
field_name,
path,
tokenizer_name,
tokenizer_params,
num_threads,
overall_memory_budget_in_bytes,
in_ram,
)?,
)),
}
}
}

View File

@ -5,13 +5,14 @@ use crate::array::RustResult;
use crate::cstr_to_str;
use crate::index_writer::IndexWriterWrapper;
use crate::log::init_log;
use crate::analyzer::create_analyzer;
use crate::util::create_binding;
use crate::TantivyIndexVersion;
#[no_mangle]
pub extern "C" fn tantivy_create_text_writer(
field_name: *const c_char,
path: *const c_char,
tantivy_index_version: u32,
tokenizer_name: *const c_char,
analyzer_params: *const c_char,
num_threads: usize,
@ -23,20 +24,23 @@ pub extern "C" fn tantivy_create_text_writer(
let path_str = cstr_to_str!(path);
let tokenizer_name_str = cstr_to_str!(tokenizer_name);
let params = cstr_to_str!(analyzer_params);
let analyzer = create_analyzer(params);
match analyzer {
Ok(text_analyzer) => {
let wrapper = IndexWriterWrapper::create_text_writer(
String::from(field_name_str),
String::from(path_str),
String::from(tokenizer_name_str),
text_analyzer,
num_threads,
overall_memory_budget_in_bytes,
in_ram,
);
RustResult::from_ptr(create_binding(wrapper))
}
let tantivy_index_version = match TantivyIndexVersion::from_u32(tantivy_index_version) {
Ok(v) => v,
Err(e) => return RustResult::from_error(e.to_string()),
};
match IndexWriterWrapper::create_text_writer(
field_name_str,
path_str,
tokenizer_name_str,
params,
num_threads,
overall_memory_budget_in_bytes,
in_ram,
tantivy_index_version,
) {
Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)),
Err(err) => RustResult::from_error(format!(
"create tokenizer failed with error: {} param: {}",
err.to_string(),

View File

@ -0,0 +1,288 @@
use serde_json as json;
use std::collections::HashMap;
use tantivy_5::tokenizer::*;
use crate::error::{Result, TantivyBindingError};
use super::{
build_in_analyzer::{chinese_analyzer, english_analyzer},
filter::SystemFilter,
standard_analyzer,
tokenizers::get_builder_with_tokenizer,
util::{get_stop_words_list, get_string_list},
};
struct AnalyzerBuilder<'a> {
filters: HashMap<String, SystemFilter>,
params: &'a json::Map<String, json::Value>,
}
impl AnalyzerBuilder<'_> {
fn new(params: &json::Map<String, json::Value>) -> AnalyzerBuilder {
AnalyzerBuilder {
filters: HashMap::new(),
params: params,
}
}
fn get_tokenizer_params(&self) -> Result<&json::Value> {
let tokenizer = self.params.get("tokenizer");
if tokenizer.is_none() {
return Err(TantivyBindingError::InternalError(format!(
"tokenizer name or type must be set"
)));
}
let value = tokenizer.unwrap();
if value.is_object() || value.is_string() {
return Ok(tokenizer.unwrap());
}
Err(TantivyBindingError::InternalError(format!(
"tokenizer name should be string or dict"
)))
}
fn add_custom_filter(
&mut self,
name: &String,
params: &json::Map<String, json::Value>,
) -> Result<()> {
match SystemFilter::try_from(params) {
Ok(filter) => {
self.filters.insert(name.to_string(), filter);
Ok(())
}
Err(e) => Err(e),
}
}
fn add_custom_filters(&mut self, params: &json::Map<String, json::Value>) -> Result<()> {
for (name, value) in params {
if !value.is_object() {
continue;
}
self.add_custom_filter(name, value.as_object().unwrap())?;
}
Ok(())
}
fn build_filter(
&mut self,
mut builder: TextAnalyzerBuilder,
params: &json::Value,
) -> Result<TextAnalyzerBuilder> {
if !params.is_array() {
return Err(TantivyBindingError::InternalError(
"filter params should be array".to_string(),
));
}
let filters = params.as_array().unwrap();
for filter in filters {
if filter.is_string() {
let filter_name = filter.as_str().unwrap();
let costum = self.filters.remove(filter_name);
if !costum.is_none() {
builder = costum.unwrap().transform(builder);
continue;
}
// check if filter was system filter
let system = SystemFilter::from(filter_name);
match system {
SystemFilter::Invalid => {
return Err(TantivyBindingError::InternalError(format!(
"build analyzer failed, filter not found :{}",
filter_name
)))
}
other => {
builder = other.transform(builder);
}
}
} else if filter.is_object() {
let filter = SystemFilter::try_from(filter.as_object().unwrap())?;
builder = filter.transform(builder);
}
}
Ok(builder)
}
fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result<TextAnalyzerBuilder> {
for (key, value) in self.params {
match key.as_str() {
"tokenizer" => {}
"filter" => {
// build with filter if filter param exist
builder = self.build_filter(builder, value)?;
}
other => {
return Err(TantivyBindingError::InternalError(format!(
"unknown analyzer option key: {}",
other
)))
}
}
}
Ok(builder)
}
fn get_stop_words_option(&self) -> Result<Vec<String>> {
let value = self.params.get("stop_words");
match value {
Some(value) => {
let str_list = get_string_list(value, "filter stop_words")?;
Ok(get_stop_words_list(str_list))
}
_ => Ok(vec![]),
}
}
fn build_template(self, type_: &str) -> Result<TextAnalyzer> {
match type_ {
"standard" => Ok(standard_analyzer(self.get_stop_words_option()?)),
"chinese" => Ok(chinese_analyzer(self.get_stop_words_option()?)),
"english" => Ok(english_analyzer(self.get_stop_words_option()?)),
other_ => Err(TantivyBindingError::InternalError(format!(
"unknown build-in analyzer type: {}",
other_
))),
}
}
fn build(mut self) -> Result<TextAnalyzer> {
// build base build-in analyzer
match self.params.get("type") {
Some(type_) => {
if !type_.is_string() {
return Err(TantivyBindingError::InternalError(format!(
"analyzer type shoud be string"
)));
}
return self.build_template(type_.as_str().unwrap());
}
None => {}
};
//build custom analyzer
let tokenizer_params = self.get_tokenizer_params()?;
let mut builder = get_builder_with_tokenizer(&tokenizer_params)?;
// build with option
builder = self.build_option(builder)?;
Ok(builder.build())
}
}
pub(crate) fn create_analyzer_with_filter(params: &String) -> Result<TextAnalyzer> {
match json::from_str::<json::Value>(&params) {
Ok(value) => {
if value.is_null() {
return Ok(standard_analyzer(vec![]));
}
if !value.is_object() {
return Err(TantivyBindingError::InternalError(
"tokenizer params should be a json map".to_string(),
));
}
let json_params = value.as_object().unwrap();
// create builder
let analyzer_params = json_params.get("analyzer");
if analyzer_params.is_none() {
return Ok(standard_analyzer(vec![]));
}
if !analyzer_params.unwrap().is_object() {
return Err(TantivyBindingError::InternalError(
"analyzer params should be a json map".to_string(),
));
}
let builder_params = analyzer_params.unwrap().as_object().unwrap();
if builder_params.is_empty() {
return Ok(standard_analyzer(vec![]));
}
let mut builder = AnalyzerBuilder::new(builder_params);
// build custom filter
let filter_params = json_params.get("filter");
if !filter_params.is_none() && filter_params.unwrap().is_object() {
builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?;
}
// build analyzer
builder.build()
}
Err(err) => Err(err.into()),
}
}
pub(crate) fn create_analyzer(params: &str) -> Result<TextAnalyzer> {
if params.len() == 0 {
return Ok(standard_analyzer(vec![]));
}
create_analyzer_with_filter(&format!("{{\"analyzer\":{}}}", params))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_standard_analyzer() {
let params = r#"{
"type": "standard",
"stop_words": ["_english_"]
}"#;
let tokenizer = create_analyzer(&params.to_string());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
}
#[test]
fn test_chinese_analyzer() {
let params = r#"{
"type": "chinese"
}"#;
let tokenizer = create_analyzer(&params.to_string());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
let mut bining = tokenizer.unwrap();
let mut stream = bining.token_stream("系统安全;,'';lxyz密码");
let mut results = Vec::<String>::new();
while stream.advance() {
let token = stream.token();
results.push(token.text.clone());
}
print!("test tokens :{:?}\n", results)
}
#[test]
fn test_lindera_analyzer() {
let params = r#"{
"tokenizer": {
"type": "lindera",
"dict_kind": "ipadic"
}
}"#;
let tokenizer = create_analyzer(&params.to_string());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
let mut bining = tokenizer.unwrap();
let mut stream =
bining.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です");
let mut results = Vec::<String>::new();
while stream.advance() {
let token = stream.token();
results.push(token.text.clone());
}
print!("test tokens :{:?}\n", results)
}
}

View File

@ -0,0 +1,40 @@
use tantivy_5::tokenizer::*;
use super::filter::*;
use super::stop_words;
use super::tokenizers::*;
// default build-in analyzer
pub(crate) fn standard_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder().filter(LowerCaser);
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}
builder.build()
}
pub fn chinese_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = jieba_builder().filter(CnAlphaNumOnlyFilter);
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}
builder.build()
}
pub fn english_analyzer(stop_words: Vec<String>) -> TextAnalyzer {
let builder = standard_builder()
.filter(LowerCaser)
.filter(Stemmer::new(Language::English))
.filter(StopWordFilter::remove(
stop_words::ENGLISH.iter().map(|&word| word.to_owned()),
));
if stop_words.len() > 0 {
return builder.filter(StopWordFilter::remove(stop_words)).build();
}
builder.build()
}

View File

@ -0,0 +1,285 @@
use serde_json as json;
use tantivy_5::tokenizer::*;
use super::util::*;
use crate::error::{Result, TantivyBindingError};
pub(crate) enum SystemFilter {
Invalid,
LowerCase(LowerCaser),
AsciiFolding(AsciiFoldingFilter),
AlphaNumOnly(AlphaNumOnlyFilter),
CnCharOnly(CnCharOnlyFilter),
CnAlphaNumOnly(CnAlphaNumOnlyFilter),
Length(RemoveLongFilter),
Stop(StopWordFilter),
Decompounder(SplitCompoundWords),
Stemmer(Stemmer),
}
impl SystemFilter {
pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder {
match self {
Self::LowerCase(filter) => builder.filter(filter).dynamic(),
Self::AsciiFolding(filter) => builder.filter(filter).dynamic(),
Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(),
Self::CnCharOnly(filter) => builder.filter(filter).dynamic(),
Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(),
Self::Length(filter) => builder.filter(filter).dynamic(),
Self::Stop(filter) => builder.filter(filter).dynamic(),
Self::Decompounder(filter) => builder.filter(filter).dynamic(),
Self::Stemmer(filter) => builder.filter(filter).dynamic(),
Self::Invalid => builder,
}
}
}
// create length filter from params
// {
// "type": "length",
// "max": 10, // length
// }
// TODO support min length
fn get_length_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
let limit_str = params.get("max");
if limit_str.is_none() || !limit_str.unwrap().is_u64() {
return Err(TantivyBindingError::InternalError(
"lenth max param was none or not uint".to_string(),
));
}
let limit = limit_str.unwrap().as_u64().unwrap() as usize;
Ok(SystemFilter::Length(RemoveLongFilter::limit(limit + 1)))
}
fn get_stop_words_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
let value = params.get("stop_words");
if value.is_none() {
return Err(TantivyBindingError::InternalError(
"stop filter stop_words can't be empty".to_string(),
));
}
let str_list = get_string_list(value.unwrap(), "stop_words filter")?;
Ok(SystemFilter::Stop(StopWordFilter::remove(
get_stop_words_list(str_list),
)))
}
fn get_decompounder_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
let value = params.get("word_list");
if value.is_none() || !value.unwrap().is_array() {
return Err(TantivyBindingError::InternalError(
"decompounder word list should be array".to_string(),
));
}
let stop_words = value.unwrap().as_array().unwrap();
let mut str_list = Vec::<String>::new();
for element in stop_words {
match element.as_str() {
Some(word) => str_list.push(word.to_string()),
_ => {
return Err(TantivyBindingError::InternalError(
"decompounder word list item should be string".to_string(),
))
}
}
}
match SplitCompoundWords::from_dictionary(str_list) {
Ok(f) => Ok(SystemFilter::Decompounder(f)),
Err(e) => Err(TantivyBindingError::InternalError(format!(
"create decompounder failed: {}",
e.to_string()
))),
}
}
fn get_stemmer_filter(params: &json::Map<String, json::Value>) -> Result<SystemFilter> {
let value = params.get("language");
if value.is_none() || !value.unwrap().is_string() {
return Err(TantivyBindingError::InternalError(
"stemmer language field should be string".to_string(),
));
}
match value.unwrap().as_str().unwrap().into_language() {
Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))),
Err(e) => Err(TantivyBindingError::InternalError(format!(
"create stemmer failed : {}",
e.to_string()
))),
}
}
trait LanguageParser {
fn into_language(self) -> Result<Language>;
}
impl LanguageParser for &str {
fn into_language(self) -> Result<Language> {
match self.to_lowercase().as_str() {
"arabig" => Ok(Language::Arabic),
"danish" => Ok(Language::Danish),
"dutch" => Ok(Language::Dutch),
"english" => Ok(Language::English),
"finnish" => Ok(Language::Finnish),
"french" => Ok(Language::French),
"german" => Ok(Language::German),
"greek" => Ok(Language::Greek),
"hungarian" => Ok(Language::Hungarian),
"italian" => Ok(Language::Italian),
"norwegian" => Ok(Language::Norwegian),
"portuguese" => Ok(Language::Portuguese),
"romanian" => Ok(Language::Romanian),
"russian" => Ok(Language::Russian),
"spanish" => Ok(Language::Spanish),
"swedish" => Ok(Language::Swedish),
"tamil" => Ok(Language::Tamil),
"turkish" => Ok(Language::Turkish),
other => Err(TantivyBindingError::InternalError(format!(
"unsupport language: {}",
other
))),
}
}
}
impl From<&str> for SystemFilter {
fn from(value: &str) -> Self {
match value {
"lowercase" => Self::LowerCase(LowerCaser),
"asciifolding" => Self::AsciiFolding(AsciiFoldingFilter),
"alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter),
"cncharonly" => Self::CnCharOnly(CnCharOnlyFilter),
"cnalphanumonly" => Self::CnAlphaNumOnly(CnAlphaNumOnlyFilter),
_ => Self::Invalid,
}
}
}
impl TryFrom<&json::Map<String, json::Value>> for SystemFilter {
type Error = TantivyBindingError;
fn try_from(params: &json::Map<String, json::Value>) -> Result<Self> {
match params.get(&"type".to_string()) {
Some(value) => {
if !value.is_string() {
return Err(TantivyBindingError::InternalError(
"filter type should be string".to_string(),
));
};
match value.as_str().unwrap() {
"length" => get_length_filter(params),
"stop" => get_stop_words_filter(params),
"decompounder" => get_decompounder_filter(params),
"stemmer" => get_stemmer_filter(params),
other => Err(TantivyBindingError::InternalError(format!(
"unsupport filter type: {}",
other
))),
}
}
None => Err(TantivyBindingError::InternalError(
"no type field in filter params".to_string(),
)),
}
}
}
pub struct CnCharOnlyFilter;
pub struct CnCharOnlyFilterStream<T> {
regex: regex::Regex,
tail: T,
}
impl TokenFilter for CnCharOnlyFilter {
type Tokenizer<T: Tokenizer> = CnCharOnlyFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnCharOnlyFilterWrapper<T> {
CnCharOnlyFilterWrapper(tokenizer)
}
}
#[derive(Clone)]
pub struct CnCharOnlyFilterWrapper<T>(T);
impl<T: Tokenizer> Tokenizer for CnCharOnlyFilterWrapper<T> {
type TokenStream<'a> = CnCharOnlyFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
CnCharOnlyFilterStream {
regex: regex::Regex::new("\\p{Han}+").unwrap(),
tail: self.0.token_stream(text),
}
}
}
impl<T: TokenStream> TokenStream for CnCharOnlyFilterStream<T> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.regex.is_match(&self.tail.token().text) {
return true;
}
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}
pub struct CnAlphaNumOnlyFilter;
pub struct CnAlphaNumOnlyFilterStream<T> {
regex: regex::Regex,
tail: T,
}
impl TokenFilter for CnAlphaNumOnlyFilter {
type Tokenizer<T: Tokenizer> = CnAlphaNumOnlyFilterWrapper<T>;
fn transform<T: Tokenizer>(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper<T> {
CnAlphaNumOnlyFilterWrapper(tokenizer)
}
}
#[derive(Clone)]
pub struct CnAlphaNumOnlyFilterWrapper<T>(T);
impl<T: Tokenizer> Tokenizer for CnAlphaNumOnlyFilterWrapper<T> {
type TokenStream<'a> = CnAlphaNumOnlyFilterStream<T::TokenStream<'a>>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
CnAlphaNumOnlyFilterStream {
regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(),
tail: self.0.token_stream(text),
}
}
}
impl<T: TokenStream> TokenStream for CnAlphaNumOnlyFilterStream<T> {
fn advance(&mut self) -> bool {
while self.tail.advance() {
if self.regex.is_match(&self.tail.token().text) {
return true;
}
}
false
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}

View File

@ -0,0 +1,11 @@
//! This is totally copied from src/analyzer
mod analyzer;
mod build_in_analyzer;
mod filter;
mod stop_words;
mod tokenizers;
mod util;
pub(crate) use self::analyzer::create_analyzer;
pub(crate) use self::build_in_analyzer::standard_analyzer;

View File

@ -0,0 +1,5 @@
pub const ENGLISH: &[&str] = &[
"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in",
"into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the",
"their", "then", "there", "these", "they", "this", "to", "was", "will", "with",
];

View File

@ -0,0 +1,83 @@
use jieba_rs;
use lazy_static::lazy_static;
use tantivy_5::tokenizer::{Token, TokenStream, Tokenizer};
lazy_static! {
static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
}
#[allow(dead_code)]
#[derive(Clone)]
pub enum JiebaMode {
Exact,
Search,
}
#[derive(Clone)]
pub struct JiebaTokenizer {
mode: JiebaMode,
hmm: bool,
}
pub struct JiebaTokenStream {
tokens: Vec<Token>,
index: usize,
}
impl TokenStream for JiebaTokenStream {
fn advance(&mut self) -> bool {
if self.index < self.tokens.len() {
self.index += 1;
true
} else {
false
}
}
fn token(&self) -> &Token {
&self.tokens[self.index - 1]
}
fn token_mut(&mut self) -> &mut Token {
&mut self.tokens[self.index - 1]
}
}
impl JiebaTokenizer {
pub fn new() -> JiebaTokenizer {
JiebaTokenizer {
mode: JiebaMode::Search,
hmm: true,
}
}
fn tokenize(&self, text: &str) -> Vec<Token> {
let mut indices = text.char_indices().collect::<Vec<_>>();
indices.push((text.len(), '\0'));
let ori_tokens = match self.mode {
JiebaMode::Exact => JIEBA.tokenize(text, jieba_rs::TokenizeMode::Default, self.hmm),
JiebaMode::Search => JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, self.hmm),
};
let mut tokens = Vec::with_capacity(ori_tokens.len());
for token in ori_tokens {
tokens.push(Token {
offset_from: indices[token.start].0,
offset_to: indices[token.end].0,
position: token.start,
text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]),
position_length: token.end - token.start,
});
}
tokens
}
}
impl Tokenizer for JiebaTokenizer {
type TokenStream<'a> = JiebaTokenStream;
fn token_stream(&mut self, text: &str) -> JiebaTokenStream {
let tokens = self.tokenize(text);
JiebaTokenStream { tokens, index: 0 }
}
}

View File

@ -0,0 +1,152 @@
use core::result::Result::Err;
use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use lindera::token::Token as LToken;
use lindera::tokenizer::Tokenizer as LTokenizer;
use tantivy_5::tokenizer::{Token, TokenStream, Tokenizer};
use crate::error::{Result, TantivyBindingError};
use serde_json as json;
pub struct LinderaTokenStream<'a> {
pub tokens: Vec<LToken<'a>>,
pub token: &'a mut Token,
}
impl<'a> TokenStream for LinderaTokenStream<'a> {
fn advance(&mut self) -> bool {
if self.tokens.is_empty() {
return false;
}
let token = self.tokens.remove(0);
self.token.text = token.text.to_string();
self.token.offset_from = token.byte_start;
self.token.offset_to = token.byte_end;
self.token.position = token.position;
self.token.position_length = token.position_length;
true
}
fn token(&self) -> &Token {
self.token
}
fn token_mut(&mut self) -> &mut Token {
self.token
}
}
#[derive(Clone)]
pub struct LinderaTokenizer {
tokenizer: LTokenizer,
token: Token,
}
impl LinderaTokenizer {
/// Create a new `LinderaTokenizer`.
/// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable.
pub fn from_json(params: &json::Map<String, json::Value>) -> Result<LinderaTokenizer> {
let kind = fetch_lindera_kind(params)?;
let dictionary = load_dictionary_from_kind(kind);
if dictionary.is_err() {
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer with invalid dict_kind"
)));
}
let segmenter = Segmenter::new(Mode::Normal, dictionary.unwrap(), None);
Ok(LinderaTokenizer::from_segmenter(segmenter))
}
/// Create a new `LinderaTokenizer`.
/// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`.
pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer {
LinderaTokenizer {
tokenizer: LTokenizer::new(segmenter),
token: Default::default(),
}
}
}
impl Tokenizer for LinderaTokenizer {
type TokenStream<'a> = LinderaTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> {
self.token.reset();
LinderaTokenStream {
tokens: self.tokenizer.tokenize(text).unwrap(),
token: &mut self.token,
}
}
}
trait DictionaryKindParser {
fn into_dict_kind(self) -> Result<DictionaryKind>;
}
impl DictionaryKindParser for &str {
fn into_dict_kind(self) -> Result<DictionaryKind> {
match self {
"ipadic" => Ok(DictionaryKind::IPADIC),
"ipadic-neologd" => Ok(DictionaryKind::IPADICNEologd),
"unidic" => Ok(DictionaryKind::UniDic),
"ko-dic" => Ok(DictionaryKind::KoDic),
"cc-cedict" => Ok(DictionaryKind::CcCedict),
other => Err(TantivyBindingError::InvalidArgument(format!(
"unsupported lindera dict type: {}",
other
))),
}
}
}
fn fetch_lindera_kind(params: &json::Map<String, json::Value>) -> Result<DictionaryKind> {
match params.get("dict_kind") {
Some(val) => {
if !val.is_string() {
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer dict kind should be string"
)));
}
val.as_str().unwrap().into_dict_kind()
}
_ => {
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer dict_kind must be set"
)))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_lindera_tokenizer() {
let params = r#"{
"type": "lindera",
"dict_kind": "ipadic"
}"#;
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
assert!(json_param.is_ok());
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
}
#[test]
#[cfg(feature = "lindera-cc-cedict")]
fn test_lindera_tokenizer_cc() {
let params = r#"{
"type": "lindera",
"dict_kind": "cc-cedict"
}"#;
let json_param = json::from_str::<json::Map<String, json::Value>>(&params);
assert!(json_param.is_ok());
let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap());
assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap());
}
}

View File

@ -0,0 +1,5 @@
mod jieba_tokenizer;
mod lindera_tokenizer;
mod tokenizer;
pub(crate) use self::tokenizer::*;

View File

@ -0,0 +1,74 @@
use log::warn;
use serde_json as json;
use tantivy_5::tokenizer::*;
use tantivy_5::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
use crate::error::{Result, TantivyBindingError};
use super::jieba_tokenizer::JiebaTokenizer;
use super::lindera_tokenizer::LinderaTokenizer;
pub fn standard_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(SimpleTokenizer::default()).dynamic()
}
pub fn whitespace_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()
}
pub fn jieba_builder() -> TextAnalyzerBuilder {
TextAnalyzer::builder(JiebaTokenizer::new()).dynamic()
}
pub fn lindera_builder(
params: Option<&json::Map<String, json::Value>>,
) -> Result<TextAnalyzerBuilder> {
if params.is_none() {
return Err(TantivyBindingError::InvalidArgument(format!(
"lindera tokenizer must be costum"
)));
}
let tokenizer = LinderaTokenizer::from_json(params.unwrap())?;
Ok(TextAnalyzer::builder(tokenizer).dynamic())
}
pub fn get_builder_with_tokenizer(params: &json::Value) -> Result<TextAnalyzerBuilder> {
let name;
let params_map;
if params.is_string() {
name = params.as_str().unwrap();
params_map = None;
} else {
let m = params.as_object().unwrap();
match m.get("type") {
Some(val) => {
if !val.is_string() {
return Err(TantivyBindingError::InvalidArgument(format!(
"tokenizer type should be string"
)));
}
name = val.as_str().unwrap();
}
_ => {
return Err(TantivyBindingError::InvalidArgument(format!(
"costum tokenizer must set type"
)))
}
}
params_map = Some(m);
}
match name {
"standard" => Ok(standard_builder()),
"whitespace" => Ok(whitespace_builder()),
"jieba" => Ok(jieba_builder()),
"lindera" => lindera_builder(params_map),
other => {
warn!("unsupported tokenizer: {}", other);
Err(TantivyBindingError::InvalidArgument(format!(
"unsupported tokenizer: {}",
other
)))
}
}
}

View File

@ -0,0 +1,45 @@
use serde_json as json;
use super::stop_words;
use crate::error::{Result, TantivyBindingError};
pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result<Vec<String>> {
if !value.is_array() {
return Err(TantivyBindingError::InternalError(
format!("{} should be array", label).to_string(),
));
}
let stop_words = value.as_array().unwrap();
let mut str_list = Vec::<String>::new();
for element in stop_words {
match element.as_str() {
Some(word) => str_list.push(word.to_string()),
_ => {
return Err(TantivyBindingError::InternalError(
format!("{} list item should be string", label).to_string(),
))
}
}
}
Ok(str_list)
}
pub(crate) fn get_stop_words_list(str_list: Vec<String>) -> Vec<String> {
let mut stop_words = Vec::new();
for str in str_list {
if str.len() > 0 && str.chars().nth(0).unwrap() == '_' {
match str.as_str() {
"_english_" => {
for word in stop_words::ENGLISH {
stop_words.push(word.to_string());
}
continue;
}
_other => {}
}
}
stop_words.push(str);
}
stop_words
}

View File

@ -0,0 +1,219 @@
use std::ffi::CStr;
use std::sync::Arc;
use either::Either;
use futures::executor::block_on;
use libc::c_char;
use log::info;
use tantivy_5::schema::{
Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED,
};
use tantivy_5::{Document as TantivyDocument, Index, IndexWriter, SingleSegmentIndexWriter};
use crate::data_type::TantivyDataType;
use crate::error::Result;
use crate::index_writer::TantivyValue;
use crate::log::init_log;
pub(crate) struct IndexWriterWrapperImpl {
pub(crate) field: Field,
pub(crate) index_writer: Either<IndexWriter, SingleSegmentIndexWriter>,
pub(crate) id_field: Option<Field>,
pub(crate) _index: Arc<Index>,
}
#[inline]
fn schema_builder_add_field(
schema_builder: &mut SchemaBuilder,
field_name: &str,
data_type: TantivyDataType,
) -> Field {
match data_type {
TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED),
TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED),
TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED),
TantivyDataType::Keyword => {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
schema_builder.add_text_field(&field_name, text_options)
}
TantivyDataType::Text => {
panic!("text should be indexed with analyzer");
}
}
}
impl TantivyValue<TantivyDocument> for i64 {
fn add_to_document(&self, field: u32, document: &mut TantivyDocument) {
document.add_i64(Field::from_field_id(field), *self);
}
}
impl TantivyValue<TantivyDocument> for u64 {
fn add_to_document(&self, field: u32, document: &mut TantivyDocument) {
document.add_u64(Field::from_field_id(field), *self);
}
}
impl TantivyValue<TantivyDocument> for f64 {
fn add_to_document(&self, field: u32, document: &mut TantivyDocument) {
document.add_f64(Field::from_field_id(field), *self);
}
}
impl TantivyValue<TantivyDocument> for &str {
fn add_to_document(&self, field: u32, document: &mut TantivyDocument) {
document.add_text(Field::from_field_id(field), *self);
}
}
impl TantivyValue<TantivyDocument> for bool {
fn add_to_document(&self, field: u32, document: &mut TantivyDocument) {
document.add_bool(Field::from_field_id(field), *self);
}
}
impl IndexWriterWrapperImpl {
pub fn new(
field_name: &str,
data_type: TantivyDataType,
path: String,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
) -> Result<IndexWriterWrapperImpl> {
info!(
"create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 5",
field_name, data_type
);
let mut schema_builder = Schema::builder();
let field = schema_builder_add_field(&mut schema_builder, field_name, data_type);
// We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
let id_field = schema_builder.add_i64_field("doc_id", FAST);
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?;
let index_writer =
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
Ok(IndexWriterWrapperImpl {
field,
index_writer: Either::Left(index_writer),
id_field: Some(id_field),
_index: Arc::new(index),
})
}
pub fn new_with_single_segment(
field_name: &str,
data_type: TantivyDataType,
path: String,
) -> Result<IndexWriterWrapperImpl> {
init_log();
info!(
"create single segment index writer, field_name: {}, data_type: {:?}, tantivy_index_version 5",
field_name, data_type
);
let mut schema_builder = Schema::builder();
let field = schema_builder_add_field(&mut schema_builder, field_name, data_type);
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?;
let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?;
Ok(IndexWriterWrapperImpl {
field,
index_writer: Either::Right(index_writer),
id_field: None,
_index: Arc::new(index),
})
}
#[inline]
fn add_document(&mut self, mut document: TantivyDocument, offset: Option<i64>) -> Result<()> {
if let Some(id_field) = self.id_field {
document.add_i64(id_field, offset.unwrap());
}
match &mut self.index_writer {
Either::Left(writer) => {
let _ = writer.add_document(document)?;
}
Either::Right(single_segment_writer) => {
let _ = single_segment_writer.add_document(document)?;
}
}
Ok(())
}
pub fn add<T: TantivyValue<TantivyDocument>>(
&mut self,
data: T,
offset: Option<i64>,
) -> Result<()> {
let mut document = TantivyDocument::default();
data.add_to_document(self.field.field_id(), &mut document);
self.add_document(document, offset)
}
pub fn add_array<T: TantivyValue<TantivyDocument>, I>(
&mut self,
data: I,
offset: Option<i64>,
) -> Result<()>
where
I: IntoIterator<Item = T>,
{
let mut document = TantivyDocument::default();
data.into_iter()
.for_each(|d| d.add_to_document(self.field.field_id(), &mut document));
self.add_document(document, offset)
}
pub fn add_array_keywords(
&mut self,
datas: &[*const c_char],
offset: Option<i64>,
) -> Result<()> {
let mut document = TantivyDocument::default();
for element in datas {
let data = unsafe { CStr::from_ptr(*element) };
document.add_field_value(self.field, data.to_str()?);
}
self.add_document(document, offset)
}
pub fn manual_merge(&mut self) -> Result<()> {
let index_writer = self.index_writer.as_mut().left().unwrap();
let metas = index_writer.index().searchable_segment_metas()?;
let policy = index_writer.get_merge_policy();
let candidates = policy.compute_merge_candidates(metas.as_slice());
for candidate in candidates {
index_writer.merge(candidate.0.as_slice()).wait()?;
}
Ok(())
}
pub fn finish(self) -> Result<()> {
match self.index_writer {
Either::Left(mut index_writer) => {
index_writer.commit()?;
// self.manual_merge();
block_on(index_writer.garbage_collect_files())?;
index_writer.wait_merging_threads()?;
}
Either::Right(single_segment_index_writer) => {
single_segment_index_writer
.finalize()
.expect("failed to build inverted index");
}
}
Ok(())
}
pub(crate) fn commit(&mut self) -> Result<()> {
self.index_writer.as_mut().left().unwrap().commit()?;
Ok(())
}
}

View File

@ -0,0 +1,58 @@
use std::sync::Arc;
use either::Either;
use tantivy_5::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST};
use tantivy_5::Index;
use crate::error::Result;
use crate::log::init_log;
use super::analyzer::create_analyzer;
use super::IndexWriterWrapperImpl;
fn build_text_schema(field_name: &str, tokenizer_name: &str) -> (Schema, Field, Field) {
let mut schema_builder = Schema::builder();
// positions is required for matching phase.
let indexing = TextFieldIndexing::default()
.set_tokenizer(tokenizer_name)
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let option = TextOptions::default().set_indexing_options(indexing);
let field = schema_builder.add_text_field(field_name, option);
let id_field = schema_builder.add_i64_field("doc_id", FAST);
(schema_builder.build(), field, id_field)
}
impl IndexWriterWrapperImpl {
pub(crate) fn create_text_writer(
field_name: &str,
path: &str,
tokenizer_name: &str,
tokenizer_params: &str,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
in_ram: bool,
) -> Result<IndexWriterWrapperImpl> {
init_log();
let tokenizer = create_analyzer(tokenizer_params)?;
let (schema, field, id_field) = build_text_schema(field_name, tokenizer_name);
let index: Index;
if in_ram {
index = Index::create_in_ram(schema);
} else {
index = Index::create_in_dir(path.to_string(), schema).unwrap();
}
index.tokenizers().register(&tokenizer_name, tokenizer);
let index_writer = index
.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)
.unwrap();
Ok(IndexWriterWrapperImpl {
field,
index_writer: Either::Left(index_writer),
id_field: Some(id_field),
_index: Arc::new(index),
})
}
}

View File

@ -0,0 +1,11 @@
//! Tantivy index version 5
//! This is the old version of Tantivy index (ex: Milvus 2.4.x uses).
//! We may still build tantivy index with version 5 for compatibility reasons where
//! there are some read nodes that can only read tantivy index with version 5.
mod analyzer;
pub(crate) mod index_writer;
pub(crate) mod index_writer_text;
pub(crate) use index_writer::IndexWriterWrapperImpl;
pub(crate) use tantivy_5::Document as TantivyDocumentV5;

View File

@ -0,0 +1,224 @@
use std::ffi::CStr;
use std::sync::Arc;
use either::Either;
use futures::executor::block_on;
use libc::c_char;
use log::info;
use tantivy::schema::{
Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED,
};
use tantivy::{Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument};
use crate::data_type::TantivyDataType;
use crate::error::Result;
use crate::index_reader::IndexReaderWrapper;
use crate::index_writer::TantivyValue;
use crate::log::init_log;
#[inline]
fn schema_builder_add_field(
schema_builder: &mut SchemaBuilder,
field_name: &str,
data_type: TantivyDataType,
) -> Field {
match data_type {
TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED),
TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED),
TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED),
TantivyDataType::Keyword => {
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
schema_builder.add_text_field(&field_name, text_options)
}
TantivyDataType::Text => {
panic!("text should be indexed with analyzer");
}
}
}
impl TantivyValue<TantivyDocument> for i64 {
fn add_to_document(&self, field: u32, document: &mut TantivyDocument) {
document.add_i64(Field::from_field_id(field), *self);
}
}
impl TantivyValue<TantivyDocument> for u64 {
fn add_to_document(&self, field: u32, document: &mut TantivyDocument) {
document.add_u64(Field::from_field_id(field), *self);
}
}
impl TantivyValue<TantivyDocument> for f64 {
fn add_to_document(&self, field: u32, document: &mut TantivyDocument) {
document.add_f64(Field::from_field_id(field), *self);
}
}
impl TantivyValue<TantivyDocument> for &str {
fn add_to_document(&self, field: u32, document: &mut TantivyDocument) {
document.add_text(Field::from_field_id(field), *self);
}
}
impl TantivyValue<TantivyDocument> for bool {
fn add_to_document(&self, field: u32, document: &mut TantivyDocument) {
document.add_bool(Field::from_field_id(field), *self);
}
}
pub struct IndexWriterWrapperImpl {
pub(crate) field: Field,
pub(crate) index_writer: Either<IndexWriter, SingleSegmentIndexWriter>,
pub(crate) id_field: Option<Field>,
pub(crate) index: Arc<Index>,
}
impl IndexWriterWrapperImpl {
pub fn new(
field_name: &str,
data_type: TantivyDataType,
path: String,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
) -> Result<IndexWriterWrapperImpl> {
info!(
"create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 7",
field_name, data_type
);
let mut schema_builder = Schema::builder();
let field = schema_builder_add_field(&mut schema_builder, field_name, data_type);
// We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
let id_field = schema_builder.add_i64_field("doc_id", FAST);
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?;
let index_writer =
index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?;
Ok(IndexWriterWrapperImpl {
field,
index_writer: Either::Left(index_writer),
id_field: Some(id_field),
index: Arc::new(index),
})
}
pub fn new_with_single_segment(
field_name: &str,
data_type: TantivyDataType,
path: String,
) -> Result<IndexWriterWrapperImpl> {
init_log();
info!(
"create single segment index writer, field_name: {}, data_type: {:?}, tantivy_index_version 7",
field_name, data_type
);
let mut schema_builder = Schema::builder();
let field = schema_builder_add_field(&mut schema_builder, field_name, data_type);
let schema = schema_builder.build();
let index = Index::create_in_dir(path.clone(), schema)?;
let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?;
Ok(IndexWriterWrapperImpl {
field,
index_writer: Either::Right(index_writer),
id_field: None,
index: Arc::new(index),
})
}
pub fn create_reader(&self) -> Result<IndexReaderWrapper> {
IndexReaderWrapper::from_index(self.index.clone())
}
#[inline]
fn add_document(&mut self, mut document: TantivyDocument, offset: Option<i64>) -> Result<()> {
if let Some(id_field) = self.id_field {
document.add_i64(id_field, offset.unwrap());
}
match &mut self.index_writer {
Either::Left(writer) => {
let _ = writer.add_document(document)?;
}
Either::Right(single_segment_writer) => {
let _ = single_segment_writer.add_document(document)?;
}
}
Ok(())
}
pub fn add<T: TantivyValue<TantivyDocument>>(
&mut self,
data: T,
offset: Option<i64>,
) -> Result<()> {
let mut document = TantivyDocument::default();
data.add_to_document(self.field.field_id(), &mut document);
self.add_document(document, offset)
}
pub fn add_array<T: TantivyValue<TantivyDocument>, I>(
&mut self,
data: I,
offset: Option<i64>,
) -> Result<()>
where
I: IntoIterator<Item = T>,
{
let mut document = TantivyDocument::default();
data.into_iter()
.for_each(|d| d.add_to_document(self.field.field_id(), &mut document));
self.add_document(document, offset)
}
pub fn add_array_keywords(
&mut self,
datas: &[*const c_char],
offset: Option<i64>,
) -> Result<()> {
let mut document = TantivyDocument::default();
for element in datas {
let data = unsafe { CStr::from_ptr(*element) };
document.add_field_value(self.field, data.to_str()?);
}
self.add_document(document, offset)
}
pub fn manual_merge(&mut self) -> Result<()> {
let index_writer = self.index_writer.as_mut().left().unwrap();
let metas = index_writer.index().searchable_segment_metas()?;
let policy = index_writer.get_merge_policy();
let candidates = policy.compute_merge_candidates(metas.as_slice());
for candidate in candidates {
index_writer.merge(candidate.0.as_slice()).wait()?;
}
Ok(())
}
pub fn finish(self) -> Result<()> {
match self.index_writer {
Either::Left(mut index_writer) => {
index_writer.commit()?;
// self.manual_merge();
block_on(index_writer.garbage_collect_files())?;
index_writer.wait_merging_threads()?;
}
Either::Right(single_segment_index_writer) => {
single_segment_index_writer
.finalize()
.expect("failed to build inverted index");
}
}
Ok(())
}
pub(crate) fn commit(&mut self) -> Result<()> {
self.index_writer.as_mut().left().unwrap().commit()?;
Ok(())
}
}

View File

@ -0,0 +1,57 @@
use std::sync::Arc;
use either::Either;
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST};
use tantivy::Index;
use crate::analyzer::create_analyzer;
use crate::error::Result;
use crate::log::init_log;
use super::IndexWriterWrapperImpl;
fn build_text_schema(field_name: &str, tokenizer_name: &str) -> (Schema, Field, Field) {
let mut schema_builder = Schema::builder();
// positions is required for matching phase.
let indexing = TextFieldIndexing::default()
.set_tokenizer(tokenizer_name)
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let option = TextOptions::default().set_indexing_options(indexing);
let field = schema_builder.add_text_field(field_name, option);
let id_field = schema_builder.add_i64_field("doc_id", FAST);
(schema_builder.build(), field, id_field)
}
impl IndexWriterWrapperImpl {
pub(crate) fn create_text_writer(
field_name: &str,
path: &str,
tokenizer_name: &str,
tokenizer_params: &str,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
in_ram: bool,
) -> Result<IndexWriterWrapperImpl> {
init_log();
let tokenizer = create_analyzer(tokenizer_params)?;
let (schema, field, id_field) = build_text_schema(field_name, tokenizer_name);
let index: Index;
if in_ram {
index = Index::create_in_ram(schema);
} else {
index = Index::create_in_dir(path.to_string(), schema).unwrap();
}
index.tokenizers().register(&tokenizer_name, tokenizer);
let index_writer = index
.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)
.unwrap();
Ok(IndexWriterWrapperImpl {
field,
index_writer: Either::Left(index_writer),
id_field: Some(id_field),
index: Arc::new(index),
})
}
}

View File

@ -0,0 +1,9 @@
//! Tantivy index version 7
//! This is the latest version of Tantivy index and is what we plan to use
//! in most cases.
pub(crate) mod index_writer;
pub(crate) mod index_writer_text;
pub(crate) use index_writer::IndexWriterWrapperImpl;
pub(crate) use tantivy::TantivyDocument as TantivyDocumentV7;

View File

@ -1,3 +1,6 @@
use error::TantivyBindingError;
mod analyzer;
mod array;
mod data_type;
mod demo_c;
@ -12,26 +15,44 @@ mod index_writer;
mod index_writer_c;
mod index_writer_text;
mod index_writer_text_c;
mod index_writer_v5;
mod index_writer_v7;
mod log;
mod string_c;
mod token_stream_c;
mod analyzer;
mod tokenizer_c;
mod util;
mod util_c;
mod vec_collector;
pub fn add(left: usize, right: usize) -> usize {
left + right
use error::Result;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TantivyIndexVersion {
V5, // Version for compatibility (for 2.4.x)
V7, // Latest version
}
#[cfg(test)]
mod tests {
use super::*;
impl TantivyIndexVersion {
pub fn from_u32(version: u32) -> Result<Self> {
match version {
5 => Ok(Self::V5),
7 => Ok(Self::V7),
_ => Err(TantivyBindingError::InvalidArgument(format!(
"unsupported version {}",
version
))),
}
}
#[test]
fn it_works() {
let result = add(2, 2);
assert_eq!(result, 4);
pub fn as_u32(&self) -> u32 {
match self {
Self::V5 => 5,
Self::V7 => 7,
}
}
pub fn default_version() -> Self {
Self::V7
}
}

View File

@ -82,6 +82,7 @@ struct TantivyIndexWrapper {
TantivyIndexWrapper(const char* field_name,
TantivyDataType data_type,
const char* path,
uint32_t tantivy_index_version,
bool inverted_single_semgnent = false,
uintptr_t num_threads = DEFAULT_NUM_THREADS,
uintptr_t overall_memory_budget_in_bytes =
@ -89,12 +90,13 @@ struct TantivyIndexWrapper {
RustResultWrapper res;
if (inverted_single_semgnent) {
res = RustResultWrapper(tantivy_create_index_with_single_segment(
field_name, data_type, path));
field_name, data_type, path, tantivy_index_version));
} else {
res = RustResultWrapper(
tantivy_create_index(field_name,
data_type,
path,
tantivy_index_version,
num_threads,
overall_memory_budget_in_bytes));
}
@ -120,6 +122,7 @@ struct TantivyIndexWrapper {
TantivyIndexWrapper(const char* field_name,
bool in_ram,
const char* path,
uint32_t tantivy_index_version,
const char* tokenizer_name = DEFAULT_TOKENIZER_NAME,
const char* analyzer_params = DEFAULT_analyzer_params,
uintptr_t num_threads = DEFAULT_NUM_THREADS,
@ -128,6 +131,7 @@ struct TantivyIndexWrapper {
auto res = RustResultWrapper(
tantivy_create_text_writer(field_name,
path,
tantivy_index_version,
tokenizer_name,
analyzer_params,
num_threads,

View File

@ -39,7 +39,8 @@ struct TokenStream {
return s;
}
TantivyToken get_detailed_token() {
TantivyToken
get_detailed_token() {
return tantivy_token_stream_get_detailed_token(ptr_);
}

View File

@ -70,16 +70,15 @@ TEST(CTokenizer, Default) {
ASSERT_FALSE(token_stream_advance(token_stream));
free_token_stream(token_stream);
token_stream =
create_token_stream(tokenizer, text.c_str(), text.length());
token_stream = create_token_stream(tokenizer, text.c_str(), text.length());
for (int i = 0; i < 3; i++) {
ASSERT_TRUE(token_stream_advance(token_stream));
auto token = token_stream_get_detailed_token(token_stream);
ASSERT_EQ(refs[i], std::string(token.token));
ASSERT_EQ(offsets[i], token.start_offset);
free_token(const_cast<char*>(token.token));
}
ASSERT_FALSE(token_stream_advance(token_stream));