diff --git a/internal/core/src/exec/Driver.cpp b/internal/core/src/exec/Driver.cpp index 58c07337a2..301a814026 100644 --- a/internal/core/src/exec/Driver.cpp +++ b/internal/core/src/exec/Driver.cpp @@ -176,21 +176,21 @@ Driver::Next(std::shared_ptr& blocking_state) { return result; } -#define CALL_OPERATOR(call_func, operator, method_name) \ - try { \ - call_func; \ - } catch (std::exception & e) { \ - std::string stack_trace = milvus::impl::EasyStackTrace(); \ - auto err_msg = fmt::format( \ - "Operator::{} failed for [Operator:{}, plan node id: " \ - "{}] : {}\nStack trace: {}", \ - method_name, \ - operator->ToString() , \ - operator->get_plannode_id(), \ - e.what(), \ - stack_trace); \ - LOG_ERROR(err_msg); \ - throw ExecOperatorException(err_msg); \ +#define CALL_OPERATOR(call_func, operator, method_name) \ + try { \ + call_func; \ + } catch (std::exception & e) { \ + std::string stack_trace = milvus::impl::EasyStackTrace(); \ + auto err_msg = fmt::format( \ + "Operator::{} failed for [Operator:{}, plan node id: " \ + "{}] : {}\nStack trace: {}", \ + method_name, \ + operator->ToString(), \ + operator->get_plannode_id(), \ + e.what(), \ + stack_trace); \ + LOG_ERROR(err_msg); \ + throw ExecOperatorException(err_msg); \ } StopReason diff --git a/internal/core/src/index/HybridScalarIndex.cpp b/internal/core/src/index/HybridScalarIndex.cpp index 4c0ff5345e..7bbf996b20 100644 --- a/internal/core/src/index/HybridScalarIndex.cpp +++ b/internal/core/src/index/HybridScalarIndex.cpp @@ -29,9 +29,11 @@ namespace index { template HybridScalarIndex::HybridScalarIndex( + uint32_t tantivy_index_version, const storage::FileManagerContext& file_manager_context) : ScalarIndex(HYBRID_INDEX_TYPE), is_built_(false), + tantivy_index_version_(tantivy_index_version), bitmap_index_cardinality_limit_( DEFAULT_HYBRID_INDEX_BITMAP_CARDINALITY_LIMIT), file_manager_context_(file_manager_context) { @@ -191,8 +193,8 @@ HybridScalarIndex::GetInternalIndex() { internal_index_ = std::make_shared>(file_manager_context_); } else if (internal_index_type_ == ScalarIndexType::INVERTED) { - internal_index_ = - std::make_shared>(file_manager_context_); + internal_index_ = std::make_shared>( + tantivy_index_version_, file_manager_context_); } else { PanicInfo(UnexpectedError, "unknown index type when get internal index"); @@ -215,7 +217,7 @@ HybridScalarIndex::GetInternalIndex() { std::make_shared(file_manager_context_); } else if (internal_index_type_ == ScalarIndexType::INVERTED) { internal_index_ = std::make_shared>( - file_manager_context_); + tantivy_index_version_, file_manager_context_); } else { PanicInfo(UnexpectedError, "unknown index type when get internal index"); diff --git a/internal/core/src/index/HybridScalarIndex.h b/internal/core/src/index/HybridScalarIndex.h index 301f54fea0..3bcc349fea 100644 --- a/internal/core/src/index/HybridScalarIndex.h +++ b/internal/core/src/index/HybridScalarIndex.h @@ -42,6 +42,7 @@ template class HybridScalarIndex : public ScalarIndex { public: explicit HybridScalarIndex( + uint32_t tantivy_index_version, const storage::FileManagerContext& file_manager_context = storage::FileManagerContext()); @@ -193,6 +194,13 @@ class HybridScalarIndex : public ScalarIndex { std::shared_ptr> internal_index_{nullptr}; storage::FileManagerContext file_manager_context_; std::shared_ptr mem_file_manager_{nullptr}; + + // `tantivy_index_version_` is used to control which kind of tantivy index should be used. + // There could be the case where milvus version of read node is lower than the version of index builder node(and read node + // may not be upgraded to a higher version in a predictable time), so we are using a lower version of tantivy to read index + // built from a higher version of tantivy which is not supported. + // Therefore, we should provide a way to allow higher version of milvus to build tantivy index with low version. + uint32_t tantivy_index_version_{0}; }; } // namespace index diff --git a/internal/core/src/index/IndexFactory.cpp b/internal/core/src/index/IndexFactory.cpp index b59758d50c..0e56d1c99f 100644 --- a/internal/core/src/index/IndexFactory.cpp +++ b/internal/core/src/index/IndexFactory.cpp @@ -46,8 +46,10 @@ IndexFactory::CreatePrimitiveScalarIndex( const storage::FileManagerContext& file_manager_context) { auto index_type = create_index_info.index_type; if (index_type == INVERTED_INDEX_TYPE) { + assert(create_index_info.tantivy_index_version != 0); // scalar_index_engine_version 0 means we should built tantivy index within single segment return std::make_unique>( + create_index_info.tantivy_index_version, file_manager_context, create_index_info.scalar_index_engine_version == 0); } @@ -55,7 +57,8 @@ IndexFactory::CreatePrimitiveScalarIndex( return std::make_unique>(file_manager_context); } if (index_type == HYBRID_INDEX_TYPE) { - return std::make_unique>(file_manager_context); + return std::make_unique>( + create_index_info.tantivy_index_version, file_manager_context); } return CreateScalarIndexSort(file_manager_context); } @@ -75,8 +78,10 @@ IndexFactory::CreatePrimitiveScalarIndex( auto index_type = create_index_info.index_type; #if defined(__linux__) || defined(__APPLE__) if (index_type == INVERTED_INDEX_TYPE) { + assert(create_index_info.tantivy_index_version != 0); // scalar_index_engine_version 0 means we should built tantivy index within single segment return std::make_unique>( + create_index_info.tantivy_index_version, file_manager_context, create_index_info.scalar_index_engine_version == 0); } @@ -85,7 +90,7 @@ IndexFactory::CreatePrimitiveScalarIndex( } if (index_type == HYBRID_INDEX_TYPE) { return std::make_unique>( - file_manager_context); + create_index_info.tantivy_index_version, file_manager_context); } return CreateStringIndexMarisa(file_manager_context); #else diff --git a/internal/core/src/index/IndexInfo.h b/internal/core/src/index/IndexInfo.h index a86258f8b5..4e44664a1d 100644 --- a/internal/core/src/index/IndexInfo.h +++ b/internal/core/src/index/IndexInfo.h @@ -27,7 +27,8 @@ struct CreateIndexInfo { IndexVersion index_engine_version; std::string field_name; int64_t dim; - int32_t scalar_index_engine_version; + int32_t scalar_index_engine_version{1}; + uint32_t tantivy_index_version{7}; JsonCastType json_cast_type; std::string json_path; }; diff --git a/internal/core/src/index/InvertedIndexTantivy.cpp b/internal/core/src/index/InvertedIndexTantivy.cpp index de231eeb50..243e2d7347 100644 --- a/internal/core/src/index/InvertedIndexTantivy.cpp +++ b/internal/core/src/index/InvertedIndexTantivy.cpp @@ -55,15 +55,22 @@ InvertedIndexTantivy::InitForBuildIndex() { "build inverted index temp dir:{} not empty", path_); } - wrapper_ = std::make_shared( - field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_); + wrapper_ = + std::make_shared(field.c_str(), + d_type_, + path_.c_str(), + tantivy_index_version_, + inverted_index_single_segment_); } template InvertedIndexTantivy::InvertedIndexTantivy( - const storage::FileManagerContext& ctx, bool inverted_index_single_segment) + uint32_t tantivy_index_version, + const storage::FileManagerContext& ctx, + bool inverted_index_single_segment) : ScalarIndex(INVERTED_INDEX_TYPE), schema_(ctx.fieldDataMeta.field_schema), + tantivy_index_version_(tantivy_index_version), inverted_index_single_segment_(inverted_index_single_segment) { mem_file_manager_ = std::make_shared(ctx); disk_file_manager_ = std::make_shared(ctx); @@ -465,8 +472,16 @@ InvertedIndexTantivy::BuildWithRawDataForUT(size_t n, GetValueFromConfig(config, milvus::index::SCALAR_INDEX_ENGINE_VERSION) .value_or(1) == 0; - wrapper_ = std::make_shared( - field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_); + tantivy_index_version_ = + GetValueFromConfig(config, + milvus::index::TANTIVY_INDEX_VERSION) + .value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION); + wrapper_ = + std::make_shared(field.c_str(), + d_type_, + path_.c_str(), + tantivy_index_version_, + inverted_index_single_segment_); if (!inverted_index_single_segment_) { if (config.find("is_array") != config.end()) { // only used in ut. diff --git a/internal/core/src/index/InvertedIndexTantivy.h b/internal/core/src/index/InvertedIndexTantivy.h index 99186dac1d..6bd08ea68d 100644 --- a/internal/core/src/index/InvertedIndexTantivy.h +++ b/internal/core/src/index/InvertedIndexTantivy.h @@ -69,7 +69,9 @@ class InvertedIndexTantivy : public ScalarIndex { InvertedIndexTantivy() : ScalarIndex(INVERTED_INDEX_TYPE) { } - explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx, + // Default, we build tantivy index with version 7 (newest version now). + explicit InvertedIndexTantivy(uint32_t tantivy_index_version, + const storage::FileManagerContext& ctx, bool inverted_index_single_segment = false); ~InvertedIndexTantivy(); @@ -254,5 +256,12 @@ class InvertedIndexTantivy : public ScalarIndex { // new version while the query node is a older version. So we have this `inverted_index_single_segment_` to control the index // building node to build specific type of tantivy index. bool inverted_index_single_segment_{false}; + + // `tantivy_index_version_` is used to control which kind of tantivy index should be used. + // There could be the case where milvus version of read node is lower than the version of index builder node(and read node + // may not be upgraded to a higher version in a predictable time), so we are using a lower version of tantivy to read index + // built from a higher version of tantivy which is not supported. + // Therefore, we should provide a way to allow higher version of milvus to build tantivy index with low version. + uint32_t tantivy_index_version_{0}; }; } // namespace milvus::index diff --git a/internal/core/src/index/JsonInvertedIndex.h b/internal/core/src/index/JsonInvertedIndex.h index a76decedcd..bc4917352a 100644 --- a/internal/core/src/index/JsonInvertedIndex.h +++ b/internal/core/src/index/JsonInvertedIndex.h @@ -89,7 +89,10 @@ class JsonInvertedIndex : public index::InvertedIndexTantivy { std::string field_name = std::to_string( this->disk_file_manager_->GetFieldDataMeta().field_id); this->wrapper_ = std::make_shared( - field_name.c_str(), this->d_type_, this->path_.c_str()); + field_name.c_str(), + this->d_type_, + this->path_.c_str(), + TANTIVY_INDEX_LATEST_VERSION /* json index is not supported in old version */); } void diff --git a/internal/core/src/index/Meta.h b/internal/core/src/index/Meta.h index 0887cc96e9..a46fe0d752 100644 --- a/internal/core/src/index/Meta.h +++ b/internal/core/src/index/Meta.h @@ -48,6 +48,8 @@ constexpr const char* BITMAP_INDEX_TYPE = "BITMAP"; constexpr const char* HYBRID_INDEX_TYPE = "HYBRID"; constexpr const char* SCALAR_INDEX_ENGINE_VERSION = "scalar_index_engine_version"; +constexpr const char* TANTIVY_INDEX_VERSION = "tantivy_index_version"; +constexpr uint32_t TANTIVY_INDEX_LATEST_VERSION = 7; // index meta constexpr const char* COLLECTION_ID = "collection_id"; diff --git a/internal/core/src/index/TextMatchIndex.cpp b/internal/core/src/index/TextMatchIndex.cpp index fd22099e2e..93529f0ace 100644 --- a/internal/core/src/index/TextMatchIndex.cpp +++ b/internal/core/src/index/TextMatchIndex.cpp @@ -28,11 +28,18 @@ TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms, last_commit_time_(stdclock::now()) { d_type_ = TantivyDataType::Text; wrapper_ = std::make_shared( - unique_id, true, "", tokenizer_name, analyzer_params); + unique_id, + true, + "", + TANTIVY_INDEX_LATEST_VERSION /* Growing segment has no reason to use old version index*/ + , + tokenizer_name, + analyzer_params); } TextMatchIndex::TextMatchIndex(const std::string& path, const char* unique_id, + uint32_t tantivy_index_version, const char* tokenizer_name, const char* analyzer_params) : commit_interval_in_ms_(std::numeric_limits::max()), @@ -42,11 +49,16 @@ TextMatchIndex::TextMatchIndex(const std::string& path, boost::filesystem::path sub_path = unique_id; path_ = (prefix / sub_path).string(); boost::filesystem::create_directories(path_); - wrapper_ = std::make_shared( - unique_id, false, path_.c_str(), tokenizer_name, analyzer_params); + wrapper_ = std::make_shared(unique_id, + false, + path_.c_str(), + tantivy_index_version, + tokenizer_name, + analyzer_params); } TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx, + uint32_t tantivy_index_version, const char* tokenizer_name, const char* analyzer_params) : commit_interval_in_ms_(std::numeric_limits::max()), @@ -65,6 +77,7 @@ TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx, wrapper_ = std::make_shared(field_name.c_str(), false, path_.c_str(), + tantivy_index_version, tokenizer_name, analyzer_params); } diff --git a/internal/core/src/index/TextMatchIndex.h b/internal/core/src/index/TextMatchIndex.h index edeee8b0be..a2d35deb36 100644 --- a/internal/core/src/index/TextMatchIndex.h +++ b/internal/core/src/index/TextMatchIndex.h @@ -30,10 +30,12 @@ class TextMatchIndex : public InvertedIndexTantivy { // for sealed segment. explicit TextMatchIndex(const std::string& path, const char* unique_id, + uint32_t tantivy_index_version, const char* tokenizer_name, const char* analyzer_params); // for building index. explicit TextMatchIndex(const storage::FileManagerContext& ctx, + uint32_t tantivy_index_version, const char* tokenizer_name, const char* analyzer_params); // for loading index diff --git a/internal/core/src/indexbuilder/ScalarIndexCreator.cpp b/internal/core/src/indexbuilder/ScalarIndexCreator.cpp index c06a16a8a4..7419115015 100644 --- a/internal/core/src/indexbuilder/ScalarIndexCreator.cpp +++ b/internal/core/src/indexbuilder/ScalarIndexCreator.cpp @@ -41,6 +41,11 @@ ScalarIndexCreator::ScalarIndexCreator( config, milvus::index::SCALAR_INDEX_ENGINE_VERSION) .value_or(1); + index_info.tantivy_index_version = + milvus::index::GetValueFromConfig( + config, milvus::index::TANTIVY_INDEX_VERSION) + .value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION); + index_info.field_type = dtype_; index_info.index_type = index_type(); if (dtype == DataType::JSON) { diff --git a/internal/core/src/indexbuilder/index_c.cpp b/internal/core/src/indexbuilder/index_c.cpp index 80cc08bc2f..11bd458537 100644 --- a/internal/core/src/indexbuilder/index_c.cpp +++ b/internal/core/src/indexbuilder/index_c.cpp @@ -267,10 +267,16 @@ BuildTextIndex(ProtoLayoutInterface result, milvus::storage::FileManagerContext fileManagerContext( field_meta, index_meta, chunk_manager); + uint32_t tantivy_index_version = + milvus::index::GetValueFromConfig( + config, milvus::index::TANTIVY_INDEX_VERSION) + .value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION); + auto field_schema = FieldMeta::ParseFrom(build_index_info->field_schema()); auto index = std::make_unique( fileManagerContext, + tantivy_index_version, "milvus_tokenizer", field_schema.get_analyzer_params().c_str()); index->Build(config); diff --git a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp index aa318dbbf4..f2e8d36ade 100644 --- a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp +++ b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp @@ -1476,6 +1476,8 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) { index = std::make_unique( cfg.GetMmapPath(), unique_id.c_str(), + // todo: make it configurable + index::TANTIVY_INDEX_LATEST_VERSION, "milvus_tokenizer", field_meta.get_analyzer_params().c_str()); } diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index 238e91fb26..e6c76a3a1a 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -2079,6 +2079,8 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) { index = std::make_unique( cfg.GetMmapPath(), unique_id.c_str(), + // todo: make it configurable + index::TANTIVY_INDEX_LATEST_VERSION, "milvus_tokenizer", field_meta.get_analyzer_params().c_str()); } diff --git a/internal/core/src/segcore/load_index_c.cpp b/internal/core/src/segcore/load_index_c.cpp index b847c4d297..a26fa027c2 100644 --- a/internal/core/src/segcore/load_index_c.cpp +++ b/internal/core/src/segcore/load_index_c.cpp @@ -194,6 +194,21 @@ appendScalarIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) { index_info.field_type = milvus::DataType(field_type); index_info.index_type = index_params["index_type"]; + auto config = milvus::index::ParseConfigFromIndexParams( + load_index_info->index_params); + + // Config should have value for milvus::index::SCALAR_INDEX_ENGINE_VERSION for production calling chain. + // Use value_or(1) for unit test without setting this value + index_info.scalar_index_engine_version = + milvus::index::GetValueFromConfig( + config, milvus::index::SCALAR_INDEX_ENGINE_VERSION) + .value_or(1); + + index_info.tantivy_index_version = + milvus::index::GetValueFromConfig( + config, milvus::index::TANTIVY_INDEX_VERSION) + .value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION); + load_index_info->index = milvus::index::IndexFactory::GetInstance().CreateIndex( index_info, milvus::storage::FileManagerContext()); @@ -262,6 +277,21 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) { index_info.field_type = load_index_info->field_type; index_info.index_engine_version = engine_version; + auto config = milvus::index::ParseConfigFromIndexParams( + load_index_info->index_params); + + // Config should have value for milvus::index::SCALAR_INDEX_ENGINE_VERSION for production calling chain. + // Use value_or(1) for unit test without setting this value + index_info.scalar_index_engine_version = + milvus::index::GetValueFromConfig( + config, milvus::index::SCALAR_INDEX_ENGINE_VERSION) + .value_or(1); + + index_info.tantivy_index_version = + milvus::index::GetValueFromConfig( + config, milvus::index::TANTIVY_INDEX_VERSION) + .value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION); + auto ctx = milvus::tracer::TraceContext{ c_trace.traceID, c_trace.spanID, c_trace.traceFlags}; auto span = milvus::tracer::StartSpan("SegCoreLoadIndex", &ctx); @@ -303,8 +333,6 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) { milvus::storage::RemoteChunkManagerSingleton::GetInstance() .GetRemoteChunkManager(); - auto config = milvus::index::ParseConfigFromIndexParams( - load_index_info->index_params); config[milvus::index::INDEX_FILES] = load_index_info->index_files; if (load_index_info->field_type == milvus::DataType::JSON) { diff --git a/internal/core/src/segcore/token_stream_c.cpp b/internal/core/src/segcore/token_stream_c.cpp index 11530d2d24..0cf3794b06 100644 --- a/internal/core/src/segcore/token_stream_c.cpp +++ b/internal/core/src/segcore/token_stream_c.cpp @@ -30,11 +30,13 @@ token_stream_get_token(CTokenStream token_stream) { CToken token_stream_get_detailed_token(CTokenStream token_stream) { - auto token= static_cast(token_stream) - ->get_detailed_token(); - return CToken{ - token.token, token.start_offset, token.end_offset, token.position, token.position_length - }; + auto token = static_cast(token_stream) + ->get_detailed_token(); + return CToken{token.token, + token.start_offset, + token.end_offset, + token.position, + token.position_length}; } void diff --git a/internal/core/src/segcore/token_stream_c.h b/internal/core/src/segcore/token_stream_c.h index 9cad7f7161..c19ae87113 100644 --- a/internal/core/src/segcore/token_stream_c.h +++ b/internal/core/src/segcore/token_stream_c.h @@ -21,13 +21,13 @@ extern "C" { #endif typedef void* CTokenStream; -typedef struct CToken{ - const char *token; - int64_t start_offset; - int64_t end_offset; - int64_t position; - int64_t position_length; -}CToken; +typedef struct CToken { + const char* token; + int64_t start_offset; + int64_t end_offset; + int64_t position; + int64_t position_length; +} CToken; void free_token_stream(CTokenStream); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock index d48d88d627..d45e87f255 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock @@ -17,6 +17,18 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -129,7 +141,7 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", "winapi", ] @@ -152,9 +164,15 @@ dependencies = [ "miniz_oxide", "object", "rustc-demangle", - "windows-targets", + "windows-targets 0.52.6", ] +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + [[package]] name = "base64" version = "0.22.1" @@ -182,6 +200,15 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" +[[package]] +name = "bitpacking" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8c7d2ac73c167c06af4a5f37e6e59d84148d57ccbe4480b76f0273eefea82d7" +dependencies = [ + "crunchy", +] + [[package]] name = "bitpacking" version = "0.9.2" @@ -721,6 +748,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs4" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47" +dependencies = [ + "rustix", + "windows-sys 0.48.0", +] + [[package]] name = "fs4" version = "0.8.4" @@ -849,7 +886,7 @@ dependencies = [ "cfg-if", "libc", "wasi 0.13.3+wasi-0.2.2", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -894,6 +931,10 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "hashbrown" @@ -927,6 +968,12 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + [[package]] name = "htmlescape" version = "0.3.1" @@ -1225,6 +1272,18 @@ dependencies = [ "hashbrown 0.15.2", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -1237,6 +1296,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.14.0" @@ -1394,7 +1462,7 @@ dependencies = [ "reqwest", "serde", "tar", - "thiserror", + "thiserror 2.0.11", "yada", ] @@ -1468,6 +1536,15 @@ version = "0.4.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" +[[package]] +name = "lru" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a83fb7698b3643a0e34f9ae6f2e8f0178c0fd42f8b59d493aa271ff3a5bf21" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "lru" version = "0.12.5" @@ -1483,6 +1560,16 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" +[[package]] +name = "measure_time" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbefd235b0aadd181626f281e1d684e116972988c14c264e42069d5e8a5775cc" +dependencies = [ + "instant", + "log", +] + [[package]] name = "measure_time" version = "0.9.0" @@ -1498,6 +1585,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memmap2" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +dependencies = [ + "libc", +] + [[package]] name = "memmap2" version = "0.9.5" @@ -1588,6 +1684,16 @@ dependencies = [ "libm", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi 0.3.9", + "libc", +] + [[package]] name = "object" version = "0.36.7" @@ -1659,6 +1765,14 @@ version = "6.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1" +[[package]] +name = "ownedbytes" +version = "0.6.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "ownedbytes" version = "0.7.0" @@ -1856,7 +1970,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax", + "regex-syntax 0.8.5", ] [[package]] @@ -1867,9 +1981,15 @@ checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.5", ] +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.8.5" @@ -1882,7 +2002,7 @@ version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "encoding_rs", "futures-core", @@ -1951,6 +2071,12 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc-hash" version = "2.1.1" @@ -2128,6 +2254,15 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" +[[package]] +name = "sketches-ddsketch" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c" +dependencies = [ + "serde", +] + [[package]] name = "sketches-ddsketch" version = "0.3.0" @@ -2277,6 +2412,61 @@ dependencies = [ "libc", ] +[[package]] +name = "tantivy" +version = "0.21.1" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "aho-corasick", + "arc-swap", + "async-channel", + "async-trait", + "base64 0.21.7", + "bitpacking 0.8.4", + "byteorder", + "census", + "crc32fast", + "crossbeam-channel", + "downcast-rs", + "fastdivide", + "fs4 0.6.6", + "htmlescape", + "itertools 0.11.0", + "lazy_static", + "levenshtein_automata", + "log", + "lru 0.11.1", + "lz4_flex", + "measure_time 0.8.3", + "memmap2 0.7.1", + "murmurhash32", + "num_cpus", + "once_cell", + "oneshot", + "rayon", + "regex", + "rust-stemmers", + "rustc-hash 1.1.0", + "serde", + "serde_json", + "sketches-ddsketch 0.2.2", + "smallvec", + "tantivy-bitpacker 0.5.0", + "tantivy-columnar 0.2.0", + "tantivy-common 0.6.0", + "tantivy-fst 0.4.0", + "tantivy-query-grammar 0.21.0", + "tantivy-stacker 0.2.0", + "tantivy-tokenizer-api 0.2.0", + "tempfile", + "thiserror 1.0.69", + "time", + "tokio", + "uuid", + "winapi", + "zstd-sys", +] + [[package]] name = "tantivy" version = "0.23.0" @@ -2285,8 +2475,8 @@ dependencies = [ "aho-corasick", "arc-swap", "async-channel", - "base64", - "bitpacking", + "base64 0.22.1", + "bitpacking 0.9.2", "bon", "byteorder", "census", @@ -2295,36 +2485,36 @@ dependencies = [ "downcast-rs", "fastdivide", "fnv", - "fs4", + "fs4 0.8.4", "htmlescape", "hyperloglogplus", - "itertools", + "itertools 0.14.0", "lazy_static", "levenshtein_automata", "log", - "lru", + "lru 0.12.5", "lz4_flex", - "measure_time", - "memmap2", + "measure_time 0.9.0", + "memmap2 0.9.5", "once_cell", "oneshot", "rayon", "regex", "rust-stemmers", - "rustc-hash", + "rustc-hash 2.1.1", "serde", "serde_json", - "sketches-ddsketch", + "sketches-ddsketch 0.3.0", "smallvec", - "tantivy-bitpacker", - "tantivy-columnar", - "tantivy-common", - "tantivy-fst", - "tantivy-query-grammar", - "tantivy-stacker", - "tantivy-tokenizer-api", + "tantivy-bitpacker 0.6.0", + "tantivy-columnar 0.3.0", + "tantivy-common 0.7.0", + "tantivy-fst 0.5.0", + "tantivy-query-grammar 0.22.0", + "tantivy-stacker 0.3.0", + "tantivy-tokenizer-api 0.3.0", "tempfile", - "thiserror", + "thiserror 2.0.11", "time", "tokio", "uuid", @@ -2348,17 +2538,41 @@ dependencies = [ "regex", "scopeguard", "serde_json", - "tantivy", + "tantivy 0.21.1", + "tantivy 0.23.0", "tempfile", "zstd-sys", ] +[[package]] +name = "tantivy-bitpacker" +version = "0.5.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "bitpacking 0.8.4", +] + [[package]] name = "tantivy-bitpacker" version = "0.6.0" source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ - "bitpacking", + "bitpacking 0.9.2", +] + +[[package]] +name = "tantivy-columnar" +version = "0.2.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "fastdivide", + "fnv", + "itertools 0.11.0", + "serde", + "tantivy-bitpacker 0.5.0", + "tantivy-common 0.6.0", + "tantivy-sstable 0.2.0", + "tantivy-stacker 0.2.0", ] [[package]] @@ -2368,12 +2582,24 @@ source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a dependencies = [ "downcast-rs", "fastdivide", - "itertools", + "itertools 0.14.0", "serde", - "tantivy-bitpacker", - "tantivy-common", - "tantivy-sstable", - "tantivy-stacker", + "tantivy-bitpacker 0.6.0", + "tantivy-common 0.7.0", + "tantivy-sstable 0.3.0", + "tantivy-stacker 0.3.0", +] + +[[package]] +name = "tantivy-common" +version = "0.6.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "async-trait", + "byteorder", + "ownedbytes 0.6.0", + "serde", + "time", ] [[package]] @@ -2383,12 +2609,23 @@ source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a dependencies = [ "async-trait", "byteorder", - "ownedbytes", + "ownedbytes 0.7.0", "serde", "time", "tokio", ] +[[package]] +name = "tantivy-fst" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc3c506b1a8443a3a65352df6382a1fb6a7afe1a02e871cee0d25e2c3d5f3944" +dependencies = [ + "byteorder", + "regex-syntax 0.6.29", + "utf8-ranges", +] + [[package]] name = "tantivy-fst" version = "0.5.0" @@ -2396,10 +2633,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18" dependencies = [ "byteorder", - "regex-syntax", + "regex-syntax 0.8.5", "utf8-ranges", ] +[[package]] +name = "tantivy-query-grammar" +version = "0.21.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "nom", +] + [[package]] name = "tantivy-query-grammar" version = "0.22.0" @@ -2408,17 +2653,36 @@ dependencies = [ "nom", ] +[[package]] +name = "tantivy-sstable" +version = "0.2.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "tantivy-common 0.6.0", + "tantivy-fst 0.4.0", + "zstd 0.12.4", +] + [[package]] name = "tantivy-sstable" version = "0.3.0" source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ "futures-util", - "itertools", - "tantivy-bitpacker", - "tantivy-common", - "tantivy-fst", - "zstd", + "itertools 0.14.0", + "tantivy-bitpacker 0.6.0", + "tantivy-common 0.7.0", + "tantivy-fst 0.5.0", + "zstd 0.13.0", +] + +[[package]] +name = "tantivy-stacker" +version = "0.2.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "murmurhash32", + "tantivy-common 0.6.0", ] [[package]] @@ -2428,7 +2692,15 @@ source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a dependencies = [ "murmurhash32", "rand_distr", - "tantivy-common", + "tantivy-common 0.7.0", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.2.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "serde", ] [[package]] @@ -2479,13 +2751,33 @@ version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9" +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + [[package]] name = "thiserror" version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" dependencies = [ - "thiserror-impl", + "thiserror-impl 2.0.11", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", ] [[package]] @@ -2766,6 +3058,12 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "want" version = "0.3.1" @@ -2910,7 +3208,7 @@ checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" dependencies = [ "windows-result", "windows-strings", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -2919,7 +3217,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -2929,7 +3227,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" dependencies = [ "windows-result", - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", ] [[package]] @@ -2938,7 +3245,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -2947,7 +3254,22 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] @@ -2956,28 +3278,46 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -2990,24 +3330,48 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -3146,13 +3510,32 @@ dependencies = [ "syn 2.0.98", ] +[[package]] +name = "zstd" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +dependencies = [ + "zstd-safe 6.0.6", +] + [[package]] name = "zstd" version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110" dependencies = [ - "zstd-safe", + "zstd-safe 7.0.0", +] + +[[package]] +name = "zstd-safe" +version = "6.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +dependencies = [ + "libc", + "zstd-sys", ] [[package]] diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml index 8d93e097e0..6d641de4f3 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml @@ -15,6 +15,7 @@ lindera-cc-cedict = ["lindera/cc-cedict"] [dependencies] tantivy = { git = "https://github.com/zilliztech/tantivy.git" } +tantivy-5 = { package = "tantivy", git = "https://github.com/milvus-io/tantivy.git", tag = "0.21.1-fix3" } lindera = "0.40.1" futures = "0.3.21" libc = "0.2" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h index 30daf793fb..1e9d588cad 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h +++ b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h @@ -178,12 +178,14 @@ RustResult tantivy_register_tokenizer(void *ptr, RustResult tantivy_create_index(const char *field_name, TantivyDataType data_type, const char *path, + uint32_t tantivy_index_version, uintptr_t num_threads, uintptr_t overall_memory_budget_in_bytes); RustResult tantivy_create_index_with_single_segment(const char *field_name, TantivyDataType data_type, - const char *path); + const char *path, + uint32_t tantivy_index_version); void tantivy_free_index_writer(void *ptr); @@ -334,6 +336,7 @@ RustResult tantivy_index_add_array_keywords_by_single_segment_writer(void *ptr, RustResult tantivy_create_text_writer(const char *field_name, const char *path, + uint32_t tantivy_index_version, const char *tokenizer_name, const char *analyzer_params, uintptr_t num_threads, diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs index 6b25bb699d..9ab188ab40 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs @@ -6,6 +6,7 @@ lazy_static! { static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new(); } +#[allow(dead_code)] #[derive(Clone)] pub enum JiebaMode { Exact, diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs index bee22114d3..2f7a26ff5e 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs @@ -1,15 +1,14 @@ - use core::result::Result::Err; +use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind}; use lindera::mode::Mode; use lindera::segmenter::Segmenter; use lindera::token::Token as LToken; -use lindera::tokenizer::{Tokenizer as LTokenizer, TokenizerBuilder}; -use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind}; -use tantivy::tokenizer::{Token, Tokenizer, TokenStream}; +use lindera::tokenizer::Tokenizer as LTokenizer; +use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; +use crate::error::{Result, TantivyBindingError}; use serde_json as json; -use crate::error::{Result,TantivyBindingError}; pub struct LinderaTokenStream<'a> { pub tokens: Vec>, @@ -52,7 +51,7 @@ impl LinderaTokenizer { pub fn from_json(params: &json::Map) -> Result { let kind = fetch_lindera_kind(params)?; let dictionary = load_dictionary_from_kind(kind); - if dictionary.is_err(){ + if dictionary.is_err() { return Err(TantivyBindingError::InvalidArgument(format!( "lindera tokenizer with invalid dict_kind" ))); @@ -87,9 +86,9 @@ trait DictionaryKindParser { fn into_dict_kind(self) -> Result; } -impl DictionaryKindParser for &str{ +impl DictionaryKindParser for &str { fn into_dict_kind(self) -> Result { - match self{ + match self { "ipadic" => Ok(DictionaryKind::IPADIC), "ipadic-neologd" => Ok(DictionaryKind::IPADICNEologd), "unidic" => Ok(DictionaryKind::UniDic), @@ -98,21 +97,21 @@ impl DictionaryKindParser for &str{ other => Err(TantivyBindingError::InvalidArgument(format!( "unsupported lindera dict type: {}", other - ))) + ))), } } } -fn fetch_lindera_kind(params:&json::Map) -> Result{ - match params.get("dict_kind"){ +fn fetch_lindera_kind(params: &json::Map) -> Result { + match params.get("dict_kind") { Some(val) => { - if !val.is_string(){ + if !val.is_string() { return Err(TantivyBindingError::InvalidArgument(format!( "lindera tokenizer dict kind should be string" - ))) + ))); } val.as_str().unwrap().into_dict_kind() - }, + } _ => { return Err(TantivyBindingError::InvalidArgument(format!( "lindera tokenizer dict_kind must be set" @@ -128,29 +127,29 @@ mod tests { use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer; #[test] - fn test_lindera_tokenizer(){ + fn test_lindera_tokenizer() { let params = r#"{ "type": "lindera", "dict_kind": "ipadic" }"#; let json_param = json::from_str::>(¶ms); assert!(json_param.is_ok()); - + let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap()); assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); } #[test] #[cfg(feature = "lindera-cc-cedict")] - fn test_lindera_tokenizer_cc(){ + fn test_lindera_tokenizer_cc() { let params = r#"{ "type": "lindera", "dict_kind": "cc-cedict" }"#; let json_param = json::from_str::>(¶ms); assert!(json_param.is_ok()); - + let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap()); assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); } -} \ No newline at end of file +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs index 1644fbe4fa..f5f6aa1dfd 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs @@ -1,13 +1,10 @@ -use tantivy::tokenizer::{TextAnalyzer, TextAnalyzerBuilder}; -use lindera::segmenter::Segmenter; -use tantivy::tokenizer::*; -use lindera::mode::Mode; -use serde_json as json; use log::warn; +use serde_json as json; +use tantivy::tokenizer::*; +use tantivy::tokenizer::{TextAnalyzer, TextAnalyzerBuilder}; use crate::analyzer::tokenizers::{JiebaTokenizer, LinderaTokenizer}; -use crate::error::{Result,TantivyBindingError}; - +use crate::error::{Result, TantivyBindingError}; pub fn standard_builder() -> TextAnalyzerBuilder { TextAnalyzer::builder(SimpleTokenizer::default()).dynamic() @@ -21,11 +18,13 @@ pub fn jieba_builder() -> TextAnalyzerBuilder { TextAnalyzer::builder(JiebaTokenizer::new()).dynamic() } -pub fn lindera_builder(params: Option<&json::Map>) -> Result{ - if params.is_none(){ +pub fn lindera_builder( + params: Option<&json::Map>, +) -> Result { + if params.is_none() { return Err(TantivyBindingError::InvalidArgument(format!( "lindera tokenizer must be costum" - ))) + ))); } let tokenizer = LinderaTokenizer::from_json(params.unwrap())?; Ok(TextAnalyzer::builder(tokenizer).dynamic()) @@ -34,25 +33,25 @@ pub fn lindera_builder(params: Option<&json::Map>) -> Resul pub fn get_builder_with_tokenizer(params: &json::Value) -> Result { let name; let params_map; - if params.is_string(){ + if params.is_string() { name = params.as_str().unwrap(); params_map = None; - }else{ + } else { let m = params.as_object().unwrap(); - match m.get("type"){ + match m.get("type") { Some(val) => { - if !val.is_string(){ + if !val.is_string() { return Err(TantivyBindingError::InvalidArgument(format!( "tokenizer type should be string" - ))) + ))); } name = val.as_str().unwrap(); - }, + } _ => { return Err(TantivyBindingError::InvalidArgument(format!( "costum tokenizer must set type" ))) - }, + } } params_map = Some(m); } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs index f0e2553f77..cc83b35061 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs @@ -98,6 +98,7 @@ pub extern "C" fn free_rust_array_i64(array: RustArrayI64) { } } +#[allow(dead_code)] #[repr(C)] pub enum Value { None(()), @@ -192,11 +193,9 @@ pub extern "C" fn free_rust_error(error: *const c_char) { #[macro_export] macro_rules! cstr_to_str { ($cstr:expr) => { - unsafe { - match CStr::from_ptr($cstr).to_str() { - Ok(f) => f, - Err(e) => return RustResult::from_error(e.to_string()), - } + match unsafe { CStr::from_ptr($cstr).to_str() } { + Ok(f) => f, + Err(e) => return RustResult::from_error(e.to_string()), } }; } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs index 63285abce3..72b43a7565 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs @@ -1,3 +1,4 @@ +#[allow(dead_code)] #[repr(u8)] #[derive(Debug)] pub enum TantivyDataType { diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs index ed813a9e6d..b9192fa568 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs @@ -65,7 +65,7 @@ impl SegmentCollector for DocIdChildCollector { self.collect_block(&[doc]); } - fn harvest(mut self) -> Self::Fruit { + fn harvest(self) -> Self::Fruit { self.milvus_doc_ids } } @@ -117,7 +117,7 @@ impl SegmentCollector for DocIdChildCollector { self.collect_block(&[doc]); } - fn harvest(mut self) -> Self::Fruit { + fn harvest(self) -> Self::Fruit { self.milvus_doc_ids } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs index 77c922f824..01b9e5ac6a 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs @@ -4,6 +4,7 @@ use core::{fmt, str}; pub enum TantivyBindingError { JsonError(serde_json::Error), TantivyError(tantivy::TantivyError), + TantivyErrorV5(tantivy_5::TantivyError), InvalidArgument(String), InternalError(String), } @@ -20,11 +21,18 @@ impl From for TantivyBindingError { } } +impl From for TantivyBindingError { + fn from(value: tantivy_5::TantivyError) -> Self { + TantivyBindingError::TantivyErrorV5(value) + } +} + impl fmt::Display for TantivyBindingError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { TantivyBindingError::JsonError(e) => write!(f, "JsonError: {}", e), TantivyBindingError::TantivyError(e) => write!(f, "TantivyError: {}", e), + TantivyBindingError::TantivyErrorV5(e) => write!(f, "TantivyErrorV5: {}", e), TantivyBindingError::InvalidArgument(e) => write!(f, "InvalidArgument: {}", e), TantivyBindingError::InternalError(e) => write!(f, "InternalError: {}", e), } @@ -36,6 +44,7 @@ impl std::error::Error for TantivyBindingError { match self { TantivyBindingError::JsonError(e) => Some(e), TantivyBindingError::TantivyError(e) => Some(e), + TantivyBindingError::TantivyErrorV5(e) => Some(e), TantivyBindingError::InvalidArgument(_) => None, TantivyBindingError::InternalError(_) => None, } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs index af86535a89..2497d52d0f 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs @@ -12,6 +12,7 @@ use crate::vec_collector::VecCollector; use crate::error::{Result, TantivyBindingError}; +#[allow(dead_code)] pub(crate) struct IndexReaderWrapper { pub(crate) field_name: String, pub(crate) field: Field, diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs index 9e62186d74..93a961df92 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs @@ -65,25 +65,26 @@ mod tests { use tantivy::query::TermQuery; use tempfile::TempDir; - use crate::{analyzer::create_analyzer, index_writer::IndexWriterWrapper}; + use crate::{index_writer::IndexWriterWrapper, TantivyIndexVersion}; #[test] fn test_jeba() { let params = "{\"tokenizer\": \"jieba\"}".to_string(); - let tokenizer = create_analyzer(¶ms).unwrap(); let dir = TempDir::new().unwrap(); let mut writer = IndexWriterWrapper::create_text_writer( - "text".to_string(), - dir.path().to_str().unwrap().to_string(), - "jieba".to_string(), - tokenizer, + "text", + dir.path().to_str().unwrap(), + "jieba", + ¶ms, 1, 50_000_000, false, - ); + TantivyIndexVersion::default_version(), + ) + .unwrap(); - writer.add_string("网球和滑雪", 0).unwrap(); - writer.add_string("网球以及滑雪", 1).unwrap(); + writer.add("网球和滑雪", Some(0)).unwrap(); + writer.add("网球以及滑雪", Some(1)).unwrap(); writer.commit().unwrap(); @@ -100,20 +101,21 @@ mod tests { #[test] fn test_read() { - let tokenizer = create_analyzer("").unwrap(); let dir = TempDir::new().unwrap(); let mut writer = IndexWriterWrapper::create_text_writer( - "text".to_string(), - dir.path().to_str().unwrap().to_string(), - "default".to_string(), - tokenizer, + "text", + dir.path().to_str().unwrap(), + "default", + "", 1, 50_000_000, false, - ); + TantivyIndexVersion::default_version(), + ) + .unwrap(); for i in 0..10000 { - writer.add_string("hello world", i).unwrap(); + writer.add("hello world", Some(i)).unwrap(); } writer.commit().unwrap(); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs index f5b80a3cc8..67d4736f0c 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs @@ -3,17 +3,15 @@ use std::ffi::CStr; use libc::{c_char, c_void}; use crate::{ - array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, log::init_log, - analyzer::create_analyzer, + analyzer::create_analyzer, array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, + log::init_log, }; #[no_mangle] pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> RustResult { let real = ptr as *mut IndexReaderWrapper; - unsafe { - let query = cstr_to_str!(query); - (*real).match_query(query).into() - } + let query = cstr_to_str!(query); + unsafe { (*real).match_query(query).into() } } #[no_mangle] diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs index 5c00fc62a1..66adc211f0 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs @@ -1,389 +1,285 @@ -use std::ffi::CStr; -use std::sync::Arc; - -use either::Either; -use futures::executor::block_on; +use index_writer_v5::TantivyDocumentV5; +use index_writer_v7::TantivyDocumentV7; use libc::c_char; -use log::info; -use tantivy::schema::{ - Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED, -}; -use tantivy::{doc, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument}; use crate::data_type::TantivyDataType; -use crate::error::Result; +use crate::error::{Result, TantivyBindingError}; use crate::index_reader::IndexReaderWrapper; use crate::log::init_log; +use crate::{index_writer_v5, index_writer_v7, TantivyIndexVersion}; -pub(crate) struct IndexWriterWrapper { - pub(crate) field: Field, - pub(crate) index_writer: Either, - pub(crate) id_field: Option, - pub(crate) index: Arc, +pub trait TantivyValue { + fn add_to_document(&self, field: u32, document: &mut D); } -#[inline] -fn schema_builder_add_field( - schema_builder: &mut SchemaBuilder, - field_name: &str, - data_type: TantivyDataType, -) -> Field { - match data_type { - TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED), - TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED), - TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED), - TantivyDataType::Keyword => { - let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("raw") - .set_index_option(IndexRecordOption::Basic); - let text_options = TextOptions::default().set_indexing_options(text_field_indexing); - schema_builder.add_text_field(&field_name, text_options) - } - TantivyDataType::Text => { - panic!("text should be indexed with analyzer"); - } - } +pub enum IndexWriterWrapper { + V5(index_writer_v5::IndexWriterWrapperImpl), + V7(index_writer_v7::IndexWriterWrapperImpl), } impl IndexWriterWrapper { + // create a IndexWriterWrapper according to `tanviy_index_version`. + // version 7 is the latest version and is what we should use in most cases. + // We may also build with version 5 for compatibility for reader nodes with older versions. pub fn new( - field_name: String, + field_name: &str, data_type: TantivyDataType, path: String, num_threads: usize, overall_memory_budget_in_bytes: usize, + tanviy_index_version: TantivyIndexVersion, ) -> Result { init_log(); - info!( - "create index writer, field_name: {}, data_type: {:?}", - field_name, data_type - ); - let mut schema_builder = Schema::builder(); - let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type); - // We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field. - let id_field = schema_builder.add_i64_field("doc_id", FAST); - let schema = schema_builder.build(); - let index = Index::create_in_dir(path.clone(), schema)?; - let index_writer = - index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?; - Ok(IndexWriterWrapper { - field, - index_writer: Either::Left(index_writer), - id_field: Some(id_field), - index: Arc::new(index), - }) + match tanviy_index_version { + TantivyIndexVersion::V5 => { + let writer = index_writer_v5::IndexWriterWrapperImpl::new( + field_name, + data_type, + path, + num_threads, + overall_memory_budget_in_bytes, + )?; + Ok(IndexWriterWrapper::V5(writer)) + } + TantivyIndexVersion::V7 => { + let writer = index_writer_v7::IndexWriterWrapperImpl::new( + field_name, + data_type, + path, + num_threads, + overall_memory_budget_in_bytes, + )?; + Ok(IndexWriterWrapper::V7(writer)) + } + } } pub fn new_with_single_segment( - field_name: String, + field_name: &str, data_type: TantivyDataType, path: String, + tanviy_index_version: TantivyIndexVersion, ) -> Result { init_log(); - info!( - "create single segment index writer, field_name: {}, data_type: {:?}", - field_name, data_type - ); - let mut schema_builder = Schema::builder(); - let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type); - let schema = schema_builder.build(); - let index = Index::create_in_dir(path.clone(), schema)?; - let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?; - Ok(IndexWriterWrapper { - field, - index_writer: Either::Right(index_writer), - id_field: None, - index: Arc::new(index), - }) + match tanviy_index_version { + TantivyIndexVersion::V5 => { + let writer = index_writer_v5::IndexWriterWrapperImpl::new_with_single_segment( + field_name, data_type, path, + )?; + Ok(IndexWriterWrapper::V5(writer)) + } + TantivyIndexVersion::V7 => { + let writer = index_writer_v7::IndexWriterWrapperImpl::new_with_single_segment( + field_name, data_type, path, + )?; + Ok(IndexWriterWrapper::V7(writer)) + } + } } pub fn create_reader(&self) -> Result { - IndexReaderWrapper::from_index(self.index.clone()) - } - - fn index_writer_add_document(&self, document: TantivyDocument) -> Result<()> { - match self.index_writer { - Either::Left(ref writer) => { - let _ = writer.add_document(document)?; - } - Either::Right(_) => { - panic!("unexpected writer"); + match self { + IndexWriterWrapper::V5(_) => { + return Err(TantivyBindingError::InternalError( + "create reader with tantivy index version 5 + is not supported from tantivy with version 7" + .into(), + )); } + IndexWriterWrapper::V7(writer) => writer.create_reader(), } - Ok(()) } - fn single_segment_index_writer_add_document( - &mut self, - document: TantivyDocument, - ) -> Result<()> { - match self.index_writer { - Either::Left(_) => { - panic!("unexpected writer"); - } - Either::Right(ref mut single_segmnet_writer) => { - let _ = single_segmnet_writer.add_document(document)?; - } + pub fn add(&mut self, data: T, offset: Option) -> Result<()> + where + T: TantivyValue + TantivyValue, + { + match self { + IndexWriterWrapper::V5(writer) => writer.add(data, offset), + IndexWriterWrapper::V7(writer) => writer.add(data, offset), } - Ok(()) } - pub fn add_i8(&mut self, data: i8, offset: i64) -> Result<()> { - self.add_i64(data.into(), offset) - } - - pub fn add_i16(&mut self, data: i16, offset: i64) -> Result<()> { - self.add_i64(data.into(), offset) - } - - pub fn add_i32(&mut self, data: i32, offset: i64) -> Result<()> { - self.add_i64(data.into(), offset) - } - - pub fn add_i64(&mut self, data: i64, offset: i64) -> Result<()> { - self.index_writer_add_document(doc!( - self.field => data, - self.id_field.unwrap() => offset, - )) - } - - pub fn add_f32(&mut self, data: f32, offset: i64) -> Result<()> { - self.add_f64(data.into(), offset) - } - - pub fn add_f64(&mut self, data: f64, offset: i64) -> Result<()> { - self.index_writer_add_document(doc!( - self.field => data, - self.id_field.unwrap() => offset, - )) - } - - pub fn add_bool(&mut self, data: bool, offset: i64) -> Result<()> { - self.index_writer_add_document(doc!( - self.field => data, - self.id_field.unwrap() => offset, - )) - } - - pub fn add_string(&mut self, data: &str, offset: i64) -> Result<()> { - self.index_writer_add_document(doc!( - self.field => data, - self.id_field.unwrap() => offset, - )) - } - - pub fn add_array_i8s(&mut self, datas: &[i8], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); + pub fn add_array(&mut self, data: I, offset: Option) -> Result<()> + where + I: IntoIterator, + T: TantivyValue + TantivyValue, + { + match self { + IndexWriterWrapper::V5(writer) => writer.add_array(data, offset), + IndexWriterWrapper::V7(writer) => writer.add_array(data, offset), } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) } - pub fn add_array_i16s(&mut self, datas: &[i16], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_i32s(&mut self, datas: &[i32], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_i64s(&mut self, datas: &[i64], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_f32s(&mut self, datas: &[f32], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as f64)); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_f64s(&mut self, datas: &[f64], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_bools(&mut self, datas: &[bool], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_keywords(&mut self, datas: &[*const c_char], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for element in datas { - let data = unsafe { CStr::from_ptr(*element) }; - document.add_field_value(self.field, data.to_str()?); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_i8_by_single_segment_writer(&mut self, data: i8) -> Result<()> { - self.add_i64_by_single_segment_writer(data.into()) - } - - pub fn add_i16_by_single_segment_writer(&mut self, data: i16) -> Result<()> { - self.add_i64_by_single_segment_writer(data.into()) - } - - pub fn add_i32_by_single_segment_writer(&mut self, data: i32) -> Result<()> { - self.add_i64_by_single_segment_writer(data.into()) - } - - pub fn add_i64_by_single_segment_writer(&mut self, data: i64) -> Result<()> { - self.single_segment_index_writer_add_document(doc!( - self.field => data - )) - } - - pub fn add_f32_by_single_segment_writer(&mut self, data: f32) -> Result<()> { - self.add_f64_by_single_segment_writer(data.into()) - } - - pub fn add_f64_by_single_segment_writer(&mut self, data: f64) -> Result<()> { - self.single_segment_index_writer_add_document(doc!( - self.field => data - )) - } - - pub fn add_bool_by_single_segment_writer(&mut self, data: bool) -> Result<()> { - self.single_segment_index_writer_add_document(doc!( - self.field => data - )) - } - - pub fn add_string_by_single_segment_writer(&mut self, data: &str) -> Result<()> { - self.single_segment_index_writer_add_document(doc!( - self.field => data - )) - } - - pub fn add_array_i8s_by_single_segment_writer(&mut self, datas: &[i8]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_i16s_by_single_segment_writer(&mut self, datas: &[i16]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_i32s_by_single_segment_writer(&mut self, datas: &[i32]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_i64s_by_single_segment_writer(&mut self, datas: &[i64]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_f32s_by_single_segment_writer(&mut self, datas: &[f32]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as f64)); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_f64s_by_single_segment_writer(&mut self, datas: &[f64]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_bools_by_single_segment_writer(&mut self, datas: &[bool]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_keywords_by_single_segment_writer( + pub fn add_array_keywords( &mut self, datas: &[*const c_char], + offset: Option, ) -> Result<()> { - let mut document = TantivyDocument::default(); - for element in datas { - let data = unsafe { CStr::from_ptr(*element) }; - document.add_field_value(self.field, data.to_str()?); + match self { + IndexWriterWrapper::V5(writer) => writer.add_array_keywords(datas, offset), + IndexWriterWrapper::V7(writer) => writer.add_array_keywords(datas, offset), } - self.single_segment_index_writer_add_document(document) } - fn manual_merge(&mut self) -> Result<()> { - let index_writer = self.index_writer.as_mut().left().unwrap(); - let metas = index_writer.index().searchable_segment_metas()?; - let policy = index_writer.get_merge_policy(); - let candidates = policy.compute_merge_candidates(metas.as_slice()); - for candidate in candidates { - index_writer.merge(candidate.0.as_slice()).wait()?; + #[allow(dead_code)] + pub fn manual_merge(&mut self) -> Result<()> { + match self { + IndexWriterWrapper::V5(writer) => writer.manual_merge(), + IndexWriterWrapper::V7(writer) => writer.manual_merge(), } - Ok(()) } + #[allow(dead_code)] + pub fn commit(&mut self) -> Result<()> { + match self { + IndexWriterWrapper::V5(writer) => writer.commit(), + IndexWriterWrapper::V7(writer) => writer.commit(), + } + } + + #[allow(dead_code)] pub fn finish(self) -> Result<()> { - match self.index_writer { - Either::Left(mut index_writer) => { - index_writer.commit()?; - // self.manual_merge(); - block_on(index_writer.garbage_collect_files())?; - index_writer.wait_merging_threads()?; - } - Either::Right(single_segment_index_writer) => { - single_segment_index_writer - .finalize() - .expect("failed to build inverted index"); - } + match self { + IndexWriterWrapper::V5(writer) => writer.finish(), + IndexWriterWrapper::V7(writer) => writer.finish(), } - Ok(()) - } - - pub(crate) fn commit(&mut self) -> Result<()> { - self.index_writer.as_mut().left().unwrap().commit()?; - Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::ops::Bound; + + use tempfile::TempDir; + + use crate::{data_type::TantivyDataType, TantivyIndexVersion}; + + use super::IndexWriterWrapper; + + #[test] + fn test_build_index_version5() { + let field_name = "number"; + let data_type = TantivyDataType::I64; + let dir = TempDir::new().unwrap(); + + { + let mut index_wrapper = IndexWriterWrapper::new( + field_name, + data_type, + dir.path().to_str().unwrap().to_string(), + 1, + 50_000_000, + TantivyIndexVersion::V5, + ) + .unwrap(); + + for i in 0..10 { + index_wrapper.add::(i, Some(i as i64)).unwrap(); + } + index_wrapper.commit().unwrap(); + } + + use tantivy_5::{collector, query, Index, ReloadPolicy}; + let index = Index::open_in_dir(dir.path()).unwrap(); + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into() + .unwrap(); + let query = query::RangeQuery::new_i64_bounds( + field_name.to_string(), + Bound::Included(0), + Bound::Included(9), + ); + let res = reader + .searcher() + .search(&query, &tantivy_5::collector::TopDocs::with_limit(10)) + .unwrap(); + assert_eq!(res.len(), 10); + } + + #[test] + fn test_build_index_version5_single_segment() { + let field_name = "number"; + let data_type = TantivyDataType::I64; + let dir = TempDir::new().unwrap(); + + { + let mut index_wrapper = IndexWriterWrapper::new_with_single_segment( + field_name, + data_type, + dir.path().to_str().unwrap().to_string(), + TantivyIndexVersion::V5, + ) + .unwrap(); + + for i in 0..10 { + index_wrapper.add::(i, None).unwrap(); + } + index_wrapper.finish().unwrap(); + } + + use tantivy_5::{collector, query, Index, ReloadPolicy}; + let index = Index::open_in_dir(dir.path()).unwrap(); + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into() + .unwrap(); + let query = query::RangeQuery::new_i64_bounds( + field_name.to_string(), + Bound::Included(0), + Bound::Included(9), + ); + let res = reader + .searcher() + .search(&query, &collector::TopDocs::with_limit(10)) + .unwrap(); + assert_eq!(res.len(), 10); + } + + #[test] + fn test_build_text_index_version5() { + let field_name = "text"; + let dir = TempDir::new().unwrap(); + + { + let mut index_wrapper = IndexWriterWrapper::create_text_writer( + field_name, + dir.path().to_str().unwrap(), + "default", + "", + 1, + 50_000_000, + false, + TantivyIndexVersion::V5, + ) + .unwrap(); + + for i in 0..10 { + index_wrapper.add("hello", Some(i as i64)).unwrap(); + } + index_wrapper.commit().unwrap(); + } + + use tantivy_5::{collector, query, schema, Index, ReloadPolicy, Term}; + let index = Index::open_in_dir(dir.path()).unwrap(); + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into() + .unwrap(); + let text = index.schema().get_field("text").unwrap(); + let query = query::TermQuery::new( + Term::from_field_text(text, "hello"), + schema::IndexRecordOption::Basic, + ); + let res = reader + .searcher() + .search(&query, &collector::TopDocs::with_limit(10)) + .unwrap(); + assert_eq!(res.len(), 10); } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs index 10171ecb92..4747398f59 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs @@ -8,6 +8,7 @@ use crate::{ error::Result, index_writer::IndexWriterWrapper, util::{create_binding, free_binding}, + TantivyIndexVersion, }; macro_rules! convert_to_rust_slice { @@ -25,17 +26,25 @@ pub extern "C" fn tantivy_create_index( field_name: *const c_char, data_type: TantivyDataType, path: *const c_char, + tantivy_index_version: u32, num_threads: usize, overall_memory_budget_in_bytes: usize, ) -> RustResult { let field_name_str = cstr_to_str!(field_name); let path_str = cstr_to_str!(path); + + let tantivy_index_version = match TantivyIndexVersion::from_u32(tantivy_index_version) { + Ok(v) => v, + Err(e) => return RustResult::from_error(e.to_string()), + }; + match IndexWriterWrapper::new( - String::from(field_name_str), + field_name_str, data_type, String::from(path_str), num_threads, overall_memory_budget_in_bytes, + tantivy_index_version, ) { Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)), Err(e) => RustResult::from_error(e.to_string()), @@ -47,13 +56,21 @@ pub extern "C" fn tantivy_create_index_with_single_segment( field_name: *const c_char, data_type: TantivyDataType, path: *const c_char, + tantivy_index_version: u32, ) -> RustResult { let field_name_str = cstr_to_str!(field_name); let path_str = cstr_to_str!(path); + + let tantivy_index_version = match TantivyIndexVersion::from_u32(tantivy_index_version) { + Ok(v) => v, + Err(e) => return RustResult::from_error(e.to_string()), + }; + match IndexWriterWrapper::new_with_single_segment( - String::from(field_name_str), + field_name_str, data_type, String::from(path_str), + tantivy_index_version, ) { Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)), Err(e) => RustResult::from_error(e.to_string()), @@ -90,25 +107,31 @@ pub extern "C" fn tantivy_create_reader_from_writer(ptr: *mut c_void) -> RustRes } // -------------------------build-------------------- -fn execute( - arr: &[T], +fn execute( + arr: I, offset: i64, - e: fn(&mut IndexWriterWrapper, T, i64) -> Result<()>, + e: fn(&mut IndexWriterWrapper, T, Option) -> Result<()>, w: &mut IndexWriterWrapper, -) -> Result<()> { - for (index, data) in arr.iter().enumerate() { - e(w, *data, offset + (index as i64))?; +) -> Result<()> +where + I: IntoIterator, +{ + for (index, data) in arr.into_iter().enumerate() { + e(w, data, Some(offset + (index as i64)))?; } Ok(()) } -fn execute_by_single_segment_writer( - arr: &[T], - e: fn(&mut IndexWriterWrapper, T) -> Result<()>, +fn execute_by_single_segment_writer( + arr: I, + e: fn(&mut IndexWriterWrapper, T, Option) -> Result<()>, w: &mut IndexWriterWrapper, -) -> Result<()> { - for (_, data) in arr.iter().enumerate() { - e(w, *data)?; +) -> Result<()> +where + I: IntoIterator, +{ + for data in arr.into_iter() { + e(w, data, None)?; } Ok(()) } @@ -122,7 +145,15 @@ pub extern "C" fn tantivy_index_add_int8s( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i8, &mut (*real)).into() } + unsafe { + execute( + arr.into_iter().map(|num| *num as i64), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -135,8 +166,8 @@ pub extern "C" fn tantivy_index_add_int8s_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_i8_by_single_segment_writer, + arr.into_iter().map(|num| *num as i64), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -152,7 +183,15 @@ pub extern "C" fn tantivy_index_add_int16s( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i16, &mut (*real)).into() } + unsafe { + execute( + arr.into_iter().map(|num| *num as i64), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -165,8 +204,8 @@ pub extern "C" fn tantivy_index_add_int16s_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_i16_by_single_segment_writer, + arr.into_iter().map(|num| *num as i64), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -182,7 +221,15 @@ pub extern "C" fn tantivy_index_add_int32s( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i32, &mut (*real)).into() } + unsafe { + execute( + arr.into_iter().map(|num| *num as i64), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -195,8 +242,8 @@ pub extern "C" fn tantivy_index_add_int32s_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_i32_by_single_segment_writer, + arr.into_iter().map(|num| *num as i64), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -213,7 +260,15 @@ pub extern "C" fn tantivy_index_add_int64s( let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i64, &mut (*real)).into() } + unsafe { + execute( + arr.iter().copied(), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -227,8 +282,8 @@ pub extern "C" fn tantivy_index_add_int64s_by_single_segment_writer( unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_i64_by_single_segment_writer, + arr.iter().copied(), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -244,7 +299,15 @@ pub extern "C" fn tantivy_index_add_f32s( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f32, &mut (*real)).into() } + unsafe { + execute( + arr.into_iter().map(|num| *num as f64), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -257,8 +320,8 @@ pub extern "C" fn tantivy_index_add_f32s_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_f32_by_single_segment_writer, + arr.into_iter().map(|num| *num as f64), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -274,7 +337,15 @@ pub extern "C" fn tantivy_index_add_f64s( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f64, &mut (*real)).into() } + unsafe { + execute( + arr.iter().copied(), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -287,8 +358,8 @@ pub extern "C" fn tantivy_index_add_f64s_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_f64_by_single_segment_writer, + arr.into_iter().map(|num| *num as f64), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -306,9 +377,9 @@ pub extern "C" fn tantivy_index_add_bools( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute( - arr, + arr.iter().copied(), offset_begin, - IndexWriterWrapper::add_bool, + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -325,8 +396,8 @@ pub extern "C" fn tantivy_index_add_bools_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_bool_by_single_segment_writer, + arr.iter().copied(), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -343,7 +414,7 @@ pub extern "C" fn tantivy_index_add_string( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let s = cstr_to_str!(s); - unsafe { (*real).add_string(s, offset).into() } + unsafe { (*real).add::<&str>(s, Some(offset)).into() } } #[no_mangle] @@ -353,7 +424,7 @@ pub extern "C" fn tantivy_index_add_string_by_single_segment_writer( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let s = cstr_to_str!(s); - unsafe { (*real).add_string_by_single_segment_writer(s).into() } + unsafe { (*real).add::<&str>(s, None).into() } } // --------------------------------------------- array ------------------------------------------ @@ -368,7 +439,9 @@ pub extern "C" fn tantivy_index_add_array_int8s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i8s(arr, offset).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), Some(offset)) + .into() } } @@ -381,7 +454,9 @@ pub extern "C" fn tantivy_index_add_array_int8s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i8s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), None) + .into() } } @@ -395,7 +470,9 @@ pub extern "C" fn tantivy_index_add_array_int16s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i16s(arr, offset).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), Some(offset)) + .into() } } @@ -408,7 +485,9 @@ pub extern "C" fn tantivy_index_add_array_int16s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i16s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), None) + .into() } } @@ -422,7 +501,9 @@ pub extern "C" fn tantivy_index_add_array_int32s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i32s(arr, offset).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), Some(offset)) + .into() } } @@ -435,7 +516,9 @@ pub extern "C" fn tantivy_index_add_array_int32s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i32s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), None) + .into() } } @@ -449,7 +532,9 @@ pub extern "C" fn tantivy_index_add_array_int64s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i64s(arr, offset).into() + (*real) + .add_array::(arr.iter().copied(), Some(offset)) + .into() } } @@ -462,7 +547,9 @@ pub extern "C" fn tantivy_index_add_array_int64s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i64s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.iter().copied(), None) + .into() } } @@ -476,7 +563,9 @@ pub extern "C" fn tantivy_index_add_array_f32s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_f32s(arr, offset).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as f64), Some(offset)) + .into() } } @@ -489,7 +578,9 @@ pub extern "C" fn tantivy_index_add_array_f32s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_f32s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as f64), None) + .into() } } @@ -503,7 +594,9 @@ pub extern "C" fn tantivy_index_add_array_f64s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_f64s(arr, offset).into() + (*real) + .add_array::(arr.iter().copied(), Some(offset)) + .into() } } @@ -516,7 +609,9 @@ pub extern "C" fn tantivy_index_add_array_f64s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_f64s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.iter().copied(), None) + .into() } } @@ -530,7 +625,9 @@ pub extern "C" fn tantivy_index_add_array_bools( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_bools(arr, offset).into() + (*real) + .add_array::(arr.iter().copied(), Some(offset)) + .into() } } @@ -543,7 +640,9 @@ pub extern "C" fn tantivy_index_add_array_bools_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_bools_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.iter().copied(), None) + .into() } } @@ -557,7 +656,7 @@ pub extern "C" fn tantivy_index_add_array_keywords( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_keywords(arr, offset).into() + (*real).add_array_keywords(arr, Some(offset)).into() } } @@ -570,8 +669,6 @@ pub extern "C" fn tantivy_index_add_array_keywords_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real) - .add_array_keywords_by_single_segment_writer(arr) - .into() + (*real).add_array_keywords(arr, None).into() } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text.rs index abddfc707e..2532f1bbbd 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text.rs @@ -1,53 +1,44 @@ -use std::sync::Arc; - -use either::Either; -use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST}; -use tantivy::tokenizer::TextAnalyzer; -use tantivy::Index; - -use crate::{index_writer::IndexWriterWrapper, log::init_log}; - -fn build_text_schema(field_name: &String, tokenizer_name: &String) -> (Schema, Field, Field) { - let mut schema_builder = Schema::builder(); - // positions is required for matching phase. - let indexing = TextFieldIndexing::default() - .set_tokenizer(&tokenizer_name) - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - let option = TextOptions::default().set_indexing_options(indexing); - let field = schema_builder.add_text_field(&field_name, option); - let id_field = schema_builder.add_i64_field("doc_id", FAST); - (schema_builder.build(), field, id_field) -} +use crate::error::Result; +use crate::index_writer::IndexWriterWrapper; +use crate::{index_writer_v5, index_writer_v7, TantivyIndexVersion}; impl IndexWriterWrapper { + // create a text writer according to `tanviy_index_version`. + // version 7 is the latest version and is what we should use in most cases. + // We may also build with version 5 for compatibility for reader nodes with older versions. pub(crate) fn create_text_writer( - field_name: String, - path: String, - tokenizer_name: String, - tokenizer: TextAnalyzer, + field_name: &str, + path: &str, + tokenizer_name: &str, + tokenizer_params: &str, num_threads: usize, overall_memory_budget_in_bytes: usize, in_ram: bool, - ) -> IndexWriterWrapper { - init_log(); - - let (schema, field, id_field) = build_text_schema(&field_name, &tokenizer_name); - let index: Index; - if in_ram { - index = Index::create_in_ram(schema); - } else { - index = Index::create_in_dir(path.clone(), schema).unwrap(); - } - index.tokenizers().register(&tokenizer_name, tokenizer); - let index_writer = index - .writer_with_num_threads(num_threads, overall_memory_budget_in_bytes) - .unwrap(); - - IndexWriterWrapper { - field, - index_writer: Either::Left(index_writer), - id_field: Some(id_field), - index: Arc::new(index), + tanviy_index_version: TantivyIndexVersion, + ) -> Result { + match tanviy_index_version { + TantivyIndexVersion::V5 => Ok(IndexWriterWrapper::V5( + index_writer_v5::IndexWriterWrapperImpl::create_text_writer( + field_name, + path, + tokenizer_name, + tokenizer_params, + num_threads, + overall_memory_budget_in_bytes, + in_ram, + )?, + )), + TantivyIndexVersion::V7 => Ok(IndexWriterWrapper::V7( + index_writer_v7::IndexWriterWrapperImpl::create_text_writer( + field_name, + path, + tokenizer_name, + tokenizer_params, + num_threads, + overall_memory_budget_in_bytes, + in_ram, + )?, + )), } } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs index 44d4c7435d..ca781ce7c3 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs @@ -5,13 +5,14 @@ use crate::array::RustResult; use crate::cstr_to_str; use crate::index_writer::IndexWriterWrapper; use crate::log::init_log; -use crate::analyzer::create_analyzer; use crate::util::create_binding; +use crate::TantivyIndexVersion; #[no_mangle] pub extern "C" fn tantivy_create_text_writer( field_name: *const c_char, path: *const c_char, + tantivy_index_version: u32, tokenizer_name: *const c_char, analyzer_params: *const c_char, num_threads: usize, @@ -23,20 +24,23 @@ pub extern "C" fn tantivy_create_text_writer( let path_str = cstr_to_str!(path); let tokenizer_name_str = cstr_to_str!(tokenizer_name); let params = cstr_to_str!(analyzer_params); - let analyzer = create_analyzer(params); - match analyzer { - Ok(text_analyzer) => { - let wrapper = IndexWriterWrapper::create_text_writer( - String::from(field_name_str), - String::from(path_str), - String::from(tokenizer_name_str), - text_analyzer, - num_threads, - overall_memory_budget_in_bytes, - in_ram, - ); - RustResult::from_ptr(create_binding(wrapper)) - } + + let tantivy_index_version = match TantivyIndexVersion::from_u32(tantivy_index_version) { + Ok(v) => v, + Err(e) => return RustResult::from_error(e.to_string()), + }; + + match IndexWriterWrapper::create_text_writer( + field_name_str, + path_str, + tokenizer_name_str, + params, + num_threads, + overall_memory_budget_in_bytes, + in_ram, + tantivy_index_version, + ) { + Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)), Err(err) => RustResult::from_error(format!( "create tokenizer failed with error: {} param: {}", err.to_string(), diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/analyzer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/analyzer.rs new file mode 100644 index 0000000000..91ee027d39 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/analyzer.rs @@ -0,0 +1,288 @@ +use serde_json as json; +use std::collections::HashMap; +use tantivy_5::tokenizer::*; + +use crate::error::{Result, TantivyBindingError}; + +use super::{ + build_in_analyzer::{chinese_analyzer, english_analyzer}, + filter::SystemFilter, + standard_analyzer, + tokenizers::get_builder_with_tokenizer, + util::{get_stop_words_list, get_string_list}, +}; + +struct AnalyzerBuilder<'a> { + filters: HashMap, + params: &'a json::Map, +} + +impl AnalyzerBuilder<'_> { + fn new(params: &json::Map) -> AnalyzerBuilder { + AnalyzerBuilder { + filters: HashMap::new(), + params: params, + } + } + + fn get_tokenizer_params(&self) -> Result<&json::Value> { + let tokenizer = self.params.get("tokenizer"); + if tokenizer.is_none() { + return Err(TantivyBindingError::InternalError(format!( + "tokenizer name or type must be set" + ))); + } + let value = tokenizer.unwrap(); + if value.is_object() || value.is_string() { + return Ok(tokenizer.unwrap()); + } + + Err(TantivyBindingError::InternalError(format!( + "tokenizer name should be string or dict" + ))) + } + + fn add_custom_filter( + &mut self, + name: &String, + params: &json::Map, + ) -> Result<()> { + match SystemFilter::try_from(params) { + Ok(filter) => { + self.filters.insert(name.to_string(), filter); + Ok(()) + } + Err(e) => Err(e), + } + } + + fn add_custom_filters(&mut self, params: &json::Map) -> Result<()> { + for (name, value) in params { + if !value.is_object() { + continue; + } + self.add_custom_filter(name, value.as_object().unwrap())?; + } + Ok(()) + } + + fn build_filter( + &mut self, + mut builder: TextAnalyzerBuilder, + params: &json::Value, + ) -> Result { + if !params.is_array() { + return Err(TantivyBindingError::InternalError( + "filter params should be array".to_string(), + )); + } + + let filters = params.as_array().unwrap(); + + for filter in filters { + if filter.is_string() { + let filter_name = filter.as_str().unwrap(); + let costum = self.filters.remove(filter_name); + if !costum.is_none() { + builder = costum.unwrap().transform(builder); + continue; + } + + // check if filter was system filter + let system = SystemFilter::from(filter_name); + match system { + SystemFilter::Invalid => { + return Err(TantivyBindingError::InternalError(format!( + "build analyzer failed, filter not found :{}", + filter_name + ))) + } + other => { + builder = other.transform(builder); + } + } + } else if filter.is_object() { + let filter = SystemFilter::try_from(filter.as_object().unwrap())?; + builder = filter.transform(builder); + } + } + Ok(builder) + } + + fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result { + for (key, value) in self.params { + match key.as_str() { + "tokenizer" => {} + "filter" => { + // build with filter if filter param exist + builder = self.build_filter(builder, value)?; + } + other => { + return Err(TantivyBindingError::InternalError(format!( + "unknown analyzer option key: {}", + other + ))) + } + } + } + Ok(builder) + } + + fn get_stop_words_option(&self) -> Result> { + let value = self.params.get("stop_words"); + match value { + Some(value) => { + let str_list = get_string_list(value, "filter stop_words")?; + Ok(get_stop_words_list(str_list)) + } + _ => Ok(vec![]), + } + } + + fn build_template(self, type_: &str) -> Result { + match type_ { + "standard" => Ok(standard_analyzer(self.get_stop_words_option()?)), + "chinese" => Ok(chinese_analyzer(self.get_stop_words_option()?)), + "english" => Ok(english_analyzer(self.get_stop_words_option()?)), + other_ => Err(TantivyBindingError::InternalError(format!( + "unknown build-in analyzer type: {}", + other_ + ))), + } + } + + fn build(mut self) -> Result { + // build base build-in analyzer + match self.params.get("type") { + Some(type_) => { + if !type_.is_string() { + return Err(TantivyBindingError::InternalError(format!( + "analyzer type shoud be string" + ))); + } + return self.build_template(type_.as_str().unwrap()); + } + None => {} + }; + + //build custom analyzer + let tokenizer_params = self.get_tokenizer_params()?; + let mut builder = get_builder_with_tokenizer(&tokenizer_params)?; + + // build with option + builder = self.build_option(builder)?; + Ok(builder.build()) + } +} + +pub(crate) fn create_analyzer_with_filter(params: &String) -> Result { + match json::from_str::(¶ms) { + Ok(value) => { + if value.is_null() { + return Ok(standard_analyzer(vec![])); + } + if !value.is_object() { + return Err(TantivyBindingError::InternalError( + "tokenizer params should be a json map".to_string(), + )); + } + let json_params = value.as_object().unwrap(); + + // create builder + let analyzer_params = json_params.get("analyzer"); + if analyzer_params.is_none() { + return Ok(standard_analyzer(vec![])); + } + if !analyzer_params.unwrap().is_object() { + return Err(TantivyBindingError::InternalError( + "analyzer params should be a json map".to_string(), + )); + } + + let builder_params = analyzer_params.unwrap().as_object().unwrap(); + if builder_params.is_empty() { + return Ok(standard_analyzer(vec![])); + } + + let mut builder = AnalyzerBuilder::new(builder_params); + + // build custom filter + let filter_params = json_params.get("filter"); + if !filter_params.is_none() && filter_params.unwrap().is_object() { + builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?; + } + + // build analyzer + builder.build() + } + Err(err) => Err(err.into()), + } +} + +pub(crate) fn create_analyzer(params: &str) -> Result { + if params.len() == 0 { + return Ok(standard_analyzer(vec![])); + } + create_analyzer_with_filter(&format!("{{\"analyzer\":{}}}", params)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_standard_analyzer() { + let params = r#"{ + "type": "standard", + "stop_words": ["_english_"] + }"#; + + let tokenizer = create_analyzer(¶ms.to_string()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + } + + #[test] + fn test_chinese_analyzer() { + let params = r#"{ + "type": "chinese" + }"#; + + let tokenizer = create_analyzer(¶ms.to_string()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + let mut bining = tokenizer.unwrap(); + let mut stream = bining.token_stream("系统安全;,'';lxyz密码"); + + let mut results = Vec::::new(); + while stream.advance() { + let token = stream.token(); + results.push(token.text.clone()); + } + + print!("test tokens :{:?}\n", results) + } + + #[test] + fn test_lindera_analyzer() { + let params = r#"{ + "tokenizer": { + "type": "lindera", + "dict_kind": "ipadic" + } + }"#; + + let tokenizer = create_analyzer(¶ms.to_string()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + + let mut bining = tokenizer.unwrap(); + let mut stream = + bining.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です"); + + let mut results = Vec::::new(); + while stream.advance() { + let token = stream.token(); + results.push(token.text.clone()); + } + + print!("test tokens :{:?}\n", results) + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/build_in_analyzer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/build_in_analyzer.rs new file mode 100644 index 0000000000..515a1182ff --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/build_in_analyzer.rs @@ -0,0 +1,40 @@ +use tantivy_5::tokenizer::*; + +use super::filter::*; +use super::stop_words; +use super::tokenizers::*; + +// default build-in analyzer +pub(crate) fn standard_analyzer(stop_words: Vec) -> TextAnalyzer { + let builder = standard_builder().filter(LowerCaser); + + if stop_words.len() > 0 { + return builder.filter(StopWordFilter::remove(stop_words)).build(); + } + + builder.build() +} + +pub fn chinese_analyzer(stop_words: Vec) -> TextAnalyzer { + let builder = jieba_builder().filter(CnAlphaNumOnlyFilter); + if stop_words.len() > 0 { + return builder.filter(StopWordFilter::remove(stop_words)).build(); + } + + builder.build() +} + +pub fn english_analyzer(stop_words: Vec) -> TextAnalyzer { + let builder = standard_builder() + .filter(LowerCaser) + .filter(Stemmer::new(Language::English)) + .filter(StopWordFilter::remove( + stop_words::ENGLISH.iter().map(|&word| word.to_owned()), + )); + + if stop_words.len() > 0 { + return builder.filter(StopWordFilter::remove(stop_words)).build(); + } + + builder.build() +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/filter.rs new file mode 100644 index 0000000000..b74bc29b6c --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/filter.rs @@ -0,0 +1,285 @@ +use serde_json as json; +use tantivy_5::tokenizer::*; + +use super::util::*; +use crate::error::{Result, TantivyBindingError}; + +pub(crate) enum SystemFilter { + Invalid, + LowerCase(LowerCaser), + AsciiFolding(AsciiFoldingFilter), + AlphaNumOnly(AlphaNumOnlyFilter), + CnCharOnly(CnCharOnlyFilter), + CnAlphaNumOnly(CnAlphaNumOnlyFilter), + Length(RemoveLongFilter), + Stop(StopWordFilter), + Decompounder(SplitCompoundWords), + Stemmer(Stemmer), +} + +impl SystemFilter { + pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder { + match self { + Self::LowerCase(filter) => builder.filter(filter).dynamic(), + Self::AsciiFolding(filter) => builder.filter(filter).dynamic(), + Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(), + Self::CnCharOnly(filter) => builder.filter(filter).dynamic(), + Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(), + Self::Length(filter) => builder.filter(filter).dynamic(), + Self::Stop(filter) => builder.filter(filter).dynamic(), + Self::Decompounder(filter) => builder.filter(filter).dynamic(), + Self::Stemmer(filter) => builder.filter(filter).dynamic(), + Self::Invalid => builder, + } + } +} + +// create length filter from params +// { +// "type": "length", +// "max": 10, // length +// } +// TODO support min length +fn get_length_filter(params: &json::Map) -> Result { + let limit_str = params.get("max"); + if limit_str.is_none() || !limit_str.unwrap().is_u64() { + return Err(TantivyBindingError::InternalError( + "lenth max param was none or not uint".to_string(), + )); + } + let limit = limit_str.unwrap().as_u64().unwrap() as usize; + Ok(SystemFilter::Length(RemoveLongFilter::limit(limit + 1))) +} + +fn get_stop_words_filter(params: &json::Map) -> Result { + let value = params.get("stop_words"); + if value.is_none() { + return Err(TantivyBindingError::InternalError( + "stop filter stop_words can't be empty".to_string(), + )); + } + let str_list = get_string_list(value.unwrap(), "stop_words filter")?; + Ok(SystemFilter::Stop(StopWordFilter::remove( + get_stop_words_list(str_list), + ))) +} + +fn get_decompounder_filter(params: &json::Map) -> Result { + let value = params.get("word_list"); + if value.is_none() || !value.unwrap().is_array() { + return Err(TantivyBindingError::InternalError( + "decompounder word list should be array".to_string(), + )); + } + + let stop_words = value.unwrap().as_array().unwrap(); + let mut str_list = Vec::::new(); + for element in stop_words { + match element.as_str() { + Some(word) => str_list.push(word.to_string()), + _ => { + return Err(TantivyBindingError::InternalError( + "decompounder word list item should be string".to_string(), + )) + } + } + } + + match SplitCompoundWords::from_dictionary(str_list) { + Ok(f) => Ok(SystemFilter::Decompounder(f)), + Err(e) => Err(TantivyBindingError::InternalError(format!( + "create decompounder failed: {}", + e.to_string() + ))), + } +} + +fn get_stemmer_filter(params: &json::Map) -> Result { + let value = params.get("language"); + if value.is_none() || !value.unwrap().is_string() { + return Err(TantivyBindingError::InternalError( + "stemmer language field should be string".to_string(), + )); + } + + match value.unwrap().as_str().unwrap().into_language() { + Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))), + Err(e) => Err(TantivyBindingError::InternalError(format!( + "create stemmer failed : {}", + e.to_string() + ))), + } +} + +trait LanguageParser { + fn into_language(self) -> Result; +} + +impl LanguageParser for &str { + fn into_language(self) -> Result { + match self.to_lowercase().as_str() { + "arabig" => Ok(Language::Arabic), + "danish" => Ok(Language::Danish), + "dutch" => Ok(Language::Dutch), + "english" => Ok(Language::English), + "finnish" => Ok(Language::Finnish), + "french" => Ok(Language::French), + "german" => Ok(Language::German), + "greek" => Ok(Language::Greek), + "hungarian" => Ok(Language::Hungarian), + "italian" => Ok(Language::Italian), + "norwegian" => Ok(Language::Norwegian), + "portuguese" => Ok(Language::Portuguese), + "romanian" => Ok(Language::Romanian), + "russian" => Ok(Language::Russian), + "spanish" => Ok(Language::Spanish), + "swedish" => Ok(Language::Swedish), + "tamil" => Ok(Language::Tamil), + "turkish" => Ok(Language::Turkish), + other => Err(TantivyBindingError::InternalError(format!( + "unsupport language: {}", + other + ))), + } + } +} + +impl From<&str> for SystemFilter { + fn from(value: &str) -> Self { + match value { + "lowercase" => Self::LowerCase(LowerCaser), + "asciifolding" => Self::AsciiFolding(AsciiFoldingFilter), + "alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter), + "cncharonly" => Self::CnCharOnly(CnCharOnlyFilter), + "cnalphanumonly" => Self::CnAlphaNumOnly(CnAlphaNumOnlyFilter), + _ => Self::Invalid, + } + } +} + +impl TryFrom<&json::Map> for SystemFilter { + type Error = TantivyBindingError; + + fn try_from(params: &json::Map) -> Result { + match params.get(&"type".to_string()) { + Some(value) => { + if !value.is_string() { + return Err(TantivyBindingError::InternalError( + "filter type should be string".to_string(), + )); + }; + + match value.as_str().unwrap() { + "length" => get_length_filter(params), + "stop" => get_stop_words_filter(params), + "decompounder" => get_decompounder_filter(params), + "stemmer" => get_stemmer_filter(params), + other => Err(TantivyBindingError::InternalError(format!( + "unsupport filter type: {}", + other + ))), + } + } + None => Err(TantivyBindingError::InternalError( + "no type field in filter params".to_string(), + )), + } + } +} + +pub struct CnCharOnlyFilter; + +pub struct CnCharOnlyFilterStream { + regex: regex::Regex, + tail: T, +} + +impl TokenFilter for CnCharOnlyFilter { + type Tokenizer = CnCharOnlyFilterWrapper; + + fn transform(self, tokenizer: T) -> CnCharOnlyFilterWrapper { + CnCharOnlyFilterWrapper(tokenizer) + } +} + +#[derive(Clone)] +pub struct CnCharOnlyFilterWrapper(T); + +impl Tokenizer for CnCharOnlyFilterWrapper { + type TokenStream<'a> = CnCharOnlyFilterStream>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + CnCharOnlyFilterStream { + regex: regex::Regex::new("\\p{Han}+").unwrap(), + tail: self.0.token_stream(text), + } + } +} + +impl TokenStream for CnCharOnlyFilterStream { + fn advance(&mut self) -> bool { + while self.tail.advance() { + if self.regex.is_match(&self.tail.token().text) { + return true; + } + } + + false + } + + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } +} + +pub struct CnAlphaNumOnlyFilter; + +pub struct CnAlphaNumOnlyFilterStream { + regex: regex::Regex, + tail: T, +} + +impl TokenFilter for CnAlphaNumOnlyFilter { + type Tokenizer = CnAlphaNumOnlyFilterWrapper; + + fn transform(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper { + CnAlphaNumOnlyFilterWrapper(tokenizer) + } +} +#[derive(Clone)] +pub struct CnAlphaNumOnlyFilterWrapper(T); + +impl Tokenizer for CnAlphaNumOnlyFilterWrapper { + type TokenStream<'a> = CnAlphaNumOnlyFilterStream>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + CnAlphaNumOnlyFilterStream { + regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(), + tail: self.0.token_stream(text), + } + } +} + +impl TokenStream for CnAlphaNumOnlyFilterStream { + fn advance(&mut self) -> bool { + while self.tail.advance() { + if self.regex.is_match(&self.tail.token().text) { + return true; + } + } + + false + } + + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/mod.rs new file mode 100644 index 0000000000..df2ad4d68c --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/mod.rs @@ -0,0 +1,11 @@ +//! This is totally copied from src/analyzer + +mod analyzer; +mod build_in_analyzer; +mod filter; +mod stop_words; +mod tokenizers; +mod util; + +pub(crate) use self::analyzer::create_analyzer; +pub(crate) use self::build_in_analyzer::standard_analyzer; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/stop_words.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/stop_words.rs new file mode 100644 index 0000000000..ae78b86f12 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/stop_words.rs @@ -0,0 +1,5 @@ +pub const ENGLISH: &[&str] = &[ + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", + "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", + "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", +]; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/jieba_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/jieba_tokenizer.rs new file mode 100644 index 0000000000..a4d8e24163 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/jieba_tokenizer.rs @@ -0,0 +1,83 @@ +use jieba_rs; +use lazy_static::lazy_static; +use tantivy_5::tokenizer::{Token, TokenStream, Tokenizer}; + +lazy_static! { + static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new(); +} + +#[allow(dead_code)] +#[derive(Clone)] +pub enum JiebaMode { + Exact, + Search, +} + +#[derive(Clone)] +pub struct JiebaTokenizer { + mode: JiebaMode, + hmm: bool, +} + +pub struct JiebaTokenStream { + tokens: Vec, + index: usize, +} + +impl TokenStream for JiebaTokenStream { + fn advance(&mut self) -> bool { + if self.index < self.tokens.len() { + self.index += 1; + true + } else { + false + } + } + + fn token(&self) -> &Token { + &self.tokens[self.index - 1] + } + + fn token_mut(&mut self) -> &mut Token { + &mut self.tokens[self.index - 1] + } +} + +impl JiebaTokenizer { + pub fn new() -> JiebaTokenizer { + JiebaTokenizer { + mode: JiebaMode::Search, + hmm: true, + } + } + + fn tokenize(&self, text: &str) -> Vec { + let mut indices = text.char_indices().collect::>(); + indices.push((text.len(), '\0')); + let ori_tokens = match self.mode { + JiebaMode::Exact => JIEBA.tokenize(text, jieba_rs::TokenizeMode::Default, self.hmm), + JiebaMode::Search => JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, self.hmm), + }; + + let mut tokens = Vec::with_capacity(ori_tokens.len()); + for token in ori_tokens { + tokens.push(Token { + offset_from: indices[token.start].0, + offset_to: indices[token.end].0, + position: token.start, + text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]), + position_length: token.end - token.start, + }); + } + tokens + } +} + +impl Tokenizer for JiebaTokenizer { + type TokenStream<'a> = JiebaTokenStream; + + fn token_stream(&mut self, text: &str) -> JiebaTokenStream { + let tokens = self.tokenize(text); + JiebaTokenStream { tokens, index: 0 } + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/lindera_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/lindera_tokenizer.rs new file mode 100644 index 0000000000..4b2b734766 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/lindera_tokenizer.rs @@ -0,0 +1,152 @@ +use core::result::Result::Err; + +use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind}; +use lindera::mode::Mode; +use lindera::segmenter::Segmenter; +use lindera::token::Token as LToken; +use lindera::tokenizer::Tokenizer as LTokenizer; +use tantivy_5::tokenizer::{Token, TokenStream, Tokenizer}; + +use crate::error::{Result, TantivyBindingError}; +use serde_json as json; + +pub struct LinderaTokenStream<'a> { + pub tokens: Vec>, + pub token: &'a mut Token, +} + +impl<'a> TokenStream for LinderaTokenStream<'a> { + fn advance(&mut self) -> bool { + if self.tokens.is_empty() { + return false; + } + let token = self.tokens.remove(0); + self.token.text = token.text.to_string(); + self.token.offset_from = token.byte_start; + self.token.offset_to = token.byte_end; + self.token.position = token.position; + self.token.position_length = token.position_length; + + true + } + + fn token(&self) -> &Token { + self.token + } + + fn token_mut(&mut self) -> &mut Token { + self.token + } +} + +#[derive(Clone)] +pub struct LinderaTokenizer { + tokenizer: LTokenizer, + token: Token, +} + +impl LinderaTokenizer { + /// Create a new `LinderaTokenizer`. + /// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable. + pub fn from_json(params: &json::Map) -> Result { + let kind = fetch_lindera_kind(params)?; + let dictionary = load_dictionary_from_kind(kind); + if dictionary.is_err() { + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer with invalid dict_kind" + ))); + } + let segmenter = Segmenter::new(Mode::Normal, dictionary.unwrap(), None); + Ok(LinderaTokenizer::from_segmenter(segmenter)) + } + + /// Create a new `LinderaTokenizer`. + /// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`. + pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer { + LinderaTokenizer { + tokenizer: LTokenizer::new(segmenter), + token: Default::default(), + } + } +} + +impl Tokenizer for LinderaTokenizer { + type TokenStream<'a> = LinderaTokenStream<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> { + self.token.reset(); + LinderaTokenStream { + tokens: self.tokenizer.tokenize(text).unwrap(), + token: &mut self.token, + } + } +} + +trait DictionaryKindParser { + fn into_dict_kind(self) -> Result; +} + +impl DictionaryKindParser for &str { + fn into_dict_kind(self) -> Result { + match self { + "ipadic" => Ok(DictionaryKind::IPADIC), + "ipadic-neologd" => Ok(DictionaryKind::IPADICNEologd), + "unidic" => Ok(DictionaryKind::UniDic), + "ko-dic" => Ok(DictionaryKind::KoDic), + "cc-cedict" => Ok(DictionaryKind::CcCedict), + other => Err(TantivyBindingError::InvalidArgument(format!( + "unsupported lindera dict type: {}", + other + ))), + } + } +} + +fn fetch_lindera_kind(params: &json::Map) -> Result { + match params.get("dict_kind") { + Some(val) => { + if !val.is_string() { + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer dict kind should be string" + ))); + } + val.as_str().unwrap().into_dict_kind() + } + _ => { + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer dict_kind must be set" + ))) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_lindera_tokenizer() { + let params = r#"{ + "type": "lindera", + "dict_kind": "ipadic" + }"#; + let json_param = json::from_str::>(¶ms); + assert!(json_param.is_ok()); + + let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + } + + #[test] + #[cfg(feature = "lindera-cc-cedict")] + fn test_lindera_tokenizer_cc() { + let params = r#"{ + "type": "lindera", + "dict_kind": "cc-cedict" + }"#; + let json_param = json::from_str::>(¶ms); + assert!(json_param.is_ok()); + + let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/mod.rs new file mode 100644 index 0000000000..fd922d0b5a --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/mod.rs @@ -0,0 +1,5 @@ +mod jieba_tokenizer; +mod lindera_tokenizer; +mod tokenizer; + +pub(crate) use self::tokenizer::*; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/tokenizer.rs new file mode 100644 index 0000000000..5cc5d6808d --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/tokenizer.rs @@ -0,0 +1,74 @@ +use log::warn; +use serde_json as json; +use tantivy_5::tokenizer::*; +use tantivy_5::tokenizer::{TextAnalyzer, TextAnalyzerBuilder}; + +use crate::error::{Result, TantivyBindingError}; + +use super::jieba_tokenizer::JiebaTokenizer; +use super::lindera_tokenizer::LinderaTokenizer; + +pub fn standard_builder() -> TextAnalyzerBuilder { + TextAnalyzer::builder(SimpleTokenizer::default()).dynamic() +} + +pub fn whitespace_builder() -> TextAnalyzerBuilder { + TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic() +} + +pub fn jieba_builder() -> TextAnalyzerBuilder { + TextAnalyzer::builder(JiebaTokenizer::new()).dynamic() +} + +pub fn lindera_builder( + params: Option<&json::Map>, +) -> Result { + if params.is_none() { + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer must be costum" + ))); + } + let tokenizer = LinderaTokenizer::from_json(params.unwrap())?; + Ok(TextAnalyzer::builder(tokenizer).dynamic()) +} + +pub fn get_builder_with_tokenizer(params: &json::Value) -> Result { + let name; + let params_map; + if params.is_string() { + name = params.as_str().unwrap(); + params_map = None; + } else { + let m = params.as_object().unwrap(); + match m.get("type") { + Some(val) => { + if !val.is_string() { + return Err(TantivyBindingError::InvalidArgument(format!( + "tokenizer type should be string" + ))); + } + name = val.as_str().unwrap(); + } + _ => { + return Err(TantivyBindingError::InvalidArgument(format!( + "costum tokenizer must set type" + ))) + } + } + params_map = Some(m); + } + + match name { + "standard" => Ok(standard_builder()), + "whitespace" => Ok(whitespace_builder()), + "jieba" => Ok(jieba_builder()), + "lindera" => lindera_builder(params_map), + other => { + warn!("unsupported tokenizer: {}", other); + Err(TantivyBindingError::InvalidArgument(format!( + "unsupported tokenizer: {}", + other + ))) + } + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/util.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/util.rs new file mode 100644 index 0000000000..c480cb4fd9 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/util.rs @@ -0,0 +1,45 @@ +use serde_json as json; + +use super::stop_words; +use crate::error::{Result, TantivyBindingError}; + +pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result> { + if !value.is_array() { + return Err(TantivyBindingError::InternalError( + format!("{} should be array", label).to_string(), + )); + } + + let stop_words = value.as_array().unwrap(); + let mut str_list = Vec::::new(); + for element in stop_words { + match element.as_str() { + Some(word) => str_list.push(word.to_string()), + _ => { + return Err(TantivyBindingError::InternalError( + format!("{} list item should be string", label).to_string(), + )) + } + } + } + Ok(str_list) +} + +pub(crate) fn get_stop_words_list(str_list: Vec) -> Vec { + let mut stop_words = Vec::new(); + for str in str_list { + if str.len() > 0 && str.chars().nth(0).unwrap() == '_' { + match str.as_str() { + "_english_" => { + for word in stop_words::ENGLISH { + stop_words.push(word.to_string()); + } + continue; + } + _other => {} + } + } + stop_words.push(str); + } + stop_words +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer.rs new file mode 100644 index 0000000000..4fd7cef681 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer.rs @@ -0,0 +1,219 @@ +use std::ffi::CStr; +use std::sync::Arc; + +use either::Either; +use futures::executor::block_on; +use libc::c_char; +use log::info; +use tantivy_5::schema::{ + Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED, +}; +use tantivy_5::{Document as TantivyDocument, Index, IndexWriter, SingleSegmentIndexWriter}; + +use crate::data_type::TantivyDataType; + +use crate::error::Result; +use crate::index_writer::TantivyValue; +use crate::log::init_log; + +pub(crate) struct IndexWriterWrapperImpl { + pub(crate) field: Field, + pub(crate) index_writer: Either, + pub(crate) id_field: Option, + pub(crate) _index: Arc, +} + +#[inline] +fn schema_builder_add_field( + schema_builder: &mut SchemaBuilder, + field_name: &str, + data_type: TantivyDataType, +) -> Field { + match data_type { + TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED), + TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED), + TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED), + TantivyDataType::Keyword => { + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("raw") + .set_index_option(IndexRecordOption::Basic); + let text_options = TextOptions::default().set_indexing_options(text_field_indexing); + schema_builder.add_text_field(&field_name, text_options) + } + TantivyDataType::Text => { + panic!("text should be indexed with analyzer"); + } + } +} + +impl TantivyValue for i64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_i64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for u64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_u64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for f64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_f64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for &str { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_text(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for bool { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_bool(Field::from_field_id(field), *self); + } +} + +impl IndexWriterWrapperImpl { + pub fn new( + field_name: &str, + data_type: TantivyDataType, + path: String, + num_threads: usize, + overall_memory_budget_in_bytes: usize, + ) -> Result { + info!( + "create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 5", + field_name, data_type + ); + let mut schema_builder = Schema::builder(); + let field = schema_builder_add_field(&mut schema_builder, field_name, data_type); + // We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field. + let id_field = schema_builder.add_i64_field("doc_id", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_dir(path.clone(), schema)?; + let index_writer = + index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?; + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Left(index_writer), + id_field: Some(id_field), + _index: Arc::new(index), + }) + } + + pub fn new_with_single_segment( + field_name: &str, + data_type: TantivyDataType, + path: String, + ) -> Result { + init_log(); + info!( + "create single segment index writer, field_name: {}, data_type: {:?}, tantivy_index_version 5", + field_name, data_type + ); + let mut schema_builder = Schema::builder(); + let field = schema_builder_add_field(&mut schema_builder, field_name, data_type); + let schema = schema_builder.build(); + let index = Index::create_in_dir(path.clone(), schema)?; + let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?; + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Right(index_writer), + id_field: None, + _index: Arc::new(index), + }) + } + + #[inline] + fn add_document(&mut self, mut document: TantivyDocument, offset: Option) -> Result<()> { + if let Some(id_field) = self.id_field { + document.add_i64(id_field, offset.unwrap()); + } + + match &mut self.index_writer { + Either::Left(writer) => { + let _ = writer.add_document(document)?; + } + Either::Right(single_segment_writer) => { + let _ = single_segment_writer.add_document(document)?; + } + } + Ok(()) + } + + pub fn add>( + &mut self, + data: T, + offset: Option, + ) -> Result<()> { + let mut document = TantivyDocument::default(); + data.add_to_document(self.field.field_id(), &mut document); + + self.add_document(document, offset) + } + + pub fn add_array, I>( + &mut self, + data: I, + offset: Option, + ) -> Result<()> + where + I: IntoIterator, + { + let mut document = TantivyDocument::default(); + data.into_iter() + .for_each(|d| d.add_to_document(self.field.field_id(), &mut document)); + + self.add_document(document, offset) + } + + pub fn add_array_keywords( + &mut self, + datas: &[*const c_char], + offset: Option, + ) -> Result<()> { + let mut document = TantivyDocument::default(); + for element in datas { + let data = unsafe { CStr::from_ptr(*element) }; + document.add_field_value(self.field, data.to_str()?); + } + + self.add_document(document, offset) + } + + pub fn manual_merge(&mut self) -> Result<()> { + let index_writer = self.index_writer.as_mut().left().unwrap(); + let metas = index_writer.index().searchable_segment_metas()?; + let policy = index_writer.get_merge_policy(); + let candidates = policy.compute_merge_candidates(metas.as_slice()); + for candidate in candidates { + index_writer.merge(candidate.0.as_slice()).wait()?; + } + Ok(()) + } + + pub fn finish(self) -> Result<()> { + match self.index_writer { + Either::Left(mut index_writer) => { + index_writer.commit()?; + // self.manual_merge(); + block_on(index_writer.garbage_collect_files())?; + index_writer.wait_merging_threads()?; + } + Either::Right(single_segment_index_writer) => { + single_segment_index_writer + .finalize() + .expect("failed to build inverted index"); + } + } + Ok(()) + } + + pub(crate) fn commit(&mut self) -> Result<()> { + self.index_writer.as_mut().left().unwrap().commit()?; + Ok(()) + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer_text.rs new file mode 100644 index 0000000000..c377dd1fba --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer_text.rs @@ -0,0 +1,58 @@ +use std::sync::Arc; + +use either::Either; +use tantivy_5::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST}; +use tantivy_5::Index; + +use crate::error::Result; +use crate::log::init_log; + +use super::analyzer::create_analyzer; +use super::IndexWriterWrapperImpl; + +fn build_text_schema(field_name: &str, tokenizer_name: &str) -> (Schema, Field, Field) { + let mut schema_builder = Schema::builder(); + // positions is required for matching phase. + let indexing = TextFieldIndexing::default() + .set_tokenizer(tokenizer_name) + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let option = TextOptions::default().set_indexing_options(indexing); + let field = schema_builder.add_text_field(field_name, option); + let id_field = schema_builder.add_i64_field("doc_id", FAST); + (schema_builder.build(), field, id_field) +} + +impl IndexWriterWrapperImpl { + pub(crate) fn create_text_writer( + field_name: &str, + path: &str, + tokenizer_name: &str, + tokenizer_params: &str, + num_threads: usize, + overall_memory_budget_in_bytes: usize, + in_ram: bool, + ) -> Result { + init_log(); + + let tokenizer = create_analyzer(tokenizer_params)?; + + let (schema, field, id_field) = build_text_schema(field_name, tokenizer_name); + let index: Index; + if in_ram { + index = Index::create_in_ram(schema); + } else { + index = Index::create_in_dir(path.to_string(), schema).unwrap(); + } + index.tokenizers().register(&tokenizer_name, tokenizer); + let index_writer = index + .writer_with_num_threads(num_threads, overall_memory_budget_in_bytes) + .unwrap(); + + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Left(index_writer), + id_field: Some(id_field), + _index: Arc::new(index), + }) + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/mod.rs new file mode 100644 index 0000000000..f146ee496b --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/mod.rs @@ -0,0 +1,11 @@ +//! Tantivy index version 5 +//! This is the old version of Tantivy index (ex: Milvus 2.4.x uses). +//! We may still build tantivy index with version 5 for compatibility reasons where +//! there are some read nodes that can only read tantivy index with version 5. + +mod analyzer; +pub(crate) mod index_writer; +pub(crate) mod index_writer_text; + +pub(crate) use index_writer::IndexWriterWrapperImpl; +pub(crate) use tantivy_5::Document as TantivyDocumentV5; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer.rs new file mode 100644 index 0000000000..ccbc4c6abb --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer.rs @@ -0,0 +1,224 @@ +use std::ffi::CStr; +use std::sync::Arc; + +use either::Either; +use futures::executor::block_on; +use libc::c_char; +use log::info; +use tantivy::schema::{ + Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED, +}; +use tantivy::{Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument}; + +use crate::data_type::TantivyDataType; + +use crate::error::Result; +use crate::index_reader::IndexReaderWrapper; +use crate::index_writer::TantivyValue; +use crate::log::init_log; + +#[inline] +fn schema_builder_add_field( + schema_builder: &mut SchemaBuilder, + field_name: &str, + data_type: TantivyDataType, +) -> Field { + match data_type { + TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED), + TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED), + TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED), + TantivyDataType::Keyword => { + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("raw") + .set_index_option(IndexRecordOption::Basic); + let text_options = TextOptions::default().set_indexing_options(text_field_indexing); + schema_builder.add_text_field(&field_name, text_options) + } + TantivyDataType::Text => { + panic!("text should be indexed with analyzer"); + } + } +} + +impl TantivyValue for i64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_i64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for u64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_u64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for f64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_f64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for &str { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_text(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for bool { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_bool(Field::from_field_id(field), *self); + } +} + +pub struct IndexWriterWrapperImpl { + pub(crate) field: Field, + pub(crate) index_writer: Either, + pub(crate) id_field: Option, + pub(crate) index: Arc, +} + +impl IndexWriterWrapperImpl { + pub fn new( + field_name: &str, + data_type: TantivyDataType, + path: String, + num_threads: usize, + overall_memory_budget_in_bytes: usize, + ) -> Result { + info!( + "create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 7", + field_name, data_type + ); + let mut schema_builder = Schema::builder(); + let field = schema_builder_add_field(&mut schema_builder, field_name, data_type); + // We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field. + let id_field = schema_builder.add_i64_field("doc_id", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_dir(path.clone(), schema)?; + let index_writer = + index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?; + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Left(index_writer), + id_field: Some(id_field), + index: Arc::new(index), + }) + } + + pub fn new_with_single_segment( + field_name: &str, + data_type: TantivyDataType, + path: String, + ) -> Result { + init_log(); + info!( + "create single segment index writer, field_name: {}, data_type: {:?}, tantivy_index_version 7", + field_name, data_type + ); + let mut schema_builder = Schema::builder(); + let field = schema_builder_add_field(&mut schema_builder, field_name, data_type); + let schema = schema_builder.build(); + let index = Index::create_in_dir(path.clone(), schema)?; + let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?; + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Right(index_writer), + id_field: None, + index: Arc::new(index), + }) + } + + pub fn create_reader(&self) -> Result { + IndexReaderWrapper::from_index(self.index.clone()) + } + + #[inline] + fn add_document(&mut self, mut document: TantivyDocument, offset: Option) -> Result<()> { + if let Some(id_field) = self.id_field { + document.add_i64(id_field, offset.unwrap()); + } + + match &mut self.index_writer { + Either::Left(writer) => { + let _ = writer.add_document(document)?; + } + Either::Right(single_segment_writer) => { + let _ = single_segment_writer.add_document(document)?; + } + } + Ok(()) + } + + pub fn add>( + &mut self, + data: T, + offset: Option, + ) -> Result<()> { + let mut document = TantivyDocument::default(); + data.add_to_document(self.field.field_id(), &mut document); + + self.add_document(document, offset) + } + + pub fn add_array, I>( + &mut self, + data: I, + offset: Option, + ) -> Result<()> + where + I: IntoIterator, + { + let mut document = TantivyDocument::default(); + data.into_iter() + .for_each(|d| d.add_to_document(self.field.field_id(), &mut document)); + + self.add_document(document, offset) + } + + pub fn add_array_keywords( + &mut self, + datas: &[*const c_char], + offset: Option, + ) -> Result<()> { + let mut document = TantivyDocument::default(); + for element in datas { + let data = unsafe { CStr::from_ptr(*element) }; + document.add_field_value(self.field, data.to_str()?); + } + + self.add_document(document, offset) + } + + pub fn manual_merge(&mut self) -> Result<()> { + let index_writer = self.index_writer.as_mut().left().unwrap(); + let metas = index_writer.index().searchable_segment_metas()?; + let policy = index_writer.get_merge_policy(); + let candidates = policy.compute_merge_candidates(metas.as_slice()); + for candidate in candidates { + index_writer.merge(candidate.0.as_slice()).wait()?; + } + Ok(()) + } + + pub fn finish(self) -> Result<()> { + match self.index_writer { + Either::Left(mut index_writer) => { + index_writer.commit()?; + // self.manual_merge(); + block_on(index_writer.garbage_collect_files())?; + index_writer.wait_merging_threads()?; + } + Either::Right(single_segment_index_writer) => { + single_segment_index_writer + .finalize() + .expect("failed to build inverted index"); + } + } + Ok(()) + } + + pub(crate) fn commit(&mut self) -> Result<()> { + self.index_writer.as_mut().left().unwrap().commit()?; + Ok(()) + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer_text.rs new file mode 100644 index 0000000000..808860e32e --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer_text.rs @@ -0,0 +1,57 @@ +use std::sync::Arc; + +use either::Either; +use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST}; +use tantivy::Index; + +use crate::analyzer::create_analyzer; +use crate::error::Result; +use crate::log::init_log; + +use super::IndexWriterWrapperImpl; + +fn build_text_schema(field_name: &str, tokenizer_name: &str) -> (Schema, Field, Field) { + let mut schema_builder = Schema::builder(); + // positions is required for matching phase. + let indexing = TextFieldIndexing::default() + .set_tokenizer(tokenizer_name) + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let option = TextOptions::default().set_indexing_options(indexing); + let field = schema_builder.add_text_field(field_name, option); + let id_field = schema_builder.add_i64_field("doc_id", FAST); + (schema_builder.build(), field, id_field) +} + +impl IndexWriterWrapperImpl { + pub(crate) fn create_text_writer( + field_name: &str, + path: &str, + tokenizer_name: &str, + tokenizer_params: &str, + num_threads: usize, + overall_memory_budget_in_bytes: usize, + in_ram: bool, + ) -> Result { + init_log(); + let tokenizer = create_analyzer(tokenizer_params)?; + + let (schema, field, id_field) = build_text_schema(field_name, tokenizer_name); + let index: Index; + if in_ram { + index = Index::create_in_ram(schema); + } else { + index = Index::create_in_dir(path.to_string(), schema).unwrap(); + } + index.tokenizers().register(&tokenizer_name, tokenizer); + let index_writer = index + .writer_with_num_threads(num_threads, overall_memory_budget_in_bytes) + .unwrap(); + + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Left(index_writer), + id_field: Some(id_field), + index: Arc::new(index), + }) + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/mod.rs new file mode 100644 index 0000000000..7122a00164 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/mod.rs @@ -0,0 +1,9 @@ +//! Tantivy index version 7 +//! This is the latest version of Tantivy index and is what we plan to use +//! in most cases. + +pub(crate) mod index_writer; +pub(crate) mod index_writer_text; + +pub(crate) use index_writer::IndexWriterWrapperImpl; +pub(crate) use tantivy::TantivyDocument as TantivyDocumentV7; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs index 508c8a1448..dfb06b77a3 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs @@ -1,3 +1,6 @@ +use error::TantivyBindingError; + +mod analyzer; mod array; mod data_type; mod demo_c; @@ -12,26 +15,44 @@ mod index_writer; mod index_writer_c; mod index_writer_text; mod index_writer_text_c; +mod index_writer_v5; +mod index_writer_v7; mod log; mod string_c; mod token_stream_c; -mod analyzer; mod tokenizer_c; mod util; mod util_c; mod vec_collector; -pub fn add(left: usize, right: usize) -> usize { - left + right +use error::Result; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TantivyIndexVersion { + V5, // Version for compatibility (for 2.4.x) + V7, // Latest version } -#[cfg(test)] -mod tests { - use super::*; +impl TantivyIndexVersion { + pub fn from_u32(version: u32) -> Result { + match version { + 5 => Ok(Self::V5), + 7 => Ok(Self::V7), + _ => Err(TantivyBindingError::InvalidArgument(format!( + "unsupported version {}", + version + ))), + } + } - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); + pub fn as_u32(&self) -> u32 { + match self { + Self::V5 => 5, + Self::V7 => 7, + } + } + + pub fn default_version() -> Self { + Self::V7 } } diff --git a/internal/core/thirdparty/tantivy/tantivy-wrapper.h b/internal/core/thirdparty/tantivy/tantivy-wrapper.h index 1e06c7273f..2125fe493f 100644 --- a/internal/core/thirdparty/tantivy/tantivy-wrapper.h +++ b/internal/core/thirdparty/tantivy/tantivy-wrapper.h @@ -82,6 +82,7 @@ struct TantivyIndexWrapper { TantivyIndexWrapper(const char* field_name, TantivyDataType data_type, const char* path, + uint32_t tantivy_index_version, bool inverted_single_semgnent = false, uintptr_t num_threads = DEFAULT_NUM_THREADS, uintptr_t overall_memory_budget_in_bytes = @@ -89,12 +90,13 @@ struct TantivyIndexWrapper { RustResultWrapper res; if (inverted_single_semgnent) { res = RustResultWrapper(tantivy_create_index_with_single_segment( - field_name, data_type, path)); + field_name, data_type, path, tantivy_index_version)); } else { res = RustResultWrapper( tantivy_create_index(field_name, data_type, path, + tantivy_index_version, num_threads, overall_memory_budget_in_bytes)); } @@ -120,6 +122,7 @@ struct TantivyIndexWrapper { TantivyIndexWrapper(const char* field_name, bool in_ram, const char* path, + uint32_t tantivy_index_version, const char* tokenizer_name = DEFAULT_TOKENIZER_NAME, const char* analyzer_params = DEFAULT_analyzer_params, uintptr_t num_threads = DEFAULT_NUM_THREADS, @@ -128,6 +131,7 @@ struct TantivyIndexWrapper { auto res = RustResultWrapper( tantivy_create_text_writer(field_name, path, + tantivy_index_version, tokenizer_name, analyzer_params, num_threads, diff --git a/internal/core/thirdparty/tantivy/token-stream.h b/internal/core/thirdparty/tantivy/token-stream.h index ab9415488c..374e064711 100644 --- a/internal/core/thirdparty/tantivy/token-stream.h +++ b/internal/core/thirdparty/tantivy/token-stream.h @@ -39,7 +39,8 @@ struct TokenStream { return s; } - TantivyToken get_detailed_token() { + TantivyToken + get_detailed_token() { return tantivy_token_stream_get_detailed_token(ptr_); } diff --git a/internal/core/unittest/test_c_tokenizer.cpp b/internal/core/unittest/test_c_tokenizer.cpp index ff45745170..2722538492 100644 --- a/internal/core/unittest/test_c_tokenizer.cpp +++ b/internal/core/unittest/test_c_tokenizer.cpp @@ -70,16 +70,15 @@ TEST(CTokenizer, Default) { ASSERT_FALSE(token_stream_advance(token_stream)); free_token_stream(token_stream); - - token_stream = - create_token_stream(tokenizer, text.c_str(), text.length()); + + token_stream = create_token_stream(tokenizer, text.c_str(), text.length()); for (int i = 0; i < 3; i++) { ASSERT_TRUE(token_stream_advance(token_stream)); auto token = token_stream_get_detailed_token(token_stream); ASSERT_EQ(refs[i], std::string(token.token)); ASSERT_EQ(offsets[i], token.start_offset); - + free_token(const_cast(token.token)); } ASSERT_FALSE(token_stream_advance(token_stream));