From f552ec67ddfde8fe4484a4434734d06209d13a61 Mon Sep 17 00:00:00 2001 From: Spade A <71589810+SpadeA-Tang@users.noreply.github.com> Date: Wed, 2 Apr 2025 18:46:20 +0800 Subject: [PATCH] fix: support building tantivy index with low version(5) (#40822) fix: https://github.com/milvus-io/milvus/issues/40823 To solve the problem in the issue, we have to support building tantivy index with low version for those query nodes with low tantivy version. This PR does two things: 1. refactor codes for IndexWriterWrapper to make it concise 2. enable IndexWriterWrapper to build tantivy index by different tantivy crate --------- Signed-off-by: SpadeA --- internal/core/src/exec/Driver.cpp | 30 +- internal/core/src/index/HybridScalarIndex.cpp | 8 +- internal/core/src/index/HybridScalarIndex.h | 8 + internal/core/src/index/IndexFactory.cpp | 9 +- internal/core/src/index/IndexInfo.h | 3 +- .../core/src/index/InvertedIndexTantivy.cpp | 25 +- .../core/src/index/InvertedIndexTantivy.h | 11 +- internal/core/src/index/JsonInvertedIndex.h | 5 +- internal/core/src/index/Meta.h | 2 + internal/core/src/index/TextMatchIndex.cpp | 19 +- internal/core/src/index/TextMatchIndex.h | 2 + .../src/indexbuilder/ScalarIndexCreator.cpp | 5 + internal/core/src/indexbuilder/index_c.cpp | 6 + .../src/segcore/ChunkedSegmentSealedImpl.cpp | 2 + .../core/src/segcore/SegmentSealedImpl.cpp | 2 + internal/core/src/segcore/load_index_c.cpp | 32 +- internal/core/src/segcore/token_stream_c.cpp | 12 +- internal/core/src/segcore/token_stream_c.h | 14 +- .../tantivy/tantivy-binding/Cargo.lock | 489 +++++++++++++-- .../tantivy/tantivy-binding/Cargo.toml | 1 + .../tantivy-binding/include/tantivy-binding.h | 5 +- .../analyzer/tokenizers/jieba_tokenizer.rs | 1 + .../analyzer/tokenizers/lindera_tokenizer.rs | 37 +- .../src/analyzer/tokenizers/tokenizer.rs | 33 +- .../tantivy/tantivy-binding/src/array.rs | 9 +- .../tantivy/tantivy-binding/src/data_type.rs | 1 + .../tantivy-binding/src/docid_collector.rs | 4 +- .../tantivy/tantivy-binding/src/error.rs | 9 + .../tantivy-binding/src/index_reader.rs | 1 + .../tantivy-binding/src/index_reader_text.rs | 34 +- .../src/index_reader_text_c.rs | 10 +- .../tantivy-binding/src/index_writer.rs | 570 +++++++----------- .../tantivy-binding/src/index_writer_c.rs | 209 +++++-- .../tantivy-binding/src/index_writer_text.rs | 79 ++- .../src/index_writer_text_c.rs | 34 +- .../src/index_writer_v5/analyzer/analyzer.rs | 288 +++++++++ .../analyzer/build_in_analyzer.rs | 40 ++ .../src/index_writer_v5/analyzer/filter.rs | 285 +++++++++ .../src/index_writer_v5/analyzer/mod.rs | 11 + .../index_writer_v5/analyzer/stop_words.rs | 5 + .../analyzer/tokenizers/jieba_tokenizer.rs | 83 +++ .../analyzer/tokenizers/lindera_tokenizer.rs | 152 +++++ .../analyzer/tokenizers/mod.rs | 5 + .../analyzer/tokenizers/tokenizer.rs | 74 +++ .../src/index_writer_v5/analyzer/util.rs | 45 ++ .../src/index_writer_v5/index_writer.rs | 219 +++++++ .../src/index_writer_v5/index_writer_text.rs | 58 ++ .../src/index_writer_v5/mod.rs | 11 + .../src/index_writer_v7/index_writer.rs | 224 +++++++ .../src/index_writer_v7/index_writer_text.rs | 57 ++ .../src/index_writer_v7/mod.rs | 9 + .../tantivy/tantivy-binding/src/lib.rs | 41 +- .../core/thirdparty/tantivy/tantivy-wrapper.h | 6 +- .../core/thirdparty/tantivy/token-stream.h | 3 +- internal/core/unittest/test_c_tokenizer.cpp | 7 +- 55 files changed, 2712 insertions(+), 632 deletions(-) create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/analyzer.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/build_in_analyzer.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/filter.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/mod.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/stop_words.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/jieba_tokenizer.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/lindera_tokenizer.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/mod.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/tokenizer.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/util.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer_text.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/mod.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer_text.rs create mode 100644 internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/mod.rs diff --git a/internal/core/src/exec/Driver.cpp b/internal/core/src/exec/Driver.cpp index 58c07337a2..301a814026 100644 --- a/internal/core/src/exec/Driver.cpp +++ b/internal/core/src/exec/Driver.cpp @@ -176,21 +176,21 @@ Driver::Next(std::shared_ptr& blocking_state) { return result; } -#define CALL_OPERATOR(call_func, operator, method_name) \ - try { \ - call_func; \ - } catch (std::exception & e) { \ - std::string stack_trace = milvus::impl::EasyStackTrace(); \ - auto err_msg = fmt::format( \ - "Operator::{} failed for [Operator:{}, plan node id: " \ - "{}] : {}\nStack trace: {}", \ - method_name, \ - operator->ToString() , \ - operator->get_plannode_id(), \ - e.what(), \ - stack_trace); \ - LOG_ERROR(err_msg); \ - throw ExecOperatorException(err_msg); \ +#define CALL_OPERATOR(call_func, operator, method_name) \ + try { \ + call_func; \ + } catch (std::exception & e) { \ + std::string stack_trace = milvus::impl::EasyStackTrace(); \ + auto err_msg = fmt::format( \ + "Operator::{} failed for [Operator:{}, plan node id: " \ + "{}] : {}\nStack trace: {}", \ + method_name, \ + operator->ToString(), \ + operator->get_plannode_id(), \ + e.what(), \ + stack_trace); \ + LOG_ERROR(err_msg); \ + throw ExecOperatorException(err_msg); \ } StopReason diff --git a/internal/core/src/index/HybridScalarIndex.cpp b/internal/core/src/index/HybridScalarIndex.cpp index 4c0ff5345e..7bbf996b20 100644 --- a/internal/core/src/index/HybridScalarIndex.cpp +++ b/internal/core/src/index/HybridScalarIndex.cpp @@ -29,9 +29,11 @@ namespace index { template HybridScalarIndex::HybridScalarIndex( + uint32_t tantivy_index_version, const storage::FileManagerContext& file_manager_context) : ScalarIndex(HYBRID_INDEX_TYPE), is_built_(false), + tantivy_index_version_(tantivy_index_version), bitmap_index_cardinality_limit_( DEFAULT_HYBRID_INDEX_BITMAP_CARDINALITY_LIMIT), file_manager_context_(file_manager_context) { @@ -191,8 +193,8 @@ HybridScalarIndex::GetInternalIndex() { internal_index_ = std::make_shared>(file_manager_context_); } else if (internal_index_type_ == ScalarIndexType::INVERTED) { - internal_index_ = - std::make_shared>(file_manager_context_); + internal_index_ = std::make_shared>( + tantivy_index_version_, file_manager_context_); } else { PanicInfo(UnexpectedError, "unknown index type when get internal index"); @@ -215,7 +217,7 @@ HybridScalarIndex::GetInternalIndex() { std::make_shared(file_manager_context_); } else if (internal_index_type_ == ScalarIndexType::INVERTED) { internal_index_ = std::make_shared>( - file_manager_context_); + tantivy_index_version_, file_manager_context_); } else { PanicInfo(UnexpectedError, "unknown index type when get internal index"); diff --git a/internal/core/src/index/HybridScalarIndex.h b/internal/core/src/index/HybridScalarIndex.h index 301f54fea0..3bcc349fea 100644 --- a/internal/core/src/index/HybridScalarIndex.h +++ b/internal/core/src/index/HybridScalarIndex.h @@ -42,6 +42,7 @@ template class HybridScalarIndex : public ScalarIndex { public: explicit HybridScalarIndex( + uint32_t tantivy_index_version, const storage::FileManagerContext& file_manager_context = storage::FileManagerContext()); @@ -193,6 +194,13 @@ class HybridScalarIndex : public ScalarIndex { std::shared_ptr> internal_index_{nullptr}; storage::FileManagerContext file_manager_context_; std::shared_ptr mem_file_manager_{nullptr}; + + // `tantivy_index_version_` is used to control which kind of tantivy index should be used. + // There could be the case where milvus version of read node is lower than the version of index builder node(and read node + // may not be upgraded to a higher version in a predictable time), so we are using a lower version of tantivy to read index + // built from a higher version of tantivy which is not supported. + // Therefore, we should provide a way to allow higher version of milvus to build tantivy index with low version. + uint32_t tantivy_index_version_{0}; }; } // namespace index diff --git a/internal/core/src/index/IndexFactory.cpp b/internal/core/src/index/IndexFactory.cpp index b59758d50c..0e56d1c99f 100644 --- a/internal/core/src/index/IndexFactory.cpp +++ b/internal/core/src/index/IndexFactory.cpp @@ -46,8 +46,10 @@ IndexFactory::CreatePrimitiveScalarIndex( const storage::FileManagerContext& file_manager_context) { auto index_type = create_index_info.index_type; if (index_type == INVERTED_INDEX_TYPE) { + assert(create_index_info.tantivy_index_version != 0); // scalar_index_engine_version 0 means we should built tantivy index within single segment return std::make_unique>( + create_index_info.tantivy_index_version, file_manager_context, create_index_info.scalar_index_engine_version == 0); } @@ -55,7 +57,8 @@ IndexFactory::CreatePrimitiveScalarIndex( return std::make_unique>(file_manager_context); } if (index_type == HYBRID_INDEX_TYPE) { - return std::make_unique>(file_manager_context); + return std::make_unique>( + create_index_info.tantivy_index_version, file_manager_context); } return CreateScalarIndexSort(file_manager_context); } @@ -75,8 +78,10 @@ IndexFactory::CreatePrimitiveScalarIndex( auto index_type = create_index_info.index_type; #if defined(__linux__) || defined(__APPLE__) if (index_type == INVERTED_INDEX_TYPE) { + assert(create_index_info.tantivy_index_version != 0); // scalar_index_engine_version 0 means we should built tantivy index within single segment return std::make_unique>( + create_index_info.tantivy_index_version, file_manager_context, create_index_info.scalar_index_engine_version == 0); } @@ -85,7 +90,7 @@ IndexFactory::CreatePrimitiveScalarIndex( } if (index_type == HYBRID_INDEX_TYPE) { return std::make_unique>( - file_manager_context); + create_index_info.tantivy_index_version, file_manager_context); } return CreateStringIndexMarisa(file_manager_context); #else diff --git a/internal/core/src/index/IndexInfo.h b/internal/core/src/index/IndexInfo.h index a86258f8b5..4e44664a1d 100644 --- a/internal/core/src/index/IndexInfo.h +++ b/internal/core/src/index/IndexInfo.h @@ -27,7 +27,8 @@ struct CreateIndexInfo { IndexVersion index_engine_version; std::string field_name; int64_t dim; - int32_t scalar_index_engine_version; + int32_t scalar_index_engine_version{1}; + uint32_t tantivy_index_version{7}; JsonCastType json_cast_type; std::string json_path; }; diff --git a/internal/core/src/index/InvertedIndexTantivy.cpp b/internal/core/src/index/InvertedIndexTantivy.cpp index de231eeb50..243e2d7347 100644 --- a/internal/core/src/index/InvertedIndexTantivy.cpp +++ b/internal/core/src/index/InvertedIndexTantivy.cpp @@ -55,15 +55,22 @@ InvertedIndexTantivy::InitForBuildIndex() { "build inverted index temp dir:{} not empty", path_); } - wrapper_ = std::make_shared( - field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_); + wrapper_ = + std::make_shared(field.c_str(), + d_type_, + path_.c_str(), + tantivy_index_version_, + inverted_index_single_segment_); } template InvertedIndexTantivy::InvertedIndexTantivy( - const storage::FileManagerContext& ctx, bool inverted_index_single_segment) + uint32_t tantivy_index_version, + const storage::FileManagerContext& ctx, + bool inverted_index_single_segment) : ScalarIndex(INVERTED_INDEX_TYPE), schema_(ctx.fieldDataMeta.field_schema), + tantivy_index_version_(tantivy_index_version), inverted_index_single_segment_(inverted_index_single_segment) { mem_file_manager_ = std::make_shared(ctx); disk_file_manager_ = std::make_shared(ctx); @@ -465,8 +472,16 @@ InvertedIndexTantivy::BuildWithRawDataForUT(size_t n, GetValueFromConfig(config, milvus::index::SCALAR_INDEX_ENGINE_VERSION) .value_or(1) == 0; - wrapper_ = std::make_shared( - field.c_str(), d_type_, path_.c_str(), inverted_index_single_segment_); + tantivy_index_version_ = + GetValueFromConfig(config, + milvus::index::TANTIVY_INDEX_VERSION) + .value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION); + wrapper_ = + std::make_shared(field.c_str(), + d_type_, + path_.c_str(), + tantivy_index_version_, + inverted_index_single_segment_); if (!inverted_index_single_segment_) { if (config.find("is_array") != config.end()) { // only used in ut. diff --git a/internal/core/src/index/InvertedIndexTantivy.h b/internal/core/src/index/InvertedIndexTantivy.h index 99186dac1d..6bd08ea68d 100644 --- a/internal/core/src/index/InvertedIndexTantivy.h +++ b/internal/core/src/index/InvertedIndexTantivy.h @@ -69,7 +69,9 @@ class InvertedIndexTantivy : public ScalarIndex { InvertedIndexTantivy() : ScalarIndex(INVERTED_INDEX_TYPE) { } - explicit InvertedIndexTantivy(const storage::FileManagerContext& ctx, + // Default, we build tantivy index with version 7 (newest version now). + explicit InvertedIndexTantivy(uint32_t tantivy_index_version, + const storage::FileManagerContext& ctx, bool inverted_index_single_segment = false); ~InvertedIndexTantivy(); @@ -254,5 +256,12 @@ class InvertedIndexTantivy : public ScalarIndex { // new version while the query node is a older version. So we have this `inverted_index_single_segment_` to control the index // building node to build specific type of tantivy index. bool inverted_index_single_segment_{false}; + + // `tantivy_index_version_` is used to control which kind of tantivy index should be used. + // There could be the case where milvus version of read node is lower than the version of index builder node(and read node + // may not be upgraded to a higher version in a predictable time), so we are using a lower version of tantivy to read index + // built from a higher version of tantivy which is not supported. + // Therefore, we should provide a way to allow higher version of milvus to build tantivy index with low version. + uint32_t tantivy_index_version_{0}; }; } // namespace milvus::index diff --git a/internal/core/src/index/JsonInvertedIndex.h b/internal/core/src/index/JsonInvertedIndex.h index a76decedcd..bc4917352a 100644 --- a/internal/core/src/index/JsonInvertedIndex.h +++ b/internal/core/src/index/JsonInvertedIndex.h @@ -89,7 +89,10 @@ class JsonInvertedIndex : public index::InvertedIndexTantivy { std::string field_name = std::to_string( this->disk_file_manager_->GetFieldDataMeta().field_id); this->wrapper_ = std::make_shared( - field_name.c_str(), this->d_type_, this->path_.c_str()); + field_name.c_str(), + this->d_type_, + this->path_.c_str(), + TANTIVY_INDEX_LATEST_VERSION /* json index is not supported in old version */); } void diff --git a/internal/core/src/index/Meta.h b/internal/core/src/index/Meta.h index 0887cc96e9..a46fe0d752 100644 --- a/internal/core/src/index/Meta.h +++ b/internal/core/src/index/Meta.h @@ -48,6 +48,8 @@ constexpr const char* BITMAP_INDEX_TYPE = "BITMAP"; constexpr const char* HYBRID_INDEX_TYPE = "HYBRID"; constexpr const char* SCALAR_INDEX_ENGINE_VERSION = "scalar_index_engine_version"; +constexpr const char* TANTIVY_INDEX_VERSION = "tantivy_index_version"; +constexpr uint32_t TANTIVY_INDEX_LATEST_VERSION = 7; // index meta constexpr const char* COLLECTION_ID = "collection_id"; diff --git a/internal/core/src/index/TextMatchIndex.cpp b/internal/core/src/index/TextMatchIndex.cpp index fd22099e2e..93529f0ace 100644 --- a/internal/core/src/index/TextMatchIndex.cpp +++ b/internal/core/src/index/TextMatchIndex.cpp @@ -28,11 +28,18 @@ TextMatchIndex::TextMatchIndex(int64_t commit_interval_in_ms, last_commit_time_(stdclock::now()) { d_type_ = TantivyDataType::Text; wrapper_ = std::make_shared( - unique_id, true, "", tokenizer_name, analyzer_params); + unique_id, + true, + "", + TANTIVY_INDEX_LATEST_VERSION /* Growing segment has no reason to use old version index*/ + , + tokenizer_name, + analyzer_params); } TextMatchIndex::TextMatchIndex(const std::string& path, const char* unique_id, + uint32_t tantivy_index_version, const char* tokenizer_name, const char* analyzer_params) : commit_interval_in_ms_(std::numeric_limits::max()), @@ -42,11 +49,16 @@ TextMatchIndex::TextMatchIndex(const std::string& path, boost::filesystem::path sub_path = unique_id; path_ = (prefix / sub_path).string(); boost::filesystem::create_directories(path_); - wrapper_ = std::make_shared( - unique_id, false, path_.c_str(), tokenizer_name, analyzer_params); + wrapper_ = std::make_shared(unique_id, + false, + path_.c_str(), + tantivy_index_version, + tokenizer_name, + analyzer_params); } TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx, + uint32_t tantivy_index_version, const char* tokenizer_name, const char* analyzer_params) : commit_interval_in_ms_(std::numeric_limits::max()), @@ -65,6 +77,7 @@ TextMatchIndex::TextMatchIndex(const storage::FileManagerContext& ctx, wrapper_ = std::make_shared(field_name.c_str(), false, path_.c_str(), + tantivy_index_version, tokenizer_name, analyzer_params); } diff --git a/internal/core/src/index/TextMatchIndex.h b/internal/core/src/index/TextMatchIndex.h index edeee8b0be..a2d35deb36 100644 --- a/internal/core/src/index/TextMatchIndex.h +++ b/internal/core/src/index/TextMatchIndex.h @@ -30,10 +30,12 @@ class TextMatchIndex : public InvertedIndexTantivy { // for sealed segment. explicit TextMatchIndex(const std::string& path, const char* unique_id, + uint32_t tantivy_index_version, const char* tokenizer_name, const char* analyzer_params); // for building index. explicit TextMatchIndex(const storage::FileManagerContext& ctx, + uint32_t tantivy_index_version, const char* tokenizer_name, const char* analyzer_params); // for loading index diff --git a/internal/core/src/indexbuilder/ScalarIndexCreator.cpp b/internal/core/src/indexbuilder/ScalarIndexCreator.cpp index c06a16a8a4..7419115015 100644 --- a/internal/core/src/indexbuilder/ScalarIndexCreator.cpp +++ b/internal/core/src/indexbuilder/ScalarIndexCreator.cpp @@ -41,6 +41,11 @@ ScalarIndexCreator::ScalarIndexCreator( config, milvus::index::SCALAR_INDEX_ENGINE_VERSION) .value_or(1); + index_info.tantivy_index_version = + milvus::index::GetValueFromConfig( + config, milvus::index::TANTIVY_INDEX_VERSION) + .value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION); + index_info.field_type = dtype_; index_info.index_type = index_type(); if (dtype == DataType::JSON) { diff --git a/internal/core/src/indexbuilder/index_c.cpp b/internal/core/src/indexbuilder/index_c.cpp index 80cc08bc2f..11bd458537 100644 --- a/internal/core/src/indexbuilder/index_c.cpp +++ b/internal/core/src/indexbuilder/index_c.cpp @@ -267,10 +267,16 @@ BuildTextIndex(ProtoLayoutInterface result, milvus::storage::FileManagerContext fileManagerContext( field_meta, index_meta, chunk_manager); + uint32_t tantivy_index_version = + milvus::index::GetValueFromConfig( + config, milvus::index::TANTIVY_INDEX_VERSION) + .value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION); + auto field_schema = FieldMeta::ParseFrom(build_index_info->field_schema()); auto index = std::make_unique( fileManagerContext, + tantivy_index_version, "milvus_tokenizer", field_schema.get_analyzer_params().c_str()); index->Build(config); diff --git a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp index aa318dbbf4..f2e8d36ade 100644 --- a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp +++ b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp @@ -1476,6 +1476,8 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) { index = std::make_unique( cfg.GetMmapPath(), unique_id.c_str(), + // todo: make it configurable + index::TANTIVY_INDEX_LATEST_VERSION, "milvus_tokenizer", field_meta.get_analyzer_params().c_str()); } diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index 238e91fb26..e6c76a3a1a 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -2079,6 +2079,8 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) { index = std::make_unique( cfg.GetMmapPath(), unique_id.c_str(), + // todo: make it configurable + index::TANTIVY_INDEX_LATEST_VERSION, "milvus_tokenizer", field_meta.get_analyzer_params().c_str()); } diff --git a/internal/core/src/segcore/load_index_c.cpp b/internal/core/src/segcore/load_index_c.cpp index b847c4d297..a26fa027c2 100644 --- a/internal/core/src/segcore/load_index_c.cpp +++ b/internal/core/src/segcore/load_index_c.cpp @@ -194,6 +194,21 @@ appendScalarIndex(CLoadIndexInfo c_load_index_info, CBinarySet c_binary_set) { index_info.field_type = milvus::DataType(field_type); index_info.index_type = index_params["index_type"]; + auto config = milvus::index::ParseConfigFromIndexParams( + load_index_info->index_params); + + // Config should have value for milvus::index::SCALAR_INDEX_ENGINE_VERSION for production calling chain. + // Use value_or(1) for unit test without setting this value + index_info.scalar_index_engine_version = + milvus::index::GetValueFromConfig( + config, milvus::index::SCALAR_INDEX_ENGINE_VERSION) + .value_or(1); + + index_info.tantivy_index_version = + milvus::index::GetValueFromConfig( + config, milvus::index::TANTIVY_INDEX_VERSION) + .value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION); + load_index_info->index = milvus::index::IndexFactory::GetInstance().CreateIndex( index_info, milvus::storage::FileManagerContext()); @@ -262,6 +277,21 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) { index_info.field_type = load_index_info->field_type; index_info.index_engine_version = engine_version; + auto config = milvus::index::ParseConfigFromIndexParams( + load_index_info->index_params); + + // Config should have value for milvus::index::SCALAR_INDEX_ENGINE_VERSION for production calling chain. + // Use value_or(1) for unit test without setting this value + index_info.scalar_index_engine_version = + milvus::index::GetValueFromConfig( + config, milvus::index::SCALAR_INDEX_ENGINE_VERSION) + .value_or(1); + + index_info.tantivy_index_version = + milvus::index::GetValueFromConfig( + config, milvus::index::TANTIVY_INDEX_VERSION) + .value_or(milvus::index::TANTIVY_INDEX_LATEST_VERSION); + auto ctx = milvus::tracer::TraceContext{ c_trace.traceID, c_trace.spanID, c_trace.traceFlags}; auto span = milvus::tracer::StartSpan("SegCoreLoadIndex", &ctx); @@ -303,8 +333,6 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) { milvus::storage::RemoteChunkManagerSingleton::GetInstance() .GetRemoteChunkManager(); - auto config = milvus::index::ParseConfigFromIndexParams( - load_index_info->index_params); config[milvus::index::INDEX_FILES] = load_index_info->index_files; if (load_index_info->field_type == milvus::DataType::JSON) { diff --git a/internal/core/src/segcore/token_stream_c.cpp b/internal/core/src/segcore/token_stream_c.cpp index 11530d2d24..0cf3794b06 100644 --- a/internal/core/src/segcore/token_stream_c.cpp +++ b/internal/core/src/segcore/token_stream_c.cpp @@ -30,11 +30,13 @@ token_stream_get_token(CTokenStream token_stream) { CToken token_stream_get_detailed_token(CTokenStream token_stream) { - auto token= static_cast(token_stream) - ->get_detailed_token(); - return CToken{ - token.token, token.start_offset, token.end_offset, token.position, token.position_length - }; + auto token = static_cast(token_stream) + ->get_detailed_token(); + return CToken{token.token, + token.start_offset, + token.end_offset, + token.position, + token.position_length}; } void diff --git a/internal/core/src/segcore/token_stream_c.h b/internal/core/src/segcore/token_stream_c.h index 9cad7f7161..c19ae87113 100644 --- a/internal/core/src/segcore/token_stream_c.h +++ b/internal/core/src/segcore/token_stream_c.h @@ -21,13 +21,13 @@ extern "C" { #endif typedef void* CTokenStream; -typedef struct CToken{ - const char *token; - int64_t start_offset; - int64_t end_offset; - int64_t position; - int64_t position_length; -}CToken; +typedef struct CToken { + const char* token; + int64_t start_offset; + int64_t end_offset; + int64_t position; + int64_t position_length; +} CToken; void free_token_stream(CTokenStream); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock index d48d88d627..d45e87f255 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock @@ -17,6 +17,18 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -129,7 +141,7 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", "winapi", ] @@ -152,9 +164,15 @@ dependencies = [ "miniz_oxide", "object", "rustc-demangle", - "windows-targets", + "windows-targets 0.52.6", ] +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + [[package]] name = "base64" version = "0.22.1" @@ -182,6 +200,15 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" +[[package]] +name = "bitpacking" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8c7d2ac73c167c06af4a5f37e6e59d84148d57ccbe4480b76f0273eefea82d7" +dependencies = [ + "crunchy", +] + [[package]] name = "bitpacking" version = "0.9.2" @@ -721,6 +748,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs4" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47" +dependencies = [ + "rustix", + "windows-sys 0.48.0", +] + [[package]] name = "fs4" version = "0.8.4" @@ -849,7 +886,7 @@ dependencies = [ "cfg-if", "libc", "wasi 0.13.3+wasi-0.2.2", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -894,6 +931,10 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "hashbrown" @@ -927,6 +968,12 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + [[package]] name = "htmlescape" version = "0.3.1" @@ -1225,6 +1272,18 @@ dependencies = [ "hashbrown 0.15.2", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -1237,6 +1296,15 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.14.0" @@ -1394,7 +1462,7 @@ dependencies = [ "reqwest", "serde", "tar", - "thiserror", + "thiserror 2.0.11", "yada", ] @@ -1468,6 +1536,15 @@ version = "0.4.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" +[[package]] +name = "lru" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a83fb7698b3643a0e34f9ae6f2e8f0178c0fd42f8b59d493aa271ff3a5bf21" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "lru" version = "0.12.5" @@ -1483,6 +1560,16 @@ version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" +[[package]] +name = "measure_time" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbefd235b0aadd181626f281e1d684e116972988c14c264e42069d5e8a5775cc" +dependencies = [ + "instant", + "log", +] + [[package]] name = "measure_time" version = "0.9.0" @@ -1498,6 +1585,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memmap2" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" +dependencies = [ + "libc", +] + [[package]] name = "memmap2" version = "0.9.5" @@ -1588,6 +1684,16 @@ dependencies = [ "libm", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi 0.3.9", + "libc", +] + [[package]] name = "object" version = "0.36.7" @@ -1659,6 +1765,14 @@ version = "6.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1" +[[package]] +name = "ownedbytes" +version = "0.6.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "ownedbytes" version = "0.7.0" @@ -1856,7 +1970,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax", + "regex-syntax 0.8.5", ] [[package]] @@ -1867,9 +1981,15 @@ checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.5", ] +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.8.5" @@ -1882,7 +2002,7 @@ version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "encoding_rs", "futures-core", @@ -1951,6 +2071,12 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc-hash" version = "2.1.1" @@ -2128,6 +2254,15 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" +[[package]] +name = "sketches-ddsketch" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85636c14b73d81f541e525f585c0a2109e6744e1565b5c1668e31c70c10ed65c" +dependencies = [ + "serde", +] + [[package]] name = "sketches-ddsketch" version = "0.3.0" @@ -2277,6 +2412,61 @@ dependencies = [ "libc", ] +[[package]] +name = "tantivy" +version = "0.21.1" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "aho-corasick", + "arc-swap", + "async-channel", + "async-trait", + "base64 0.21.7", + "bitpacking 0.8.4", + "byteorder", + "census", + "crc32fast", + "crossbeam-channel", + "downcast-rs", + "fastdivide", + "fs4 0.6.6", + "htmlescape", + "itertools 0.11.0", + "lazy_static", + "levenshtein_automata", + "log", + "lru 0.11.1", + "lz4_flex", + "measure_time 0.8.3", + "memmap2 0.7.1", + "murmurhash32", + "num_cpus", + "once_cell", + "oneshot", + "rayon", + "regex", + "rust-stemmers", + "rustc-hash 1.1.0", + "serde", + "serde_json", + "sketches-ddsketch 0.2.2", + "smallvec", + "tantivy-bitpacker 0.5.0", + "tantivy-columnar 0.2.0", + "tantivy-common 0.6.0", + "tantivy-fst 0.4.0", + "tantivy-query-grammar 0.21.0", + "tantivy-stacker 0.2.0", + "tantivy-tokenizer-api 0.2.0", + "tempfile", + "thiserror 1.0.69", + "time", + "tokio", + "uuid", + "winapi", + "zstd-sys", +] + [[package]] name = "tantivy" version = "0.23.0" @@ -2285,8 +2475,8 @@ dependencies = [ "aho-corasick", "arc-swap", "async-channel", - "base64", - "bitpacking", + "base64 0.22.1", + "bitpacking 0.9.2", "bon", "byteorder", "census", @@ -2295,36 +2485,36 @@ dependencies = [ "downcast-rs", "fastdivide", "fnv", - "fs4", + "fs4 0.8.4", "htmlescape", "hyperloglogplus", - "itertools", + "itertools 0.14.0", "lazy_static", "levenshtein_automata", "log", - "lru", + "lru 0.12.5", "lz4_flex", - "measure_time", - "memmap2", + "measure_time 0.9.0", + "memmap2 0.9.5", "once_cell", "oneshot", "rayon", "regex", "rust-stemmers", - "rustc-hash", + "rustc-hash 2.1.1", "serde", "serde_json", - "sketches-ddsketch", + "sketches-ddsketch 0.3.0", "smallvec", - "tantivy-bitpacker", - "tantivy-columnar", - "tantivy-common", - "tantivy-fst", - "tantivy-query-grammar", - "tantivy-stacker", - "tantivy-tokenizer-api", + "tantivy-bitpacker 0.6.0", + "tantivy-columnar 0.3.0", + "tantivy-common 0.7.0", + "tantivy-fst 0.5.0", + "tantivy-query-grammar 0.22.0", + "tantivy-stacker 0.3.0", + "tantivy-tokenizer-api 0.3.0", "tempfile", - "thiserror", + "thiserror 2.0.11", "time", "tokio", "uuid", @@ -2348,17 +2538,41 @@ dependencies = [ "regex", "scopeguard", "serde_json", - "tantivy", + "tantivy 0.21.1", + "tantivy 0.23.0", "tempfile", "zstd-sys", ] +[[package]] +name = "tantivy-bitpacker" +version = "0.5.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "bitpacking 0.8.4", +] + [[package]] name = "tantivy-bitpacker" version = "0.6.0" source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ - "bitpacking", + "bitpacking 0.9.2", +] + +[[package]] +name = "tantivy-columnar" +version = "0.2.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "fastdivide", + "fnv", + "itertools 0.11.0", + "serde", + "tantivy-bitpacker 0.5.0", + "tantivy-common 0.6.0", + "tantivy-sstable 0.2.0", + "tantivy-stacker 0.2.0", ] [[package]] @@ -2368,12 +2582,24 @@ source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a dependencies = [ "downcast-rs", "fastdivide", - "itertools", + "itertools 0.14.0", "serde", - "tantivy-bitpacker", - "tantivy-common", - "tantivy-sstable", - "tantivy-stacker", + "tantivy-bitpacker 0.6.0", + "tantivy-common 0.7.0", + "tantivy-sstable 0.3.0", + "tantivy-stacker 0.3.0", +] + +[[package]] +name = "tantivy-common" +version = "0.6.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "async-trait", + "byteorder", + "ownedbytes 0.6.0", + "serde", + "time", ] [[package]] @@ -2383,12 +2609,23 @@ source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a dependencies = [ "async-trait", "byteorder", - "ownedbytes", + "ownedbytes 0.7.0", "serde", "time", "tokio", ] +[[package]] +name = "tantivy-fst" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc3c506b1a8443a3a65352df6382a1fb6a7afe1a02e871cee0d25e2c3d5f3944" +dependencies = [ + "byteorder", + "regex-syntax 0.6.29", + "utf8-ranges", +] + [[package]] name = "tantivy-fst" version = "0.5.0" @@ -2396,10 +2633,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18" dependencies = [ "byteorder", - "regex-syntax", + "regex-syntax 0.8.5", "utf8-ranges", ] +[[package]] +name = "tantivy-query-grammar" +version = "0.21.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "nom", +] + [[package]] name = "tantivy-query-grammar" version = "0.22.0" @@ -2408,17 +2653,36 @@ dependencies = [ "nom", ] +[[package]] +name = "tantivy-sstable" +version = "0.2.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "tantivy-common 0.6.0", + "tantivy-fst 0.4.0", + "zstd 0.12.4", +] + [[package]] name = "tantivy-sstable" version = "0.3.0" source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ "futures-util", - "itertools", - "tantivy-bitpacker", - "tantivy-common", - "tantivy-fst", - "zstd", + "itertools 0.14.0", + "tantivy-bitpacker 0.6.0", + "tantivy-common 0.7.0", + "tantivy-fst 0.5.0", + "zstd 0.13.0", +] + +[[package]] +name = "tantivy-stacker" +version = "0.2.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "murmurhash32", + "tantivy-common 0.6.0", ] [[package]] @@ -2428,7 +2692,15 @@ source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a dependencies = [ "murmurhash32", "rand_distr", - "tantivy-common", + "tantivy-common 0.7.0", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.2.0" +source = "git+https://github.com/milvus-io/tantivy.git?tag=0.21.1-fix3#fbf1fd51983b6ddc5777ea3c96823557b9da0473" +dependencies = [ + "serde", ] [[package]] @@ -2479,13 +2751,33 @@ version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9" +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + [[package]] name = "thiserror" version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" dependencies = [ - "thiserror-impl", + "thiserror-impl 2.0.11", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.98", ] [[package]] @@ -2766,6 +3058,12 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "want" version = "0.3.1" @@ -2910,7 +3208,7 @@ checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" dependencies = [ "windows-result", "windows-strings", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -2919,7 +3217,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -2929,7 +3227,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" dependencies = [ "windows-result", - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", ] [[package]] @@ -2938,7 +3245,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -2947,7 +3254,22 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] @@ -2956,28 +3278,46 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -2990,24 +3330,48 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -3146,13 +3510,32 @@ dependencies = [ "syn 2.0.98", ] +[[package]] +name = "zstd" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +dependencies = [ + "zstd-safe 6.0.6", +] + [[package]] name = "zstd" version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110" dependencies = [ - "zstd-safe", + "zstd-safe 7.0.0", +] + +[[package]] +name = "zstd-safe" +version = "6.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +dependencies = [ + "libc", + "zstd-sys", ] [[package]] diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml index 8d93e097e0..6d641de4f3 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml @@ -15,6 +15,7 @@ lindera-cc-cedict = ["lindera/cc-cedict"] [dependencies] tantivy = { git = "https://github.com/zilliztech/tantivy.git" } +tantivy-5 = { package = "tantivy", git = "https://github.com/milvus-io/tantivy.git", tag = "0.21.1-fix3" } lindera = "0.40.1" futures = "0.3.21" libc = "0.2" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h index 30daf793fb..1e9d588cad 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h +++ b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h @@ -178,12 +178,14 @@ RustResult tantivy_register_tokenizer(void *ptr, RustResult tantivy_create_index(const char *field_name, TantivyDataType data_type, const char *path, + uint32_t tantivy_index_version, uintptr_t num_threads, uintptr_t overall_memory_budget_in_bytes); RustResult tantivy_create_index_with_single_segment(const char *field_name, TantivyDataType data_type, - const char *path); + const char *path, + uint32_t tantivy_index_version); void tantivy_free_index_writer(void *ptr); @@ -334,6 +336,7 @@ RustResult tantivy_index_add_array_keywords_by_single_segment_writer(void *ptr, RustResult tantivy_create_text_writer(const char *field_name, const char *path, + uint32_t tantivy_index_version, const char *tokenizer_name, const char *analyzer_params, uintptr_t num_threads, diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs index 6b25bb699d..9ab188ab40 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/jieba_tokenizer.rs @@ -6,6 +6,7 @@ lazy_static! { static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new(); } +#[allow(dead_code)] #[derive(Clone)] pub enum JiebaMode { Exact, diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs index bee22114d3..2f7a26ff5e 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs @@ -1,15 +1,14 @@ - use core::result::Result::Err; +use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind}; use lindera::mode::Mode; use lindera::segmenter::Segmenter; use lindera::token::Token as LToken; -use lindera::tokenizer::{Tokenizer as LTokenizer, TokenizerBuilder}; -use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind}; -use tantivy::tokenizer::{Token, Tokenizer, TokenStream}; +use lindera::tokenizer::Tokenizer as LTokenizer; +use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; +use crate::error::{Result, TantivyBindingError}; use serde_json as json; -use crate::error::{Result,TantivyBindingError}; pub struct LinderaTokenStream<'a> { pub tokens: Vec>, @@ -52,7 +51,7 @@ impl LinderaTokenizer { pub fn from_json(params: &json::Map) -> Result { let kind = fetch_lindera_kind(params)?; let dictionary = load_dictionary_from_kind(kind); - if dictionary.is_err(){ + if dictionary.is_err() { return Err(TantivyBindingError::InvalidArgument(format!( "lindera tokenizer with invalid dict_kind" ))); @@ -87,9 +86,9 @@ trait DictionaryKindParser { fn into_dict_kind(self) -> Result; } -impl DictionaryKindParser for &str{ +impl DictionaryKindParser for &str { fn into_dict_kind(self) -> Result { - match self{ + match self { "ipadic" => Ok(DictionaryKind::IPADIC), "ipadic-neologd" => Ok(DictionaryKind::IPADICNEologd), "unidic" => Ok(DictionaryKind::UniDic), @@ -98,21 +97,21 @@ impl DictionaryKindParser for &str{ other => Err(TantivyBindingError::InvalidArgument(format!( "unsupported lindera dict type: {}", other - ))) + ))), } } } -fn fetch_lindera_kind(params:&json::Map) -> Result{ - match params.get("dict_kind"){ +fn fetch_lindera_kind(params: &json::Map) -> Result { + match params.get("dict_kind") { Some(val) => { - if !val.is_string(){ + if !val.is_string() { return Err(TantivyBindingError::InvalidArgument(format!( "lindera tokenizer dict kind should be string" - ))) + ))); } val.as_str().unwrap().into_dict_kind() - }, + } _ => { return Err(TantivyBindingError::InvalidArgument(format!( "lindera tokenizer dict_kind must be set" @@ -128,29 +127,29 @@ mod tests { use crate::analyzer::tokenizers::lindera_tokenizer::LinderaTokenizer; #[test] - fn test_lindera_tokenizer(){ + fn test_lindera_tokenizer() { let params = r#"{ "type": "lindera", "dict_kind": "ipadic" }"#; let json_param = json::from_str::>(¶ms); assert!(json_param.is_ok()); - + let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap()); assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); } #[test] #[cfg(feature = "lindera-cc-cedict")] - fn test_lindera_tokenizer_cc(){ + fn test_lindera_tokenizer_cc() { let params = r#"{ "type": "lindera", "dict_kind": "cc-cedict" }"#; let json_param = json::from_str::>(¶ms); assert!(json_param.is_ok()); - + let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap()); assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); } -} \ No newline at end of file +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs index 1644fbe4fa..f5f6aa1dfd 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/tokenizer.rs @@ -1,13 +1,10 @@ -use tantivy::tokenizer::{TextAnalyzer, TextAnalyzerBuilder}; -use lindera::segmenter::Segmenter; -use tantivy::tokenizer::*; -use lindera::mode::Mode; -use serde_json as json; use log::warn; +use serde_json as json; +use tantivy::tokenizer::*; +use tantivy::tokenizer::{TextAnalyzer, TextAnalyzerBuilder}; use crate::analyzer::tokenizers::{JiebaTokenizer, LinderaTokenizer}; -use crate::error::{Result,TantivyBindingError}; - +use crate::error::{Result, TantivyBindingError}; pub fn standard_builder() -> TextAnalyzerBuilder { TextAnalyzer::builder(SimpleTokenizer::default()).dynamic() @@ -21,11 +18,13 @@ pub fn jieba_builder() -> TextAnalyzerBuilder { TextAnalyzer::builder(JiebaTokenizer::new()).dynamic() } -pub fn lindera_builder(params: Option<&json::Map>) -> Result{ - if params.is_none(){ +pub fn lindera_builder( + params: Option<&json::Map>, +) -> Result { + if params.is_none() { return Err(TantivyBindingError::InvalidArgument(format!( "lindera tokenizer must be costum" - ))) + ))); } let tokenizer = LinderaTokenizer::from_json(params.unwrap())?; Ok(TextAnalyzer::builder(tokenizer).dynamic()) @@ -34,25 +33,25 @@ pub fn lindera_builder(params: Option<&json::Map>) -> Resul pub fn get_builder_with_tokenizer(params: &json::Value) -> Result { let name; let params_map; - if params.is_string(){ + if params.is_string() { name = params.as_str().unwrap(); params_map = None; - }else{ + } else { let m = params.as_object().unwrap(); - match m.get("type"){ + match m.get("type") { Some(val) => { - if !val.is_string(){ + if !val.is_string() { return Err(TantivyBindingError::InvalidArgument(format!( "tokenizer type should be string" - ))) + ))); } name = val.as_str().unwrap(); - }, + } _ => { return Err(TantivyBindingError::InvalidArgument(format!( "costum tokenizer must set type" ))) - }, + } } params_map = Some(m); } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs index f0e2553f77..cc83b35061 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs @@ -98,6 +98,7 @@ pub extern "C" fn free_rust_array_i64(array: RustArrayI64) { } } +#[allow(dead_code)] #[repr(C)] pub enum Value { None(()), @@ -192,11 +193,9 @@ pub extern "C" fn free_rust_error(error: *const c_char) { #[macro_export] macro_rules! cstr_to_str { ($cstr:expr) => { - unsafe { - match CStr::from_ptr($cstr).to_str() { - Ok(f) => f, - Err(e) => return RustResult::from_error(e.to_string()), - } + match unsafe { CStr::from_ptr($cstr).to_str() } { + Ok(f) => f, + Err(e) => return RustResult::from_error(e.to_string()), } }; } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs index 63285abce3..72b43a7565 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/data_type.rs @@ -1,3 +1,4 @@ +#[allow(dead_code)] #[repr(u8)] #[derive(Debug)] pub enum TantivyDataType { diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs index ed813a9e6d..b9192fa568 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs @@ -65,7 +65,7 @@ impl SegmentCollector for DocIdChildCollector { self.collect_block(&[doc]); } - fn harvest(mut self) -> Self::Fruit { + fn harvest(self) -> Self::Fruit { self.milvus_doc_ids } } @@ -117,7 +117,7 @@ impl SegmentCollector for DocIdChildCollector { self.collect_block(&[doc]); } - fn harvest(mut self) -> Self::Fruit { + fn harvest(self) -> Self::Fruit { self.milvus_doc_ids } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs index 77c922f824..01b9e5ac6a 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/error.rs @@ -4,6 +4,7 @@ use core::{fmt, str}; pub enum TantivyBindingError { JsonError(serde_json::Error), TantivyError(tantivy::TantivyError), + TantivyErrorV5(tantivy_5::TantivyError), InvalidArgument(String), InternalError(String), } @@ -20,11 +21,18 @@ impl From for TantivyBindingError { } } +impl From for TantivyBindingError { + fn from(value: tantivy_5::TantivyError) -> Self { + TantivyBindingError::TantivyErrorV5(value) + } +} + impl fmt::Display for TantivyBindingError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { TantivyBindingError::JsonError(e) => write!(f, "JsonError: {}", e), TantivyBindingError::TantivyError(e) => write!(f, "TantivyError: {}", e), + TantivyBindingError::TantivyErrorV5(e) => write!(f, "TantivyErrorV5: {}", e), TantivyBindingError::InvalidArgument(e) => write!(f, "InvalidArgument: {}", e), TantivyBindingError::InternalError(e) => write!(f, "InternalError: {}", e), } @@ -36,6 +44,7 @@ impl std::error::Error for TantivyBindingError { match self { TantivyBindingError::JsonError(e) => Some(e), TantivyBindingError::TantivyError(e) => Some(e), + TantivyBindingError::TantivyErrorV5(e) => Some(e), TantivyBindingError::InvalidArgument(_) => None, TantivyBindingError::InternalError(_) => None, } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs index af86535a89..2497d52d0f 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs @@ -12,6 +12,7 @@ use crate::vec_collector::VecCollector; use crate::error::{Result, TantivyBindingError}; +#[allow(dead_code)] pub(crate) struct IndexReaderWrapper { pub(crate) field_name: String, pub(crate) field: Field, diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs index 9e62186d74..93a961df92 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs @@ -65,25 +65,26 @@ mod tests { use tantivy::query::TermQuery; use tempfile::TempDir; - use crate::{analyzer::create_analyzer, index_writer::IndexWriterWrapper}; + use crate::{index_writer::IndexWriterWrapper, TantivyIndexVersion}; #[test] fn test_jeba() { let params = "{\"tokenizer\": \"jieba\"}".to_string(); - let tokenizer = create_analyzer(¶ms).unwrap(); let dir = TempDir::new().unwrap(); let mut writer = IndexWriterWrapper::create_text_writer( - "text".to_string(), - dir.path().to_str().unwrap().to_string(), - "jieba".to_string(), - tokenizer, + "text", + dir.path().to_str().unwrap(), + "jieba", + ¶ms, 1, 50_000_000, false, - ); + TantivyIndexVersion::default_version(), + ) + .unwrap(); - writer.add_string("网球和滑雪", 0).unwrap(); - writer.add_string("网球以及滑雪", 1).unwrap(); + writer.add("网球和滑雪", Some(0)).unwrap(); + writer.add("网球以及滑雪", Some(1)).unwrap(); writer.commit().unwrap(); @@ -100,20 +101,21 @@ mod tests { #[test] fn test_read() { - let tokenizer = create_analyzer("").unwrap(); let dir = TempDir::new().unwrap(); let mut writer = IndexWriterWrapper::create_text_writer( - "text".to_string(), - dir.path().to_str().unwrap().to_string(), - "default".to_string(), - tokenizer, + "text", + dir.path().to_str().unwrap(), + "default", + "", 1, 50_000_000, false, - ); + TantivyIndexVersion::default_version(), + ) + .unwrap(); for i in 0..10000 { - writer.add_string("hello world", i).unwrap(); + writer.add("hello world", Some(i)).unwrap(); } writer.commit().unwrap(); diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs index f5b80a3cc8..67d4736f0c 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text_c.rs @@ -3,17 +3,15 @@ use std::ffi::CStr; use libc::{c_char, c_void}; use crate::{ - array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, log::init_log, - analyzer::create_analyzer, + analyzer::create_analyzer, array::RustResult, cstr_to_str, index_reader::IndexReaderWrapper, + log::init_log, }; #[no_mangle] pub extern "C" fn tantivy_match_query(ptr: *mut c_void, query: *const c_char) -> RustResult { let real = ptr as *mut IndexReaderWrapper; - unsafe { - let query = cstr_to_str!(query); - (*real).match_query(query).into() - } + let query = cstr_to_str!(query); + unsafe { (*real).match_query(query).into() } } #[no_mangle] diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs index 5c00fc62a1..66adc211f0 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs @@ -1,389 +1,285 @@ -use std::ffi::CStr; -use std::sync::Arc; - -use either::Either; -use futures::executor::block_on; +use index_writer_v5::TantivyDocumentV5; +use index_writer_v7::TantivyDocumentV7; use libc::c_char; -use log::info; -use tantivy::schema::{ - Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED, -}; -use tantivy::{doc, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument}; use crate::data_type::TantivyDataType; -use crate::error::Result; +use crate::error::{Result, TantivyBindingError}; use crate::index_reader::IndexReaderWrapper; use crate::log::init_log; +use crate::{index_writer_v5, index_writer_v7, TantivyIndexVersion}; -pub(crate) struct IndexWriterWrapper { - pub(crate) field: Field, - pub(crate) index_writer: Either, - pub(crate) id_field: Option, - pub(crate) index: Arc, +pub trait TantivyValue { + fn add_to_document(&self, field: u32, document: &mut D); } -#[inline] -fn schema_builder_add_field( - schema_builder: &mut SchemaBuilder, - field_name: &str, - data_type: TantivyDataType, -) -> Field { - match data_type { - TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED), - TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED), - TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED), - TantivyDataType::Keyword => { - let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("raw") - .set_index_option(IndexRecordOption::Basic); - let text_options = TextOptions::default().set_indexing_options(text_field_indexing); - schema_builder.add_text_field(&field_name, text_options) - } - TantivyDataType::Text => { - panic!("text should be indexed with analyzer"); - } - } +pub enum IndexWriterWrapper { + V5(index_writer_v5::IndexWriterWrapperImpl), + V7(index_writer_v7::IndexWriterWrapperImpl), } impl IndexWriterWrapper { + // create a IndexWriterWrapper according to `tanviy_index_version`. + // version 7 is the latest version and is what we should use in most cases. + // We may also build with version 5 for compatibility for reader nodes with older versions. pub fn new( - field_name: String, + field_name: &str, data_type: TantivyDataType, path: String, num_threads: usize, overall_memory_budget_in_bytes: usize, + tanviy_index_version: TantivyIndexVersion, ) -> Result { init_log(); - info!( - "create index writer, field_name: {}, data_type: {:?}", - field_name, data_type - ); - let mut schema_builder = Schema::builder(); - let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type); - // We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field. - let id_field = schema_builder.add_i64_field("doc_id", FAST); - let schema = schema_builder.build(); - let index = Index::create_in_dir(path.clone(), schema)?; - let index_writer = - index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?; - Ok(IndexWriterWrapper { - field, - index_writer: Either::Left(index_writer), - id_field: Some(id_field), - index: Arc::new(index), - }) + match tanviy_index_version { + TantivyIndexVersion::V5 => { + let writer = index_writer_v5::IndexWriterWrapperImpl::new( + field_name, + data_type, + path, + num_threads, + overall_memory_budget_in_bytes, + )?; + Ok(IndexWriterWrapper::V5(writer)) + } + TantivyIndexVersion::V7 => { + let writer = index_writer_v7::IndexWriterWrapperImpl::new( + field_name, + data_type, + path, + num_threads, + overall_memory_budget_in_bytes, + )?; + Ok(IndexWriterWrapper::V7(writer)) + } + } } pub fn new_with_single_segment( - field_name: String, + field_name: &str, data_type: TantivyDataType, path: String, + tanviy_index_version: TantivyIndexVersion, ) -> Result { init_log(); - info!( - "create single segment index writer, field_name: {}, data_type: {:?}", - field_name, data_type - ); - let mut schema_builder = Schema::builder(); - let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type); - let schema = schema_builder.build(); - let index = Index::create_in_dir(path.clone(), schema)?; - let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?; - Ok(IndexWriterWrapper { - field, - index_writer: Either::Right(index_writer), - id_field: None, - index: Arc::new(index), - }) + match tanviy_index_version { + TantivyIndexVersion::V5 => { + let writer = index_writer_v5::IndexWriterWrapperImpl::new_with_single_segment( + field_name, data_type, path, + )?; + Ok(IndexWriterWrapper::V5(writer)) + } + TantivyIndexVersion::V7 => { + let writer = index_writer_v7::IndexWriterWrapperImpl::new_with_single_segment( + field_name, data_type, path, + )?; + Ok(IndexWriterWrapper::V7(writer)) + } + } } pub fn create_reader(&self) -> Result { - IndexReaderWrapper::from_index(self.index.clone()) - } - - fn index_writer_add_document(&self, document: TantivyDocument) -> Result<()> { - match self.index_writer { - Either::Left(ref writer) => { - let _ = writer.add_document(document)?; - } - Either::Right(_) => { - panic!("unexpected writer"); + match self { + IndexWriterWrapper::V5(_) => { + return Err(TantivyBindingError::InternalError( + "create reader with tantivy index version 5 + is not supported from tantivy with version 7" + .into(), + )); } + IndexWriterWrapper::V7(writer) => writer.create_reader(), } - Ok(()) } - fn single_segment_index_writer_add_document( - &mut self, - document: TantivyDocument, - ) -> Result<()> { - match self.index_writer { - Either::Left(_) => { - panic!("unexpected writer"); - } - Either::Right(ref mut single_segmnet_writer) => { - let _ = single_segmnet_writer.add_document(document)?; - } + pub fn add(&mut self, data: T, offset: Option) -> Result<()> + where + T: TantivyValue + TantivyValue, + { + match self { + IndexWriterWrapper::V5(writer) => writer.add(data, offset), + IndexWriterWrapper::V7(writer) => writer.add(data, offset), } - Ok(()) } - pub fn add_i8(&mut self, data: i8, offset: i64) -> Result<()> { - self.add_i64(data.into(), offset) - } - - pub fn add_i16(&mut self, data: i16, offset: i64) -> Result<()> { - self.add_i64(data.into(), offset) - } - - pub fn add_i32(&mut self, data: i32, offset: i64) -> Result<()> { - self.add_i64(data.into(), offset) - } - - pub fn add_i64(&mut self, data: i64, offset: i64) -> Result<()> { - self.index_writer_add_document(doc!( - self.field => data, - self.id_field.unwrap() => offset, - )) - } - - pub fn add_f32(&mut self, data: f32, offset: i64) -> Result<()> { - self.add_f64(data.into(), offset) - } - - pub fn add_f64(&mut self, data: f64, offset: i64) -> Result<()> { - self.index_writer_add_document(doc!( - self.field => data, - self.id_field.unwrap() => offset, - )) - } - - pub fn add_bool(&mut self, data: bool, offset: i64) -> Result<()> { - self.index_writer_add_document(doc!( - self.field => data, - self.id_field.unwrap() => offset, - )) - } - - pub fn add_string(&mut self, data: &str, offset: i64) -> Result<()> { - self.index_writer_add_document(doc!( - self.field => data, - self.id_field.unwrap() => offset, - )) - } - - pub fn add_array_i8s(&mut self, datas: &[i8], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); + pub fn add_array(&mut self, data: I, offset: Option) -> Result<()> + where + I: IntoIterator, + T: TantivyValue + TantivyValue, + { + match self { + IndexWriterWrapper::V5(writer) => writer.add_array(data, offset), + IndexWriterWrapper::V7(writer) => writer.add_array(data, offset), } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) } - pub fn add_array_i16s(&mut self, datas: &[i16], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_i32s(&mut self, datas: &[i32], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_i64s(&mut self, datas: &[i64], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_f32s(&mut self, datas: &[f32], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as f64)); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_f64s(&mut self, datas: &[f64], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_bools(&mut self, datas: &[bool], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_array_keywords(&mut self, datas: &[*const c_char], offset: i64) -> Result<()> { - let mut document = TantivyDocument::default(); - for element in datas { - let data = unsafe { CStr::from_ptr(*element) }; - document.add_field_value(self.field, data.to_str()?); - } - document.add_i64(self.id_field.unwrap(), offset); - self.index_writer_add_document(document) - } - - pub fn add_i8_by_single_segment_writer(&mut self, data: i8) -> Result<()> { - self.add_i64_by_single_segment_writer(data.into()) - } - - pub fn add_i16_by_single_segment_writer(&mut self, data: i16) -> Result<()> { - self.add_i64_by_single_segment_writer(data.into()) - } - - pub fn add_i32_by_single_segment_writer(&mut self, data: i32) -> Result<()> { - self.add_i64_by_single_segment_writer(data.into()) - } - - pub fn add_i64_by_single_segment_writer(&mut self, data: i64) -> Result<()> { - self.single_segment_index_writer_add_document(doc!( - self.field => data - )) - } - - pub fn add_f32_by_single_segment_writer(&mut self, data: f32) -> Result<()> { - self.add_f64_by_single_segment_writer(data.into()) - } - - pub fn add_f64_by_single_segment_writer(&mut self, data: f64) -> Result<()> { - self.single_segment_index_writer_add_document(doc!( - self.field => data - )) - } - - pub fn add_bool_by_single_segment_writer(&mut self, data: bool) -> Result<()> { - self.single_segment_index_writer_add_document(doc!( - self.field => data - )) - } - - pub fn add_string_by_single_segment_writer(&mut self, data: &str) -> Result<()> { - self.single_segment_index_writer_add_document(doc!( - self.field => data - )) - } - - pub fn add_array_i8s_by_single_segment_writer(&mut self, datas: &[i8]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_i16s_by_single_segment_writer(&mut self, datas: &[i16]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_i32s_by_single_segment_writer(&mut self, datas: &[i32]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as i64)); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_i64s_by_single_segment_writer(&mut self, datas: &[i64]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_f32s_by_single_segment_writer(&mut self, datas: &[f32]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, &(*data as f64)); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_f64s_by_single_segment_writer(&mut self, datas: &[f64]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_bools_by_single_segment_writer(&mut self, datas: &[bool]) -> Result<()> { - let mut document = TantivyDocument::default(); - for data in datas { - document.add_field_value(self.field, data); - } - self.single_segment_index_writer_add_document(document) - } - - pub fn add_array_keywords_by_single_segment_writer( + pub fn add_array_keywords( &mut self, datas: &[*const c_char], + offset: Option, ) -> Result<()> { - let mut document = TantivyDocument::default(); - for element in datas { - let data = unsafe { CStr::from_ptr(*element) }; - document.add_field_value(self.field, data.to_str()?); + match self { + IndexWriterWrapper::V5(writer) => writer.add_array_keywords(datas, offset), + IndexWriterWrapper::V7(writer) => writer.add_array_keywords(datas, offset), } - self.single_segment_index_writer_add_document(document) } - fn manual_merge(&mut self) -> Result<()> { - let index_writer = self.index_writer.as_mut().left().unwrap(); - let metas = index_writer.index().searchable_segment_metas()?; - let policy = index_writer.get_merge_policy(); - let candidates = policy.compute_merge_candidates(metas.as_slice()); - for candidate in candidates { - index_writer.merge(candidate.0.as_slice()).wait()?; + #[allow(dead_code)] + pub fn manual_merge(&mut self) -> Result<()> { + match self { + IndexWriterWrapper::V5(writer) => writer.manual_merge(), + IndexWriterWrapper::V7(writer) => writer.manual_merge(), } - Ok(()) } + #[allow(dead_code)] + pub fn commit(&mut self) -> Result<()> { + match self { + IndexWriterWrapper::V5(writer) => writer.commit(), + IndexWriterWrapper::V7(writer) => writer.commit(), + } + } + + #[allow(dead_code)] pub fn finish(self) -> Result<()> { - match self.index_writer { - Either::Left(mut index_writer) => { - index_writer.commit()?; - // self.manual_merge(); - block_on(index_writer.garbage_collect_files())?; - index_writer.wait_merging_threads()?; - } - Either::Right(single_segment_index_writer) => { - single_segment_index_writer - .finalize() - .expect("failed to build inverted index"); - } + match self { + IndexWriterWrapper::V5(writer) => writer.finish(), + IndexWriterWrapper::V7(writer) => writer.finish(), } - Ok(()) - } - - pub(crate) fn commit(&mut self) -> Result<()> { - self.index_writer.as_mut().left().unwrap().commit()?; - Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::ops::Bound; + + use tempfile::TempDir; + + use crate::{data_type::TantivyDataType, TantivyIndexVersion}; + + use super::IndexWriterWrapper; + + #[test] + fn test_build_index_version5() { + let field_name = "number"; + let data_type = TantivyDataType::I64; + let dir = TempDir::new().unwrap(); + + { + let mut index_wrapper = IndexWriterWrapper::new( + field_name, + data_type, + dir.path().to_str().unwrap().to_string(), + 1, + 50_000_000, + TantivyIndexVersion::V5, + ) + .unwrap(); + + for i in 0..10 { + index_wrapper.add::(i, Some(i as i64)).unwrap(); + } + index_wrapper.commit().unwrap(); + } + + use tantivy_5::{collector, query, Index, ReloadPolicy}; + let index = Index::open_in_dir(dir.path()).unwrap(); + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into() + .unwrap(); + let query = query::RangeQuery::new_i64_bounds( + field_name.to_string(), + Bound::Included(0), + Bound::Included(9), + ); + let res = reader + .searcher() + .search(&query, &tantivy_5::collector::TopDocs::with_limit(10)) + .unwrap(); + assert_eq!(res.len(), 10); + } + + #[test] + fn test_build_index_version5_single_segment() { + let field_name = "number"; + let data_type = TantivyDataType::I64; + let dir = TempDir::new().unwrap(); + + { + let mut index_wrapper = IndexWriterWrapper::new_with_single_segment( + field_name, + data_type, + dir.path().to_str().unwrap().to_string(), + TantivyIndexVersion::V5, + ) + .unwrap(); + + for i in 0..10 { + index_wrapper.add::(i, None).unwrap(); + } + index_wrapper.finish().unwrap(); + } + + use tantivy_5::{collector, query, Index, ReloadPolicy}; + let index = Index::open_in_dir(dir.path()).unwrap(); + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into() + .unwrap(); + let query = query::RangeQuery::new_i64_bounds( + field_name.to_string(), + Bound::Included(0), + Bound::Included(9), + ); + let res = reader + .searcher() + .search(&query, &collector::TopDocs::with_limit(10)) + .unwrap(); + assert_eq!(res.len(), 10); + } + + #[test] + fn test_build_text_index_version5() { + let field_name = "text"; + let dir = TempDir::new().unwrap(); + + { + let mut index_wrapper = IndexWriterWrapper::create_text_writer( + field_name, + dir.path().to_str().unwrap(), + "default", + "", + 1, + 50_000_000, + false, + TantivyIndexVersion::V5, + ) + .unwrap(); + + for i in 0..10 { + index_wrapper.add("hello", Some(i as i64)).unwrap(); + } + index_wrapper.commit().unwrap(); + } + + use tantivy_5::{collector, query, schema, Index, ReloadPolicy, Term}; + let index = Index::open_in_dir(dir.path()).unwrap(); + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into() + .unwrap(); + let text = index.schema().get_field("text").unwrap(); + let query = query::TermQuery::new( + Term::from_field_text(text, "hello"), + schema::IndexRecordOption::Basic, + ); + let res = reader + .searcher() + .search(&query, &collector::TopDocs::with_limit(10)) + .unwrap(); + assert_eq!(res.len(), 10); } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs index 10171ecb92..4747398f59 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_c.rs @@ -8,6 +8,7 @@ use crate::{ error::Result, index_writer::IndexWriterWrapper, util::{create_binding, free_binding}, + TantivyIndexVersion, }; macro_rules! convert_to_rust_slice { @@ -25,17 +26,25 @@ pub extern "C" fn tantivy_create_index( field_name: *const c_char, data_type: TantivyDataType, path: *const c_char, + tantivy_index_version: u32, num_threads: usize, overall_memory_budget_in_bytes: usize, ) -> RustResult { let field_name_str = cstr_to_str!(field_name); let path_str = cstr_to_str!(path); + + let tantivy_index_version = match TantivyIndexVersion::from_u32(tantivy_index_version) { + Ok(v) => v, + Err(e) => return RustResult::from_error(e.to_string()), + }; + match IndexWriterWrapper::new( - String::from(field_name_str), + field_name_str, data_type, String::from(path_str), num_threads, overall_memory_budget_in_bytes, + tantivy_index_version, ) { Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)), Err(e) => RustResult::from_error(e.to_string()), @@ -47,13 +56,21 @@ pub extern "C" fn tantivy_create_index_with_single_segment( field_name: *const c_char, data_type: TantivyDataType, path: *const c_char, + tantivy_index_version: u32, ) -> RustResult { let field_name_str = cstr_to_str!(field_name); let path_str = cstr_to_str!(path); + + let tantivy_index_version = match TantivyIndexVersion::from_u32(tantivy_index_version) { + Ok(v) => v, + Err(e) => return RustResult::from_error(e.to_string()), + }; + match IndexWriterWrapper::new_with_single_segment( - String::from(field_name_str), + field_name_str, data_type, String::from(path_str), + tantivy_index_version, ) { Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)), Err(e) => RustResult::from_error(e.to_string()), @@ -90,25 +107,31 @@ pub extern "C" fn tantivy_create_reader_from_writer(ptr: *mut c_void) -> RustRes } // -------------------------build-------------------- -fn execute( - arr: &[T], +fn execute( + arr: I, offset: i64, - e: fn(&mut IndexWriterWrapper, T, i64) -> Result<()>, + e: fn(&mut IndexWriterWrapper, T, Option) -> Result<()>, w: &mut IndexWriterWrapper, -) -> Result<()> { - for (index, data) in arr.iter().enumerate() { - e(w, *data, offset + (index as i64))?; +) -> Result<()> +where + I: IntoIterator, +{ + for (index, data) in arr.into_iter().enumerate() { + e(w, data, Some(offset + (index as i64)))?; } Ok(()) } -fn execute_by_single_segment_writer( - arr: &[T], - e: fn(&mut IndexWriterWrapper, T) -> Result<()>, +fn execute_by_single_segment_writer( + arr: I, + e: fn(&mut IndexWriterWrapper, T, Option) -> Result<()>, w: &mut IndexWriterWrapper, -) -> Result<()> { - for (_, data) in arr.iter().enumerate() { - e(w, *data)?; +) -> Result<()> +where + I: IntoIterator, +{ + for data in arr.into_iter() { + e(w, data, None)?; } Ok(()) } @@ -122,7 +145,15 @@ pub extern "C" fn tantivy_index_add_int8s( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i8, &mut (*real)).into() } + unsafe { + execute( + arr.into_iter().map(|num| *num as i64), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -135,8 +166,8 @@ pub extern "C" fn tantivy_index_add_int8s_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_i8_by_single_segment_writer, + arr.into_iter().map(|num| *num as i64), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -152,7 +183,15 @@ pub extern "C" fn tantivy_index_add_int16s( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i16, &mut (*real)).into() } + unsafe { + execute( + arr.into_iter().map(|num| *num as i64), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -165,8 +204,8 @@ pub extern "C" fn tantivy_index_add_int16s_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_i16_by_single_segment_writer, + arr.into_iter().map(|num| *num as i64), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -182,7 +221,15 @@ pub extern "C" fn tantivy_index_add_int32s( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i32, &mut (*real)).into() } + unsafe { + execute( + arr.into_iter().map(|num| *num as i64), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -195,8 +242,8 @@ pub extern "C" fn tantivy_index_add_int32s_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_i32_by_single_segment_writer, + arr.into_iter().map(|num| *num as i64), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -213,7 +260,15 @@ pub extern "C" fn tantivy_index_add_int64s( let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i64, &mut (*real)).into() } + unsafe { + execute( + arr.iter().copied(), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -227,8 +282,8 @@ pub extern "C" fn tantivy_index_add_int64s_by_single_segment_writer( unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_i64_by_single_segment_writer, + arr.iter().copied(), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -244,7 +299,15 @@ pub extern "C" fn tantivy_index_add_f32s( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f32, &mut (*real)).into() } + unsafe { + execute( + arr.into_iter().map(|num| *num as f64), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -257,8 +320,8 @@ pub extern "C" fn tantivy_index_add_f32s_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_f32_by_single_segment_writer, + arr.into_iter().map(|num| *num as f64), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -274,7 +337,15 @@ pub extern "C" fn tantivy_index_add_f64s( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let arr = unsafe { slice::from_raw_parts(array, len) }; - unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f64, &mut (*real)).into() } + unsafe { + execute( + arr.iter().copied(), + offset_begin, + IndexWriterWrapper::add::, + &mut (*real), + ) + .into() + } } #[no_mangle] @@ -287,8 +358,8 @@ pub extern "C" fn tantivy_index_add_f64s_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_f64_by_single_segment_writer, + arr.into_iter().map(|num| *num as f64), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -306,9 +377,9 @@ pub extern "C" fn tantivy_index_add_bools( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute( - arr, + arr.iter().copied(), offset_begin, - IndexWriterWrapper::add_bool, + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -325,8 +396,8 @@ pub extern "C" fn tantivy_index_add_bools_by_single_segment_writer( let arr = unsafe { slice::from_raw_parts(array, len) }; unsafe { execute_by_single_segment_writer( - arr, - IndexWriterWrapper::add_bool_by_single_segment_writer, + arr.iter().copied(), + IndexWriterWrapper::add::, &mut (*real), ) .into() @@ -343,7 +414,7 @@ pub extern "C" fn tantivy_index_add_string( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let s = cstr_to_str!(s); - unsafe { (*real).add_string(s, offset).into() } + unsafe { (*real).add::<&str>(s, Some(offset)).into() } } #[no_mangle] @@ -353,7 +424,7 @@ pub extern "C" fn tantivy_index_add_string_by_single_segment_writer( ) -> RustResult { let real = ptr as *mut IndexWriterWrapper; let s = cstr_to_str!(s); - unsafe { (*real).add_string_by_single_segment_writer(s).into() } + unsafe { (*real).add::<&str>(s, None).into() } } // --------------------------------------------- array ------------------------------------------ @@ -368,7 +439,9 @@ pub extern "C" fn tantivy_index_add_array_int8s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i8s(arr, offset).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), Some(offset)) + .into() } } @@ -381,7 +454,9 @@ pub extern "C" fn tantivy_index_add_array_int8s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i8s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), None) + .into() } } @@ -395,7 +470,9 @@ pub extern "C" fn tantivy_index_add_array_int16s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i16s(arr, offset).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), Some(offset)) + .into() } } @@ -408,7 +485,9 @@ pub extern "C" fn tantivy_index_add_array_int16s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i16s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), None) + .into() } } @@ -422,7 +501,9 @@ pub extern "C" fn tantivy_index_add_array_int32s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i32s(arr, offset).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), Some(offset)) + .into() } } @@ -435,7 +516,9 @@ pub extern "C" fn tantivy_index_add_array_int32s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i32s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as i64), None) + .into() } } @@ -449,7 +532,9 @@ pub extern "C" fn tantivy_index_add_array_int64s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i64s(arr, offset).into() + (*real) + .add_array::(arr.iter().copied(), Some(offset)) + .into() } } @@ -462,7 +547,9 @@ pub extern "C" fn tantivy_index_add_array_int64s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_i64s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.iter().copied(), None) + .into() } } @@ -476,7 +563,9 @@ pub extern "C" fn tantivy_index_add_array_f32s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_f32s(arr, offset).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as f64), Some(offset)) + .into() } } @@ -489,7 +578,9 @@ pub extern "C" fn tantivy_index_add_array_f32s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_f32s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.into_iter().map(|num| *num as f64), None) + .into() } } @@ -503,7 +594,9 @@ pub extern "C" fn tantivy_index_add_array_f64s( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_f64s(arr, offset).into() + (*real) + .add_array::(arr.iter().copied(), Some(offset)) + .into() } } @@ -516,7 +609,9 @@ pub extern "C" fn tantivy_index_add_array_f64s_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_f64s_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.iter().copied(), None) + .into() } } @@ -530,7 +625,9 @@ pub extern "C" fn tantivy_index_add_array_bools( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_bools(arr, offset).into() + (*real) + .add_array::(arr.iter().copied(), Some(offset)) + .into() } } @@ -543,7 +640,9 @@ pub extern "C" fn tantivy_index_add_array_bools_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_bools_by_single_segment_writer(arr).into() + (*real) + .add_array::(arr.iter().copied(), None) + .into() } } @@ -557,7 +656,7 @@ pub extern "C" fn tantivy_index_add_array_keywords( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real).add_array_keywords(arr, offset).into() + (*real).add_array_keywords(arr, Some(offset)).into() } } @@ -570,8 +669,6 @@ pub extern "C" fn tantivy_index_add_array_keywords_by_single_segment_writer( let real = ptr as *mut IndexWriterWrapper; unsafe { let arr = convert_to_rust_slice!(array, len); - (*real) - .add_array_keywords_by_single_segment_writer(arr) - .into() + (*real).add_array_keywords(arr, None).into() } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text.rs index abddfc707e..2532f1bbbd 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text.rs @@ -1,53 +1,44 @@ -use std::sync::Arc; - -use either::Either; -use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST}; -use tantivy::tokenizer::TextAnalyzer; -use tantivy::Index; - -use crate::{index_writer::IndexWriterWrapper, log::init_log}; - -fn build_text_schema(field_name: &String, tokenizer_name: &String) -> (Schema, Field, Field) { - let mut schema_builder = Schema::builder(); - // positions is required for matching phase. - let indexing = TextFieldIndexing::default() - .set_tokenizer(&tokenizer_name) - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - let option = TextOptions::default().set_indexing_options(indexing); - let field = schema_builder.add_text_field(&field_name, option); - let id_field = schema_builder.add_i64_field("doc_id", FAST); - (schema_builder.build(), field, id_field) -} +use crate::error::Result; +use crate::index_writer::IndexWriterWrapper; +use crate::{index_writer_v5, index_writer_v7, TantivyIndexVersion}; impl IndexWriterWrapper { + // create a text writer according to `tanviy_index_version`. + // version 7 is the latest version and is what we should use in most cases. + // We may also build with version 5 for compatibility for reader nodes with older versions. pub(crate) fn create_text_writer( - field_name: String, - path: String, - tokenizer_name: String, - tokenizer: TextAnalyzer, + field_name: &str, + path: &str, + tokenizer_name: &str, + tokenizer_params: &str, num_threads: usize, overall_memory_budget_in_bytes: usize, in_ram: bool, - ) -> IndexWriterWrapper { - init_log(); - - let (schema, field, id_field) = build_text_schema(&field_name, &tokenizer_name); - let index: Index; - if in_ram { - index = Index::create_in_ram(schema); - } else { - index = Index::create_in_dir(path.clone(), schema).unwrap(); - } - index.tokenizers().register(&tokenizer_name, tokenizer); - let index_writer = index - .writer_with_num_threads(num_threads, overall_memory_budget_in_bytes) - .unwrap(); - - IndexWriterWrapper { - field, - index_writer: Either::Left(index_writer), - id_field: Some(id_field), - index: Arc::new(index), + tanviy_index_version: TantivyIndexVersion, + ) -> Result { + match tanviy_index_version { + TantivyIndexVersion::V5 => Ok(IndexWriterWrapper::V5( + index_writer_v5::IndexWriterWrapperImpl::create_text_writer( + field_name, + path, + tokenizer_name, + tokenizer_params, + num_threads, + overall_memory_budget_in_bytes, + in_ram, + )?, + )), + TantivyIndexVersion::V7 => Ok(IndexWriterWrapper::V7( + index_writer_v7::IndexWriterWrapperImpl::create_text_writer( + field_name, + path, + tokenizer_name, + tokenizer_params, + num_threads, + overall_memory_budget_in_bytes, + in_ram, + )?, + )), } } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs index 44d4c7435d..ca781ce7c3 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs @@ -5,13 +5,14 @@ use crate::array::RustResult; use crate::cstr_to_str; use crate::index_writer::IndexWriterWrapper; use crate::log::init_log; -use crate::analyzer::create_analyzer; use crate::util::create_binding; +use crate::TantivyIndexVersion; #[no_mangle] pub extern "C" fn tantivy_create_text_writer( field_name: *const c_char, path: *const c_char, + tantivy_index_version: u32, tokenizer_name: *const c_char, analyzer_params: *const c_char, num_threads: usize, @@ -23,20 +24,23 @@ pub extern "C" fn tantivy_create_text_writer( let path_str = cstr_to_str!(path); let tokenizer_name_str = cstr_to_str!(tokenizer_name); let params = cstr_to_str!(analyzer_params); - let analyzer = create_analyzer(params); - match analyzer { - Ok(text_analyzer) => { - let wrapper = IndexWriterWrapper::create_text_writer( - String::from(field_name_str), - String::from(path_str), - String::from(tokenizer_name_str), - text_analyzer, - num_threads, - overall_memory_budget_in_bytes, - in_ram, - ); - RustResult::from_ptr(create_binding(wrapper)) - } + + let tantivy_index_version = match TantivyIndexVersion::from_u32(tantivy_index_version) { + Ok(v) => v, + Err(e) => return RustResult::from_error(e.to_string()), + }; + + match IndexWriterWrapper::create_text_writer( + field_name_str, + path_str, + tokenizer_name_str, + params, + num_threads, + overall_memory_budget_in_bytes, + in_ram, + tantivy_index_version, + ) { + Ok(wrapper) => RustResult::from_ptr(create_binding(wrapper)), Err(err) => RustResult::from_error(format!( "create tokenizer failed with error: {} param: {}", err.to_string(), diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/analyzer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/analyzer.rs new file mode 100644 index 0000000000..91ee027d39 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/analyzer.rs @@ -0,0 +1,288 @@ +use serde_json as json; +use std::collections::HashMap; +use tantivy_5::tokenizer::*; + +use crate::error::{Result, TantivyBindingError}; + +use super::{ + build_in_analyzer::{chinese_analyzer, english_analyzer}, + filter::SystemFilter, + standard_analyzer, + tokenizers::get_builder_with_tokenizer, + util::{get_stop_words_list, get_string_list}, +}; + +struct AnalyzerBuilder<'a> { + filters: HashMap, + params: &'a json::Map, +} + +impl AnalyzerBuilder<'_> { + fn new(params: &json::Map) -> AnalyzerBuilder { + AnalyzerBuilder { + filters: HashMap::new(), + params: params, + } + } + + fn get_tokenizer_params(&self) -> Result<&json::Value> { + let tokenizer = self.params.get("tokenizer"); + if tokenizer.is_none() { + return Err(TantivyBindingError::InternalError(format!( + "tokenizer name or type must be set" + ))); + } + let value = tokenizer.unwrap(); + if value.is_object() || value.is_string() { + return Ok(tokenizer.unwrap()); + } + + Err(TantivyBindingError::InternalError(format!( + "tokenizer name should be string or dict" + ))) + } + + fn add_custom_filter( + &mut self, + name: &String, + params: &json::Map, + ) -> Result<()> { + match SystemFilter::try_from(params) { + Ok(filter) => { + self.filters.insert(name.to_string(), filter); + Ok(()) + } + Err(e) => Err(e), + } + } + + fn add_custom_filters(&mut self, params: &json::Map) -> Result<()> { + for (name, value) in params { + if !value.is_object() { + continue; + } + self.add_custom_filter(name, value.as_object().unwrap())?; + } + Ok(()) + } + + fn build_filter( + &mut self, + mut builder: TextAnalyzerBuilder, + params: &json::Value, + ) -> Result { + if !params.is_array() { + return Err(TantivyBindingError::InternalError( + "filter params should be array".to_string(), + )); + } + + let filters = params.as_array().unwrap(); + + for filter in filters { + if filter.is_string() { + let filter_name = filter.as_str().unwrap(); + let costum = self.filters.remove(filter_name); + if !costum.is_none() { + builder = costum.unwrap().transform(builder); + continue; + } + + // check if filter was system filter + let system = SystemFilter::from(filter_name); + match system { + SystemFilter::Invalid => { + return Err(TantivyBindingError::InternalError(format!( + "build analyzer failed, filter not found :{}", + filter_name + ))) + } + other => { + builder = other.transform(builder); + } + } + } else if filter.is_object() { + let filter = SystemFilter::try_from(filter.as_object().unwrap())?; + builder = filter.transform(builder); + } + } + Ok(builder) + } + + fn build_option(&mut self, mut builder: TextAnalyzerBuilder) -> Result { + for (key, value) in self.params { + match key.as_str() { + "tokenizer" => {} + "filter" => { + // build with filter if filter param exist + builder = self.build_filter(builder, value)?; + } + other => { + return Err(TantivyBindingError::InternalError(format!( + "unknown analyzer option key: {}", + other + ))) + } + } + } + Ok(builder) + } + + fn get_stop_words_option(&self) -> Result> { + let value = self.params.get("stop_words"); + match value { + Some(value) => { + let str_list = get_string_list(value, "filter stop_words")?; + Ok(get_stop_words_list(str_list)) + } + _ => Ok(vec![]), + } + } + + fn build_template(self, type_: &str) -> Result { + match type_ { + "standard" => Ok(standard_analyzer(self.get_stop_words_option()?)), + "chinese" => Ok(chinese_analyzer(self.get_stop_words_option()?)), + "english" => Ok(english_analyzer(self.get_stop_words_option()?)), + other_ => Err(TantivyBindingError::InternalError(format!( + "unknown build-in analyzer type: {}", + other_ + ))), + } + } + + fn build(mut self) -> Result { + // build base build-in analyzer + match self.params.get("type") { + Some(type_) => { + if !type_.is_string() { + return Err(TantivyBindingError::InternalError(format!( + "analyzer type shoud be string" + ))); + } + return self.build_template(type_.as_str().unwrap()); + } + None => {} + }; + + //build custom analyzer + let tokenizer_params = self.get_tokenizer_params()?; + let mut builder = get_builder_with_tokenizer(&tokenizer_params)?; + + // build with option + builder = self.build_option(builder)?; + Ok(builder.build()) + } +} + +pub(crate) fn create_analyzer_with_filter(params: &String) -> Result { + match json::from_str::(¶ms) { + Ok(value) => { + if value.is_null() { + return Ok(standard_analyzer(vec![])); + } + if !value.is_object() { + return Err(TantivyBindingError::InternalError( + "tokenizer params should be a json map".to_string(), + )); + } + let json_params = value.as_object().unwrap(); + + // create builder + let analyzer_params = json_params.get("analyzer"); + if analyzer_params.is_none() { + return Ok(standard_analyzer(vec![])); + } + if !analyzer_params.unwrap().is_object() { + return Err(TantivyBindingError::InternalError( + "analyzer params should be a json map".to_string(), + )); + } + + let builder_params = analyzer_params.unwrap().as_object().unwrap(); + if builder_params.is_empty() { + return Ok(standard_analyzer(vec![])); + } + + let mut builder = AnalyzerBuilder::new(builder_params); + + // build custom filter + let filter_params = json_params.get("filter"); + if !filter_params.is_none() && filter_params.unwrap().is_object() { + builder.add_custom_filters(filter_params.unwrap().as_object().unwrap())?; + } + + // build analyzer + builder.build() + } + Err(err) => Err(err.into()), + } +} + +pub(crate) fn create_analyzer(params: &str) -> Result { + if params.len() == 0 { + return Ok(standard_analyzer(vec![])); + } + create_analyzer_with_filter(&format!("{{\"analyzer\":{}}}", params)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_standard_analyzer() { + let params = r#"{ + "type": "standard", + "stop_words": ["_english_"] + }"#; + + let tokenizer = create_analyzer(¶ms.to_string()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + } + + #[test] + fn test_chinese_analyzer() { + let params = r#"{ + "type": "chinese" + }"#; + + let tokenizer = create_analyzer(¶ms.to_string()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + let mut bining = tokenizer.unwrap(); + let mut stream = bining.token_stream("系统安全;,'';lxyz密码"); + + let mut results = Vec::::new(); + while stream.advance() { + let token = stream.token(); + results.push(token.text.clone()); + } + + print!("test tokens :{:?}\n", results) + } + + #[test] + fn test_lindera_analyzer() { + let params = r#"{ + "tokenizer": { + "type": "lindera", + "dict_kind": "ipadic" + } + }"#; + + let tokenizer = create_analyzer(¶ms.to_string()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + + let mut bining = tokenizer.unwrap(); + let mut stream = + bining.token_stream("東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です"); + + let mut results = Vec::::new(); + while stream.advance() { + let token = stream.token(); + results.push(token.text.clone()); + } + + print!("test tokens :{:?}\n", results) + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/build_in_analyzer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/build_in_analyzer.rs new file mode 100644 index 0000000000..515a1182ff --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/build_in_analyzer.rs @@ -0,0 +1,40 @@ +use tantivy_5::tokenizer::*; + +use super::filter::*; +use super::stop_words; +use super::tokenizers::*; + +// default build-in analyzer +pub(crate) fn standard_analyzer(stop_words: Vec) -> TextAnalyzer { + let builder = standard_builder().filter(LowerCaser); + + if stop_words.len() > 0 { + return builder.filter(StopWordFilter::remove(stop_words)).build(); + } + + builder.build() +} + +pub fn chinese_analyzer(stop_words: Vec) -> TextAnalyzer { + let builder = jieba_builder().filter(CnAlphaNumOnlyFilter); + if stop_words.len() > 0 { + return builder.filter(StopWordFilter::remove(stop_words)).build(); + } + + builder.build() +} + +pub fn english_analyzer(stop_words: Vec) -> TextAnalyzer { + let builder = standard_builder() + .filter(LowerCaser) + .filter(Stemmer::new(Language::English)) + .filter(StopWordFilter::remove( + stop_words::ENGLISH.iter().map(|&word| word.to_owned()), + )); + + if stop_words.len() > 0 { + return builder.filter(StopWordFilter::remove(stop_words)).build(); + } + + builder.build() +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/filter.rs new file mode 100644 index 0000000000..b74bc29b6c --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/filter.rs @@ -0,0 +1,285 @@ +use serde_json as json; +use tantivy_5::tokenizer::*; + +use super::util::*; +use crate::error::{Result, TantivyBindingError}; + +pub(crate) enum SystemFilter { + Invalid, + LowerCase(LowerCaser), + AsciiFolding(AsciiFoldingFilter), + AlphaNumOnly(AlphaNumOnlyFilter), + CnCharOnly(CnCharOnlyFilter), + CnAlphaNumOnly(CnAlphaNumOnlyFilter), + Length(RemoveLongFilter), + Stop(StopWordFilter), + Decompounder(SplitCompoundWords), + Stemmer(Stemmer), +} + +impl SystemFilter { + pub(crate) fn transform(self, builder: TextAnalyzerBuilder) -> TextAnalyzerBuilder { + match self { + Self::LowerCase(filter) => builder.filter(filter).dynamic(), + Self::AsciiFolding(filter) => builder.filter(filter).dynamic(), + Self::AlphaNumOnly(filter) => builder.filter(filter).dynamic(), + Self::CnCharOnly(filter) => builder.filter(filter).dynamic(), + Self::CnAlphaNumOnly(filter) => builder.filter(filter).dynamic(), + Self::Length(filter) => builder.filter(filter).dynamic(), + Self::Stop(filter) => builder.filter(filter).dynamic(), + Self::Decompounder(filter) => builder.filter(filter).dynamic(), + Self::Stemmer(filter) => builder.filter(filter).dynamic(), + Self::Invalid => builder, + } + } +} + +// create length filter from params +// { +// "type": "length", +// "max": 10, // length +// } +// TODO support min length +fn get_length_filter(params: &json::Map) -> Result { + let limit_str = params.get("max"); + if limit_str.is_none() || !limit_str.unwrap().is_u64() { + return Err(TantivyBindingError::InternalError( + "lenth max param was none or not uint".to_string(), + )); + } + let limit = limit_str.unwrap().as_u64().unwrap() as usize; + Ok(SystemFilter::Length(RemoveLongFilter::limit(limit + 1))) +} + +fn get_stop_words_filter(params: &json::Map) -> Result { + let value = params.get("stop_words"); + if value.is_none() { + return Err(TantivyBindingError::InternalError( + "stop filter stop_words can't be empty".to_string(), + )); + } + let str_list = get_string_list(value.unwrap(), "stop_words filter")?; + Ok(SystemFilter::Stop(StopWordFilter::remove( + get_stop_words_list(str_list), + ))) +} + +fn get_decompounder_filter(params: &json::Map) -> Result { + let value = params.get("word_list"); + if value.is_none() || !value.unwrap().is_array() { + return Err(TantivyBindingError::InternalError( + "decompounder word list should be array".to_string(), + )); + } + + let stop_words = value.unwrap().as_array().unwrap(); + let mut str_list = Vec::::new(); + for element in stop_words { + match element.as_str() { + Some(word) => str_list.push(word.to_string()), + _ => { + return Err(TantivyBindingError::InternalError( + "decompounder word list item should be string".to_string(), + )) + } + } + } + + match SplitCompoundWords::from_dictionary(str_list) { + Ok(f) => Ok(SystemFilter::Decompounder(f)), + Err(e) => Err(TantivyBindingError::InternalError(format!( + "create decompounder failed: {}", + e.to_string() + ))), + } +} + +fn get_stemmer_filter(params: &json::Map) -> Result { + let value = params.get("language"); + if value.is_none() || !value.unwrap().is_string() { + return Err(TantivyBindingError::InternalError( + "stemmer language field should be string".to_string(), + )); + } + + match value.unwrap().as_str().unwrap().into_language() { + Ok(language) => Ok(SystemFilter::Stemmer(Stemmer::new(language))), + Err(e) => Err(TantivyBindingError::InternalError(format!( + "create stemmer failed : {}", + e.to_string() + ))), + } +} + +trait LanguageParser { + fn into_language(self) -> Result; +} + +impl LanguageParser for &str { + fn into_language(self) -> Result { + match self.to_lowercase().as_str() { + "arabig" => Ok(Language::Arabic), + "danish" => Ok(Language::Danish), + "dutch" => Ok(Language::Dutch), + "english" => Ok(Language::English), + "finnish" => Ok(Language::Finnish), + "french" => Ok(Language::French), + "german" => Ok(Language::German), + "greek" => Ok(Language::Greek), + "hungarian" => Ok(Language::Hungarian), + "italian" => Ok(Language::Italian), + "norwegian" => Ok(Language::Norwegian), + "portuguese" => Ok(Language::Portuguese), + "romanian" => Ok(Language::Romanian), + "russian" => Ok(Language::Russian), + "spanish" => Ok(Language::Spanish), + "swedish" => Ok(Language::Swedish), + "tamil" => Ok(Language::Tamil), + "turkish" => Ok(Language::Turkish), + other => Err(TantivyBindingError::InternalError(format!( + "unsupport language: {}", + other + ))), + } + } +} + +impl From<&str> for SystemFilter { + fn from(value: &str) -> Self { + match value { + "lowercase" => Self::LowerCase(LowerCaser), + "asciifolding" => Self::AsciiFolding(AsciiFoldingFilter), + "alphanumonly" => Self::AlphaNumOnly(AlphaNumOnlyFilter), + "cncharonly" => Self::CnCharOnly(CnCharOnlyFilter), + "cnalphanumonly" => Self::CnAlphaNumOnly(CnAlphaNumOnlyFilter), + _ => Self::Invalid, + } + } +} + +impl TryFrom<&json::Map> for SystemFilter { + type Error = TantivyBindingError; + + fn try_from(params: &json::Map) -> Result { + match params.get(&"type".to_string()) { + Some(value) => { + if !value.is_string() { + return Err(TantivyBindingError::InternalError( + "filter type should be string".to_string(), + )); + }; + + match value.as_str().unwrap() { + "length" => get_length_filter(params), + "stop" => get_stop_words_filter(params), + "decompounder" => get_decompounder_filter(params), + "stemmer" => get_stemmer_filter(params), + other => Err(TantivyBindingError::InternalError(format!( + "unsupport filter type: {}", + other + ))), + } + } + None => Err(TantivyBindingError::InternalError( + "no type field in filter params".to_string(), + )), + } + } +} + +pub struct CnCharOnlyFilter; + +pub struct CnCharOnlyFilterStream { + regex: regex::Regex, + tail: T, +} + +impl TokenFilter for CnCharOnlyFilter { + type Tokenizer = CnCharOnlyFilterWrapper; + + fn transform(self, tokenizer: T) -> CnCharOnlyFilterWrapper { + CnCharOnlyFilterWrapper(tokenizer) + } +} + +#[derive(Clone)] +pub struct CnCharOnlyFilterWrapper(T); + +impl Tokenizer for CnCharOnlyFilterWrapper { + type TokenStream<'a> = CnCharOnlyFilterStream>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + CnCharOnlyFilterStream { + regex: regex::Regex::new("\\p{Han}+").unwrap(), + tail: self.0.token_stream(text), + } + } +} + +impl TokenStream for CnCharOnlyFilterStream { + fn advance(&mut self) -> bool { + while self.tail.advance() { + if self.regex.is_match(&self.tail.token().text) { + return true; + } + } + + false + } + + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } +} + +pub struct CnAlphaNumOnlyFilter; + +pub struct CnAlphaNumOnlyFilterStream { + regex: regex::Regex, + tail: T, +} + +impl TokenFilter for CnAlphaNumOnlyFilter { + type Tokenizer = CnAlphaNumOnlyFilterWrapper; + + fn transform(self, tokenizer: T) -> CnAlphaNumOnlyFilterWrapper { + CnAlphaNumOnlyFilterWrapper(tokenizer) + } +} +#[derive(Clone)] +pub struct CnAlphaNumOnlyFilterWrapper(T); + +impl Tokenizer for CnAlphaNumOnlyFilterWrapper { + type TokenStream<'a> = CnAlphaNumOnlyFilterStream>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + CnAlphaNumOnlyFilterStream { + regex: regex::Regex::new(r"[\p{Han}a-zA-Z0-9]+").unwrap(), + tail: self.0.token_stream(text), + } + } +} + +impl TokenStream for CnAlphaNumOnlyFilterStream { + fn advance(&mut self) -> bool { + while self.tail.advance() { + if self.regex.is_match(&self.tail.token().text) { + return true; + } + } + + false + } + + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/mod.rs new file mode 100644 index 0000000000..df2ad4d68c --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/mod.rs @@ -0,0 +1,11 @@ +//! This is totally copied from src/analyzer + +mod analyzer; +mod build_in_analyzer; +mod filter; +mod stop_words; +mod tokenizers; +mod util; + +pub(crate) use self::analyzer::create_analyzer; +pub(crate) use self::build_in_analyzer::standard_analyzer; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/stop_words.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/stop_words.rs new file mode 100644 index 0000000000..ae78b86f12 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/stop_words.rs @@ -0,0 +1,5 @@ +pub const ENGLISH: &[&str] = &[ + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", + "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", + "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", +]; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/jieba_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/jieba_tokenizer.rs new file mode 100644 index 0000000000..a4d8e24163 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/jieba_tokenizer.rs @@ -0,0 +1,83 @@ +use jieba_rs; +use lazy_static::lazy_static; +use tantivy_5::tokenizer::{Token, TokenStream, Tokenizer}; + +lazy_static! { + static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new(); +} + +#[allow(dead_code)] +#[derive(Clone)] +pub enum JiebaMode { + Exact, + Search, +} + +#[derive(Clone)] +pub struct JiebaTokenizer { + mode: JiebaMode, + hmm: bool, +} + +pub struct JiebaTokenStream { + tokens: Vec, + index: usize, +} + +impl TokenStream for JiebaTokenStream { + fn advance(&mut self) -> bool { + if self.index < self.tokens.len() { + self.index += 1; + true + } else { + false + } + } + + fn token(&self) -> &Token { + &self.tokens[self.index - 1] + } + + fn token_mut(&mut self) -> &mut Token { + &mut self.tokens[self.index - 1] + } +} + +impl JiebaTokenizer { + pub fn new() -> JiebaTokenizer { + JiebaTokenizer { + mode: JiebaMode::Search, + hmm: true, + } + } + + fn tokenize(&self, text: &str) -> Vec { + let mut indices = text.char_indices().collect::>(); + indices.push((text.len(), '\0')); + let ori_tokens = match self.mode { + JiebaMode::Exact => JIEBA.tokenize(text, jieba_rs::TokenizeMode::Default, self.hmm), + JiebaMode::Search => JIEBA.tokenize(text, jieba_rs::TokenizeMode::Search, self.hmm), + }; + + let mut tokens = Vec::with_capacity(ori_tokens.len()); + for token in ori_tokens { + tokens.push(Token { + offset_from: indices[token.start].0, + offset_to: indices[token.end].0, + position: token.start, + text: String::from(&text[(indices[token.start].0)..(indices[token.end].0)]), + position_length: token.end - token.start, + }); + } + tokens + } +} + +impl Tokenizer for JiebaTokenizer { + type TokenStream<'a> = JiebaTokenStream; + + fn token_stream(&mut self, text: &str) -> JiebaTokenStream { + let tokens = self.tokenize(text); + JiebaTokenStream { tokens, index: 0 } + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/lindera_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/lindera_tokenizer.rs new file mode 100644 index 0000000000..4b2b734766 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/lindera_tokenizer.rs @@ -0,0 +1,152 @@ +use core::result::Result::Err; + +use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind}; +use lindera::mode::Mode; +use lindera::segmenter::Segmenter; +use lindera::token::Token as LToken; +use lindera::tokenizer::Tokenizer as LTokenizer; +use tantivy_5::tokenizer::{Token, TokenStream, Tokenizer}; + +use crate::error::{Result, TantivyBindingError}; +use serde_json as json; + +pub struct LinderaTokenStream<'a> { + pub tokens: Vec>, + pub token: &'a mut Token, +} + +impl<'a> TokenStream for LinderaTokenStream<'a> { + fn advance(&mut self) -> bool { + if self.tokens.is_empty() { + return false; + } + let token = self.tokens.remove(0); + self.token.text = token.text.to_string(); + self.token.offset_from = token.byte_start; + self.token.offset_to = token.byte_end; + self.token.position = token.position; + self.token.position_length = token.position_length; + + true + } + + fn token(&self) -> &Token { + self.token + } + + fn token_mut(&mut self) -> &mut Token { + self.token + } +} + +#[derive(Clone)] +pub struct LinderaTokenizer { + tokenizer: LTokenizer, + token: Token, +} + +impl LinderaTokenizer { + /// Create a new `LinderaTokenizer`. + /// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable. + pub fn from_json(params: &json::Map) -> Result { + let kind = fetch_lindera_kind(params)?; + let dictionary = load_dictionary_from_kind(kind); + if dictionary.is_err() { + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer with invalid dict_kind" + ))); + } + let segmenter = Segmenter::new(Mode::Normal, dictionary.unwrap(), None); + Ok(LinderaTokenizer::from_segmenter(segmenter)) + } + + /// Create a new `LinderaTokenizer`. + /// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`. + pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer { + LinderaTokenizer { + tokenizer: LTokenizer::new(segmenter), + token: Default::default(), + } + } +} + +impl Tokenizer for LinderaTokenizer { + type TokenStream<'a> = LinderaTokenStream<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> { + self.token.reset(); + LinderaTokenStream { + tokens: self.tokenizer.tokenize(text).unwrap(), + token: &mut self.token, + } + } +} + +trait DictionaryKindParser { + fn into_dict_kind(self) -> Result; +} + +impl DictionaryKindParser for &str { + fn into_dict_kind(self) -> Result { + match self { + "ipadic" => Ok(DictionaryKind::IPADIC), + "ipadic-neologd" => Ok(DictionaryKind::IPADICNEologd), + "unidic" => Ok(DictionaryKind::UniDic), + "ko-dic" => Ok(DictionaryKind::KoDic), + "cc-cedict" => Ok(DictionaryKind::CcCedict), + other => Err(TantivyBindingError::InvalidArgument(format!( + "unsupported lindera dict type: {}", + other + ))), + } + } +} + +fn fetch_lindera_kind(params: &json::Map) -> Result { + match params.get("dict_kind") { + Some(val) => { + if !val.is_string() { + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer dict kind should be string" + ))); + } + val.as_str().unwrap().into_dict_kind() + } + _ => { + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer dict_kind must be set" + ))) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_lindera_tokenizer() { + let params = r#"{ + "type": "lindera", + "dict_kind": "ipadic" + }"#; + let json_param = json::from_str::>(¶ms); + assert!(json_param.is_ok()); + + let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + } + + #[test] + #[cfg(feature = "lindera-cc-cedict")] + fn test_lindera_tokenizer_cc() { + let params = r#"{ + "type": "lindera", + "dict_kind": "cc-cedict" + }"#; + let json_param = json::from_str::>(¶ms); + assert!(json_param.is_ok()); + + let tokenizer = LinderaTokenizer::from_json(&json_param.unwrap()); + assert!(tokenizer.is_ok(), "error: {}", tokenizer.err().unwrap()); + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/mod.rs new file mode 100644 index 0000000000..fd922d0b5a --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/mod.rs @@ -0,0 +1,5 @@ +mod jieba_tokenizer; +mod lindera_tokenizer; +mod tokenizer; + +pub(crate) use self::tokenizer::*; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/tokenizer.rs new file mode 100644 index 0000000000..5cc5d6808d --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/tokenizers/tokenizer.rs @@ -0,0 +1,74 @@ +use log::warn; +use serde_json as json; +use tantivy_5::tokenizer::*; +use tantivy_5::tokenizer::{TextAnalyzer, TextAnalyzerBuilder}; + +use crate::error::{Result, TantivyBindingError}; + +use super::jieba_tokenizer::JiebaTokenizer; +use super::lindera_tokenizer::LinderaTokenizer; + +pub fn standard_builder() -> TextAnalyzerBuilder { + TextAnalyzer::builder(SimpleTokenizer::default()).dynamic() +} + +pub fn whitespace_builder() -> TextAnalyzerBuilder { + TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic() +} + +pub fn jieba_builder() -> TextAnalyzerBuilder { + TextAnalyzer::builder(JiebaTokenizer::new()).dynamic() +} + +pub fn lindera_builder( + params: Option<&json::Map>, +) -> Result { + if params.is_none() { + return Err(TantivyBindingError::InvalidArgument(format!( + "lindera tokenizer must be costum" + ))); + } + let tokenizer = LinderaTokenizer::from_json(params.unwrap())?; + Ok(TextAnalyzer::builder(tokenizer).dynamic()) +} + +pub fn get_builder_with_tokenizer(params: &json::Value) -> Result { + let name; + let params_map; + if params.is_string() { + name = params.as_str().unwrap(); + params_map = None; + } else { + let m = params.as_object().unwrap(); + match m.get("type") { + Some(val) => { + if !val.is_string() { + return Err(TantivyBindingError::InvalidArgument(format!( + "tokenizer type should be string" + ))); + } + name = val.as_str().unwrap(); + } + _ => { + return Err(TantivyBindingError::InvalidArgument(format!( + "costum tokenizer must set type" + ))) + } + } + params_map = Some(m); + } + + match name { + "standard" => Ok(standard_builder()), + "whitespace" => Ok(whitespace_builder()), + "jieba" => Ok(jieba_builder()), + "lindera" => lindera_builder(params_map), + other => { + warn!("unsupported tokenizer: {}", other); + Err(TantivyBindingError::InvalidArgument(format!( + "unsupported tokenizer: {}", + other + ))) + } + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/util.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/util.rs new file mode 100644 index 0000000000..c480cb4fd9 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/analyzer/util.rs @@ -0,0 +1,45 @@ +use serde_json as json; + +use super::stop_words; +use crate::error::{Result, TantivyBindingError}; + +pub(crate) fn get_string_list(value: &json::Value, label: &str) -> Result> { + if !value.is_array() { + return Err(TantivyBindingError::InternalError( + format!("{} should be array", label).to_string(), + )); + } + + let stop_words = value.as_array().unwrap(); + let mut str_list = Vec::::new(); + for element in stop_words { + match element.as_str() { + Some(word) => str_list.push(word.to_string()), + _ => { + return Err(TantivyBindingError::InternalError( + format!("{} list item should be string", label).to_string(), + )) + } + } + } + Ok(str_list) +} + +pub(crate) fn get_stop_words_list(str_list: Vec) -> Vec { + let mut stop_words = Vec::new(); + for str in str_list { + if str.len() > 0 && str.chars().nth(0).unwrap() == '_' { + match str.as_str() { + "_english_" => { + for word in stop_words::ENGLISH { + stop_words.push(word.to_string()); + } + continue; + } + _other => {} + } + } + stop_words.push(str); + } + stop_words +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer.rs new file mode 100644 index 0000000000..4fd7cef681 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer.rs @@ -0,0 +1,219 @@ +use std::ffi::CStr; +use std::sync::Arc; + +use either::Either; +use futures::executor::block_on; +use libc::c_char; +use log::info; +use tantivy_5::schema::{ + Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED, +}; +use tantivy_5::{Document as TantivyDocument, Index, IndexWriter, SingleSegmentIndexWriter}; + +use crate::data_type::TantivyDataType; + +use crate::error::Result; +use crate::index_writer::TantivyValue; +use crate::log::init_log; + +pub(crate) struct IndexWriterWrapperImpl { + pub(crate) field: Field, + pub(crate) index_writer: Either, + pub(crate) id_field: Option, + pub(crate) _index: Arc, +} + +#[inline] +fn schema_builder_add_field( + schema_builder: &mut SchemaBuilder, + field_name: &str, + data_type: TantivyDataType, +) -> Field { + match data_type { + TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED), + TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED), + TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED), + TantivyDataType::Keyword => { + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("raw") + .set_index_option(IndexRecordOption::Basic); + let text_options = TextOptions::default().set_indexing_options(text_field_indexing); + schema_builder.add_text_field(&field_name, text_options) + } + TantivyDataType::Text => { + panic!("text should be indexed with analyzer"); + } + } +} + +impl TantivyValue for i64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_i64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for u64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_u64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for f64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_f64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for &str { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_text(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for bool { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_bool(Field::from_field_id(field), *self); + } +} + +impl IndexWriterWrapperImpl { + pub fn new( + field_name: &str, + data_type: TantivyDataType, + path: String, + num_threads: usize, + overall_memory_budget_in_bytes: usize, + ) -> Result { + info!( + "create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 5", + field_name, data_type + ); + let mut schema_builder = Schema::builder(); + let field = schema_builder_add_field(&mut schema_builder, field_name, data_type); + // We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field. + let id_field = schema_builder.add_i64_field("doc_id", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_dir(path.clone(), schema)?; + let index_writer = + index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?; + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Left(index_writer), + id_field: Some(id_field), + _index: Arc::new(index), + }) + } + + pub fn new_with_single_segment( + field_name: &str, + data_type: TantivyDataType, + path: String, + ) -> Result { + init_log(); + info!( + "create single segment index writer, field_name: {}, data_type: {:?}, tantivy_index_version 5", + field_name, data_type + ); + let mut schema_builder = Schema::builder(); + let field = schema_builder_add_field(&mut schema_builder, field_name, data_type); + let schema = schema_builder.build(); + let index = Index::create_in_dir(path.clone(), schema)?; + let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?; + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Right(index_writer), + id_field: None, + _index: Arc::new(index), + }) + } + + #[inline] + fn add_document(&mut self, mut document: TantivyDocument, offset: Option) -> Result<()> { + if let Some(id_field) = self.id_field { + document.add_i64(id_field, offset.unwrap()); + } + + match &mut self.index_writer { + Either::Left(writer) => { + let _ = writer.add_document(document)?; + } + Either::Right(single_segment_writer) => { + let _ = single_segment_writer.add_document(document)?; + } + } + Ok(()) + } + + pub fn add>( + &mut self, + data: T, + offset: Option, + ) -> Result<()> { + let mut document = TantivyDocument::default(); + data.add_to_document(self.field.field_id(), &mut document); + + self.add_document(document, offset) + } + + pub fn add_array, I>( + &mut self, + data: I, + offset: Option, + ) -> Result<()> + where + I: IntoIterator, + { + let mut document = TantivyDocument::default(); + data.into_iter() + .for_each(|d| d.add_to_document(self.field.field_id(), &mut document)); + + self.add_document(document, offset) + } + + pub fn add_array_keywords( + &mut self, + datas: &[*const c_char], + offset: Option, + ) -> Result<()> { + let mut document = TantivyDocument::default(); + for element in datas { + let data = unsafe { CStr::from_ptr(*element) }; + document.add_field_value(self.field, data.to_str()?); + } + + self.add_document(document, offset) + } + + pub fn manual_merge(&mut self) -> Result<()> { + let index_writer = self.index_writer.as_mut().left().unwrap(); + let metas = index_writer.index().searchable_segment_metas()?; + let policy = index_writer.get_merge_policy(); + let candidates = policy.compute_merge_candidates(metas.as_slice()); + for candidate in candidates { + index_writer.merge(candidate.0.as_slice()).wait()?; + } + Ok(()) + } + + pub fn finish(self) -> Result<()> { + match self.index_writer { + Either::Left(mut index_writer) => { + index_writer.commit()?; + // self.manual_merge(); + block_on(index_writer.garbage_collect_files())?; + index_writer.wait_merging_threads()?; + } + Either::Right(single_segment_index_writer) => { + single_segment_index_writer + .finalize() + .expect("failed to build inverted index"); + } + } + Ok(()) + } + + pub(crate) fn commit(&mut self) -> Result<()> { + self.index_writer.as_mut().left().unwrap().commit()?; + Ok(()) + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer_text.rs new file mode 100644 index 0000000000..c377dd1fba --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/index_writer_text.rs @@ -0,0 +1,58 @@ +use std::sync::Arc; + +use either::Either; +use tantivy_5::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST}; +use tantivy_5::Index; + +use crate::error::Result; +use crate::log::init_log; + +use super::analyzer::create_analyzer; +use super::IndexWriterWrapperImpl; + +fn build_text_schema(field_name: &str, tokenizer_name: &str) -> (Schema, Field, Field) { + let mut schema_builder = Schema::builder(); + // positions is required for matching phase. + let indexing = TextFieldIndexing::default() + .set_tokenizer(tokenizer_name) + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let option = TextOptions::default().set_indexing_options(indexing); + let field = schema_builder.add_text_field(field_name, option); + let id_field = schema_builder.add_i64_field("doc_id", FAST); + (schema_builder.build(), field, id_field) +} + +impl IndexWriterWrapperImpl { + pub(crate) fn create_text_writer( + field_name: &str, + path: &str, + tokenizer_name: &str, + tokenizer_params: &str, + num_threads: usize, + overall_memory_budget_in_bytes: usize, + in_ram: bool, + ) -> Result { + init_log(); + + let tokenizer = create_analyzer(tokenizer_params)?; + + let (schema, field, id_field) = build_text_schema(field_name, tokenizer_name); + let index: Index; + if in_ram { + index = Index::create_in_ram(schema); + } else { + index = Index::create_in_dir(path.to_string(), schema).unwrap(); + } + index.tokenizers().register(&tokenizer_name, tokenizer); + let index_writer = index + .writer_with_num_threads(num_threads, overall_memory_budget_in_bytes) + .unwrap(); + + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Left(index_writer), + id_field: Some(id_field), + _index: Arc::new(index), + }) + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/mod.rs new file mode 100644 index 0000000000..f146ee496b --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v5/mod.rs @@ -0,0 +1,11 @@ +//! Tantivy index version 5 +//! This is the old version of Tantivy index (ex: Milvus 2.4.x uses). +//! We may still build tantivy index with version 5 for compatibility reasons where +//! there are some read nodes that can only read tantivy index with version 5. + +mod analyzer; +pub(crate) mod index_writer; +pub(crate) mod index_writer_text; + +pub(crate) use index_writer::IndexWriterWrapperImpl; +pub(crate) use tantivy_5::Document as TantivyDocumentV5; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer.rs new file mode 100644 index 0000000000..ccbc4c6abb --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer.rs @@ -0,0 +1,224 @@ +use std::ffi::CStr; +use std::sync::Arc; + +use either::Either; +use futures::executor::block_on; +use libc::c_char; +use log::info; +use tantivy::schema::{ + Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED, +}; +use tantivy::{Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument}; + +use crate::data_type::TantivyDataType; + +use crate::error::Result; +use crate::index_reader::IndexReaderWrapper; +use crate::index_writer::TantivyValue; +use crate::log::init_log; + +#[inline] +fn schema_builder_add_field( + schema_builder: &mut SchemaBuilder, + field_name: &str, + data_type: TantivyDataType, +) -> Field { + match data_type { + TantivyDataType::I64 => schema_builder.add_i64_field(field_name, INDEXED), + TantivyDataType::F64 => schema_builder.add_f64_field(field_name, INDEXED), + TantivyDataType::Bool => schema_builder.add_bool_field(field_name, INDEXED), + TantivyDataType::Keyword => { + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("raw") + .set_index_option(IndexRecordOption::Basic); + let text_options = TextOptions::default().set_indexing_options(text_field_indexing); + schema_builder.add_text_field(&field_name, text_options) + } + TantivyDataType::Text => { + panic!("text should be indexed with analyzer"); + } + } +} + +impl TantivyValue for i64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_i64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for u64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_u64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for f64 { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_f64(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for &str { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_text(Field::from_field_id(field), *self); + } +} + +impl TantivyValue for bool { + fn add_to_document(&self, field: u32, document: &mut TantivyDocument) { + document.add_bool(Field::from_field_id(field), *self); + } +} + +pub struct IndexWriterWrapperImpl { + pub(crate) field: Field, + pub(crate) index_writer: Either, + pub(crate) id_field: Option, + pub(crate) index: Arc, +} + +impl IndexWriterWrapperImpl { + pub fn new( + field_name: &str, + data_type: TantivyDataType, + path: String, + num_threads: usize, + overall_memory_budget_in_bytes: usize, + ) -> Result { + info!( + "create index writer, field_name: {}, data_type: {:?}, tantivy_index_version 7", + field_name, data_type + ); + let mut schema_builder = Schema::builder(); + let field = schema_builder_add_field(&mut schema_builder, field_name, data_type); + // We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field. + let id_field = schema_builder.add_i64_field("doc_id", FAST); + let schema = schema_builder.build(); + let index = Index::create_in_dir(path.clone(), schema)?; + let index_writer = + index.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)?; + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Left(index_writer), + id_field: Some(id_field), + index: Arc::new(index), + }) + } + + pub fn new_with_single_segment( + field_name: &str, + data_type: TantivyDataType, + path: String, + ) -> Result { + init_log(); + info!( + "create single segment index writer, field_name: {}, data_type: {:?}, tantivy_index_version 7", + field_name, data_type + ); + let mut schema_builder = Schema::builder(); + let field = schema_builder_add_field(&mut schema_builder, field_name, data_type); + let schema = schema_builder.build(); + let index = Index::create_in_dir(path.clone(), schema)?; + let index_writer = SingleSegmentIndexWriter::new(index.clone(), 15 * 1024 * 1024)?; + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Right(index_writer), + id_field: None, + index: Arc::new(index), + }) + } + + pub fn create_reader(&self) -> Result { + IndexReaderWrapper::from_index(self.index.clone()) + } + + #[inline] + fn add_document(&mut self, mut document: TantivyDocument, offset: Option) -> Result<()> { + if let Some(id_field) = self.id_field { + document.add_i64(id_field, offset.unwrap()); + } + + match &mut self.index_writer { + Either::Left(writer) => { + let _ = writer.add_document(document)?; + } + Either::Right(single_segment_writer) => { + let _ = single_segment_writer.add_document(document)?; + } + } + Ok(()) + } + + pub fn add>( + &mut self, + data: T, + offset: Option, + ) -> Result<()> { + let mut document = TantivyDocument::default(); + data.add_to_document(self.field.field_id(), &mut document); + + self.add_document(document, offset) + } + + pub fn add_array, I>( + &mut self, + data: I, + offset: Option, + ) -> Result<()> + where + I: IntoIterator, + { + let mut document = TantivyDocument::default(); + data.into_iter() + .for_each(|d| d.add_to_document(self.field.field_id(), &mut document)); + + self.add_document(document, offset) + } + + pub fn add_array_keywords( + &mut self, + datas: &[*const c_char], + offset: Option, + ) -> Result<()> { + let mut document = TantivyDocument::default(); + for element in datas { + let data = unsafe { CStr::from_ptr(*element) }; + document.add_field_value(self.field, data.to_str()?); + } + + self.add_document(document, offset) + } + + pub fn manual_merge(&mut self) -> Result<()> { + let index_writer = self.index_writer.as_mut().left().unwrap(); + let metas = index_writer.index().searchable_segment_metas()?; + let policy = index_writer.get_merge_policy(); + let candidates = policy.compute_merge_candidates(metas.as_slice()); + for candidate in candidates { + index_writer.merge(candidate.0.as_slice()).wait()?; + } + Ok(()) + } + + pub fn finish(self) -> Result<()> { + match self.index_writer { + Either::Left(mut index_writer) => { + index_writer.commit()?; + // self.manual_merge(); + block_on(index_writer.garbage_collect_files())?; + index_writer.wait_merging_threads()?; + } + Either::Right(single_segment_index_writer) => { + single_segment_index_writer + .finalize() + .expect("failed to build inverted index"); + } + } + Ok(()) + } + + pub(crate) fn commit(&mut self) -> Result<()> { + self.index_writer.as_mut().left().unwrap().commit()?; + Ok(()) + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer_text.rs new file mode 100644 index 0000000000..808860e32e --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/index_writer_text.rs @@ -0,0 +1,57 @@ +use std::sync::Arc; + +use either::Either; +use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST}; +use tantivy::Index; + +use crate::analyzer::create_analyzer; +use crate::error::Result; +use crate::log::init_log; + +use super::IndexWriterWrapperImpl; + +fn build_text_schema(field_name: &str, tokenizer_name: &str) -> (Schema, Field, Field) { + let mut schema_builder = Schema::builder(); + // positions is required for matching phase. + let indexing = TextFieldIndexing::default() + .set_tokenizer(tokenizer_name) + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let option = TextOptions::default().set_indexing_options(indexing); + let field = schema_builder.add_text_field(field_name, option); + let id_field = schema_builder.add_i64_field("doc_id", FAST); + (schema_builder.build(), field, id_field) +} + +impl IndexWriterWrapperImpl { + pub(crate) fn create_text_writer( + field_name: &str, + path: &str, + tokenizer_name: &str, + tokenizer_params: &str, + num_threads: usize, + overall_memory_budget_in_bytes: usize, + in_ram: bool, + ) -> Result { + init_log(); + let tokenizer = create_analyzer(tokenizer_params)?; + + let (schema, field, id_field) = build_text_schema(field_name, tokenizer_name); + let index: Index; + if in_ram { + index = Index::create_in_ram(schema); + } else { + index = Index::create_in_dir(path.to_string(), schema).unwrap(); + } + index.tokenizers().register(&tokenizer_name, tokenizer); + let index_writer = index + .writer_with_num_threads(num_threads, overall_memory_budget_in_bytes) + .unwrap(); + + Ok(IndexWriterWrapperImpl { + field, + index_writer: Either::Left(index_writer), + id_field: Some(id_field), + index: Arc::new(index), + }) + } +} diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/mod.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/mod.rs new file mode 100644 index 0000000000..7122a00164 --- /dev/null +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_v7/mod.rs @@ -0,0 +1,9 @@ +//! Tantivy index version 7 +//! This is the latest version of Tantivy index and is what we plan to use +//! in most cases. + +pub(crate) mod index_writer; +pub(crate) mod index_writer_text; + +pub(crate) use index_writer::IndexWriterWrapperImpl; +pub(crate) use tantivy::TantivyDocument as TantivyDocumentV7; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs index 508c8a1448..dfb06b77a3 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/lib.rs @@ -1,3 +1,6 @@ +use error::TantivyBindingError; + +mod analyzer; mod array; mod data_type; mod demo_c; @@ -12,26 +15,44 @@ mod index_writer; mod index_writer_c; mod index_writer_text; mod index_writer_text_c; +mod index_writer_v5; +mod index_writer_v7; mod log; mod string_c; mod token_stream_c; -mod analyzer; mod tokenizer_c; mod util; mod util_c; mod vec_collector; -pub fn add(left: usize, right: usize) -> usize { - left + right +use error::Result; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TantivyIndexVersion { + V5, // Version for compatibility (for 2.4.x) + V7, // Latest version } -#[cfg(test)] -mod tests { - use super::*; +impl TantivyIndexVersion { + pub fn from_u32(version: u32) -> Result { + match version { + 5 => Ok(Self::V5), + 7 => Ok(Self::V7), + _ => Err(TantivyBindingError::InvalidArgument(format!( + "unsupported version {}", + version + ))), + } + } - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); + pub fn as_u32(&self) -> u32 { + match self { + Self::V5 => 5, + Self::V7 => 7, + } + } + + pub fn default_version() -> Self { + Self::V7 } } diff --git a/internal/core/thirdparty/tantivy/tantivy-wrapper.h b/internal/core/thirdparty/tantivy/tantivy-wrapper.h index 1e06c7273f..2125fe493f 100644 --- a/internal/core/thirdparty/tantivy/tantivy-wrapper.h +++ b/internal/core/thirdparty/tantivy/tantivy-wrapper.h @@ -82,6 +82,7 @@ struct TantivyIndexWrapper { TantivyIndexWrapper(const char* field_name, TantivyDataType data_type, const char* path, + uint32_t tantivy_index_version, bool inverted_single_semgnent = false, uintptr_t num_threads = DEFAULT_NUM_THREADS, uintptr_t overall_memory_budget_in_bytes = @@ -89,12 +90,13 @@ struct TantivyIndexWrapper { RustResultWrapper res; if (inverted_single_semgnent) { res = RustResultWrapper(tantivy_create_index_with_single_segment( - field_name, data_type, path)); + field_name, data_type, path, tantivy_index_version)); } else { res = RustResultWrapper( tantivy_create_index(field_name, data_type, path, + tantivy_index_version, num_threads, overall_memory_budget_in_bytes)); } @@ -120,6 +122,7 @@ struct TantivyIndexWrapper { TantivyIndexWrapper(const char* field_name, bool in_ram, const char* path, + uint32_t tantivy_index_version, const char* tokenizer_name = DEFAULT_TOKENIZER_NAME, const char* analyzer_params = DEFAULT_analyzer_params, uintptr_t num_threads = DEFAULT_NUM_THREADS, @@ -128,6 +131,7 @@ struct TantivyIndexWrapper { auto res = RustResultWrapper( tantivy_create_text_writer(field_name, path, + tantivy_index_version, tokenizer_name, analyzer_params, num_threads, diff --git a/internal/core/thirdparty/tantivy/token-stream.h b/internal/core/thirdparty/tantivy/token-stream.h index ab9415488c..374e064711 100644 --- a/internal/core/thirdparty/tantivy/token-stream.h +++ b/internal/core/thirdparty/tantivy/token-stream.h @@ -39,7 +39,8 @@ struct TokenStream { return s; } - TantivyToken get_detailed_token() { + TantivyToken + get_detailed_token() { return tantivy_token_stream_get_detailed_token(ptr_); } diff --git a/internal/core/unittest/test_c_tokenizer.cpp b/internal/core/unittest/test_c_tokenizer.cpp index ff45745170..2722538492 100644 --- a/internal/core/unittest/test_c_tokenizer.cpp +++ b/internal/core/unittest/test_c_tokenizer.cpp @@ -70,16 +70,15 @@ TEST(CTokenizer, Default) { ASSERT_FALSE(token_stream_advance(token_stream)); free_token_stream(token_stream); - - token_stream = - create_token_stream(tokenizer, text.c_str(), text.length()); + + token_stream = create_token_stream(tokenizer, text.c_str(), text.length()); for (int i = 0; i < 3; i++) { ASSERT_TRUE(token_stream_advance(token_stream)); auto token = token_stream_get_detailed_token(token_stream); ASSERT_EQ(refs[i], std::string(token.token)); ASSERT_EQ(offsets[i], token.start_offset); - + free_token(const_cast(token.token)); } ASSERT_FALSE(token_stream_advance(token_stream));