diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock index 9b0c70903c..052087a646 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock @@ -1662,7 +1662,7 @@ checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1" [[package]] name = "ownedbytes" version = "0.7.0" -source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498" +source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ "stable_deref_trait", ] @@ -2280,7 +2280,7 @@ dependencies = [ [[package]] name = "tantivy" version = "0.23.0" -source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498" +source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ "aho-corasick", "arc-swap", @@ -2356,7 +2356,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498" +source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ "bitpacking", ] @@ -2364,7 +2364,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498" +source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ "downcast-rs", "fastdivide", @@ -2379,13 +2379,14 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498" +source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ "async-trait", "byteorder", "ownedbytes", "serde", "time", + "tokio", ] [[package]] @@ -2402,7 +2403,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498" +source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ "nom", ] @@ -2410,7 +2411,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498" +source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ "futures-util", "itertools", @@ -2423,7 +2424,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498" +source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ "murmurhash32", "rand_distr", @@ -2433,7 +2434,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498" +source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88" dependencies = [ "serde", ] diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml index 08265d2fa0..8227a98ded 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml +++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml @@ -14,7 +14,7 @@ lindera-ko-dic = ["lindera/ko-dic"] lindera-cc-cedict = ["lindera/cc-cedict"] [dependencies] -tantivy = { git = "https://github.com/milvus-io/tantivy", tag = "v0.1.0" } # we have make a private fix for milvus, should be removed in future after milvus fixing the bug. +tantivy = { git = "https://github.com/zilliztech/tantivy.git" } lindera = "0.38.1" futures = "0.3.21" libc = "0.2" diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter.rs index fd4c6d7f57..e4eb11bff4 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter.rs @@ -1,4 +1,3 @@ -use regex; use serde_json as json; use tantivy::tokenizer::*; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs index 548d439793..bee22114d3 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs @@ -1,6 +1,5 @@ use core::result::Result::Err; -use log::warn; use lindera::mode::Mode; use lindera::segmenter::Segmenter; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs index 92541831b0..ed813a9e6d 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs @@ -10,7 +10,7 @@ pub(crate) struct DocIdCollector { } pub(crate) struct DocIdChildCollector { - docs: Vec, + milvus_doc_ids: Vec, column: Column, } @@ -24,7 +24,7 @@ impl Collector for DocIdCollector { segment: &SegmentReader, ) -> tantivy::Result { Ok(DocIdChildCollector { - docs: Vec::new(), + milvus_doc_ids: Vec::new(), column: segment.fast_fields().i64("doc_id").unwrap(), }) } @@ -51,14 +51,22 @@ impl Collector for DocIdCollector { impl SegmentCollector for DocIdChildCollector { type Fruit = Vec; - fn collect(&mut self, doc: DocId, _score: Score) { - self.column.values_for_doc(doc).for_each(|doc_id| { - self.docs.push(doc_id as u32); - }) + fn collect_block(&mut self, docs: &[DocId]) { + self.milvus_doc_ids.extend( + self.column + .values_for_docs_flatten(docs) + .into_iter() + .map(|val| val as u32), + ); } - fn harvest(self) -> Self::Fruit { - self.docs + fn collect(&mut self, doc: DocId, _score: Score) { + // Unreachable code actually. + self.collect_block(&[doc]); + } + + fn harvest(mut self) -> Self::Fruit { + self.milvus_doc_ids } } @@ -72,7 +80,7 @@ impl Collector for DocIdCollector { segment: &SegmentReader, ) -> tantivy::Result { Ok(DocIdChildCollector { - docs: Vec::new(), + milvus_doc_ids: Vec::new(), column: segment.fast_fields().i64("doc_id").unwrap(), }) } @@ -99,13 +107,17 @@ impl Collector for DocIdCollector { impl SegmentCollector for DocIdChildCollector { type Fruit = Vec; - fn collect(&mut self, doc: DocId, _score: Score) { - self.column.values_for_doc(doc).for_each(|doc_id| { - self.docs.push(doc_id); - }) + fn collect_block(&mut self, docs: &[DocId]) { + self.milvus_doc_ids + .extend(self.column.values_for_docs_flatten(docs)); } - fn harvest(self) -> Self::Fruit { - self.docs + fn collect(&mut self, doc: DocId, _score: Score) { + // Unreachable code actually. + self.collect_block(&[doc]); + } + + fn harvest(mut self) -> Self::Fruit { + self.milvus_doc_ids } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs index 3afbd5911f..af86535a89 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs @@ -288,8 +288,8 @@ mod test { use tantivy::{ doc, - schema::{self, Schema, STORED, STRING, TEXT}, - Index, IndexWriter, + schema::{Schema, STORED, STRING}, + Index, }; use super::IndexReaderWrapper; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs index eb1adc3d84..9e62186d74 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs @@ -5,7 +5,7 @@ use tantivy::{ }; use crate::error::Result; -use crate::{index_reader::IndexReaderWrapper, analyzer::standard_analyzer}; +use crate::{analyzer::standard_analyzer, index_reader::IndexReaderWrapper}; impl IndexReaderWrapper { // split the query string into multiple tokens using index's default tokenizer, @@ -62,13 +62,14 @@ impl IndexReaderWrapper { #[cfg(test)] mod tests { + use tantivy::query::TermQuery; use tempfile::TempDir; - use crate::{index_writer::IndexWriterWrapper, tokenizer::create_tokenizer}; + use crate::{analyzer::create_analyzer, index_writer::IndexWriterWrapper}; #[test] fn test_jeba() { let params = "{\"tokenizer\": \"jieba\"}".to_string(); - let tokenizer = create_tokenizer(¶ms).unwrap(); + let tokenizer = create_analyzer(¶ms).unwrap(); let dir = TempDir::new().unwrap(); let mut writer = IndexWriterWrapper::create_text_writer( @@ -96,4 +97,36 @@ mod tests { let res = reader.phrase_match_query("网球滑雪", slop).unwrap(); assert_eq!(res, vec![0, 1]); } + + #[test] + fn test_read() { + let tokenizer = create_analyzer("").unwrap(); + let dir = TempDir::new().unwrap(); + let mut writer = IndexWriterWrapper::create_text_writer( + "text".to_string(), + dir.path().to_str().unwrap().to_string(), + "default".to_string(), + tokenizer, + 1, + 50_000_000, + false, + ); + + for i in 0..10000 { + writer.add_string("hello world", i).unwrap(); + } + writer.commit().unwrap(); + + let reader = writer.create_reader().unwrap(); + + let query = TermQuery::new( + tantivy::Term::from_field_text(reader.field.clone(), "hello"), + tantivy::schema::IndexRecordOption::Basic, + ); + + let res = reader.search(&query).unwrap(); + assert_eq!(res, (0..10000).collect::>()); + let res = reader.search_i64(&query).unwrap(); + assert_eq!(res, (0..10000).collect::>()); + } } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs index 7abc8e7d0d..5c00fc62a1 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs @@ -6,12 +6,9 @@ use futures::executor::block_on; use libc::c_char; use log::info; use tantivy::schema::{ - Field, IndexRecordOption, OwnedValue, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, - FAST, INDEXED, -}; -use tantivy::{ - doc, tokenizer, Document, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument, + Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED, }; +use tantivy::{doc, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument}; use crate::data_type::TantivyDataType; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs index 3c2a6f8a75..44d4c7435d 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs @@ -5,7 +5,6 @@ use crate::array::RustResult; use crate::cstr_to_str; use crate::index_writer::IndexWriterWrapper; use crate::log::init_log; -use crate::string_c::c_str_to_str; use crate::analyzer::create_analyzer; use crate::util::create_binding;