enhance: get doc ids by batch (#40608)

issue: #40607 tantivy change: https://github.com/zilliztech/tantivy/pull/3 Benchmarks: Test Envrioment: CPU 9900K The data is insert by: ``` for i in 0..N { for j in 0..UNIQUE { let key = format!("hello{}", j); index_writer.add_string(&key, i * UNIQUE + j).unwrap(); } } ``` So the unique influences the locality of the matched docs. The latency is the avg latency over 1000 repeate quries. The result shows 22.5%-34.8% latency reduction. ![image](https://github.com/user-attachments/assets/dd8af75a-ddc3-445d-92df-50d354dd5645) --------- Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
2026-01-07 19:31:51 +08:00 · 2025-03-14 15:48:09 +08:00 · 2025-03-14 15:48:09 +08:00 · 001fc992df
commit 001fc992df
parent 6dbe5d475e
9 changed files with 78 additions and 38 deletions
--- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
@ -1662,7 +1662,7 @@ checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1"
 [[package]]
 name = "ownedbytes"
 version = "0.7.0"
-source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
+source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
 dependencies = [
 "stable_deref_trait",
 ]
@ -2280,7 +2280,7 @@ dependencies = [
 [[package]]
 name = "tantivy"
 version = "0.23.0"
-source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
+source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
 dependencies = [
 "aho-corasick",
 "arc-swap",
@ -2356,7 +2356,7 @@ dependencies = [
 [[package]]
 name = "tantivy-bitpacker"
 version = "0.6.0"
-source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
+source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
 dependencies = [
 "bitpacking",
 ]
@ -2364,7 +2364,7 @@ dependencies = [
 [[package]]
 name = "tantivy-columnar"
 version = "0.3.0"
-source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
+source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
 dependencies = [
 "downcast-rs",
 "fastdivide",
@ -2379,13 +2379,14 @@ dependencies = [
 [[package]]
 name = "tantivy-common"
 version = "0.7.0"
-source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
+source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
 dependencies = [
 "async-trait",
 "byteorder",
 "ownedbytes",
 "serde",
 "time",
+ "tokio",
 ]

 [[package]]
@ -2402,7 +2403,7 @@ dependencies = [
 [[package]]
 name = "tantivy-query-grammar"
 version = "0.22.0"
-source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
+source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
 dependencies = [
 "nom",
 ]
@ -2410,7 +2411,7 @@ dependencies = [
 [[package]]
 name = "tantivy-sstable"
 version = "0.3.0"
-source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
+source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
 dependencies = [
 "futures-util",
 "itertools",
@ -2423,7 +2424,7 @@ dependencies = [
 [[package]]
 name = "tantivy-stacker"
 version = "0.3.0"
-source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
+source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
 dependencies = [
 "murmurhash32",
 "rand_distr",
@ -2433,7 +2434,7 @@ dependencies = [
 [[package]]
 name = "tantivy-tokenizer-api"
 version = "0.3.0"
-source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
+source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
 dependencies = [
 "serde",
 ]
--- a/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/Cargo.toml
@ -14,7 +14,7 @@ lindera-ko-dic = ["lindera/ko-dic"]
 lindera-cc-cedict = ["lindera/cc-cedict"]

 [dependencies]
-tantivy = { git = "https://github.com/milvus-io/tantivy", tag = "v0.1.0" } # we have make a private fix for milvus, should be removed in future after milvus fixing the bug.
+tantivy = { git = "https://github.com/zilliztech/tantivy.git" }
 lindera = "0.38.1"
 futures = "0.3.21"
 libc = "0.2"
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/filter.rs
@ -1,4 +1,3 @@
-use regex;
 use serde_json as json;
 use tantivy::tokenizer::*;

--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/analyzer/tokenizers/lindera_tokenizer.rs
@ -1,6 +1,5 @@

 use core::result::Result::Err;
-use log::warn;

 use lindera::mode::Mode;
 use lindera::segmenter::Segmenter;
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs
@ -10,7 +10,7 @@ pub(crate) struct DocIdCollector<T> {
 }

 pub(crate) struct DocIdChildCollector<T> {
-    docs: Vec<T>,
+    milvus_doc_ids: Vec<T>,
    column: Column<i64>,
 }

@ -24,7 +24,7 @@ impl Collector for DocIdCollector<u32> {
        segment: &SegmentReader,
    ) -> tantivy::Result<Self::Child> {
        Ok(DocIdChildCollector {
-            docs: Vec::new(),
+            milvus_doc_ids: Vec::new(),
            column: segment.fast_fields().i64("doc_id").unwrap(),
        })
    }
@ -51,14 +51,22 @@ impl Collector for DocIdCollector<u32> {
 impl SegmentCollector for DocIdChildCollector<u32> {
    type Fruit = Vec<u32>;

-    fn collect(&mut self, doc: DocId, _score: Score) {
-        self.column.values_for_doc(doc).for_each(|doc_id| {
-            self.docs.push(doc_id as u32);
-        })
+    fn collect_block(&mut self, docs: &[DocId]) {
+        self.milvus_doc_ids.extend(
+            self.column
+                .values_for_docs_flatten(docs)
+                .into_iter()
+                .map(|val| val as u32),
+        );
    }

-    fn harvest(self) -> Self::Fruit {
-        self.docs
+    fn collect(&mut self, doc: DocId, _score: Score) {
+        // Unreachable code actually.
+        self.collect_block(&[doc]);
+    }
+
+    fn harvest(mut self) -> Self::Fruit {
+        self.milvus_doc_ids
    }
 }

@ -72,7 +80,7 @@ impl Collector for DocIdCollector<i64> {
        segment: &SegmentReader,
    ) -> tantivy::Result<Self::Child> {
        Ok(DocIdChildCollector {
-            docs: Vec::new(),
+            milvus_doc_ids: Vec::new(),
            column: segment.fast_fields().i64("doc_id").unwrap(),
        })
    }
@ -99,13 +107,17 @@ impl Collector for DocIdCollector<i64> {
 impl SegmentCollector for DocIdChildCollector<i64> {
    type Fruit = Vec<i64>;

-    fn collect(&mut self, doc: DocId, _score: Score) {
-        self.column.values_for_doc(doc).for_each(|doc_id| {
-            self.docs.push(doc_id);
-        })
+    fn collect_block(&mut self, docs: &[DocId]) {
+        self.milvus_doc_ids
+            .extend(self.column.values_for_docs_flatten(docs));
    }

-    fn harvest(self) -> Self::Fruit {
-        self.docs
+    fn collect(&mut self, doc: DocId, _score: Score) {
+        // Unreachable code actually.
+        self.collect_block(&[doc]);
+    }
+
+    fn harvest(mut self) -> Self::Fruit {
+        self.milvus_doc_ids
    }
 }
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs
@ -288,8 +288,8 @@ mod test {

    use tantivy::{
        doc,
-        schema::{self, Schema, STORED, STRING, TEXT},
-        Index, IndexWriter,
+        schema::{Schema, STORED, STRING},
+        Index,
    };

    use super::IndexReaderWrapper;
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs
@ -5,7 +5,7 @@ use tantivy::{
 };

 use crate::error::Result;
-use crate::{index_reader::IndexReaderWrapper, analyzer::standard_analyzer};
+use crate::{analyzer::standard_analyzer, index_reader::IndexReaderWrapper};

 impl IndexReaderWrapper {
    // split the query string into multiple tokens using index's default tokenizer,
@ -62,13 +62,14 @@ impl IndexReaderWrapper {

 #[cfg(test)]
 mod tests {
+    use tantivy::query::TermQuery;
    use tempfile::TempDir;

-    use crate::{index_writer::IndexWriterWrapper, tokenizer::create_tokenizer};
+    use crate::{analyzer::create_analyzer, index_writer::IndexWriterWrapper};
    #[test]
    fn test_jeba() {
        let params = "{\"tokenizer\": \"jieba\"}".to_string();
-        let tokenizer = create_tokenizer(&params).unwrap();
+        let tokenizer = create_analyzer(&params).unwrap();
        let dir = TempDir::new().unwrap();

        let mut writer = IndexWriterWrapper::create_text_writer(
@ -96,4 +97,36 @@ mod tests {
        let res = reader.phrase_match_query("网球滑雪", slop).unwrap();
        assert_eq!(res, vec![0, 1]);
    }
+
+    #[test]
+    fn test_read() {
+        let tokenizer = create_analyzer("").unwrap();
+        let dir = TempDir::new().unwrap();
+        let mut writer = IndexWriterWrapper::create_text_writer(
+            "text".to_string(),
+            dir.path().to_str().unwrap().to_string(),
+            "default".to_string(),
+            tokenizer,
+            1,
+            50_000_000,
+            false,
+        );
+
+        for i in 0..10000 {
+            writer.add_string("hello world", i).unwrap();
+        }
+        writer.commit().unwrap();
+
+        let reader = writer.create_reader().unwrap();
+
+        let query = TermQuery::new(
+            tantivy::Term::from_field_text(reader.field.clone(), "hello"),
+            tantivy::schema::IndexRecordOption::Basic,
+        );
+
+        let res = reader.search(&query).unwrap();
+        assert_eq!(res, (0..10000).collect::<Vec<u32>>());
+        let res = reader.search_i64(&query).unwrap();
+        assert_eq!(res, (0..10000).collect::<Vec<i64>>());
+    }
 }
--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer.rs
@ -6,12 +6,9 @@ use futures::executor::block_on;
 use libc::c_char;
 use log::info;
 use tantivy::schema::{
-    Field, IndexRecordOption, OwnedValue, Schema, SchemaBuilder, TextFieldIndexing, TextOptions,
-    FAST, INDEXED,
-};
-use tantivy::{
-    doc, tokenizer, Document, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument,
+    Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED,
 };
+use tantivy::{doc, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument};

 use crate::data_type::TantivyDataType;

--- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs
+++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_writer_text_c.rs
@ -5,7 +5,6 @@ use crate::array::RustResult;
 use crate::cstr_to_str;
 use crate::index_writer::IndexWriterWrapper;
 use crate::log::init_log;
-use crate::string_c::c_str_to_str;
 use crate::analyzer::create_analyzer;
 use crate::util::create_binding;