enhance: get doc ids by batch (#40608)

issue: #40607

tantivy change: https://github.com/zilliztech/tantivy/pull/3

Benchmarks:
Test Envrioment: CPU 9900K
The data is insert by:
```
for i in 0..N {
    for j in 0..UNIQUE {
        let key = format!("hello{}", j);
        index_writer.add_string(&key, i * UNIQUE + j).unwrap();
    }
}
```
So the unique influences the locality of the matched docs.
The latency is the avg latency over 1000 repeate quries.
The result shows 22.5%-34.8% latency reduction.

![image](https://github.com/user-attachments/assets/dd8af75a-ddc3-445d-92df-50d354dd5645)

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
This commit is contained in:
Spade A 2025-03-14 15:48:09 +08:00 committed by GitHub
parent 6dbe5d475e
commit 001fc992df
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 78 additions and 38 deletions

View File

@ -1662,7 +1662,7 @@ checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1"
[[package]]
name = "ownedbytes"
version = "0.7.0"
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
dependencies = [
"stable_deref_trait",
]
@ -2280,7 +2280,7 @@ dependencies = [
[[package]]
name = "tantivy"
version = "0.23.0"
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
dependencies = [
"aho-corasick",
"arc-swap",
@ -2356,7 +2356,7 @@ dependencies = [
[[package]]
name = "tantivy-bitpacker"
version = "0.6.0"
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
dependencies = [
"bitpacking",
]
@ -2364,7 +2364,7 @@ dependencies = [
[[package]]
name = "tantivy-columnar"
version = "0.3.0"
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
dependencies = [
"downcast-rs",
"fastdivide",
@ -2379,13 +2379,14 @@ dependencies = [
[[package]]
name = "tantivy-common"
version = "0.7.0"
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
dependencies = [
"async-trait",
"byteorder",
"ownedbytes",
"serde",
"time",
"tokio",
]
[[package]]
@ -2402,7 +2403,7 @@ dependencies = [
[[package]]
name = "tantivy-query-grammar"
version = "0.22.0"
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
dependencies = [
"nom",
]
@ -2410,7 +2411,7 @@ dependencies = [
[[package]]
name = "tantivy-sstable"
version = "0.3.0"
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
dependencies = [
"futures-util",
"itertools",
@ -2423,7 +2424,7 @@ dependencies = [
[[package]]
name = "tantivy-stacker"
version = "0.3.0"
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
dependencies = [
"murmurhash32",
"rand_distr",
@ -2433,7 +2434,7 @@ dependencies = [
[[package]]
name = "tantivy-tokenizer-api"
version = "0.3.0"
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
dependencies = [
"serde",
]

View File

@ -14,7 +14,7 @@ lindera-ko-dic = ["lindera/ko-dic"]
lindera-cc-cedict = ["lindera/cc-cedict"]
[dependencies]
tantivy = { git = "https://github.com/milvus-io/tantivy", tag = "v0.1.0" } # we have make a private fix for milvus, should be removed in future after milvus fixing the bug.
tantivy = { git = "https://github.com/zilliztech/tantivy.git" }
lindera = "0.38.1"
futures = "0.3.21"
libc = "0.2"

View File

@ -1,4 +1,3 @@
use regex;
use serde_json as json;
use tantivy::tokenizer::*;

View File

@ -1,6 +1,5 @@
use core::result::Result::Err;
use log::warn;
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;

View File

@ -10,7 +10,7 @@ pub(crate) struct DocIdCollector<T> {
}
pub(crate) struct DocIdChildCollector<T> {
docs: Vec<T>,
milvus_doc_ids: Vec<T>,
column: Column<i64>,
}
@ -24,7 +24,7 @@ impl Collector for DocIdCollector<u32> {
segment: &SegmentReader,
) -> tantivy::Result<Self::Child> {
Ok(DocIdChildCollector {
docs: Vec::new(),
milvus_doc_ids: Vec::new(),
column: segment.fast_fields().i64("doc_id").unwrap(),
})
}
@ -51,14 +51,22 @@ impl Collector for DocIdCollector<u32> {
impl SegmentCollector for DocIdChildCollector<u32> {
type Fruit = Vec<u32>;
fn collect(&mut self, doc: DocId, _score: Score) {
self.column.values_for_doc(doc).for_each(|doc_id| {
self.docs.push(doc_id as u32);
})
fn collect_block(&mut self, docs: &[DocId]) {
self.milvus_doc_ids.extend(
self.column
.values_for_docs_flatten(docs)
.into_iter()
.map(|val| val as u32),
);
}
fn harvest(self) -> Self::Fruit {
self.docs
fn collect(&mut self, doc: DocId, _score: Score) {
// Unreachable code actually.
self.collect_block(&[doc]);
}
fn harvest(mut self) -> Self::Fruit {
self.milvus_doc_ids
}
}
@ -72,7 +80,7 @@ impl Collector for DocIdCollector<i64> {
segment: &SegmentReader,
) -> tantivy::Result<Self::Child> {
Ok(DocIdChildCollector {
docs: Vec::new(),
milvus_doc_ids: Vec::new(),
column: segment.fast_fields().i64("doc_id").unwrap(),
})
}
@ -99,13 +107,17 @@ impl Collector for DocIdCollector<i64> {
impl SegmentCollector for DocIdChildCollector<i64> {
type Fruit = Vec<i64>;
fn collect(&mut self, doc: DocId, _score: Score) {
self.column.values_for_doc(doc).for_each(|doc_id| {
self.docs.push(doc_id);
})
fn collect_block(&mut self, docs: &[DocId]) {
self.milvus_doc_ids
.extend(self.column.values_for_docs_flatten(docs));
}
fn harvest(self) -> Self::Fruit {
self.docs
fn collect(&mut self, doc: DocId, _score: Score) {
// Unreachable code actually.
self.collect_block(&[doc]);
}
fn harvest(mut self) -> Self::Fruit {
self.milvus_doc_ids
}
}

View File

@ -288,8 +288,8 @@ mod test {
use tantivy::{
doc,
schema::{self, Schema, STORED, STRING, TEXT},
Index, IndexWriter,
schema::{Schema, STORED, STRING},
Index,
};
use super::IndexReaderWrapper;

View File

@ -5,7 +5,7 @@ use tantivy::{
};
use crate::error::Result;
use crate::{index_reader::IndexReaderWrapper, analyzer::standard_analyzer};
use crate::{analyzer::standard_analyzer, index_reader::IndexReaderWrapper};
impl IndexReaderWrapper {
// split the query string into multiple tokens using index's default tokenizer,
@ -62,13 +62,14 @@ impl IndexReaderWrapper {
#[cfg(test)]
mod tests {
use tantivy::query::TermQuery;
use tempfile::TempDir;
use crate::{index_writer::IndexWriterWrapper, tokenizer::create_tokenizer};
use crate::{analyzer::create_analyzer, index_writer::IndexWriterWrapper};
#[test]
fn test_jeba() {
let params = "{\"tokenizer\": \"jieba\"}".to_string();
let tokenizer = create_tokenizer(&params).unwrap();
let tokenizer = create_analyzer(&params).unwrap();
let dir = TempDir::new().unwrap();
let mut writer = IndexWriterWrapper::create_text_writer(
@ -96,4 +97,36 @@ mod tests {
let res = reader.phrase_match_query("网球滑雪", slop).unwrap();
assert_eq!(res, vec![0, 1]);
}
#[test]
fn test_read() {
let tokenizer = create_analyzer("").unwrap();
let dir = TempDir::new().unwrap();
let mut writer = IndexWriterWrapper::create_text_writer(
"text".to_string(),
dir.path().to_str().unwrap().to_string(),
"default".to_string(),
tokenizer,
1,
50_000_000,
false,
);
for i in 0..10000 {
writer.add_string("hello world", i).unwrap();
}
writer.commit().unwrap();
let reader = writer.create_reader().unwrap();
let query = TermQuery::new(
tantivy::Term::from_field_text(reader.field.clone(), "hello"),
tantivy::schema::IndexRecordOption::Basic,
);
let res = reader.search(&query).unwrap();
assert_eq!(res, (0..10000).collect::<Vec<u32>>());
let res = reader.search_i64(&query).unwrap();
assert_eq!(res, (0..10000).collect::<Vec<i64>>());
}
}

View File

@ -6,12 +6,9 @@ use futures::executor::block_on;
use libc::c_char;
use log::info;
use tantivy::schema::{
Field, IndexRecordOption, OwnedValue, Schema, SchemaBuilder, TextFieldIndexing, TextOptions,
FAST, INDEXED,
};
use tantivy::{
doc, tokenizer, Document, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument,
Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED,
};
use tantivy::{doc, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument};
use crate::data_type::TantivyDataType;

View File

@ -5,7 +5,6 @@ use crate::array::RustResult;
use crate::cstr_to_str;
use crate::index_writer::IndexWriterWrapper;
use crate::log::init_log;
use crate::string_c::c_str_to_str;
use crate::analyzer::create_analyzer;
use crate::util::create_binding;