mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
enhance: get doc ids by batch (#40608)
issue: #40607 tantivy change: https://github.com/zilliztech/tantivy/pull/3 Benchmarks: Test Envrioment: CPU 9900K The data is insert by: ``` for i in 0..N { for j in 0..UNIQUE { let key = format!("hello{}", j); index_writer.add_string(&key, i * UNIQUE + j).unwrap(); } } ``` So the unique influences the locality of the matched docs. The latency is the avg latency over 1000 repeate quries. The result shows 22.5%-34.8% latency reduction.  --------- Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
This commit is contained in:
parent
6dbe5d475e
commit
001fc992df
19
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
19
internal/core/thirdparty/tantivy/tantivy-binding/Cargo.lock
generated
vendored
@ -1662,7 +1662,7 @@ checksum = "e2355d85b9a3786f481747ced0e0ff2ba35213a1f9bd406ed906554d7af805a1"
|
||||
[[package]]
|
||||
name = "ownedbytes"
|
||||
version = "0.7.0"
|
||||
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
|
||||
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
|
||||
dependencies = [
|
||||
"stable_deref_trait",
|
||||
]
|
||||
@ -2280,7 +2280,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tantivy"
|
||||
version = "0.23.0"
|
||||
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
|
||||
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"arc-swap",
|
||||
@ -2356,7 +2356,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.6.0"
|
||||
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
|
||||
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
|
||||
dependencies = [
|
||||
"bitpacking",
|
||||
]
|
||||
@ -2364,7 +2364,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tantivy-columnar"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
|
||||
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
|
||||
dependencies = [
|
||||
"downcast-rs",
|
||||
"fastdivide",
|
||||
@ -2379,13 +2379,14 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tantivy-common"
|
||||
version = "0.7.0"
|
||||
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
|
||||
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"ownedbytes",
|
||||
"serde",
|
||||
"time",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -2402,7 +2403,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.22.0"
|
||||
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
|
||||
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
|
||||
dependencies = [
|
||||
"nom",
|
||||
]
|
||||
@ -2410,7 +2411,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tantivy-sstable"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
|
||||
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
|
||||
dependencies = [
|
||||
"futures-util",
|
||||
"itertools",
|
||||
@ -2423,7 +2424,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tantivy-stacker"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
|
||||
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
|
||||
dependencies = [
|
||||
"murmurhash32",
|
||||
"rand_distr",
|
||||
@ -2433,7 +2434,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tantivy-tokenizer-api"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/milvus-io/tantivy?tag=v0.1.0#608b2f5d3ffd6c9b79233f799f71631f9cbbd498"
|
||||
source = "git+https://github.com/zilliztech/tantivy.git#8abf9cee3eb6fe959c83e69a638c33fc3b830e88"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
@ -14,7 +14,7 @@ lindera-ko-dic = ["lindera/ko-dic"]
|
||||
lindera-cc-cedict = ["lindera/cc-cedict"]
|
||||
|
||||
[dependencies]
|
||||
tantivy = { git = "https://github.com/milvus-io/tantivy", tag = "v0.1.0" } # we have make a private fix for milvus, should be removed in future after milvus fixing the bug.
|
||||
tantivy = { git = "https://github.com/zilliztech/tantivy.git" }
|
||||
lindera = "0.38.1"
|
||||
futures = "0.3.21"
|
||||
libc = "0.2"
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
use regex;
|
||||
use serde_json as json;
|
||||
use tantivy::tokenizer::*;
|
||||
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
|
||||
use core::result::Result::Err;
|
||||
use log::warn;
|
||||
|
||||
use lindera::mode::Mode;
|
||||
use lindera::segmenter::Segmenter;
|
||||
|
||||
@ -10,7 +10,7 @@ pub(crate) struct DocIdCollector<T> {
|
||||
}
|
||||
|
||||
pub(crate) struct DocIdChildCollector<T> {
|
||||
docs: Vec<T>,
|
||||
milvus_doc_ids: Vec<T>,
|
||||
column: Column<i64>,
|
||||
}
|
||||
|
||||
@ -24,7 +24,7 @@ impl Collector for DocIdCollector<u32> {
|
||||
segment: &SegmentReader,
|
||||
) -> tantivy::Result<Self::Child> {
|
||||
Ok(DocIdChildCollector {
|
||||
docs: Vec::new(),
|
||||
milvus_doc_ids: Vec::new(),
|
||||
column: segment.fast_fields().i64("doc_id").unwrap(),
|
||||
})
|
||||
}
|
||||
@ -51,14 +51,22 @@ impl Collector for DocIdCollector<u32> {
|
||||
impl SegmentCollector for DocIdChildCollector<u32> {
|
||||
type Fruit = Vec<u32>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
self.column.values_for_doc(doc).for_each(|doc_id| {
|
||||
self.docs.push(doc_id as u32);
|
||||
})
|
||||
fn collect_block(&mut self, docs: &[DocId]) {
|
||||
self.milvus_doc_ids.extend(
|
||||
self.column
|
||||
.values_for_docs_flatten(docs)
|
||||
.into_iter()
|
||||
.map(|val| val as u32),
|
||||
);
|
||||
}
|
||||
|
||||
fn harvest(self) -> Self::Fruit {
|
||||
self.docs
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
// Unreachable code actually.
|
||||
self.collect_block(&[doc]);
|
||||
}
|
||||
|
||||
fn harvest(mut self) -> Self::Fruit {
|
||||
self.milvus_doc_ids
|
||||
}
|
||||
}
|
||||
|
||||
@ -72,7 +80,7 @@ impl Collector for DocIdCollector<i64> {
|
||||
segment: &SegmentReader,
|
||||
) -> tantivy::Result<Self::Child> {
|
||||
Ok(DocIdChildCollector {
|
||||
docs: Vec::new(),
|
||||
milvus_doc_ids: Vec::new(),
|
||||
column: segment.fast_fields().i64("doc_id").unwrap(),
|
||||
})
|
||||
}
|
||||
@ -99,13 +107,17 @@ impl Collector for DocIdCollector<i64> {
|
||||
impl SegmentCollector for DocIdChildCollector<i64> {
|
||||
type Fruit = Vec<i64>;
|
||||
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
self.column.values_for_doc(doc).for_each(|doc_id| {
|
||||
self.docs.push(doc_id);
|
||||
})
|
||||
fn collect_block(&mut self, docs: &[DocId]) {
|
||||
self.milvus_doc_ids
|
||||
.extend(self.column.values_for_docs_flatten(docs));
|
||||
}
|
||||
|
||||
fn harvest(self) -> Self::Fruit {
|
||||
self.docs
|
||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||
// Unreachable code actually.
|
||||
self.collect_block(&[doc]);
|
||||
}
|
||||
|
||||
fn harvest(mut self) -> Self::Fruit {
|
||||
self.milvus_doc_ids
|
||||
}
|
||||
}
|
||||
|
||||
@ -288,8 +288,8 @@ mod test {
|
||||
|
||||
use tantivy::{
|
||||
doc,
|
||||
schema::{self, Schema, STORED, STRING, TEXT},
|
||||
Index, IndexWriter,
|
||||
schema::{Schema, STORED, STRING},
|
||||
Index,
|
||||
};
|
||||
|
||||
use super::IndexReaderWrapper;
|
||||
|
||||
@ -5,7 +5,7 @@ use tantivy::{
|
||||
};
|
||||
|
||||
use crate::error::Result;
|
||||
use crate::{index_reader::IndexReaderWrapper, analyzer::standard_analyzer};
|
||||
use crate::{analyzer::standard_analyzer, index_reader::IndexReaderWrapper};
|
||||
|
||||
impl IndexReaderWrapper {
|
||||
// split the query string into multiple tokens using index's default tokenizer,
|
||||
@ -62,13 +62,14 @@ impl IndexReaderWrapper {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tantivy::query::TermQuery;
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::{index_writer::IndexWriterWrapper, tokenizer::create_tokenizer};
|
||||
use crate::{analyzer::create_analyzer, index_writer::IndexWriterWrapper};
|
||||
#[test]
|
||||
fn test_jeba() {
|
||||
let params = "{\"tokenizer\": \"jieba\"}".to_string();
|
||||
let tokenizer = create_tokenizer(¶ms).unwrap();
|
||||
let tokenizer = create_analyzer(¶ms).unwrap();
|
||||
let dir = TempDir::new().unwrap();
|
||||
|
||||
let mut writer = IndexWriterWrapper::create_text_writer(
|
||||
@ -96,4 +97,36 @@ mod tests {
|
||||
let res = reader.phrase_match_query("网球滑雪", slop).unwrap();
|
||||
assert_eq!(res, vec![0, 1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read() {
|
||||
let tokenizer = create_analyzer("").unwrap();
|
||||
let dir = TempDir::new().unwrap();
|
||||
let mut writer = IndexWriterWrapper::create_text_writer(
|
||||
"text".to_string(),
|
||||
dir.path().to_str().unwrap().to_string(),
|
||||
"default".to_string(),
|
||||
tokenizer,
|
||||
1,
|
||||
50_000_000,
|
||||
false,
|
||||
);
|
||||
|
||||
for i in 0..10000 {
|
||||
writer.add_string("hello world", i).unwrap();
|
||||
}
|
||||
writer.commit().unwrap();
|
||||
|
||||
let reader = writer.create_reader().unwrap();
|
||||
|
||||
let query = TermQuery::new(
|
||||
tantivy::Term::from_field_text(reader.field.clone(), "hello"),
|
||||
tantivy::schema::IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let res = reader.search(&query).unwrap();
|
||||
assert_eq!(res, (0..10000).collect::<Vec<u32>>());
|
||||
let res = reader.search_i64(&query).unwrap();
|
||||
assert_eq!(res, (0..10000).collect::<Vec<i64>>());
|
||||
}
|
||||
}
|
||||
|
||||
@ -6,12 +6,9 @@ use futures::executor::block_on;
|
||||
use libc::c_char;
|
||||
use log::info;
|
||||
use tantivy::schema::{
|
||||
Field, IndexRecordOption, OwnedValue, Schema, SchemaBuilder, TextFieldIndexing, TextOptions,
|
||||
FAST, INDEXED,
|
||||
};
|
||||
use tantivy::{
|
||||
doc, tokenizer, Document, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument,
|
||||
Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED,
|
||||
};
|
||||
use tantivy::{doc, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument};
|
||||
|
||||
use crate::data_type::TantivyDataType;
|
||||
|
||||
|
||||
@ -5,7 +5,6 @@ use crate::array::RustResult;
|
||||
use crate::cstr_to_str;
|
||||
use crate::index_writer::IndexWriterWrapper;
|
||||
use crate::log::init_log;
|
||||
use crate::string_c::c_str_to_str;
|
||||
use crate::analyzer::create_analyzer;
|
||||
use crate::util::create_binding;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user