enhance: update tantivy version (#39253)

https://github.com/milvus-io/milvus/issues/39254

---------

Signed-off-by: sunby <sunbingyi1992@gmail.com>
This commit is contained in:
Bingyi Sun 2025-02-08 14:08:43 +08:00 committed by GitHub
parent cd56d64ec4
commit c13fc8cd19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 391 additions and 415 deletions

File diff suppressed because it is too large Load Diff

View File

@ -6,7 +6,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
tantivy = { git = "https://github.com/milvus-io/tantivy", tag = "0.21.1-fix3" } # we have make a private fix for milvus, should be removed in future after milvus fixing the bug.
tantivy = { git = "https://github.com/milvus-io/tantivy", tag = "v0.1.0" } # we have make a private fix for milvus, should be removed in future after milvus fixing the bug.
futures = "0.3.21"
libc = "0.2"
scopeguard = "1.2"

View File

@ -40,7 +40,7 @@ impl IndexReaderWrapper {
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit) // OnCommit serve for growing segment.
.reload_policy(ReloadPolicy::OnCommitWithDelay) // OnCommitWithDelay serve for growing segment.
.try_into()?;
reader.reload()?;
@ -98,11 +98,9 @@ impl IndexReaderWrapper {
lower_bound: i64,
inclusive: bool,
) -> Result<Vec<u32>> {
let q = RangeQuery::new_i64_bounds(
self.field_name.to_string(),
make_bounds(lower_bound, inclusive),
Bound::Unbounded,
);
let term = Term::from_field_i64(self.field, lower_bound);
let q = RangeQuery::new(make_bounds(term, inclusive), Bound::Unbounded);
self.search(&q)
}
@ -111,11 +109,8 @@ impl IndexReaderWrapper {
upper_bound: i64,
inclusive: bool,
) -> Result<Vec<u32>> {
let q = RangeQuery::new_i64_bounds(
self.field_name.to_string(),
Bound::Unbounded,
make_bounds(upper_bound, inclusive),
);
let term = Term::from_field_i64(self.field, upper_bound);
let q = RangeQuery::new(Bound::Unbounded, make_bounds(term, inclusive));
self.search(&q)
}
@ -126,9 +121,9 @@ impl IndexReaderWrapper {
lb_inclusive: bool,
ub_inclusive: bool,
) -> Result<Vec<u32>> {
let lb = make_bounds(lower_bound, lb_inclusive);
let ub = make_bounds(upper_bound, ub_inclusive);
let q = RangeQuery::new_i64_bounds(self.field_name.to_string(), lb, ub);
let lb = make_bounds(Term::from_field_i64(self.field, lower_bound), lb_inclusive);
let ub = make_bounds(Term::from_field_i64(self.field, upper_bound), ub_inclusive);
let q = RangeQuery::new(lb, ub);
self.search(&q)
}
@ -145,9 +140,8 @@ impl IndexReaderWrapper {
lower_bound: f64,
inclusive: bool,
) -> Result<Vec<u32>> {
let q = RangeQuery::new_f64_bounds(
self.field_name.to_string(),
make_bounds(lower_bound, inclusive),
let q = RangeQuery::new(
make_bounds(Term::from_field_f64(self.field, lower_bound), inclusive),
Bound::Unbounded,
);
self.search(&q)
@ -158,10 +152,9 @@ impl IndexReaderWrapper {
upper_bound: f64,
inclusive: bool,
) -> Result<Vec<u32>> {
let q = RangeQuery::new_f64_bounds(
self.field_name.to_string(),
let q = RangeQuery::new(
Bound::Unbounded,
make_bounds(upper_bound, inclusive),
make_bounds(Term::from_field_f64(self.field, upper_bound), inclusive),
);
self.search(&q)
}
@ -173,9 +166,9 @@ impl IndexReaderWrapper {
lb_inclusive: bool,
ub_inclusive: bool,
) -> Result<Vec<u32>> {
let lb = make_bounds(lower_bound, lb_inclusive);
let ub = make_bounds(upper_bound, ub_inclusive);
let q = RangeQuery::new_f64_bounds(self.field_name.to_string(), lb, ub);
let lb = make_bounds(Term::from_field_f64(self.field, lower_bound), lb_inclusive);
let ub = make_bounds(Term::from_field_f64(self.field, upper_bound), ub_inclusive);
let q = RangeQuery::new(lb, ub);
self.search(&q)
}
@ -200,9 +193,8 @@ impl IndexReaderWrapper {
lower_bound: &str,
inclusive: bool,
) -> Result<Vec<u32>> {
let q = RangeQuery::new_str_bounds(
self.field_name.to_string(),
make_bounds(lower_bound, inclusive),
let q = RangeQuery::new(
make_bounds(Term::from_field_text(self.field, lower_bound), inclusive),
Bound::Unbounded,
);
self.search(&q)
@ -213,10 +205,9 @@ impl IndexReaderWrapper {
upper_bound: &str,
inclusive: bool,
) -> Result<Vec<u32>> {
let q = RangeQuery::new_str_bounds(
self.field_name.to_string(),
let q = RangeQuery::new(
Bound::Unbounded,
make_bounds(upper_bound, inclusive),
make_bounds(Term::from_field_text(self.field, upper_bound), inclusive),
);
self.search(&q)
}
@ -228,9 +219,9 @@ impl IndexReaderWrapper {
lb_inclusive: bool,
ub_inclusive: bool,
) -> Result<Vec<u32>> {
let lb = make_bounds(lower_bound, lb_inclusive);
let ub = make_bounds(upper_bound, ub_inclusive);
let q = RangeQuery::new_str_bounds(self.field_name.to_string(), lb, ub);
let lb = make_bounds(Term::from_field_text(self.field, lower_bound), lb_inclusive);
let ub = make_bounds(Term::from_field_text(self.field, upper_bound), ub_inclusive);
let q = RangeQuery::new(lb, ub);
self.search(&q)
}

View File

@ -6,9 +6,12 @@ use futures::executor::block_on;
use libc::c_char;
use log::info;
use tantivy::schema::{
Field, IndexRecordOption, Schema, SchemaBuilder, TextFieldIndexing, TextOptions, FAST, INDEXED,
Field, IndexRecordOption, OwnedValue, Schema, SchemaBuilder, TextFieldIndexing, TextOptions,
FAST, INDEXED,
};
use tantivy::{
doc, tokenizer, Document, Index, IndexWriter, SingleSegmentIndexWriter, TantivyDocument,
};
use tantivy::{doc, Document, Index, IndexWriter, SingleSegmentIndexWriter};
use crate::data_type::TantivyDataType;
@ -55,7 +58,10 @@ impl IndexWriterWrapper {
overall_memory_budget_in_bytes: usize,
) -> Result<IndexWriterWrapper> {
init_log();
info!("create index writer, field_name: {}, data_type: {:?}", field_name, data_type);
info!(
"create index writer, field_name: {}, data_type: {:?}",
field_name, data_type
);
let mut schema_builder = Schema::builder();
let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
// We cannot build direct connection from rows in multi-segments to milvus row data. So we have this doc_id field.
@ -78,7 +84,10 @@ impl IndexWriterWrapper {
path: String,
) -> Result<IndexWriterWrapper> {
init_log();
info!("create single segment index writer, field_name: {}, data_type: {:?}", field_name, data_type);
info!(
"create single segment index writer, field_name: {}, data_type: {:?}",
field_name, data_type
);
let mut schema_builder = Schema::builder();
let field = schema_builder_add_field(&mut schema_builder, &field_name, data_type);
let schema = schema_builder.build();
@ -96,7 +105,7 @@ impl IndexWriterWrapper {
IndexReaderWrapper::from_index(self.index.clone())
}
fn index_writer_add_document(&self, document: Document) -> Result<()> {
fn index_writer_add_document(&self, document: TantivyDocument) -> Result<()> {
match self.index_writer {
Either::Left(ref writer) => {
let _ = writer.add_document(document)?;
@ -108,7 +117,10 @@ impl IndexWriterWrapper {
Ok(())
}
fn single_segment_index_writer_add_document(&mut self, document: Document) -> Result<()> {
fn single_segment_index_writer_add_document(
&mut self,
document: TantivyDocument,
) -> Result<()> {
match self.index_writer {
Either::Left(_) => {
panic!("unexpected writer");
@ -165,70 +177,70 @@ impl IndexWriterWrapper {
}
pub fn add_multi_i8s(&mut self, datas: &[i8], offset: i64) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
document.add_field_value(self.field, &(*data as i64));
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_i16s(&mut self, datas: &[i16], offset: i64) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
document.add_field_value(self.field, &(*data as i64));
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_i32s(&mut self, datas: &[i32], offset: i64) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
document.add_field_value(self.field, &(*data as i64));
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_i64s(&mut self, datas: &[i64], offset: i64) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data);
document.add_field_value(self.field, data);
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_f32s(&mut self, datas: &[f32], offset: i64) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data as f64);
document.add_field_value(self.field, &(*data as f64));
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_f64s(&mut self, datas: &[f64], offset: i64) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data);
document.add_field_value(self.field, data);
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_bools(&mut self, datas: &[bool], offset: i64) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data);
document.add_field_value(self.field, data);
}
document.add_i64(self.id_field.unwrap(), offset);
self.index_writer_add_document(document)
}
pub fn add_multi_keywords(&mut self, datas: &[*const c_char], offset: i64) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for element in datas {
let data = unsafe { CStr::from_ptr(*element) };
document.add_field_value(self.field, data.to_str()?);
@ -278,57 +290,57 @@ impl IndexWriterWrapper {
}
pub fn add_multi_i8s_by_single_segment_writer(&mut self, datas: &[i8]) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
document.add_field_value(self.field, &(*data as i64));
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_i16s_by_single_segment_writer(&mut self, datas: &[i16]) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
document.add_field_value(self.field, &(*data as i64));
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_i32s_by_single_segment_writer(&mut self, datas: &[i32]) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data as i64);
document.add_field_value(self.field, &(*data as i64));
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_i64s_by_single_segment_writer(&mut self, datas: &[i64]) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data);
document.add_field_value(self.field, data);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_f32s_by_single_segment_writer(&mut self, datas: &[f32]) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data as f64);
document.add_field_value(self.field, &(*data as f64));
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_f64s_by_single_segment_writer(&mut self, datas: &[f64]) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data);
document.add_field_value(self.field, data);
}
self.single_segment_index_writer_add_document(document)
}
pub fn add_multi_bools_by_single_segment_writer(&mut self, datas: &[bool]) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for data in datas {
document.add_field_value(self.field, *data);
document.add_field_value(self.field, data);
}
self.single_segment_index_writer_add_document(document)
}
@ -337,7 +349,7 @@ impl IndexWriterWrapper {
&mut self,
datas: &[*const c_char],
) -> Result<()> {
let mut document = Document::default();
let mut document = TantivyDocument::default();
for element in datas {
let data = unsafe { CStr::from_ptr(*element) };
document.add_field_value(self.field, data.to_str()?);