fix: fix some bugs discovered by chaos tests --- cherry pick (#42909)

master: https://github.com/milvus-io/milvus/pull/42906
issue: https://github.com/milvus-io/milvus/issues/42870

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
This commit is contained in:
Spade A 2025-06-24 17:10:42 +08:00 committed by GitHub
parent 1e55112de7
commit 017eb9ffe2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
20 changed files with 85 additions and 62 deletions

View File

@ -254,8 +254,8 @@ InvertedIndexTantivy<T>::Load(milvus::tracer::TraceContext ctx,
inverted_index_files.end());
disk_file_manager_->CacheIndexToDisk(inverted_index_files);
path_ = prefix;
wrapper_ = std::make_shared<TantivyIndexWrapper>(prefix.c_str(),
milvus::index::SetBitset);
wrapper_ = std::make_shared<TantivyIndexWrapper>(
prefix.c_str(), milvus::index::SetBitsetSealed);
}
template <typename T>
@ -482,7 +482,7 @@ InvertedIndexTantivy<T>::BuildWithRawDataForUT(size_t n,
static_cast<const T*>(values), n);
}
}
wrapper_->create_reader();
wrapper_->create_reader(milvus::index::SetBitsetSealed);
finish();
wrapper_->reload();
}

View File

@ -108,8 +108,8 @@ class JsonInvertedIndex : public index::InvertedIndexTantivy<T> {
}
void
create_reader() {
this->wrapper_->create_reader();
create_reader(SetBitsetFn set_bitset) {
this->wrapper_->create_reader(set_bitset);
}
bool

View File

@ -386,8 +386,8 @@ JsonKeyStatsInvertedIndex::Load(milvus::tracer::TraceContext ctx,
disk_file_manager_->CacheJsonKeyIndexToDisk(index_files.value());
AssertInfo(
tantivy_index_exist(path_.c_str()), "index not exist: {}", path_);
wrapper_ = std::make_shared<TantivyIndexWrapper>(path_.c_str(),
milvus::index::SetBitset);
wrapper_ = std::make_shared<TantivyIndexWrapper>(
path_.c_str(), milvus::index::SetBitsetSealed);
LOG_INFO("load json key index done for field id:{} with dir:{}",
field_id_,
path_);
@ -497,8 +497,8 @@ JsonKeyStatsInvertedIndex::Reload() {
}
void
JsonKeyStatsInvertedIndex::CreateReader() {
wrapper_->create_reader();
JsonKeyStatsInvertedIndex::CreateReader(SetBitsetFn set_bitset) {
wrapper_->create_reader(set_bitset);
}
} // namespace milvus::index

View File

@ -133,7 +133,7 @@ class JsonKeyStatsInvertedIndex : public InvertedIndexTantivy<std::string> {
Reload();
void
CreateReader();
CreateReader(SetBitsetFn set_bitset);
bool
has_escape_sequence(const std::string& str) {

View File

@ -149,8 +149,8 @@ TextMatchIndex::Load(const Config& config) {
disk_file_manager_->CacheTextLogToDisk(files_value);
AssertInfo(
tantivy_index_exist(prefix.c_str()), "index not exist: {}", prefix);
wrapper_ = std::make_shared<TantivyIndexWrapper>(prefix.c_str(),
milvus::index::SetBitset);
wrapper_ = std::make_shared<TantivyIndexWrapper>(
prefix.c_str(), milvus::index::SetBitsetSealed);
}
void
@ -269,8 +269,8 @@ TextMatchIndex::Reload() {
}
void
TextMatchIndex::CreateReader() {
wrapper_->create_reader();
TextMatchIndex::CreateReader(SetBitsetFn set_bitset) {
wrapper_->create_reader(set_bitset);
}
void

View File

@ -74,7 +74,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
public:
void
CreateReader();
CreateReader(SetBitsetFn set_bitset);
void
RegisterTokenizer(const char* tokenizer_name, const char* analyzer_params);

View File

@ -199,13 +199,31 @@ CheckAndUpdateKnowhereRangeSearchParam(const SearchInfo& search_info,
const MetricType& metric_type,
knowhere::Json& search_config);
void inline SetBitset(void* bitset, const uint32_t* doc_id, uintptr_t n) {
// For sealed segment, the doc_id is guaranteed to be less than bitset size which equals to the doc count of tantivy before querying.
void inline SetBitsetSealed(void* bitset, const uint32_t* doc_id, uintptr_t n) {
TargetBitmap* bitmap = static_cast<TargetBitmap*>(bitset);
const auto bitmap_size = bitmap->size();
for (uintptr_t i = 0; i < n; ++i) {
assert(doc_id[i] < bitmap->size());
assert(doc_id[i] < bitmap_size);
(*bitmap)[doc_id[i]] = true;
}
}
// For growing segment, concurrent insert exists, so the doc_id may exceed bitset size.
void inline SetBitsetGrowing(void* bitset,
const uint32_t* doc_id,
uintptr_t n) {
TargetBitmap* bitmap = static_cast<TargetBitmap*>(bitset);
const auto bitmap_size = bitmap->size();
for (uintptr_t i = 0; i < n; ++i) {
const auto id = doc_id[i];
if (id >= bitmap_size) {
// Ideally, the doc_id is sorted and we can return directly. But I don't want to have this strong guarantee.
continue;
}
(*bitmap)[id] = true;
}
}
} // namespace milvus::index

View File

@ -1523,7 +1523,7 @@ ChunkedSegmentSealedImpl::CreateTextIndex(FieldId field_id) {
}
// create index reader.
index->CreateReader();
index->CreateReader(milvus::index::SetBitsetSealed);
// release index writer.
index->Finish();

View File

@ -894,7 +894,7 @@ SegmentGrowingImpl::CreateTextIndex(FieldId field_id) {
"milvus_tokenizer",
field_meta.get_analyzer_params().c_str());
index->Commit();
index->CreateReader();
index->CreateReader(milvus::index::SetBitsetGrowing);
index->RegisterTokenizer("milvus_tokenizer",
field_meta.get_analyzer_params().c_str());
text_indexes_[field_id] = std::move(index);
@ -955,7 +955,7 @@ SegmentGrowingImpl::CreateJSONIndex(FieldId field_id) {
JSON_KEY_STATS_COMMIT_INTERVAL, unique_id.c_str());
index->Commit();
index->CreateReader();
index->CreateReader(milvus::index::SetBitsetGrowing);
json_indexes_[field_id] = std::move(index);
}

View File

@ -2138,7 +2138,7 @@ SegmentSealedImpl::CreateTextIndex(FieldId field_id) {
}
// create index reader.
index->CreateReader();
index->CreateReader(milvus::index::SetBitsetGrowing);
// release index writer.
index->Finish();

View File

@ -37,7 +37,7 @@ main(int argc, char* argv[]) {
text_index.commit();
}
text_index.create_reader();
text_index.create_reader(milvus::index::SetBitsetGrowing);
text_index.register_tokenizer(tokenizer_name.c_str(), analyzer_params);
{

View File

@ -1,3 +1,4 @@
use core::slice;
use std::ffi::CStr;
use std::sync::Arc;
@ -10,6 +11,7 @@ use tantivy::schema::{
};
use tantivy::{doc, Document, Index, IndexWriter, SingleSegmentIndexWriter, UserOperation};
use crate::convert_to_rust_slice;
use crate::data_type::TantivyDataType;
use crate::error::{Result, TantivyBindingError};
@ -196,7 +198,7 @@ impl IndexWriterWrapper {
.to_str()
.map_err(|e| TantivyBindingError::InternalError(e.to_string()))?;
let json_offsets =
unsafe { std::slice::from_raw_parts(*json_offsets, *json_offsets_len) };
unsafe { convert_to_rust_slice!(*json_offsets, *json_offsets_len) };
for offset in json_offsets {
batch.push(UserOperation::Add(doc!(

View File

@ -11,6 +11,7 @@ use crate::{
util::{create_binding, free_binding},
};
#[macro_export]
macro_rules! convert_to_rust_slice {
($arr: expr, $len: expr) => {
match $arr {
@ -127,7 +128,7 @@ pub extern "C" fn tantivy_index_add_int8s(
offset_begin: i64,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i8, &mut (*real)).into() }
}
@ -138,7 +139,7 @@ pub extern "C" fn tantivy_index_add_int8s_by_single_segment_writer(
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
@ -157,7 +158,7 @@ pub extern "C" fn tantivy_index_add_int16s(
offset_begin: i64,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i16, &mut (*real)).into() }
}
@ -168,7 +169,7 @@ pub extern "C" fn tantivy_index_add_int16s_by_single_segment_writer(
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
@ -187,7 +188,7 @@ pub extern "C" fn tantivy_index_add_int32s(
offset_begin: i64,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i32, &mut (*real)).into() }
}
@ -198,7 +199,7 @@ pub extern "C" fn tantivy_index_add_int32s_by_single_segment_writer(
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
@ -217,7 +218,7 @@ pub extern "C" fn tantivy_index_add_int64s(
offset_begin: i64,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_i64, &mut (*real)).into() }
}
@ -229,7 +230,7 @@ pub extern "C" fn tantivy_index_add_int64s_by_single_segment_writer(
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe {
execute_by_single_segment_writer(
@ -249,7 +250,7 @@ pub extern "C" fn tantivy_index_add_f32s(
offset_begin: i64,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f32, &mut (*real)).into() }
}
@ -260,7 +261,7 @@ pub extern "C" fn tantivy_index_add_f32s_by_single_segment_writer(
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
@ -279,7 +280,7 @@ pub extern "C" fn tantivy_index_add_f64s(
offset_begin: i64,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe { execute(arr, offset_begin, IndexWriterWrapper::add_f64, &mut (*real)).into() }
}
@ -290,7 +291,7 @@ pub extern "C" fn tantivy_index_add_f64s_by_single_segment_writer(
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
@ -309,7 +310,7 @@ pub extern "C" fn tantivy_index_add_bools(
offset_begin: i64,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe {
execute(
arr,
@ -330,9 +331,9 @@ pub extern "C" fn tantivy_index_add_json_key_stats_data_by_batch(
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let json_offsets_len = unsafe { slice::from_raw_parts(json_offsets_len, len) };
let json_offsets = unsafe { slice::from_raw_parts(json_offsets, len) };
let keys = unsafe { slice::from_raw_parts(keys, len) };
let json_offsets_len = unsafe { convert_to_rust_slice!(json_offsets_len, len) };
let json_offsets = unsafe { convert_to_rust_slice!(json_offsets, len) };
let keys = unsafe { convert_to_rust_slice!(keys, len) };
unsafe {
(*real)
.add_json_key_stats(keys, json_offsets, json_offsets_len)
@ -347,7 +348,7 @@ pub extern "C" fn tantivy_index_add_bools_by_single_segment_writer(
len: usize,
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
let arr = unsafe { slice::from_raw_parts(array, len) };
let arr = unsafe { convert_to_rust_slice!(array, len) };
unsafe {
execute_by_single_segment_writer(
arr,
@ -392,7 +393,7 @@ pub extern "C" fn tantivy_index_add_multi_int8s(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len);
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i8s(arr, offset).into()
}
}
@ -419,7 +420,7 @@ pub extern "C" fn tantivy_index_add_multi_int16s(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len);
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i16s(arr, offset).into()
}
}
@ -446,7 +447,7 @@ pub extern "C" fn tantivy_index_add_multi_int32s(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len);
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i32s(arr, offset).into()
}
}
@ -473,7 +474,7 @@ pub extern "C" fn tantivy_index_add_multi_int64s(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len);
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_i64s(arr, offset).into()
}
}
@ -500,7 +501,7 @@ pub extern "C" fn tantivy_index_add_multi_f32s(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len);
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_f32s(arr, offset).into()
}
}
@ -527,7 +528,7 @@ pub extern "C" fn tantivy_index_add_multi_f64s(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len);
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_f64s(arr, offset).into()
}
}
@ -554,7 +555,7 @@ pub extern "C" fn tantivy_index_add_multi_bools(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len);
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_bools(arr, offset).into()
}
}
@ -581,7 +582,7 @@ pub extern "C" fn tantivy_index_add_multi_keywords(
) -> RustResult {
let real = ptr as *mut IndexWriterWrapper;
unsafe {
let arr = slice::from_raw_parts(array, len);
let arr = convert_to_rust_slice!(array, len);
(*real).add_multi_keywords(arr, offset).into()
}
}

View File

@ -1,3 +1,5 @@
use crate::convert_to_rust_slice;
use core::slice;
use std::ffi::c_void;
use std::ops::Bound;
use tantivy::{directory::MmapDirectory, Index};
@ -31,6 +33,6 @@ pub fn free_binding<T>(ptr: *mut c_void) {
#[cfg(test)]
pub extern "C" fn set_bitset(bitset: *mut c_void, doc_id: *const u32, len: usize) {
let bitset = unsafe { &mut *(bitset as *mut Vec<u32>) };
let docs = unsafe { std::slice::from_raw_parts(doc_id, len) };
let docs = unsafe { convert_to_rust_slice!(doc_id, len) };
bitset.extend_from_slice(docs);
}

View File

@ -144,10 +144,10 @@ struct TantivyIndexWrapper {
}
// create reader.
void
create_reader() {
create_reader(SetBitsetFn set_bitset) {
if (writer_ != nullptr) {
auto res = RustResultWrapper(tantivy_create_reader_from_writer(
writer_, milvus::index::SetBitset));
auto res = RustResultWrapper(
tantivy_create_reader_from_writer(writer_, set_bitset));
AssertInfo(res.result_->success,
"failed to create reader from writer: {}",
res.result_->error);
@ -155,7 +155,7 @@ struct TantivyIndexWrapper {
} else if (!path_.empty()) {
assert(tantivy_index_exist(path_.c_str()));
auto res = RustResultWrapper(
tantivy_load_index(path_.c_str(), milvus::index::SetBitset));
tantivy_load_index(path_.c_str(), set_bitset));
AssertInfo(res.result_->success,
"failed to load index: {}",
res.result_->error);

View File

@ -31,7 +31,7 @@ main(int argc, char* argv[]) {
text_index.commit();
}
text_index.create_reader();
text_index.create_reader(milvus::index::SetBitsetSealed);
{
auto result = to_set(text_index.match_query("football"));
assert(result.size() == 2);

View File

@ -16617,7 +16617,7 @@ TYPED_TEST(JsonIndexTestFixture, TestJsonIndexUnaryExpr) {
json_index->BuildWithFieldData({json_field});
json_index->finish();
json_index->create_reader();
json_index->create_reader(milvus::index::SetBitsetSealed);
load_index_info.field_id = json_fid.get();
load_index_info.field_type = DataType::JSON;
@ -16745,7 +16745,7 @@ TEST(JsonIndexTest, TestJsonNotEqualExpr) {
json_index->BuildWithFieldData({json_field, json_field2});
json_index->finish();
json_index->create_reader();
json_index->create_reader(milvus::index::SetBitsetSealed);
load_index_info.field_id = json_fid.get();
load_index_info.field_type = DataType::JSON;
@ -16845,7 +16845,7 @@ TEST_P(JsonIndexExistsTest, TestExistsExpr) {
json_index->BuildWithFieldData({json_field});
json_index->finish();
json_index->create_reader();
json_index->create_reader(milvus::index::SetBitsetSealed);
load_index_info.field_id = json_fid.get();
load_index_info.field_type = DataType::JSON;
@ -17021,7 +17021,7 @@ TEST_P(JsonIndexBinaryExprTest, TestBinaryRangeExpr) {
json_index->BuildWithFieldData({json_field});
json_index->finish();
json_index->create_reader();
json_index->create_reader(milvus::index::SetBitsetSealed);
load_index_info.field_id = json_fid.get();
load_index_info.field_type = DataType::JSON;

View File

@ -134,7 +134,7 @@ TEST(JsonIndexTest, TestJsonContains) {
json_field->add_json_data(jsons);
json_index->BuildWithFieldData({json_field});
json_index->finish();
json_index->create_reader();
json_index->create_reader(milvus::index::SetBitsetSealed);
auto segment = segcore::CreateSealedSegment(schema);
segcore::LoadIndexInfo load_index_info;
@ -226,7 +226,7 @@ TEST(JsonIndexTest, TestJsonCast) {
json_field->add_json_data(jsons);
json_index->BuildWithFieldData({json_field});
json_index->finish();
json_index->create_reader();
json_index->create_reader(milvus::index::SetBitsetSealed);
auto segment = segcore::CreateSealedSegment(schema);
segcore::LoadIndexInfo load_index_info;

View File

@ -561,7 +561,7 @@ TEST(GrowingJsonKeyStatsIndexTest, GrowingIndex) {
for (const auto& jsonData : jsonDatas) {
jsons.push_back(milvus::Json(simdjson::padded_string(jsonData)));
}
index->CreateReader();
index->CreateReader(milvus::index::SetBitsetGrowing);
index->AddJSONDatas(jsonDatas.size(), jsonDatas.data(), nullptr, 0);
index->Commit();
index->Reload();

View File

@ -134,7 +134,7 @@ TEST(TextMatch, Index) {
"unique_id",
"milvus_tokenizer",
"{}");
index->CreateReader();
index->CreateReader(milvus::index::SetBitsetGrowing);
index->AddText("football, basketball, pingpang", true, 0);
index->AddText("", false, 1);
index->AddText("swimming, football", true, 2);