mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
feat: more types of matches for ngram (#43081)
Ref https://github.com/milvus-io/milvus/issues/42053 This PR enable ngram to support more kinds of matches such as prefix and postfix match. --------- Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
This commit is contained in:
parent
0aeac94f8a
commit
db91d85dbc
@ -1484,8 +1484,7 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImpl(EvalCtx& context) {
|
||||
fmt::format("match query does not support iterative filter"));
|
||||
}
|
||||
return ExecTextMatch();
|
||||
} else if (expr_->op_type_ == proto::plan::OpType::InnerMatch &&
|
||||
!has_offset_input_ && CanUseNgramIndex(field_id_)) {
|
||||
} else if (CanExecNgramMatch(expr_->op_type_)) {
|
||||
auto res = ExecNgramMatch();
|
||||
// If nullopt is returned, it means the query cannot be
|
||||
// optimized by ngram index. Forward it to the normal path.
|
||||
@ -1933,6 +1932,15 @@ PhyUnaryRangeFilterExpr::ExecTextMatch() {
|
||||
return res;
|
||||
};
|
||||
|
||||
bool
|
||||
PhyUnaryRangeFilterExpr::CanExecNgramMatch(proto::plan::OpType op_type) {
|
||||
return (op_type == proto::plan::OpType::InnerMatch ||
|
||||
op_type == proto::plan::OpType::Match ||
|
||||
op_type == proto::plan::OpType::PrefixMatch ||
|
||||
op_type == proto::plan::OpType::PostfixMatch) &&
|
||||
!has_offset_input_ && CanUseNgramIndex(field_id_);
|
||||
}
|
||||
|
||||
std::optional<VectorPtr>
|
||||
PhyUnaryRangeFilterExpr::ExecNgramMatch() {
|
||||
if (!arg_inited_) {
|
||||
@ -1951,7 +1959,7 @@ PhyUnaryRangeFilterExpr::ExecNgramMatch() {
|
||||
AssertInfo(index != nullptr,
|
||||
"ngram index should not be null, field_id: {}",
|
||||
field_id_.get());
|
||||
auto res_opt = index->InnerMatchQuery(literal, this);
|
||||
auto res_opt = index->ExecuteQuery(literal, expr_->op_type_, this);
|
||||
if (!res_opt.has_value()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
@ -506,6 +506,9 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
|
||||
VectorPtr
|
||||
ExecTextMatch();
|
||||
|
||||
bool
|
||||
CanExecNgramMatch(proto::plan::OpType op_type);
|
||||
|
||||
std::optional<VectorPtr>
|
||||
ExecNgramMatch();
|
||||
|
||||
|
||||
@ -191,122 +191,5 @@ GetValueWithCastNumber(const milvus::proto::plan::GenericValue& value_proto) {
|
||||
}
|
||||
}
|
||||
|
||||
enum class MatchType {
|
||||
ExactMatch,
|
||||
PrefixMatch,
|
||||
PostfixMatch,
|
||||
// The different between InnerMatch and Match is that InnerMatch is used for
|
||||
// %xxx% while Match could be %xxx%xxx%
|
||||
InnerMatch,
|
||||
Match
|
||||
};
|
||||
struct ParsedResult {
|
||||
std::string literal;
|
||||
MatchType type;
|
||||
};
|
||||
|
||||
// Not used now, but may be used in the future for other type of match for ngram index
|
||||
inline std::optional<ParsedResult>
|
||||
parse_ngram_pattern(const std::string& pattern) {
|
||||
if (pattern.empty()) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
std::vector<size_t> percent_indices;
|
||||
bool was_escaped = false;
|
||||
for (size_t i = 0; i < pattern.length(); ++i) {
|
||||
char c = pattern[i];
|
||||
if (c == '%' && !was_escaped) {
|
||||
percent_indices.push_back(i);
|
||||
} else if (c == '_' && !was_escaped) {
|
||||
// todo(SpadeA): now not support '_'
|
||||
return std::nullopt;
|
||||
}
|
||||
was_escaped = (c == '\\' && !was_escaped);
|
||||
}
|
||||
|
||||
MatchType match_type;
|
||||
size_t core_start = 0;
|
||||
size_t core_length = 0;
|
||||
size_t percent_count = percent_indices.size();
|
||||
|
||||
if (percent_count == 0) {
|
||||
match_type = MatchType::ExactMatch;
|
||||
core_start = 0;
|
||||
core_length = pattern.length();
|
||||
} else if (percent_count == 1) {
|
||||
if (pattern.length() == 1) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
size_t idx = percent_indices[0];
|
||||
// case: %xxx
|
||||
if (idx == 0 && pattern.length() > 1) {
|
||||
match_type = MatchType::PrefixMatch;
|
||||
core_start = 1;
|
||||
core_length = pattern.length() - 1;
|
||||
} else if (idx == pattern.length() - 1 && pattern.length() > 1) {
|
||||
// case: xxx%
|
||||
match_type = MatchType::PostfixMatch;
|
||||
core_start = 0;
|
||||
core_length = pattern.length() - 1;
|
||||
} else {
|
||||
// case: xxx%xxx
|
||||
match_type = MatchType::Match;
|
||||
}
|
||||
} else if (percent_count == 2) {
|
||||
size_t idx1 = percent_indices[0];
|
||||
size_t idx2 = percent_indices[1];
|
||||
if (idx1 == 0 && idx2 == pattern.length() - 1 && pattern.length() > 2) {
|
||||
// case: %xxx%
|
||||
match_type = MatchType::InnerMatch;
|
||||
core_start = 1;
|
||||
core_length = pattern.length() - 2;
|
||||
} else {
|
||||
match_type = MatchType::Match;
|
||||
}
|
||||
} else {
|
||||
match_type = MatchType::Match;
|
||||
}
|
||||
|
||||
if (match_type == MatchType::Match) {
|
||||
// not supported now
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Extract the literal from the pattern
|
||||
std::string_view core_pattern =
|
||||
std::string_view(pattern).substr(core_start, core_length);
|
||||
|
||||
std::string r;
|
||||
r.reserve(2 * core_pattern.size());
|
||||
bool escape_mode = false;
|
||||
for (char c : core_pattern) {
|
||||
if (escape_mode) {
|
||||
if (is_special(c)) {
|
||||
// todo(SpadeA): may not be suitable for ngram? Not use ngram in this case for now.
|
||||
return std::nullopt;
|
||||
}
|
||||
r += c;
|
||||
escape_mode = false;
|
||||
} else {
|
||||
if (c == '\\') {
|
||||
escape_mode = true;
|
||||
} else if (c == '%') {
|
||||
// should be unreachable
|
||||
} else if (c == '_') {
|
||||
// should be unreachable
|
||||
return std::nullopt;
|
||||
} else {
|
||||
if (is_special(c)) {
|
||||
r += '\\';
|
||||
}
|
||||
r += c;
|
||||
}
|
||||
}
|
||||
}
|
||||
return std::optional<ParsedResult>{ParsedResult{std::move(r), match_type}};
|
||||
}
|
||||
|
||||
} // namespace exec
|
||||
} // namespace milvus
|
||||
@ -106,47 +106,168 @@ NgramInvertedIndex::Load(milvus::tracer::TraceContext ctx,
|
||||
}
|
||||
|
||||
std::optional<TargetBitmap>
|
||||
NgramInvertedIndex::InnerMatchQuery(const std::string& literal,
|
||||
exec::SegmentExpr* segment) {
|
||||
NgramInvertedIndex::ExecuteQuery(const std::string& literal,
|
||||
proto::plan::OpType op_type,
|
||||
exec::SegmentExpr* segment) {
|
||||
if (literal.length() < min_gram_) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
switch (op_type) {
|
||||
case proto::plan::OpType::InnerMatch: {
|
||||
auto predicate = [&literal](const std::string_view& data) {
|
||||
return data.find(literal) != std::string::npos;
|
||||
};
|
||||
bool need_post_filter = literal.length() > max_gram_;
|
||||
return ExecuteQueryWithPredicate(
|
||||
literal, segment, predicate, need_post_filter);
|
||||
}
|
||||
case proto::plan::OpType::Match:
|
||||
return MatchQuery(literal, segment);
|
||||
case proto::plan::OpType::PrefixMatch: {
|
||||
auto predicate = [&literal](const std::string_view& data) {
|
||||
return data.length() >= literal.length() &&
|
||||
std::equal(literal.begin(), literal.end(), data.begin());
|
||||
};
|
||||
return ExecuteQueryWithPredicate(literal, segment, predicate, true);
|
||||
}
|
||||
case proto::plan::OpType::PostfixMatch: {
|
||||
auto predicate = [&literal](const std::string_view& data) {
|
||||
return data.length() >= literal.length() &&
|
||||
std::equal(
|
||||
literal.rbegin(), literal.rend(), data.rbegin());
|
||||
};
|
||||
return ExecuteQueryWithPredicate(literal, segment, predicate, true);
|
||||
}
|
||||
default:
|
||||
LOG_WARN("unsupported op type for ngram index: {}", op_type);
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
inline void
|
||||
handle_batch(const std::string_view* data,
|
||||
const int32_t* offsets,
|
||||
const int size,
|
||||
TargetBitmapView res,
|
||||
std::function<bool(const std::string_view&)> predicate) {
|
||||
auto next_off_option = res.find_first();
|
||||
while (next_off_option.has_value()) {
|
||||
auto next_off = next_off_option.value();
|
||||
if (next_off >= size) {
|
||||
return;
|
||||
}
|
||||
if (!predicate(data[next_off])) {
|
||||
res[next_off] = false;
|
||||
}
|
||||
next_off_option = res.find_next(next_off);
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<TargetBitmap>
|
||||
NgramInvertedIndex::ExecuteQueryWithPredicate(
|
||||
const std::string& literal,
|
||||
exec::SegmentExpr* segment,
|
||||
std::function<bool(const std::string_view&)> predicate,
|
||||
bool need_post_filter) {
|
||||
TargetBitmap bitset{static_cast<size_t>(Count())};
|
||||
wrapper_->inner_match_ngram(literal, min_gram_, max_gram_, &bitset);
|
||||
wrapper_->ngram_match_query(literal, min_gram_, max_gram_, &bitset);
|
||||
|
||||
// Post filtering: if the literal length is larger than the max_gram
|
||||
// we need to filter out the bitset
|
||||
if (literal.length() > max_gram_) {
|
||||
auto bitset_off = 0;
|
||||
TargetBitmapView res(bitset);
|
||||
TargetBitmap valid(res.size(), true);
|
||||
TargetBitmapView valid_res(valid.data(), valid.size());
|
||||
TargetBitmapView res(bitset);
|
||||
TargetBitmap valid(res.size(), true);
|
||||
TargetBitmapView valid_res(valid.data(), valid.size());
|
||||
|
||||
auto execute_sub_batch = [&literal](const std::string_view* data,
|
||||
const bool* valid_data,
|
||||
const int32_t* offsets,
|
||||
const int size,
|
||||
TargetBitmapView res,
|
||||
TargetBitmapView valid_res) {
|
||||
auto next_off_option = res.find_first();
|
||||
while (next_off_option.has_value()) {
|
||||
auto next_off = next_off_option.value();
|
||||
if (next_off >= size) {
|
||||
break;
|
||||
}
|
||||
if (data[next_off].find(literal) == std::string::npos) {
|
||||
res[next_off] = false;
|
||||
}
|
||||
next_off_option = res.find_next(next_off);
|
||||
}
|
||||
};
|
||||
if (need_post_filter) {
|
||||
auto execute_batch =
|
||||
[&predicate](
|
||||
const std::string_view* data,
|
||||
// `valid_data` is not used as the results returned by ngram_match_query are all valid
|
||||
const bool* _valid_data,
|
||||
const int32_t* offsets,
|
||||
const int size,
|
||||
TargetBitmapView res,
|
||||
// the same with `valid_data`
|
||||
TargetBitmapView _valid_res) {
|
||||
handle_batch(data, offsets, size, res, predicate);
|
||||
};
|
||||
|
||||
segment->ProcessAllDataChunk<std::string_view>(
|
||||
execute_sub_batch, std::nullptr_t{}, res, valid_res);
|
||||
execute_batch, std::nullptr_t{}, res, valid_res);
|
||||
}
|
||||
|
||||
return std::optional<TargetBitmap>(std::move(bitset));
|
||||
}
|
||||
|
||||
std::vector<std::string>
|
||||
split_by_wildcard(const std::string& literal) {
|
||||
std::vector<std::string> result;
|
||||
std::string r;
|
||||
r.reserve(literal.size());
|
||||
bool escape_mode = false;
|
||||
for (char c : literal) {
|
||||
if (escape_mode) {
|
||||
r += c;
|
||||
escape_mode = false;
|
||||
} else {
|
||||
if (c == '\\') {
|
||||
// consider case "\\%", we should reserve %
|
||||
escape_mode = true;
|
||||
} else if (c == '%' || c == '_') {
|
||||
if (r.length() > 0) {
|
||||
result.push_back(r);
|
||||
r.clear();
|
||||
}
|
||||
} else {
|
||||
r += c;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (r.length() > 0) {
|
||||
result.push_back(r);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::optional<TargetBitmap>
|
||||
NgramInvertedIndex::MatchQuery(const std::string& literal,
|
||||
exec::SegmentExpr* segment) {
|
||||
TargetBitmap bitset{static_cast<size_t>(Count())};
|
||||
auto literals = split_by_wildcard(literal);
|
||||
for (const auto& l : literals) {
|
||||
if (l.length() < min_gram_) {
|
||||
return std::nullopt;
|
||||
}
|
||||
wrapper_->ngram_match_query(l, min_gram_, max_gram_, &bitset);
|
||||
}
|
||||
|
||||
TargetBitmapView res(bitset);
|
||||
TargetBitmap valid(res.size(), true);
|
||||
TargetBitmapView valid_res(valid.data(), valid.size());
|
||||
|
||||
PatternMatchTranslator translator;
|
||||
auto regex_pattern = translator(literal);
|
||||
RegexMatcher matcher(regex_pattern);
|
||||
|
||||
auto predicate = [&matcher](const std::string_view& data) {
|
||||
return matcher(data);
|
||||
};
|
||||
|
||||
auto execute_batch =
|
||||
[&predicate](
|
||||
const std::string_view* data,
|
||||
// `_valid_data` is not used as the results returned by ngram_match_query are all valid
|
||||
const bool* _valid_data,
|
||||
const int32_t* offsets,
|
||||
const int size,
|
||||
TargetBitmapView res,
|
||||
// the same with `_valid_data`
|
||||
TargetBitmapView _valid_res) {
|
||||
handle_batch(data, offsets, size, res, predicate);
|
||||
};
|
||||
segment->ProcessAllDataChunk<std::string_view>(
|
||||
execute_batch, std::nullptr_t{}, res, valid_res);
|
||||
|
||||
return std::optional<TargetBitmap>(std::move(bitset));
|
||||
}
|
||||
|
||||
} // namespace milvus::index
|
||||
|
||||
@ -36,7 +36,21 @@ class NgramInvertedIndex : public InvertedIndexTantivy<std::string> {
|
||||
BuildWithFieldData(const std::vector<FieldDataPtr>& datas) override;
|
||||
|
||||
std::optional<TargetBitmap>
|
||||
InnerMatchQuery(const std::string& literal, exec::SegmentExpr* segment);
|
||||
ExecuteQuery(const std::string& literal,
|
||||
proto::plan::OpType op_type,
|
||||
exec::SegmentExpr* segment);
|
||||
|
||||
private:
|
||||
std::optional<TargetBitmap>
|
||||
ExecuteQueryWithPredicate(
|
||||
const std::string& literal,
|
||||
exec::SegmentExpr* segment,
|
||||
std::function<bool(const std::string_view&)> predicate,
|
||||
bool need_post_filter);
|
||||
|
||||
// Match is something like xxx%xxx%xxx, xxx%xxx, %xxx%xxx, xxx_x etc.
|
||||
std::optional<TargetBitmap>
|
||||
MatchQuery(const std::string& literal, exec::SegmentExpr* segment);
|
||||
|
||||
private:
|
||||
uintptr_t min_gram_{0};
|
||||
|
||||
@ -185,12 +185,10 @@ ChunkedSegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) {
|
||||
if (auto it = info.index_params.find(index::INDEX_TYPE);
|
||||
it != info.index_params.end() &&
|
||||
it->second == index::NGRAM_INDEX_TYPE) {
|
||||
ngram_indexings_[field_id] =
|
||||
std::move(const_cast<LoadIndexInfo&>(info).cache_index);
|
||||
} else {
|
||||
scalar_indexings_[field_id] =
|
||||
std::move(const_cast<LoadIndexInfo&>(info).cache_index);
|
||||
ngram_fields_.insert(field_id);
|
||||
}
|
||||
scalar_indexings_[field_id] =
|
||||
std::move(const_cast<LoadIndexInfo&>(info).cache_index);
|
||||
|
||||
LoadResourceRequest request =
|
||||
milvus::index::IndexFactory::GetInstance().ScalarIndexLoadResource(
|
||||
@ -633,8 +631,8 @@ ChunkedSegmentSealedImpl::chunk_index_impl(FieldId field_id,
|
||||
PinWrapper<index::NgramInvertedIndex*>
|
||||
ChunkedSegmentSealedImpl::GetNgramIndex(FieldId field_id) const {
|
||||
std::shared_lock lck(mutex_);
|
||||
auto iter = ngram_indexings_.find(field_id);
|
||||
if (iter == ngram_indexings_.end()) {
|
||||
auto iter = scalar_indexings_.find(field_id);
|
||||
if (iter == scalar_indexings_.end()) {
|
||||
return PinWrapper<index::NgramInvertedIndex*>(nullptr);
|
||||
}
|
||||
auto slot = iter->second.get();
|
||||
@ -987,6 +985,7 @@ ChunkedSegmentSealedImpl::ChunkedSegmentSealedImpl(
|
||||
field_data_ready_bitset_(schema->size()),
|
||||
index_ready_bitset_(schema->size()),
|
||||
binlog_index_bitset_(schema->size()),
|
||||
ngram_fields_(schema->size()),
|
||||
scalar_indexings_(schema->size()),
|
||||
insert_record_(*schema, MAX_ROW_COUNT),
|
||||
schema_(schema),
|
||||
@ -1146,6 +1145,7 @@ ChunkedSegmentSealedImpl::ClearData() {
|
||||
index_has_raw_data_.clear();
|
||||
system_ready_count_ = 0;
|
||||
num_rows_ = std::nullopt;
|
||||
ngram_fields_.clear();
|
||||
scalar_indexings_.clear();
|
||||
vector_indexings_.clear();
|
||||
insert_record_.clear();
|
||||
|
||||
@ -120,7 +120,7 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
|
||||
bool
|
||||
HasNgramIndex(FieldId field_id) const override {
|
||||
std::shared_lock lck(mutex_);
|
||||
return ngram_indexings_.find(field_id) != ngram_indexings_.end();
|
||||
return ngram_fields_.find(field_id) != ngram_fields_.end();
|
||||
}
|
||||
|
||||
PinWrapper<index::NgramInvertedIndex*>
|
||||
@ -432,8 +432,8 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
|
||||
// TODO: generate index for scalar
|
||||
std::optional<int64_t> num_rows_;
|
||||
|
||||
// ngram field index
|
||||
std::unordered_map<FieldId, index::CacheIndexBasePtr> ngram_indexings_;
|
||||
// fields that has ngram index
|
||||
std::unordered_set<FieldId> ngram_fields_{};
|
||||
|
||||
// scalar field index
|
||||
std::unordered_map<FieldId, index::CacheIndexBasePtr> scalar_indexings_;
|
||||
|
||||
@ -189,7 +189,7 @@ RustResult tantivy_term_query_keyword(void *ptr, const char *term, void *bitset)
|
||||
|
||||
RustResult tantivy_term_query_keyword_i64(void *ptr, const char *term);
|
||||
|
||||
RustResult tantivy_inner_match_ngram(void *ptr,
|
||||
RustResult tantivy_ngram_match_query(void *ptr,
|
||||
const char *literal,
|
||||
uintptr_t min_gram,
|
||||
uintptr_t max_gram,
|
||||
|
||||
@ -109,7 +109,7 @@ mod tests {
|
||||
let reader = writer.create_reader(set_bitset).unwrap();
|
||||
let mut res: Vec<u32> = vec![];
|
||||
reader
|
||||
.inner_match_ngram("ic", 2, 3, &mut res as *mut _ as *mut c_void)
|
||||
.ngram_match_query("ic", 2, 3, &mut res as *mut _ as *mut c_void)
|
||||
.unwrap();
|
||||
assert_eq!(res, vec![2, 4, 5]);
|
||||
}
|
||||
@ -138,19 +138,19 @@ mod tests {
|
||||
let reader = writer.create_reader(set_bitset).unwrap();
|
||||
let mut res: Vec<u32> = vec![];
|
||||
reader
|
||||
.inner_match_ngram("测试", 2, 3, &mut res as *mut _ as *mut c_void)
|
||||
.ngram_match_query("测试", 2, 3, &mut res as *mut _ as *mut c_void)
|
||||
.unwrap();
|
||||
assert_eq!(res, vec![0, 1, 2, 4]);
|
||||
|
||||
let mut res: Vec<u32> = vec![];
|
||||
reader
|
||||
.inner_match_ngram("m测试", 2, 3, &mut res as *mut _ as *mut c_void)
|
||||
.ngram_match_query("m测试", 2, 3, &mut res as *mut _ as *mut c_void)
|
||||
.unwrap();
|
||||
assert_eq!(res, vec![0, 2]);
|
||||
|
||||
let mut res: Vec<u32> = vec![];
|
||||
reader
|
||||
.inner_match_ngram("需要被测试", 2, 3, &mut res as *mut _ as *mut c_void)
|
||||
.ngram_match_query("需要被测试", 2, 3, &mut res as *mut _ as *mut c_void)
|
||||
.unwrap();
|
||||
assert_eq!(res, vec![4]);
|
||||
}
|
||||
|
||||
@ -300,7 +300,7 @@ impl IndexReaderWrapper {
|
||||
}
|
||||
|
||||
// **Note**: literal length must be larger or equal to min_gram.
|
||||
pub fn inner_match_ngram(
|
||||
pub fn ngram_match_query(
|
||||
&self,
|
||||
literal: &str,
|
||||
min_gram: usize,
|
||||
|
||||
@ -234,7 +234,7 @@ pub extern "C" fn tantivy_term_query_keyword_i64(
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn tantivy_inner_match_ngram(
|
||||
pub extern "C" fn tantivy_ngram_match_query(
|
||||
ptr: *mut c_void,
|
||||
literal: *const c_char,
|
||||
min_gram: usize,
|
||||
@ -247,7 +247,7 @@ pub extern "C" fn tantivy_inner_match_ngram(
|
||||
let now = std::time::Instant::now();
|
||||
unsafe {
|
||||
(*real)
|
||||
.inner_match_ngram(literal, min_gram, max_gram, bitset)
|
||||
.ngram_match_query(literal, min_gram, max_gram, bitset)
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
@ -936,19 +936,19 @@ struct TantivyIndexWrapper {
|
||||
}
|
||||
|
||||
void
|
||||
inner_match_ngram(const std::string& literal,
|
||||
ngram_match_query(const std::string& literal,
|
||||
uintptr_t min_gram,
|
||||
uintptr_t max_gram,
|
||||
void* bitset) {
|
||||
auto array = tantivy_inner_match_ngram(
|
||||
auto array = tantivy_ngram_match_query(
|
||||
reader_, literal.c_str(), min_gram, max_gram, bitset);
|
||||
auto res = RustResultWrapper(array);
|
||||
AssertInfo(res.result_->success,
|
||||
"TantivyIndexWrapper.inner_match_ngram: {}",
|
||||
"TantivyIndexWrapper.ngram_match_query: {}",
|
||||
res.result_->error);
|
||||
AssertInfo(
|
||||
res.result_->value.tag == Value::Tag::None,
|
||||
"TantivyIndexWrapper.inner_match_ngram: invalid result type");
|
||||
"TantivyIndexWrapper.ngram_match_query: invalid result type");
|
||||
}
|
||||
|
||||
// json query
|
||||
|
||||
@ -28,84 +28,6 @@ using namespace milvus::query;
|
||||
using namespace milvus::segcore;
|
||||
using namespace milvus::exec;
|
||||
|
||||
TEST(ConvertToNgramLiteralTest, EmptyString) {
|
||||
auto result = parse_ngram_pattern("");
|
||||
ASSERT_FALSE(result.has_value());
|
||||
}
|
||||
|
||||
TEST(ConvertToNgramLiteralTest, ExactMatchSimple) {
|
||||
auto result = parse_ngram_pattern("abc");
|
||||
ASSERT_TRUE(result.has_value());
|
||||
EXPECT_EQ(result->literal, "abc");
|
||||
EXPECT_EQ(result->type, MatchType::ExactMatch);
|
||||
}
|
||||
|
||||
TEST(ConvertToNgramLiteralTest, ExactMatchWithEscapedPercent) {
|
||||
auto result = parse_ngram_pattern("ab\\%cd");
|
||||
ASSERT_TRUE(result.has_value());
|
||||
EXPECT_EQ(result->literal, "ab%cd");
|
||||
EXPECT_EQ(result->type, MatchType::ExactMatch);
|
||||
}
|
||||
|
||||
TEST(ConvertToNgramLiteralTest, ExactMatchWithEscapedSpecialChar) {
|
||||
auto result = parse_ngram_pattern("a.b");
|
||||
ASSERT_TRUE(result.has_value());
|
||||
EXPECT_EQ(result->literal, "a\\.b");
|
||||
EXPECT_EQ(result->type, MatchType::ExactMatch);
|
||||
}
|
||||
|
||||
TEST(ConvertToNgramLiteralTest, PrefixMatchSimple) {
|
||||
auto result = parse_ngram_pattern("%abc");
|
||||
ASSERT_TRUE(result.has_value());
|
||||
EXPECT_EQ(result->literal, "abc");
|
||||
EXPECT_EQ(result->type, MatchType::PrefixMatch);
|
||||
}
|
||||
|
||||
TEST(ConvertToNgramLiteralTest, PostfixMatchSimple) {
|
||||
auto result = parse_ngram_pattern("abc%");
|
||||
ASSERT_TRUE(result.has_value());
|
||||
EXPECT_EQ(result->literal, "abc");
|
||||
EXPECT_EQ(result->type, MatchType::PostfixMatch);
|
||||
}
|
||||
|
||||
TEST(ConvertToNgramLiteralTest, InnerMatchSimple) {
|
||||
auto result = parse_ngram_pattern("%abc%");
|
||||
ASSERT_TRUE(result.has_value());
|
||||
EXPECT_EQ(result->literal, "abc");
|
||||
EXPECT_EQ(result->type, MatchType::InnerMatch);
|
||||
}
|
||||
|
||||
TEST(ConvertToNgramLiteralTest, MatchSinglePercentMiddle) {
|
||||
auto result = parse_ngram_pattern("a%b");
|
||||
ASSERT_FALSE(result.has_value());
|
||||
}
|
||||
|
||||
TEST(ConvertToNgramLiteralTest, MatchTypeReturnsNullopt) {
|
||||
EXPECT_FALSE(parse_ngram_pattern("%").has_value());
|
||||
// %a%b (n=2, not %xxx%) -> Match -> nullopt
|
||||
EXPECT_FALSE(parse_ngram_pattern("%a%b").has_value());
|
||||
// a%b%c (n=2, not %xxx%) -> Match -> nullopt
|
||||
EXPECT_FALSE(parse_ngram_pattern("a%b%c").has_value());
|
||||
// %% (n=2, not %xxx% because length is not > 2) -> Match -> nullopt
|
||||
EXPECT_FALSE(parse_ngram_pattern("%%").has_value());
|
||||
// %a%b%c% (n=3) -> Match -> nullopt
|
||||
EXPECT_FALSE(parse_ngram_pattern("%a%b%c%").has_value());
|
||||
}
|
||||
|
||||
TEST(ConvertToNgramLiteralTest, UnescapedUnderscoreReturnsNullopt) {
|
||||
EXPECT_FALSE(parse_ngram_pattern("a_b").has_value());
|
||||
EXPECT_FALSE(parse_ngram_pattern("%a_b").has_value());
|
||||
EXPECT_FALSE(parse_ngram_pattern("a_b%").has_value());
|
||||
EXPECT_FALSE(parse_ngram_pattern("%a_b%").has_value());
|
||||
}
|
||||
|
||||
TEST(ConvertToNgramLiteralTest, EscapedUnderscore) {
|
||||
auto result = parse_ngram_pattern("a\\_b");
|
||||
ASSERT_TRUE(result.has_value());
|
||||
EXPECT_EQ(result->literal, "a_b");
|
||||
EXPECT_EQ(result->type, MatchType::ExactMatch);
|
||||
}
|
||||
|
||||
auto
|
||||
generate_field_meta(int64_t collection_id = 1,
|
||||
int64_t partition_id = 2,
|
||||
@ -153,7 +75,9 @@ generate_local_storage_config(const std::string& root_path)
|
||||
void
|
||||
test_ngram_with_data(const boost::container::vector<std::string>& data,
|
||||
const std::string& literal,
|
||||
const std::vector<bool>& expected_result) {
|
||||
proto::plan::OpType op_type,
|
||||
const std::vector<bool>& expected_result,
|
||||
bool forward_to_br = false) {
|
||||
int64_t collection_id = 1;
|
||||
int64_t partition_id = 2;
|
||||
int64_t segment_id = 3;
|
||||
@ -275,9 +199,15 @@ test_ngram_with_data(const boost::container::vector<std::string>& data,
|
||||
8192,
|
||||
0);
|
||||
|
||||
auto bitset = index->InnerMatchQuery(literal, &segment_expr).value();
|
||||
for (size_t i = 0; i < nb; i++) {
|
||||
ASSERT_EQ(bitset[i], expected_result[i]);
|
||||
std::optional<TargetBitmap> bitset_opt =
|
||||
index->ExecuteQuery(literal, op_type, &segment_expr);
|
||||
if (forward_to_br) {
|
||||
ASSERT_TRUE(!bitset_opt.has_value());
|
||||
} else {
|
||||
auto bitset = std::move(bitset_opt.value());
|
||||
for (size_t i = 0; i < nb; i++) {
|
||||
ASSERT_EQ(bitset[i], expected_result[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -318,8 +248,7 @@ test_ngram_with_data(const boost::container::vector<std::string>& data,
|
||||
AppendIndexV2(trace, cload_index_info);
|
||||
UpdateSealedSegmentIndex(segment.get(), cload_index_info);
|
||||
|
||||
auto unary_range_expr =
|
||||
test::GenUnaryRangeExpr(OpType::InnerMatch, literal);
|
||||
auto unary_range_expr = test::GenUnaryRangeExpr(op_type, literal);
|
||||
auto column_info = test::GenColumnInfo(
|
||||
field_id.get(), proto::schema::DataType::VarChar, false, false);
|
||||
unary_range_expr->set_allocated_column_info(column_info);
|
||||
@ -339,39 +268,116 @@ test_ngram_with_data(const boost::container::vector<std::string>& data,
|
||||
|
||||
TEST(NgramIndex, TestNgramWikiEpisode) {
|
||||
boost::container::vector<std::string> data;
|
||||
// not hit
|
||||
data.push_back(
|
||||
"'Indira Davelba Murillo Alvarado (Tegucigalpa, "
|
||||
"the youngest of eight siblings. She attended primary school at the "
|
||||
"Escuela 14 de Julio, and her secondary studies at the Instituto "
|
||||
"school called \"Indi del Bosque\", where she taught the children of "
|
||||
"Honduran women'");
|
||||
// hit
|
||||
data.push_back(
|
||||
"Richmond Green Secondary School is a public secondary school in "
|
||||
"Richmond Hill, Ontario, Canada.");
|
||||
// hit
|
||||
data.push_back(
|
||||
"The Gymnasium in 2002 Gymnasium Philippinum or Philippinum High "
|
||||
"School is an almost 500-year-old secondary school in Marburg, Hesse, "
|
||||
"Germany.");
|
||||
// hit
|
||||
data.push_back(
|
||||
"Sir Winston Churchill Secondary School is a Canadian secondary school "
|
||||
"located in St. Catharines, Ontario.");
|
||||
// not hit
|
||||
data.push_back("Sir Winston Churchill Secondary School");
|
||||
|
||||
std::vector<bool> expected_result{false, true, true, true, false};
|
||||
// within min-max_gram
|
||||
{
|
||||
// inner match
|
||||
std::vector<bool> expected_result{true, true, true, true, true};
|
||||
test_ngram_with_data(
|
||||
data, "ary", proto::plan::OpType::InnerMatch, expected_result);
|
||||
|
||||
test_ngram_with_data(data, "secondary school", expected_result);
|
||||
expected_result = {false, true, false, true, true};
|
||||
test_ngram_with_data(
|
||||
data, "y S", proto::plan::OpType::InnerMatch, expected_result);
|
||||
|
||||
expected_result = {true, true, true, true, false};
|
||||
test_ngram_with_data(
|
||||
data, "y s", proto::plan::OpType::InnerMatch, expected_result);
|
||||
|
||||
// prefix
|
||||
expected_result = {false, false, false, true, true};
|
||||
test_ngram_with_data(
|
||||
data, "Sir", proto::plan::OpType::PrefixMatch, expected_result);
|
||||
|
||||
// postfix
|
||||
expected_result = {false, false, false, false, true};
|
||||
test_ngram_with_data(
|
||||
data, "ool", proto::plan::OpType::PostfixMatch, expected_result);
|
||||
|
||||
// match
|
||||
expected_result = {true, false, false, false, false};
|
||||
test_ngram_with_data(
|
||||
data, "%Alv%y s%", proto::plan::OpType::Match, expected_result);
|
||||
}
|
||||
|
||||
// exceeds max_gram
|
||||
{
|
||||
// inner match
|
||||
std::vector<bool> expected_result{false, true, true, true, false};
|
||||
test_ngram_with_data(data,
|
||||
"secondary school",
|
||||
proto::plan::OpType::InnerMatch,
|
||||
expected_result);
|
||||
|
||||
// prefix
|
||||
expected_result = {false, false, false, true, true};
|
||||
test_ngram_with_data(data,
|
||||
"Sir Winston",
|
||||
proto::plan::OpType::PrefixMatch,
|
||||
expected_result);
|
||||
|
||||
// postfix
|
||||
expected_result = {false, false, true, false, false};
|
||||
test_ngram_with_data(data,
|
||||
"Germany.",
|
||||
proto::plan::OpType::PostfixMatch,
|
||||
expected_result);
|
||||
|
||||
// match
|
||||
expected_result = {true, true, true, true, false};
|
||||
test_ngram_with_data(data,
|
||||
"%secondary%school%",
|
||||
proto::plan::OpType::Match,
|
||||
expected_result);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(NgramIndex, TestNgramAllFalse) {
|
||||
TEST(NgramIndex, TestNgramSimple) {
|
||||
boost::container::vector<std::string> data(10000,
|
||||
"elementary school secondary");
|
||||
|
||||
// all can be hit by ngram tantivy but will be filterred out by the second phase
|
||||
test_ngram_with_data(
|
||||
data, "secondary school", std::vector<bool>(10000, false));
|
||||
test_ngram_with_data(data,
|
||||
"secondary school",
|
||||
proto::plan::OpType::InnerMatch,
|
||||
std::vector<bool>(10000, false));
|
||||
|
||||
test_ngram_with_data(data,
|
||||
"ele",
|
||||
proto::plan::OpType::PrefixMatch,
|
||||
std::vector<bool>(10000, true));
|
||||
|
||||
test_ngram_with_data(data,
|
||||
"%ary%sec%",
|
||||
proto::plan::OpType::Match,
|
||||
std::vector<bool>(10000, true));
|
||||
|
||||
// should be forwarded to brute force
|
||||
test_ngram_with_data(data,
|
||||
"%ary%s%",
|
||||
proto::plan::OpType::Match,
|
||||
std::vector<bool>(10000, true),
|
||||
true);
|
||||
|
||||
test_ngram_with_data(data,
|
||||
"ary",
|
||||
proto::plan::OpType::PostfixMatch,
|
||||
std::vector<bool>(10000, true));
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user