feat: optimize Like query with n-gram (#41803)

Ref #42053

This is the first PR for optimizing `LIKE` with ngram inverted index.
Now, only VARCHAR data type is supported and only InnerMatch LIKE
(%xxx%) query is supported.


How to use it:
```
milvus_client = MilvusClient("http://localhost:19530")
schema = milvus_client.create_schema()
...
schema.add_field("content_ngram", DataType.VARCHAR, max_length=10000)
...
index_params = milvus_client.prepare_index_params()
index_params.add_index(field_name="content_ngram", index_type="NGRAM", index_name="ngram_index", min_gram=2, max_gram=3)
milvus_client.create_collection(COLLECTION_NAME, ...)
```

min_gram and max_gram controls how we tokenize the documents. For
example, for min_gram=2 and max_gram=4, we will tokenize each document
with 2-gram, 3-gram and 4-gram.

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
Signed-off-by: SpadeA-Tang <tangchenjie1210@gmail.com>
This commit is contained in:
Spade A 2025-07-01 10:08:44 +08:00 committed by GitHub
parent 396120ade5
commit 26ec841feb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
40 changed files with 2179 additions and 656 deletions

View File

@ -50,6 +50,7 @@ const char TEXT_LOG_ROOT_PATH[] = "text_log";
const char ITERATIVE_FILTER[] = "iterative_filter";
const char HINTS[] = "hints";
const char JSON_KEY_INDEX_LOG_ROOT_PATH[] = "json_key_index_log";
const char NGRAM_LOG_ROOT_PATH[] = "ngram_log";
const char DEFAULT_PLANNODE_ID[] = "0";
const char DEAFULT_QUERY_ID[] = "0";

View File

@ -305,17 +305,19 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForData(EvalCtx& context) {
TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size);
size_t processed_cursor = 0;
auto execute_sub_batch =
[ lower_inclusive, upper_inclusive, &processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
const T* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
HighPrecisionType val1,
HighPrecisionType val2) {
auto execute_sub_batch = [lower_inclusive,
upper_inclusive,
&processed_cursor,
&bitmap_input]<FilterType filter_type =
FilterType::sequential>(
const T* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
HighPrecisionType val1,
HighPrecisionType val2) {
if (lower_inclusive && upper_inclusive) {
BinaryRangeElementFunc<T, true, true, filter_type> func;
func(val1,
@ -447,22 +449,20 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson(EvalCtx& context) {
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
size_t processed_cursor = 0;
auto execute_sub_batch =
[
lower_inclusive,
upper_inclusive,
pointer,
&bitmap_input,
&processed_cursor
]<FilterType filter_type = FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ValueType val1,
ValueType val2) {
auto execute_sub_batch = [lower_inclusive,
upper_inclusive,
pointer,
&bitmap_input,
&processed_cursor]<FilterType filter_type =
FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ValueType val1,
ValueType val2) {
if (lower_inclusive && upper_inclusive) {
BinaryRangeElementFuncForJson<ValueType, true, true, filter_type>
func;
@ -550,9 +550,10 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJsonForIndex() {
using GetType = std::conditional_t<std::is_same_v<ValueType, std::string>,
std::string_view,
ValueType>;
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto real_batch_size =
(current_data_chunk_pos_ + batch_size_ > active_count_)
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
#define BinaryRangeJSONIndexCompare(cmp) \
do { \
@ -852,18 +853,20 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForArray(EvalCtx& context) {
}
size_t processed_cursor = 0;
auto execute_sub_batch =
[ lower_inclusive, upper_inclusive, &processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
const milvus::ArrayView* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ValueType val1,
ValueType val2,
int index) {
auto execute_sub_batch = [lower_inclusive,
upper_inclusive,
&processed_cursor,
&bitmap_input]<FilterType filter_type =
FilterType::sequential>(
const milvus::ArrayView* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ValueType val1,
ValueType val2,
int index) {
if (lower_inclusive && upper_inclusive) {
BinaryRangeElementFuncForArray<ValueType, true, true, filter_type>
func;

View File

@ -676,23 +676,34 @@ class SegmentExpr : public Expr {
return processed_size;
}
// If process_all_chunks is true, all chunks will be processed and no inner state will be changed.
template <typename T, typename FUNC, typename... ValTypes>
int64_t
ProcessDataChunksForMultipleChunk(
ProcessMultipleChunksCommon(
FUNC func,
std::function<bool(const milvus::SkipIndex&, FieldId, int)> skip_func,
TargetBitmapView res,
TargetBitmapView valid_res,
bool process_all_chunks,
ValTypes... values) {
int64_t processed_size = 0;
for (size_t i = current_data_chunk_; i < num_data_chunk_; i++) {
size_t start_chunk = process_all_chunks ? 0 : current_data_chunk_;
for (size_t i = start_chunk; i < num_data_chunk_; i++) {
auto data_pos =
(i == current_data_chunk_) ? current_data_chunk_pos_ : 0;
process_all_chunks
? 0
: (i == current_data_chunk_ ? current_data_chunk_pos_ : 0);
// if segment is chunked, type won't be growing
int64_t size = segment_->chunk_size(field_id_, i) - data_pos;
size = std::min(size, batch_size_ - processed_size);
// process a whole chunk if process_all_chunks is true
if (!process_all_chunks) {
size = std::min(size, batch_size_ - processed_size);
}
if (size == 0)
continue; //do not go empty-loop at the bound of the chunk
@ -761,7 +772,8 @@ class SegmentExpr : public Expr {
}
processed_size += size;
if (processed_size >= batch_size_) {
if (!process_all_chunks && processed_size >= batch_size_) {
current_data_chunk_ = i;
current_data_chunk_pos_ = data_pos + size;
break;
@ -771,6 +783,30 @@ class SegmentExpr : public Expr {
return processed_size;
}
template <typename T, typename FUNC, typename... ValTypes>
int64_t
ProcessDataChunksForMultipleChunk(
FUNC func,
std::function<bool(const milvus::SkipIndex&, FieldId, int)> skip_func,
TargetBitmapView res,
TargetBitmapView valid_res,
ValTypes... values) {
return ProcessMultipleChunksCommon<T>(
func, skip_func, res, valid_res, false, values...);
}
template <typename T, typename FUNC, typename... ValTypes>
int64_t
ProcessAllChunksForMultipleChunk(
FUNC func,
std::function<bool(const milvus::SkipIndex&, FieldId, int)> skip_func,
TargetBitmapView res,
TargetBitmapView valid_res,
ValTypes... values) {
return ProcessMultipleChunksCommon<T>(
func, skip_func, res, valid_res, true, values...);
}
template <typename T, typename FUNC, typename... ValTypes>
int64_t
ProcessDataChunks(
@ -788,6 +824,22 @@ class SegmentExpr : public Expr {
}
}
template <typename T, typename FUNC, typename... ValTypes>
int64_t
ProcessAllDataChunk(
FUNC func,
std::function<bool(const milvus::SkipIndex&, FieldId, int)> skip_func,
TargetBitmapView res,
TargetBitmapView valid_res,
ValTypes... values) {
if (segment_->is_chunked()) {
return ProcessAllChunksForMultipleChunk<T>(
func, skip_func, res, valid_res, values...);
} else {
PanicInfo(ErrorCode::Unsupported, "unreachable");
}
}
int
ProcessIndexOneChunk(TargetBitmap& result,
TargetBitmap& valid_result,
@ -1169,7 +1221,7 @@ class SegmentExpr : public Expr {
// return batch size, not sure if we should use the data position.
auto real_batch_size =
current_data_chunk_pos_ + batch_size_ > active_count_
(current_data_chunk_pos_ + batch_size_ > active_count_)
? active_count_ - current_data_chunk_pos_
: batch_size_;
result.append(
@ -1266,6 +1318,15 @@ class SegmentExpr : public Expr {
return false;
}
bool
CanUseNgramIndex(FieldId field_id) const {
if (segment_->type() != SegmentType::Sealed) {
return false;
}
auto cast_ptr = dynamic_cast<const segcore::SegmentSealed*>(segment_);
return (cast_ptr != nullptr && cast_ptr->HasNgramIndex(field_id));
}
protected:
const segcore::SegmentInternalInterface* segment_;
const FieldId field_id_;
@ -1305,6 +1366,9 @@ class SegmentExpr : public Expr {
// Cache for text match.
std::shared_ptr<TargetBitmap> cached_match_res_{nullptr};
int32_t consistency_level_{0};
// Cache for ngram match.
std::shared_ptr<TargetBitmap> cached_ngram_match_res_{nullptr};
};
bool

View File

@ -219,8 +219,8 @@ PhyJsonContainsFilterExpr::ExecArrayContains(EvalCtx& context) {
int processed_cursor = 0;
auto execute_sub_batch =
[&processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
[&processed_cursor,
&bitmap_input]<FilterType filter_type = FilterType::sequential>(
const milvus::ArrayView* data,
const bool* valid_data,
const int32_t* offsets,
@ -228,32 +228,32 @@ PhyJsonContainsFilterExpr::ExecArrayContains(EvalCtx& context) {
TargetBitmapView res,
TargetBitmapView valid_res,
const std::shared_ptr<MultiElement>& elements) {
auto executor = [&](size_t i) {
const auto& array = data[i];
for (int j = 0; j < array.length(); ++j) {
if (elements->In(array.template get_data<GetType>(j))) {
return true;
auto executor = [&](size_t i) {
const auto& array = data[i];
for (int j = 0; j < array.length(); ++j) {
if (elements->In(array.template get_data<GetType>(j))) {
return true;
}
}
return false;
};
bool has_bitmap_input = !bitmap_input.empty();
for (int i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
return false;
processed_cursor += size;
};
bool has_bitmap_input = !bitmap_input.empty();
for (int i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {
@ -311,8 +311,8 @@ PhyJsonContainsFilterExpr::ExecJsonContains(EvalCtx& context) {
size_t processed_cursor = 0;
auto execute_sub_batch =
[&processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
[&processed_cursor,
&bitmap_input]<FilterType filter_type = FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
@ -321,40 +321,40 @@ PhyJsonContainsFilterExpr::ExecJsonContains(EvalCtx& context) {
TargetBitmapView valid_res,
const std::string& pointer,
const std::shared_ptr<MultiElement>& elements) {
auto executor = [&](size_t i) {
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
auto executor = [&](size_t i) {
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
return false;
}
for (auto&& it : array) {
auto val = it.template get<GetType>();
if (val.error()) {
continue;
}
if (elements->In(val.value()) > 0) {
return true;
}
}
return false;
}
for (auto&& it : array) {
auto val = it.template get<GetType>();
if (val.error()) {
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (elements->In(val.value()) > 0) {
return true;
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
return false;
processed_cursor += size;
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {
@ -388,9 +388,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsByKeyIndex() {
std::conditional_t<std::is_same_v<ExprValueType, std::string>,
std::string_view,
ExprValueType>;
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto real_batch_size =
(current_data_chunk_pos_ + batch_size_ > active_count_)
? active_count_ - current_data_chunk_pos_
: batch_size_;
std::unordered_set<GetType> elements;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
if (!arg_inited_) {
@ -519,8 +520,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) {
size_t processed_cursor = 0;
auto execute_sub_batch =
[&processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
[&processed_cursor,
&bitmap_input]<FilterType filter_type = FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
@ -529,49 +530,49 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) {
TargetBitmapView valid_res,
const std::string& pointer,
const std::vector<proto::plan::Array>& elements) {
auto executor = [&](size_t i) -> bool {
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
return false;
}
for (auto&& it : array) {
auto val = it.get_array();
if (val.error()) {
continue;
auto executor = [&](size_t i) -> bool {
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
return false;
}
std::vector<
simdjson::simdjson_result<simdjson::ondemand::value>>
json_array;
json_array.reserve(val.count_elements());
for (auto&& e : val) {
json_array.emplace_back(e);
}
for (auto const& element : elements) {
if (CompareTwoJsonArray(json_array, element)) {
return true;
for (auto&& it : array) {
auto val = it.get_array();
if (val.error()) {
continue;
}
std::vector<
simdjson::simdjson_result<simdjson::ondemand::value>>
json_array;
json_array.reserve(val.count_elements());
for (auto&& e : val) {
json_array.emplace_back(e);
}
for (auto const& element : elements) {
if (CompareTwoJsonArray(json_array, element)) {
return true;
}
}
}
return false;
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
return false;
processed_cursor += size;
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {
@ -600,9 +601,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray(EvalCtx& context) {
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsArrayByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto real_batch_size =
(current_data_chunk_pos_ + batch_size_ > active_count_)
? active_count_ - current_data_chunk_pos_
: batch_size_;
std::vector<proto::plan::Array> elements;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
for (auto const& element : expr_->vals_) {
@ -733,8 +735,8 @@ PhyJsonContainsFilterExpr::ExecArrayContainsAll(EvalCtx& context) {
int processed_cursor = 0;
auto execute_sub_batch =
[&processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
[&processed_cursor,
&bitmap_input]<FilterType filter_type = FilterType::sequential>(
const milvus::ArrayView* data,
const bool* valid_data,
const int32_t* offsets,
@ -742,34 +744,34 @@ PhyJsonContainsFilterExpr::ExecArrayContainsAll(EvalCtx& context) {
TargetBitmapView res,
TargetBitmapView valid_res,
const std::set<GetType>& elements) {
auto executor = [&](size_t i) {
std::set<GetType> tmp_elements(elements);
// Note: array can only be iterated once
for (int j = 0; j < data[i].length(); ++j) {
tmp_elements.erase(data[i].template get_data<GetType>(j));
if (tmp_elements.size() == 0) {
return true;
auto executor = [&](size_t i) {
std::set<GetType> tmp_elements(elements);
// Note: array can only be iterated once
for (int j = 0; j < data[i].length(); ++j) {
tmp_elements.erase(data[i].template get_data<GetType>(j));
if (tmp_elements.size() == 0) {
return true;
}
}
return tmp_elements.size() == 0;
};
bool has_bitmap_input = !bitmap_input.empty();
for (int i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
return tmp_elements.size() == 0;
processed_cursor += size;
};
bool has_bitmap_input = !bitmap_input.empty();
for (int i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {
processed_size =
@ -825,8 +827,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(EvalCtx& context) {
int processed_cursor = 0;
auto execute_sub_batch =
[&processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
[&processed_cursor,
&bitmap_input]<FilterType filter_type = FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
@ -835,43 +837,43 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(EvalCtx& context) {
TargetBitmapView valid_res,
const std::string& pointer,
const std::set<GetType>& elements) {
auto executor = [&](const size_t i) -> bool {
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
return false;
}
std::set<GetType> tmp_elements(elements);
// Note: array can only be iterated once
for (auto&& it : array) {
auto val = it.template get<GetType>();
if (val.error()) {
auto executor = [&](const size_t i) -> bool {
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
return false;
}
std::set<GetType> tmp_elements(elements);
// Note: array can only be iterated once
for (auto&& it : array) {
auto val = it.template get<GetType>();
if (val.error()) {
continue;
}
tmp_elements.erase(val.value());
if (tmp_elements.size() == 0) {
return true;
}
}
return tmp_elements.size() == 0;
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
tmp_elements.erase(val.value());
if (tmp_elements.size() == 0) {
return true;
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
return tmp_elements.size() == 0;
processed_cursor += size;
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {
@ -905,9 +907,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllByKeyIndex() {
std::conditional_t<std::is_same_v<ExprValueType, std::string>,
std::string_view,
ExprValueType>;
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto real_batch_size =
(current_data_chunk_pos_ + batch_size_ > active_count_)
? active_count_ - current_data_chunk_pos_
: batch_size_;
std::set<GetType> elements;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
for (auto const& element : expr_->vals_) {
@ -1039,8 +1042,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) {
int processed_cursor = 0;
auto execute_sub_batch =
[&processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
[&processed_cursor,
&bitmap_input]<FilterType filter_type = FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
@ -1050,102 +1053,104 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) {
const std::string& pointer,
const std::vector<proto::plan::GenericValue>& elements,
const std::unordered_set<int> elements_index) {
auto executor = [&](size_t i) -> bool {
const auto& json = data[i];
auto doc = json.doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
return false;
}
std::unordered_set<int> tmp_elements_index(elements_index);
for (auto&& it : array) {
int i = -1;
for (auto& element : elements) {
i++;
switch (element.val_case()) {
case proto::plan::GenericValue::kBoolVal: {
auto val = it.template get<bool>();
if (val.error()) {
continue;
auto executor = [&](size_t i) -> bool {
const auto& json = data[i];
auto doc = json.doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
return false;
}
std::unordered_set<int> tmp_elements_index(elements_index);
for (auto&& it : array) {
int i = -1;
for (auto& element : elements) {
i++;
switch (element.val_case()) {
case proto::plan::GenericValue::kBoolVal: {
auto val = it.template get<bool>();
if (val.error()) {
continue;
}
if (val.value() == element.bool_val()) {
tmp_elements_index.erase(i);
}
break;
}
if (val.value() == element.bool_val()) {
tmp_elements_index.erase(i);
case proto::plan::GenericValue::kInt64Val: {
auto val = it.template get<int64_t>();
if (val.error()) {
continue;
}
if (val.value() == element.int64_val()) {
tmp_elements_index.erase(i);
}
break;
}
break;
case proto::plan::GenericValue::kFloatVal: {
auto val = it.template get<double>();
if (val.error()) {
continue;
}
if (val.value() == element.float_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kStringVal: {
auto val = it.template get<std::string_view>();
if (val.error()) {
continue;
}
if (val.value() == element.string_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kArrayVal: {
auto val = it.get_array();
if (val.error()) {
continue;
}
if (CompareTwoJsonArray(val,
element.array_val())) {
tmp_elements_index.erase(i);
}
break;
}
default:
PanicInfo(
DataTypeInvalid,
fmt::format("unsupported data type {}",
element.val_case()));
}
case proto::plan::GenericValue::kInt64Val: {
auto val = it.template get<int64_t>();
if (val.error()) {
continue;
}
if (val.value() == element.int64_val()) {
tmp_elements_index.erase(i);
}
break;
if (tmp_elements_index.size() == 0) {
return true;
}
case proto::plan::GenericValue::kFloatVal: {
auto val = it.template get<double>();
if (val.error()) {
continue;
}
if (val.value() == element.float_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kStringVal: {
auto val = it.template get<std::string_view>();
if (val.error()) {
continue;
}
if (val.value() == element.string_val()) {
tmp_elements_index.erase(i);
}
break;
}
case proto::plan::GenericValue::kArrayVal: {
auto val = it.get_array();
if (val.error()) {
continue;
}
if (CompareTwoJsonArray(val, element.array_val())) {
tmp_elements_index.erase(i);
}
break;
}
default:
PanicInfo(DataTypeInvalid,
fmt::format("unsupported data type {}",
element.val_case()));
}
if (tmp_elements_index.size() == 0) {
return true;
}
}
if (tmp_elements_index.size() == 0) {
return true;
return tmp_elements_index.size() == 0;
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
}
return tmp_elements_index.size() == 0;
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
processed_cursor += size;
};
res[i] = executor(offset);
}
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {
@ -1176,9 +1181,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType(EvalCtx& context) {
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffTypeByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto real_batch_size =
(current_data_chunk_pos_ + batch_size_ > active_count_)
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
auto elements = expr_->vals_;
std::set<int> elements_index;
@ -1371,8 +1377,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) {
size_t processed_cursor = 0;
auto execute_sub_batch =
[&processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
[&processed_cursor,
&bitmap_input]<FilterType filter_type = FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
@ -1381,54 +1387,54 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) {
TargetBitmapView valid_res,
const std::string& pointer,
const std::vector<proto::plan::Array>& elements) {
auto executor = [&](const size_t i) {
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
return false;
}
std::unordered_set<int> exist_elements_index;
for (auto&& it : array) {
auto val = it.get_array();
if (val.error()) {
continue;
auto executor = [&](const size_t i) {
auto doc = data[i].doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
return false;
}
std::vector<
simdjson::simdjson_result<simdjson::ondemand::value>>
json_array;
json_array.reserve(val.count_elements());
for (auto&& e : val) {
json_array.emplace_back(e);
}
for (int index = 0; index < elements.size(); ++index) {
if (CompareTwoJsonArray(json_array, elements[index])) {
exist_elements_index.insert(index);
std::unordered_set<int> exist_elements_index;
for (auto&& it : array) {
auto val = it.get_array();
if (val.error()) {
continue;
}
std::vector<
simdjson::simdjson_result<simdjson::ondemand::value>>
json_array;
json_array.reserve(val.count_elements());
for (auto&& e : val) {
json_array.emplace_back(e);
}
for (int index = 0; index < elements.size(); ++index) {
if (CompareTwoJsonArray(json_array, elements[index])) {
exist_elements_index.insert(index);
}
}
if (exist_elements_index.size() == elements.size()) {
return true;
}
}
if (exist_elements_index.size() == elements.size()) {
return true;
return exist_elements_index.size() == elements.size();
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
}
return exist_elements_index.size() == elements.size();
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
processed_cursor += size;
};
res[i] = executor(offset);
}
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {
@ -1457,9 +1463,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(EvalCtx& context) {
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsAllArrayByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto real_batch_size =
(current_data_chunk_pos_ + batch_size_ > active_count_)
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
std::vector<proto::plan::Array> elements;
for (auto const& element : expr_->vals_) {
@ -1596,8 +1603,8 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) {
size_t processed_cursor = 0;
auto execute_sub_batch =
[&processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
[&processed_cursor,
&bitmap_input]<FilterType filter_type = FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
@ -1606,94 +1613,96 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) {
TargetBitmapView valid_res,
const std::string& pointer,
const std::vector<proto::plan::GenericValue>& elements) {
auto executor = [&](const size_t i) {
auto& json = data[i];
auto doc = json.doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
return false;
}
// Note: array can only be iterated once
for (auto&& it : array) {
for (auto const& element : elements) {
switch (element.val_case()) {
case proto::plan::GenericValue::kBoolVal: {
auto val = it.template get<bool>();
if (val.error()) {
continue;
auto executor = [&](const size_t i) {
auto& json = data[i];
auto doc = json.doc();
auto array = doc.at_pointer(pointer).get_array();
if (array.error()) {
return false;
}
// Note: array can only be iterated once
for (auto&& it : array) {
for (auto const& element : elements) {
switch (element.val_case()) {
case proto::plan::GenericValue::kBoolVal: {
auto val = it.template get<bool>();
if (val.error()) {
continue;
}
if (val.value() == element.bool_val()) {
return true;
}
break;
}
if (val.value() == element.bool_val()) {
return true;
case proto::plan::GenericValue::kInt64Val: {
auto val = it.template get<int64_t>();
if (val.error()) {
continue;
}
if (val.value() == element.int64_val()) {
return true;
}
break;
}
break;
case proto::plan::GenericValue::kFloatVal: {
auto val = it.template get<double>();
if (val.error()) {
continue;
}
if (val.value() == element.float_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kStringVal: {
auto val = it.template get<std::string_view>();
if (val.error()) {
continue;
}
if (val.value() == element.string_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kArrayVal: {
auto val = it.get_array();
if (val.error()) {
continue;
}
if (CompareTwoJsonArray(val,
element.array_val())) {
return true;
}
break;
}
default:
PanicInfo(
DataTypeInvalid,
fmt::format("unsupported data type {}",
element.val_case()));
}
case proto::plan::GenericValue::kInt64Val: {
auto val = it.template get<int64_t>();
if (val.error()) {
continue;
}
if (val.value() == element.int64_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kFloatVal: {
auto val = it.template get<double>();
if (val.error()) {
continue;
}
if (val.value() == element.float_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kStringVal: {
auto val = it.template get<std::string_view>();
if (val.error()) {
continue;
}
if (val.value() == element.string_val()) {
return true;
}
break;
}
case proto::plan::GenericValue::kArrayVal: {
auto val = it.get_array();
if (val.error()) {
continue;
}
if (CompareTwoJsonArray(val, element.array_val())) {
return true;
}
break;
}
default:
PanicInfo(DataTypeInvalid,
fmt::format("unsupported data type {}",
element.val_case()));
}
}
}
return false;
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
return false;
};
bool has_bitmap_input = !bitmap_input.empty();
for (size_t i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[processed_cursor + i]) {
continue;
}
res[i] = executor(offset);
}
processed_cursor += size;
};
res[i] = executor(offset);
}
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {
@ -1722,9 +1731,10 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(EvalCtx& context) {
VectorPtr
PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffTypeByKeyIndex() {
auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto real_batch_size =
(current_data_chunk_pos_ + batch_size_ > active_count_)
? active_count_ - current_data_chunk_pos_
: batch_size_;
auto pointer = milvus::Json::pointer(expr_->column_.nested_path_);
auto elements = expr_->vals_;
if (elements.empty()) {

View File

@ -318,8 +318,9 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray(EvalCtx& context) {
}
int processed_cursor = 0;
auto execute_sub_batch =
[ op_type, &processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
[op_type,
&processed_cursor,
&bitmap_input]<FilterType filter_type = FilterType::sequential>(
const milvus::ArrayView* data,
const bool* valid_data,
const int32_t* offsets,
@ -328,185 +329,186 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray(EvalCtx& context) {
TargetBitmapView valid_res,
ValueType val,
int index) {
switch (op_type) {
case proto::plan::GreaterThan: {
UnaryElementFuncForArray<ValueType,
proto::plan::GreaterThan,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
switch (op_type) {
case proto::plan::GreaterThan: {
UnaryElementFuncForArray<ValueType,
proto::plan::GreaterThan,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::GreaterEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::GreaterEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::LessThan: {
UnaryElementFuncForArray<ValueType,
proto::plan::LessThan,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::LessEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::LessEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::Equal: {
UnaryElementFuncForArray<ValueType,
proto::plan::Equal,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::NotEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::NotEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::PrefixMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::PrefixMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::Match: {
UnaryElementFuncForArray<ValueType,
proto::plan::Match,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::PostfixMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::PostfixMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::InnerMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::InnerMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
default:
PanicInfo(
OpTypeInvalid,
fmt::format(
"unsupported operator type for unary expr: {}",
op_type));
}
case proto::plan::GreaterEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::GreaterEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::LessThan: {
UnaryElementFuncForArray<ValueType,
proto::plan::LessThan,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::LessEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::LessEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::Equal: {
UnaryElementFuncForArray<ValueType,
proto::plan::Equal,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::NotEqual: {
UnaryElementFuncForArray<ValueType,
proto::plan::NotEqual,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::PrefixMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::PrefixMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::Match: {
UnaryElementFuncForArray<ValueType,
proto::plan::Match,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::PostfixMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::PostfixMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
case proto::plan::InnerMatch: {
UnaryElementFuncForArray<ValueType,
proto::plan::InnerMatch,
filter_type>
func;
func(data,
valid_data,
size,
val,
index,
res,
valid_res,
bitmap_input,
processed_cursor,
offsets);
break;
}
default:
PanicInfo(
OpTypeInvalid,
fmt::format("unsupported operator type for unary expr: {}",
op_type));
}
processed_cursor += size;
};
processed_cursor += size;
};
int64_t processed_size;
if (has_offset_input_) {
processed_size =
@ -706,16 +708,18 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(EvalCtx& context) {
} while (false)
int processed_cursor = 0;
auto execute_sub_batch =
[ op_type, pointer, &processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ExprValueType val) {
auto execute_sub_batch = [op_type,
pointer,
&processed_cursor,
&bitmap_input]<FilterType filter_type =
FilterType::sequential>(
const milvus::Json* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
ExprValueType val) {
bool has_bitmap_input = !bitmap_input.empty();
switch (op_type) {
case proto::plan::GreaterThan: {
@ -1480,6 +1484,14 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImpl(EvalCtx& context) {
fmt::format("match query does not support iterative filter"));
}
return ExecTextMatch();
} else if (expr_->op_type_ == proto::plan::OpType::InnerMatch &&
!has_offset_input_ && CanUseNgramIndex(field_id_)) {
auto res = ExecNgramMatch();
// If nullopt is returned, it means the query cannot be
// optimized by ngram index. Forward it to the normal path.
if (res.has_value()) {
return res.value();
}
}
if (CanUseIndex<T>() && !has_offset_input_) {
@ -1675,16 +1687,17 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForData(EvalCtx& context) {
auto expr_type = expr_->op_type_;
size_t processed_cursor = 0;
auto execute_sub_batch =
[ expr_type, &processed_cursor, &
bitmap_input ]<FilterType filter_type = FilterType::sequential>(
const T* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
IndexInnerType val) {
auto execute_sub_batch = [expr_type,
&processed_cursor,
&bitmap_input]<FilterType filter_type =
FilterType::sequential>(
const T* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res,
IndexInnerType val) {
switch (expr_type) {
case proto::plan::GreaterThan: {
UnaryElementFunc<T, proto::plan::GreaterThan, filter_type> func;
@ -1920,5 +1933,48 @@ PhyUnaryRangeFilterExpr::ExecTextMatch() {
return res;
};
std::optional<VectorPtr>
PhyUnaryRangeFilterExpr::ExecNgramMatch() {
if (!arg_inited_) {
value_arg_.SetValue<std::string>(expr_->val_);
arg_inited_ = true;
}
auto literal = value_arg_.GetValue<std::string>();
TargetBitmap result;
TargetBitmap valid_result;
if (cached_ngram_match_res_ == nullptr) {
auto pinned_index = segment_->GetNgramIndex(field_id_);
auto index = pinned_index.get();
AssertInfo(index != nullptr,
"ngram index should not be null, field_id: {}",
field_id_.get());
auto res_opt = index->InnerMatchQuery(literal, this);
if (!res_opt.has_value()) {
return std::nullopt;
}
auto valid_res = index->IsNotNull();
cached_ngram_match_res_ =
std::make_shared<TargetBitmap>(std::move(res_opt.value()));
cached_index_chunk_valid_res_ = std::move(valid_res);
}
auto real_batch_size =
(current_data_chunk_pos_ + batch_size_ > active_count_)
? active_count_ - current_data_chunk_pos_
: batch_size_;
result.append(
*cached_ngram_match_res_, current_data_chunk_pos_, real_batch_size);
valid_result.append(cached_index_chunk_valid_res_,
current_data_chunk_pos_,
real_batch_size);
current_data_chunk_pos_ += real_batch_size;
return std::optional<VectorPtr>(std::make_shared<ColumnVector>(
std::move(result), std::move(valid_result)));
}
} // namespace exec
} // namespace milvus

View File

@ -18,6 +18,7 @@
#include <fmt/core.h>
#include <optional>
#include <utility>
#include "common/EasyAssert.h"
@ -505,6 +506,9 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr {
VectorPtr
ExecTextMatch();
std::optional<VectorPtr>
ExecNgramMatch();
std::pair<std::string, std::string>
SplitAtFirstSlashDigit(std::string input);

View File

@ -191,5 +191,122 @@ GetValueWithCastNumber(const milvus::proto::plan::GenericValue& value_proto) {
}
}
enum class MatchType {
ExactMatch,
PrefixMatch,
PostfixMatch,
// The different between InnerMatch and Match is that InnerMatch is used for
// %xxx% while Match could be %xxx%xxx%
InnerMatch,
Match
};
struct ParsedResult {
std::string literal;
MatchType type;
};
// Not used now, but may be used in the future for other type of match for ngram index
inline std::optional<ParsedResult>
parse_ngram_pattern(const std::string& pattern) {
if (pattern.empty()) {
return std::nullopt;
}
std::vector<size_t> percent_indices;
bool was_escaped = false;
for (size_t i = 0; i < pattern.length(); ++i) {
char c = pattern[i];
if (c == '%' && !was_escaped) {
percent_indices.push_back(i);
} else if (c == '_' && !was_escaped) {
// todo(SpadeA): now not support '_'
return std::nullopt;
}
was_escaped = (c == '\\' && !was_escaped);
}
MatchType match_type;
size_t core_start = 0;
size_t core_length = 0;
size_t percent_count = percent_indices.size();
if (percent_count == 0) {
match_type = MatchType::ExactMatch;
core_start = 0;
core_length = pattern.length();
} else if (percent_count == 1) {
if (pattern.length() == 1) {
return std::nullopt;
}
size_t idx = percent_indices[0];
// case: %xxx
if (idx == 0 && pattern.length() > 1) {
match_type = MatchType::PrefixMatch;
core_start = 1;
core_length = pattern.length() - 1;
} else if (idx == pattern.length() - 1 && pattern.length() > 1) {
// case: xxx%
match_type = MatchType::PostfixMatch;
core_start = 0;
core_length = pattern.length() - 1;
} else {
// case: xxx%xxx
match_type = MatchType::Match;
}
} else if (percent_count == 2) {
size_t idx1 = percent_indices[0];
size_t idx2 = percent_indices[1];
if (idx1 == 0 && idx2 == pattern.length() - 1 && pattern.length() > 2) {
// case: %xxx%
match_type = MatchType::InnerMatch;
core_start = 1;
core_length = pattern.length() - 2;
} else {
match_type = MatchType::Match;
}
} else {
match_type = MatchType::Match;
}
if (match_type == MatchType::Match) {
// not supported now
return std::nullopt;
}
// Extract the literal from the pattern
std::string_view core_pattern =
std::string_view(pattern).substr(core_start, core_length);
std::string r;
r.reserve(2 * core_pattern.size());
bool escape_mode = false;
for (char c : core_pattern) {
if (escape_mode) {
if (is_special(c)) {
// todo(SpadeA): may not be suitable for ngram? Not use ngram in this case for now.
return std::nullopt;
}
r += c;
escape_mode = false;
} else {
if (c == '\\') {
escape_mode = true;
} else if (c == '%') {
// should be unreachable
} else if (c == '_') {
// should be unreachable
return std::nullopt;
} else {
if (is_special(c)) {
r += '\\';
}
r += c;
}
}
}
return std::optional<ParsedResult>{ParsedResult{std::move(r), match_type}};
}
} // namespace exec
} // namespace milvus

View File

@ -28,6 +28,7 @@
#include "index/Utils.h"
#include "index/Meta.h"
#include "index/JsonInvertedIndex.h"
#include "index/NgramInvertedIndex.h"
#include "knowhere/utils.h"
#include "index/VectorDiskIndex.h"
@ -66,12 +67,27 @@ IndexFactory::CreatePrimitiveScalarIndex(
return CreateScalarIndexSort<T>(file_manager_context);
}
// template <>
// inline ScalarIndexPtr<bool>
// IndexFactory::CreateScalarIndex(const IndexType& index_type) {
// return CreateBoolIndex();
//}
//
IndexBasePtr
IndexFactory::CreateNgramIndex(
DataType data_type,
const NgramParams& params,
const storage::FileManagerContext& file_manager_context) {
switch (data_type) {
case DataType::VARCHAR:
case DataType::STRING:
return std::make_unique<NgramInvertedIndex>(file_manager_context,
params);
case DataType::JSON:
PanicInfo(
NotImplemented,
fmt::format("building ngram index in json is not implemented"));
default:
PanicInfo(DataTypeInvalid,
fmt::format("invalid data type to build ngram index: {}",
data_type));
}
}
template <>
ScalarIndexPtr<std::string>
@ -345,9 +361,15 @@ IndexFactory::CreatePrimitiveScalarIndex(
// create string index
case DataType::STRING:
case DataType::VARCHAR:
case DataType::VARCHAR: {
auto& ngram_params = create_index_info.ngram_params;
if (ngram_params.has_value()) {
return CreateNgramIndex(
data_type, ngram_params.value(), file_manager_context);
}
return CreatePrimitiveScalarIndex<std::string>(
create_index_info, file_manager_context);
}
default:
PanicInfo(
DataTypeInvalid,

View File

@ -94,6 +94,13 @@ class IndexFactory {
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext());
// Create ngram index
IndexBasePtr
CreateNgramIndex(DataType data_type,
const NgramParams& params,
const storage::FileManagerContext& file_manager_context =
storage::FileManagerContext());
// For types like array, struct, union, etc
IndexBasePtr
CreateCompositeScalarIndex(

View File

@ -20,6 +20,12 @@
namespace milvus::index {
struct NgramParams {
bool loading_index;
uintptr_t min_gram;
uintptr_t max_gram;
};
struct CreateIndexInfo {
DataType field_type;
IndexType index_type;
@ -32,6 +38,7 @@ struct CreateIndexInfo {
JsonCastType json_cast_type{JsonCastType::UNKNOWN};
std::string json_path;
std::string json_cast_function;
std::optional<NgramParams> ngram_params;
};
} // namespace milvus::index

View File

@ -332,7 +332,8 @@ JsonKeyStatsInvertedIndex::Upload(const Config& config) {
index_build_timestamps_.index_build_done_ =
std::chrono::system_clock::now();
LOG_INFO(
"build json key index done for field id:{}, json parse duration: {}s, "
"index build done for json key index, field id:{}, json parse "
"duration: {}s, "
"tantivy document add schedule duration : {}s, "
"tantivy total duration : {}s, "
"total duration : {}s",

View File

@ -52,6 +52,9 @@ constexpr const char* TANTIVY_INDEX_VERSION = "tantivy_index_version";
constexpr uint32_t TANTIVY_INDEX_LATEST_VERSION = 7;
constexpr uint32_t TANTIVY_INDEX_MINIMUM_VERSION = 5;
constexpr const char* INDEX_NON_ENCODING = "index.nonEncoding";
constexpr const char* NGRAM_INDEX_TYPE = "NGRAM";
constexpr const char* MIN_GRAM = "min_gram";
constexpr const char* MAX_GRAM = "max_gram";
// index meta
constexpr const char* COLLECTION_ID = "collection_id";

View File

@ -0,0 +1,153 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include "index/NgramInvertedIndex.h"
#include "exec/expression/Expr.h"
namespace milvus::index {
NgramInvertedIndex::NgramInvertedIndex(const storage::FileManagerContext& ctx,
const NgramParams& params)
: min_gram_(params.min_gram), max_gram_(params.max_gram) {
schema_ = ctx.fieldDataMeta.field_schema;
field_id_ = ctx.fieldDataMeta.field_id;
mem_file_manager_ = std::make_shared<MemFileManager>(ctx);
disk_file_manager_ = std::make_shared<DiskFileManager>(ctx);
if (params.loading_index) {
path_ = disk_file_manager_->GetLocalNgramIndexPrefix();
} else {
path_ = disk_file_manager_->GetLocalTempNgramIndexPrefix();
boost::filesystem::create_directories(path_);
d_type_ = TantivyDataType::Keyword;
std::string field_name =
std::to_string(disk_file_manager_->GetFieldDataMeta().field_id);
wrapper_ = std::make_shared<TantivyIndexWrapper>(
field_name.c_str(), path_.c_str(), min_gram_, max_gram_);
}
}
void
NgramInvertedIndex::BuildWithFieldData(const std::vector<FieldDataPtr>& datas) {
AssertInfo(schema_.data_type() == proto::schema::DataType::String ||
schema_.data_type() == proto::schema::DataType::VarChar,
"schema data type is {}",
schema_.data_type());
index_build_begin_ = std::chrono::system_clock::now();
InvertedIndexTantivy<std::string>::BuildWithFieldData(datas);
}
IndexStatsPtr
NgramInvertedIndex::Upload(const Config& config) {
finish();
auto index_build_end = std::chrono::system_clock::now();
auto index_build_duration =
std::chrono::duration<double>(index_build_end - index_build_begin_)
.count();
LOG_INFO("index build done for ngram index, field id: {}, duration: {}s",
field_id_,
index_build_duration);
return InvertedIndexTantivy<std::string>::Upload(config);
}
void
NgramInvertedIndex::Load(milvus::tracer::TraceContext ctx,
const Config& config) {
auto index_files =
GetValueFromConfig<std::vector<std::string>>(config, INDEX_FILES);
AssertInfo(index_files.has_value(),
"index file paths is empty when load ngram index");
auto files_value = index_files.value();
auto it = std::find_if(
files_value.begin(), files_value.end(), [](const std::string& file) {
constexpr std::string_view suffix{"/index_null_offset"};
return file.size() >= suffix.size() &&
std::equal(suffix.rbegin(), suffix.rend(), file.rbegin());
});
if (it != files_value.end()) {
std::vector<std::string> file;
file.push_back(*it);
files_value.erase(it);
auto index_datas = mem_file_manager_->LoadIndexToMemory(
file, config[milvus::LOAD_PRIORITY]);
BinarySet binary_set;
AssembleIndexDatas(index_datas, binary_set);
auto index_valid_data = binary_set.GetByName("index_null_offset");
folly::SharedMutex::WriteHolder lock(mutex_);
null_offset_.resize((size_t)index_valid_data->size / sizeof(size_t));
memcpy(null_offset_.data(),
index_valid_data->data.get(),
(size_t)index_valid_data->size);
}
disk_file_manager_->CacheNgramIndexToDisk(files_value,
config[milvus::LOAD_PRIORITY]);
AssertInfo(
tantivy_index_exist(path_.c_str()), "index not exist: {}", path_);
auto load_in_mmap =
GetValueFromConfig<bool>(config, ENABLE_MMAP).value_or(true);
wrapper_ = std::make_shared<TantivyIndexWrapper>(
path_.c_str(), load_in_mmap, milvus::index::SetBitsetSealed);
if (!load_in_mmap) {
// the index is loaded in ram, so we can remove files in advance
disk_file_manager_->RemoveNgramIndexFiles();
}
LOG_INFO(
"load ngram index done for field id:{} with dir:{}", field_id_, path_);
}
std::optional<TargetBitmap>
NgramInvertedIndex::InnerMatchQuery(const std::string& literal,
exec::SegmentExpr* segment) {
if (literal.length() < min_gram_) {
return std::nullopt;
}
TargetBitmap bitset{static_cast<size_t>(Count())};
wrapper_->inner_match_ngram(literal, min_gram_, max_gram_, &bitset);
// Post filtering: if the literal length is larger than the max_gram
// we need to filter out the bitset
if (literal.length() > max_gram_) {
auto bitset_off = 0;
TargetBitmapView res(bitset);
TargetBitmap valid(res.size(), true);
TargetBitmapView valid_res(valid.data(), valid.size());
auto execute_sub_batch = [&literal](const std::string_view* data,
const bool* valid_data,
const int32_t* offsets,
const int size,
TargetBitmapView res,
TargetBitmapView valid_res) {
auto next_off_option = res.find_first();
while (next_off_option.has_value()) {
auto next_off = next_off_option.value();
if (next_off >= size) {
break;
}
if (data[next_off].find(literal) == std::string::npos) {
res[next_off] = false;
}
next_off_option = res.find_next(next_off);
}
};
segment->ProcessAllDataChunk<std::string_view>(
execute_sub_batch, std::nullptr_t{}, res, valid_res);
}
return std::optional<TargetBitmap>(std::move(bitset));
}
} // namespace milvus::index

View File

@ -0,0 +1,47 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#pragma once
#include <string>
#include <boost/filesystem.hpp>
#include <optional>
#include "index/InvertedIndexTantivy.h"
namespace milvus::exec {
class SegmentExpr;
} // namespace milvus::exec
namespace milvus::index {
class NgramInvertedIndex : public InvertedIndexTantivy<std::string> {
public:
explicit NgramInvertedIndex(const storage::FileManagerContext& ctx,
const NgramParams& params);
IndexStatsPtr
Upload(const Config& config = {}) override;
void
Load(milvus::tracer::TraceContext ctx, const Config& config) override;
void
BuildWithFieldData(const std::vector<FieldDataPtr>& datas) override;
std::optional<TargetBitmap>
InnerMatchQuery(const std::string& literal, exec::SegmentExpr* segment);
private:
uintptr_t min_gram_{0};
uintptr_t max_gram_{0};
int64_t field_id_{0};
std::chrono::time_point<std::chrono::system_clock> index_build_begin_;
};
} // namespace milvus::index

View File

@ -31,8 +31,29 @@ ScalarIndexCreator::ScalarIndexCreator(
const storage::FileManagerContext& file_manager_context)
: config_(config), dtype_(dtype) {
milvus::index::CreateIndexInfo index_info;
if (config.contains("index_type")) {
index_type_ = config.at("index_type").get<std::string>();
if (config.contains(milvus::index::INDEX_TYPE)) {
index_type_ = config.at(milvus::index::INDEX_TYPE).get<std::string>();
if (index_type_ == milvus::index::NGRAM_INDEX_TYPE) {
if (!config.contains(milvus::index::MIN_GRAM) ||
!config.contains(milvus::index::MAX_GRAM)) {
PanicInfo(
milvus::ErrorCode::InvalidParameter,
"Ngram index must specify both min_gram and max_gram");
}
milvus::index::NgramParams ngram_params{};
ngram_params.loading_index = false;
ngram_params.min_gram =
std::stoul(milvus::index::GetValueFromConfig<std::string>(
config, milvus::index::MIN_GRAM)
.value());
ngram_params.max_gram =
std::stoul(milvus::index::GetValueFromConfig<std::string>(
config, milvus::index::MAX_GRAM)
.value());
index_info.ngram_params = std::make_optional(ngram_params);
}
}
// Config should have value for milvus::index::SCALAR_INDEX_ENGINE_VERSION for production calling chain.
// Use value_or(1) for unit test without setting this value

View File

@ -181,8 +181,15 @@ ChunkedSegmentSealedImpl::LoadScalarIndex(const LoadIndexInfo& info) {
return;
}
scalar_indexings_[field_id] =
std::move(const_cast<LoadIndexInfo&>(info).cache_index);
if (auto it = info.index_params.find(index::INDEX_TYPE);
it != info.index_params.end() &&
it->second == index::NGRAM_INDEX_TYPE) {
ngram_indexings_[field_id] =
std::move(const_cast<LoadIndexInfo&>(info).cache_index);
} else {
scalar_indexings_[field_id] =
std::move(const_cast<LoadIndexInfo&>(info).cache_index);
}
LoadResourceRequest request =
milvus::index::IndexFactory::GetInstance().ScalarIndexLoadResource(
@ -598,15 +605,36 @@ ChunkedSegmentSealedImpl::chunk_view_by_offsets(
PinWrapper<const index::IndexBase*>
ChunkedSegmentSealedImpl::chunk_index_impl(FieldId field_id,
int64_t chunk_id) const {
std::shared_lock lck(mutex_);
AssertInfo(scalar_indexings_.find(field_id) != scalar_indexings_.end(),
"Cannot find scalar_indexing with field_id: " +
std::to_string(field_id.get()));
auto slot = scalar_indexings_.at(field_id);
lck.unlock();
auto ca = SemiInlineGet(slot->PinCells({0}));
auto index = ca->get_cell_of(0);
return PinWrapper<const index::IndexBase*>(ca, index);
}
PinWrapper<index::NgramInvertedIndex*>
ChunkedSegmentSealedImpl::GetNgramIndex(FieldId field_id) const {
std::shared_lock lck(mutex_);
auto iter = ngram_indexings_.find(field_id);
if (iter == ngram_indexings_.end()) {
return PinWrapper<index::NgramInvertedIndex*>(nullptr);
}
auto slot = iter->second.get();
lck.unlock();
auto ca = SemiInlineGet(slot->PinCells({0}));
auto index = dynamic_cast<index::NgramInvertedIndex*>(ca->get_cell_of(0));
AssertInfo(index != nullptr,
"ngram index cache is corrupted, field_id: {}",
field_id.get());
return PinWrapper<index::NgramInvertedIndex*>(ca, index);
}
int64_t
ChunkedSegmentSealedImpl::get_row_count() const {
std::shared_lock lck(mutex_);

View File

@ -118,6 +118,15 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
return iter->second.get();
}
bool
HasNgramIndex(FieldId field_id) const override {
std::shared_lock lck(mutex_);
return ngram_indexings_.find(field_id) != ngram_indexings_.end();
}
PinWrapper<index::NgramInvertedIndex*>
GetNgramIndex(FieldId field_id) const override;
// TODO(tiered storage 1): should return a PinWrapper
void
BulkGetJsonData(FieldId field_id,
@ -424,6 +433,9 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
// TODO: generate index for scalar
std::optional<int64_t> num_rows_;
// ngram field index
std::unordered_map<FieldId, index::CacheIndexBasePtr> ngram_indexings_;
// scalar field index
std::unordered_map<FieldId, index::CacheIndexBasePtr> scalar_indexings_;
// vector field index

View File

@ -535,4 +535,11 @@ SegmentInternalInterface::GetJsonKeyIndex(FieldId field_id) const {
}
return iter->second.get();
}
// Only sealed segment has ngram index
PinWrapper<index::NgramInvertedIndex*>
SegmentInternalInterface::GetNgramIndex(FieldId field_id) const {
return PinWrapper<index::NgramInvertedIndex*>(nullptr);
}
} // namespace milvus::segcore

View File

@ -40,6 +40,7 @@
#include "index/JsonKeyStatsInvertedIndex.h"
#include "segcore/ConcurrentVector.h"
#include "segcore/InsertRecord.h"
#include "index/NgramInvertedIndex.h"
namespace milvus::segcore {
@ -150,6 +151,9 @@ class SegmentInterface {
const int64_t* offsets,
int64_t count) const = 0;
virtual PinWrapper<index::NgramInvertedIndex*>
GetNgramIndex(FieldId field_id) const = 0;
virtual void
LazyCheckSchema(SchemaPtr sch) = 0;
@ -361,6 +365,9 @@ class SegmentInternalInterface : public SegmentInterface {
virtual index::JsonKeyStatsInvertedIndex*
GetJsonKeyIndex(FieldId field_id) const override;
virtual PinWrapper<index::NgramInvertedIndex*>
GetNgramIndex(FieldId field_id) const override;
public:
virtual void
vector_search(SearchInfo& search_info,

View File

@ -26,6 +26,7 @@
#include "segcore/InsertRecord.h"
#include "segcore/SegmentInterface.h"
#include "segcore/Types.h"
#include "index/NgramInvertedIndex.h"
namespace milvus::segcore {
@ -103,6 +104,12 @@ class SegmentSealed : public SegmentInternalInterface {
FieldId field_id,
std::unique_ptr<index::JsonKeyStatsInvertedIndex> index) = 0;
virtual bool
HasNgramIndex(FieldId field_id) const = 0;
virtual PinWrapper<index::NgramInvertedIndex*>
GetNgramIndex(FieldId field_id) const override = 0;
SegmentType
type() const override {
return SegmentType::Sealed;

View File

@ -348,6 +348,28 @@ AppendIndexV2(CTraceContext c_trace, CLoadIndexInfo c_load_index_info) {
index_info.metric_type = index_params.at("metric_type");
}
if (index_info.index_type == milvus::index::NGRAM_INDEX_TYPE) {
AssertInfo(index_params.find(milvus::index::MIN_GRAM) !=
index_params.end(),
"min_gram is empty for ngram index");
AssertInfo(index_params.find(milvus::index::MAX_GRAM) !=
index_params.end(),
"max_gram is empty for ngram index");
// get min_gram and max_gram and convert to uintptr_t
milvus::index::NgramParams ngram_params{};
ngram_params.loading_index = true;
ngram_params.min_gram =
std::stoul(milvus::index::GetValueFromConfig<std::string>(
config, milvus::index::MIN_GRAM)
.value());
ngram_params.max_gram =
std::stoul(milvus::index::GetValueFromConfig<std::string>(
config, milvus::index::MAX_GRAM)
.value());
index_info.ngram_params = std::make_optional(ngram_params);
}
// init file manager
milvus::storage::FieldDataMeta field_meta{
load_index_info->collection_id,

View File

@ -58,6 +58,7 @@ DiskFileManagerImpl::~DiskFileManagerImpl() {
RemoveIndexFiles();
RemoveTextLogFiles();
RemoveJsonKeyIndexFiles();
RemoveNgramIndexFiles();
}
bool
@ -317,6 +318,16 @@ DiskFileManagerImpl::CacheJsonKeyIndexToDisk(
priority);
}
void
DiskFileManagerImpl::CacheNgramIndexToDisk(
const std::vector<std::string>& remote_files,
milvus::proto::common::LoadPriority priority) {
return CacheIndexToDiskInternal(
remote_files,
[this]() { return GetLocalNgramIndexPrefix(); },
priority);
}
template <typename DataType>
std::string
DiskFileManagerImpl::CacheRawDataToDisk(const Config& config) {
@ -527,6 +538,13 @@ DiskFileManagerImpl::RemoveJsonKeyIndexFiles() {
local_chunk_manager->RemoveDir(GetLocalJsonKeyIndexPrefix());
}
void
DiskFileManagerImpl::RemoveNgramIndexFiles() {
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
local_chunk_manager->RemoveDir(GetLocalNgramIndexPrefix());
}
template <DataType T>
bool
WriteOptFieldIvfDataImpl(
@ -803,6 +821,30 @@ DiskFileManagerImpl::GetRemoteJsonKeyLogPrefix() {
field_meta_.field_id);
}
std::string
DiskFileManagerImpl::GetLocalNgramIndexPrefix() {
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
return GenNgramIndexPrefix(local_chunk_manager,
index_meta_.build_id,
index_meta_.index_version,
field_meta_.segment_id,
field_meta_.field_id,
false);
}
std::string
DiskFileManagerImpl::GetLocalTempNgramIndexPrefix() {
auto local_chunk_manager =
LocalChunkManagerSingleton::GetInstance().GetChunkManager();
return GenNgramIndexPrefix(local_chunk_manager,
index_meta_.build_id,
index_meta_.index_version,
field_meta_.segment_id,
field_meta_.field_id,
true);
}
std::string
DiskFileManagerImpl::GetLocalRawDataObjectPrefix() {
auto local_chunk_manager =

View File

@ -88,6 +88,14 @@ class DiskFileManagerImpl : public FileManagerImpl {
std::string
GetRemoteJsonKeyLogPrefix();
// Used for upload index to remote storage, using this index prefix dir as remote storage directory
std::string
GetLocalNgramIndexPrefix();
// Used for loading index, using this index prefix dir to store index.
std::string
GetLocalTempNgramIndexPrefix();
std::string
GetLocalRawDataObjectPrefix();
@ -113,6 +121,10 @@ class DiskFileManagerImpl : public FileManagerImpl {
CacheJsonKeyIndexToDisk(const std::vector<std::string>& remote_files,
milvus::proto::common::LoadPriority priority);
void
CacheNgramIndexToDisk(const std::vector<std::string>& remote_files,
milvus::proto::common::LoadPriority priority);
void
RemoveIndexFiles();
@ -122,6 +134,9 @@ class DiskFileManagerImpl : public FileManagerImpl {
void
RemoveJsonKeyIndexFiles();
void
RemoveNgramIndexFiles();
void
AddBatchIndexFiles(const std::string& local_file_name,
const std::vector<int64_t>& local_file_offsets,

View File

@ -653,6 +653,26 @@ GenRemoteJsonKeyIndexPathPrefix(ChunkManagerPtr cm,
segment_id,
field_id);
}
std::string
GenNgramIndexPrefix(ChunkManagerPtr cm,
int64_t build_id,
int64_t index_version,
int64_t segment_id,
int64_t field_id,
bool is_temp) {
boost::filesystem::path prefix = cm->GetRootPath();
if (is_temp) {
prefix = prefix / TEMP;
}
boost::filesystem::path path = std::string(NGRAM_LOG_ROOT_PATH);
boost::filesystem::path path1 =
GenIndexPathIdentifier(build_id, index_version, segment_id, field_id);
return (prefix / path / path1).string();
}
std::string
GenFieldRawDataPathPrefix(ChunkManagerPtr cm,
int64_t segment_id,

View File

@ -133,6 +133,14 @@ GenRemoteJsonKeyIndexPathPrefix(ChunkManagerPtr cm,
int64_t segment_id,
int64_t field_id);
std::string
GenNgramIndexPrefix(ChunkManagerPtr cm,
int64_t build_id,
int64_t index_version,
int64_t segment_id,
int64_t field_id,
bool is_temp);
std::string
GenFieldRawDataPathPrefix(ChunkManagerPtr cm,
int64_t segment_id,
int64_t field_id);

View File

@ -113,6 +113,13 @@ RustResult tantivy_create_json_key_stats_writer(const char *field_name,
uintptr_t overall_memory_budget_in_bytes,
bool in_ram);
RustResult tantivy_create_ngram_writer(const char *field_name,
const char *path,
uintptr_t min_gram,
uintptr_t max_gram,
uintptr_t num_threads,
uintptr_t overall_memory_budget_in_bytes);
RustResult tantivy_load_index(const char *path, bool load_in_mmap, SetBitsetFn set_bitset);
void tantivy_free_index_reader(void *ptr);
@ -182,6 +189,12 @@ RustResult tantivy_term_query_keyword(void *ptr, const char *term, void *bitset)
RustResult tantivy_term_query_keyword_i64(void *ptr, const char *term);
RustResult tantivy_inner_match_ngram(void *ptr,
const char *literal,
uintptr_t min_gram,
uintptr_t max_gram,
void *bitset);
RustResult tantivy_lower_bound_range_query_keyword(void *ptr,
const char *lower_bound,
bool inclusive,

View File

@ -0,0 +1,157 @@
use std::sync::Arc;
use tantivy::schema::{Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
use tantivy::tokenizer::{NgramTokenizer, TextAnalyzer};
use tantivy::Index;
use crate::error::Result;
use crate::index_writer::IndexWriterWrapper;
use crate::index_writer_v7::IndexWriterWrapperImpl;
const NGRAM_TOKENIZER: &str = "ngram";
fn build_ngram_schema(field_name: &str) -> (Schema, Field) {
let mut schema_builder = Schema::builder();
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer(NGRAM_TOKENIZER)
.set_fieldnorms(false)
.set_index_option(IndexRecordOption::Basic);
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
let field = schema_builder.add_text_field(field_name, text_options);
schema_builder.enable_user_specified_doc_id();
(schema_builder.build(), field)
}
impl IndexWriterWrapper {
// create a text writer according to `tanviy_index_version`.
// version 7 is the latest version and is what we should use in most cases.
// We may also build with version 5 for compatibility for reader nodes with older versions.
pub(crate) fn create_ngram_writer(
field_name: &str,
path: &str,
min_gram: usize,
max_gram: usize,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
) -> Result<IndexWriterWrapper> {
let tokenizer = TextAnalyzer::builder(NgramTokenizer::new(
min_gram as usize,
max_gram as usize,
false,
)?)
.dynamic()
.build();
let (schema, field) = build_ngram_schema(field_name);
let index = Index::create_in_dir(path, schema).unwrap();
index.tokenizers().register(NGRAM_TOKENIZER, tokenizer);
let index_writer = index
.writer_with_num_threads(num_threads, overall_memory_budget_in_bytes)
.unwrap();
Ok(IndexWriterWrapper::V7(IndexWriterWrapperImpl {
field,
index_writer,
index: Arc::new(index),
enable_user_specified_doc_id: true,
id_field: None,
}))
}
}
#[cfg(test)]
mod tests {
use std::ffi::c_void;
use tempfile::TempDir;
use crate::{index_writer::IndexWriterWrapper, util::set_bitset};
#[test]
fn test_create_ngram_writer() {
let dir = TempDir::new().unwrap();
let _ = IndexWriterWrapper::create_ngram_writer(
"test",
dir.path().to_str().unwrap(),
1,
2,
1,
15000000,
)
.unwrap();
}
#[test]
fn test_ngram_writer() {
let dir = TempDir::new().unwrap();
let mut writer = IndexWriterWrapper::create_ngram_writer(
"test",
dir.path().to_str().unwrap(),
2,
3,
1,
15000000,
)
.unwrap();
writer.add("university", Some(0)).unwrap();
writer.add("anthropology", Some(1)).unwrap();
writer.add("economics", Some(2)).unwrap();
writer.add("history", Some(3)).unwrap();
writer.add("victoria", Some(4)).unwrap();
writer.add("basics", Some(5)).unwrap();
writer.add("economiCs", Some(6)).unwrap();
writer.commit().unwrap();
let reader = writer.create_reader(set_bitset).unwrap();
let mut res: Vec<u32> = vec![];
reader
.inner_match_ngram("ic", 2, 3, &mut res as *mut _ as *mut c_void)
.unwrap();
assert_eq!(res, vec![2, 4, 5]);
}
#[test]
fn test_ngram_writer_chinese() {
let dir = TempDir::new().unwrap();
let mut writer = IndexWriterWrapper::create_ngram_writer(
"test",
dir.path().to_str().unwrap(),
2,
3,
1,
15000000,
)
.unwrap();
writer.add("ngram测试", Some(0)).unwrap();
writer.add("测试ngram", Some(1)).unwrap();
writer.add("测试ngram测试", Some(2)).unwrap();
writer.add("你好世界", Some(3)).unwrap();
writer.add("ngram需要被测试", Some(4)).unwrap();
writer.commit().unwrap();
let reader = writer.create_reader(set_bitset).unwrap();
let mut res: Vec<u32> = vec![];
reader
.inner_match_ngram("测试", 2, 3, &mut res as *mut _ as *mut c_void)
.unwrap();
assert_eq!(res, vec![0, 1, 2, 4]);
let mut res: Vec<u32> = vec![];
reader
.inner_match_ngram("m测试", 2, 3, &mut res as *mut _ as *mut c_void)
.unwrap();
assert_eq!(res, vec![0, 2]);
let mut res: Vec<u32> = vec![];
reader
.inner_match_ngram("需要被测试", 2, 3, &mut res as *mut _ as *mut c_void)
.unwrap();
assert_eq!(res, vec![4]);
}
}

View File

@ -0,0 +1,37 @@
use std::ffi::c_char;
use std::ffi::CStr;
use crate::array::RustResult;
use crate::cstr_to_str;
use crate::index_writer::IndexWriterWrapper;
use crate::log::init_log;
use crate::util::create_binding;
#[no_mangle]
pub extern "C" fn tantivy_create_ngram_writer(
field_name: *const c_char,
path: *const c_char,
min_gram: usize,
max_gram: usize,
num_threads: usize,
overall_memory_budget_in_bytes: usize,
) -> RustResult {
init_log();
let field_name_str = cstr_to_str!(field_name);
let path_str = cstr_to_str!(path);
match IndexWriterWrapper::create_ngram_writer(
field_name_str,
path_str,
min_gram,
max_gram,
num_threads,
overall_memory_budget_in_bytes,
) {
Ok(index_writer_wrapper) => RustResult::from_ptr(create_binding(index_writer_wrapper)),
Err(err) => RustResult::from_error(format!(
"create ngram writer failed with error: {}",
err.to_string(),
)),
}
}

View File

@ -1,10 +1,12 @@
use log::info;
use std::ffi::c_void;
use std::ops::Bound;
use std::sync::Arc;
use tantivy::fastfield::FastValue;
use tantivy::query::{ExistsQuery, Query, RangeQuery, RegexQuery, TermQuery};
use tantivy::query::{BooleanQuery, ExistsQuery, Query, RangeQuery, RegexQuery, TermQuery};
use tantivy::schema::{Field, IndexRecordOption};
use tantivy::tokenizer::{NgramTokenizer, TokenStream, Tokenizer};
use tantivy::{Index, IndexReader, ReloadPolicy, Term};
use crate::bitset_wrapper::BitsetWrapper;
@ -297,6 +299,41 @@ impl IndexReaderWrapper {
self.search_i64(&q)
}
// **Note**: literal length must be larger or equal to min_gram.
pub fn inner_match_ngram(
&self,
literal: &str,
min_gram: usize,
max_gram: usize,
bitset: *mut c_void,
) -> Result<()> {
// literal length should be larger or equal to min_gram.
assert!(
literal.chars().count() >= min_gram,
"literal length should be larger or equal to min_gram. literal: {}, min_gram: {}",
literal,
min_gram
);
if literal.chars().count() <= max_gram {
return self.term_query_keyword(literal, bitset);
}
let mut terms = vec![];
// So, str length is larger than 'max_gram' parse 'str' by 'max_gram'-gram and search all of them with boolean intersection
// nivers
let mut term_queries: Vec<Box<dyn Query>> = vec![];
let mut tokenizer = NgramTokenizer::new(max_gram, max_gram, false).unwrap();
let mut token_stream = tokenizer.token_stream(literal);
token_stream.process(&mut |token| {
let term = Term::from_field_text(self.field, &token.text);
term_queries.push(Box::new(TermQuery::new(term, IndexRecordOption::Basic)));
terms.push(token.text.clone());
});
let query = BooleanQuery::intersection(term_queries);
self.search(&query, bitset)
}
pub fn lower_bound_range_query_keyword(
&self,
lower_bound: &str,

View File

@ -233,6 +233,25 @@ pub extern "C" fn tantivy_term_query_keyword_i64(
unsafe { (*real).term_query_keyword_i64(term).into() }
}
#[no_mangle]
pub extern "C" fn tantivy_inner_match_ngram(
ptr: *mut c_void,
literal: *const c_char,
min_gram: usize,
max_gram: usize,
bitset: *mut c_void,
) -> RustResult {
let real = ptr as *mut IndexReaderWrapper;
let literal = cstr_to_str!(literal);
let now = std::time::Instant::now();
unsafe {
(*real)
.inner_match_ngram(literal, min_gram, max_gram, bitset)
.into()
}
}
#[no_mangle]
pub extern "C" fn tantivy_lower_bound_range_query_keyword(
ptr: *mut c_void,

View File

@ -9,6 +9,8 @@ mod error;
mod hashmap_c;
mod index_json_key_stats_writer;
mod index_json_key_stats_writer_c;
mod index_ngram_writer;
mod index_ngram_writer_c;
mod index_reader;
mod index_reader_c;
mod index_reader_text;

View File

@ -177,6 +177,29 @@ struct TantivyIndexWrapper {
path_ = std::string(path);
}
// create index writer for ngram
TantivyIndexWrapper(const char* field_name,
const char* path,
uintptr_t min_gram,
uintptr_t max_gram,
uintptr_t num_threads = DEFAULT_NUM_THREADS,
uintptr_t overall_memory_budget_in_bytes =
DEFAULT_OVERALL_MEMORY_BUDGET_IN_BYTES) {
auto res = RustResultWrapper(
tantivy_create_ngram_writer(field_name,
path,
min_gram,
max_gram,
num_threads,
overall_memory_budget_in_bytes));
AssertInfo(res.result_->success,
"failed to create ngram writer: {}",
res.result_->error);
writer_ = res.result_->value.ptr._0;
path_ = std::string(path);
}
// create reader.
void
create_reader(SetBitsetFn set_bitset) {
@ -912,6 +935,22 @@ struct TantivyIndexWrapper {
"TantivyIndexWrapper.phrase_match_query: invalid result type");
}
void
inner_match_ngram(const std::string& literal,
uintptr_t min_gram,
uintptr_t max_gram,
void* bitset) {
auto array = tantivy_inner_match_ngram(
reader_, literal.c_str(), min_gram, max_gram, bitset);
auto res = RustResultWrapper(array);
AssertInfo(res.result_->success,
"TantivyIndexWrapper.inner_match_ngram: {}",
res.result_->error);
AssertInfo(
res.result_->value.tag == Value::Tag::None,
"TantivyIndexWrapper.inner_match_ngram: invalid result type");
}
// json query
template <typename T>
void

View File

@ -106,6 +106,7 @@ set(MILVUS_TEST_FILES
test_thread_pool.cpp
test_json_flat_index.cpp
test_vector_array.cpp
test_ngram_query.cpp
)
if ( INDEX_ENGINE STREQUAL "cardinal" )

View File

@ -0,0 +1,377 @@
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License
#include <gtest/gtest.h>
#include <string>
#include <random>
#include "common/Schema.h"
#include "test_utils/GenExprProto.h"
#include "query/PlanProto.h"
#include "query/ExecPlanNodeVisitor.h"
#include "expr/ITypeExpr.h"
#include "test_utils/storage_test_utils.h"
#include "index/IndexFactory.h"
#include "index/NgramInvertedIndex.h"
#include "segcore/load_index_c.h"
using namespace milvus;
using namespace milvus::query;
using namespace milvus::segcore;
using namespace milvus::exec;
TEST(ConvertToNgramLiteralTest, EmptyString) {
auto result = parse_ngram_pattern("");
ASSERT_FALSE(result.has_value());
}
TEST(ConvertToNgramLiteralTest, ExactMatchSimple) {
auto result = parse_ngram_pattern("abc");
ASSERT_TRUE(result.has_value());
EXPECT_EQ(result->literal, "abc");
EXPECT_EQ(result->type, MatchType::ExactMatch);
}
TEST(ConvertToNgramLiteralTest, ExactMatchWithEscapedPercent) {
auto result = parse_ngram_pattern("ab\\%cd");
ASSERT_TRUE(result.has_value());
EXPECT_EQ(result->literal, "ab%cd");
EXPECT_EQ(result->type, MatchType::ExactMatch);
}
TEST(ConvertToNgramLiteralTest, ExactMatchWithEscapedSpecialChar) {
auto result = parse_ngram_pattern("a.b");
ASSERT_TRUE(result.has_value());
EXPECT_EQ(result->literal, "a\\.b");
EXPECT_EQ(result->type, MatchType::ExactMatch);
}
TEST(ConvertToNgramLiteralTest, PrefixMatchSimple) {
auto result = parse_ngram_pattern("%abc");
ASSERT_TRUE(result.has_value());
EXPECT_EQ(result->literal, "abc");
EXPECT_EQ(result->type, MatchType::PrefixMatch);
}
TEST(ConvertToNgramLiteralTest, PostfixMatchSimple) {
auto result = parse_ngram_pattern("abc%");
ASSERT_TRUE(result.has_value());
EXPECT_EQ(result->literal, "abc");
EXPECT_EQ(result->type, MatchType::PostfixMatch);
}
TEST(ConvertToNgramLiteralTest, InnerMatchSimple) {
auto result = parse_ngram_pattern("%abc%");
ASSERT_TRUE(result.has_value());
EXPECT_EQ(result->literal, "abc");
EXPECT_EQ(result->type, MatchType::InnerMatch);
}
TEST(ConvertToNgramLiteralTest, MatchSinglePercentMiddle) {
auto result = parse_ngram_pattern("a%b");
ASSERT_FALSE(result.has_value());
}
TEST(ConvertToNgramLiteralTest, MatchTypeReturnsNullopt) {
EXPECT_FALSE(parse_ngram_pattern("%").has_value());
// %a%b (n=2, not %xxx%) -> Match -> nullopt
EXPECT_FALSE(parse_ngram_pattern("%a%b").has_value());
// a%b%c (n=2, not %xxx%) -> Match -> nullopt
EXPECT_FALSE(parse_ngram_pattern("a%b%c").has_value());
// %% (n=2, not %xxx% because length is not > 2) -> Match -> nullopt
EXPECT_FALSE(parse_ngram_pattern("%%").has_value());
// %a%b%c% (n=3) -> Match -> nullopt
EXPECT_FALSE(parse_ngram_pattern("%a%b%c%").has_value());
}
TEST(ConvertToNgramLiteralTest, UnescapedUnderscoreReturnsNullopt) {
EXPECT_FALSE(parse_ngram_pattern("a_b").has_value());
EXPECT_FALSE(parse_ngram_pattern("%a_b").has_value());
EXPECT_FALSE(parse_ngram_pattern("a_b%").has_value());
EXPECT_FALSE(parse_ngram_pattern("%a_b%").has_value());
}
TEST(ConvertToNgramLiteralTest, EscapedUnderscore) {
auto result = parse_ngram_pattern("a\\_b");
ASSERT_TRUE(result.has_value());
EXPECT_EQ(result->literal, "a_b");
EXPECT_EQ(result->type, MatchType::ExactMatch);
}
auto
generate_field_meta(int64_t collection_id = 1,
int64_t partition_id = 2,
int64_t segment_id = 3,
int64_t field_id = 101,
DataType data_type = DataType::NONE,
DataType element_type = DataType::NONE,
bool nullable = false) -> storage::FieldDataMeta {
auto meta = storage::FieldDataMeta{
.collection_id = collection_id,
.partition_id = partition_id,
.segment_id = segment_id,
.field_id = field_id,
};
meta.field_schema.set_data_type(
static_cast<proto::schema::DataType>(data_type));
meta.field_schema.set_element_type(
static_cast<proto::schema::DataType>(element_type));
meta.field_schema.set_nullable(nullable);
return meta;
}
auto
generate_index_meta(int64_t segment_id = 3,
int64_t field_id = 101,
int64_t index_build_id = 1000,
int64_t index_version = 10000) -> storage::IndexMeta {
return storage::IndexMeta{
.segment_id = segment_id,
.field_id = field_id,
.build_id = index_build_id,
.index_version = index_version,
};
}
auto
generate_local_storage_config(const std::string& root_path)
-> storage::StorageConfig {
auto ret = storage::StorageConfig{};
ret.storage_type = "local";
ret.root_path = root_path;
return ret;
}
void
test_ngram_with_data(const boost::container::vector<std::string>& data,
const std::string& literal,
const std::vector<bool>& expected_result) {
int64_t collection_id = 1;
int64_t partition_id = 2;
int64_t segment_id = 3;
int64_t index_build_id = 4000;
int64_t index_version = 4000;
int64_t index_id = 5000;
auto schema = std::make_shared<Schema>();
auto field_id = schema->AddDebugField("ngram", DataType::VARCHAR);
auto field_meta = generate_field_meta(collection_id,
partition_id,
segment_id,
field_id.get(),
DataType::VARCHAR,
DataType::NONE,
false);
auto index_meta = generate_index_meta(
segment_id, field_id.get(), index_build_id, index_version);
std::string root_path = "/tmp/test-inverted-index/";
auto storage_config = generate_local_storage_config(root_path);
auto cm = CreateChunkManager(storage_config);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> distrib(1, 100);
size_t nb = data.size();
auto field_data = storage::CreateFieldData(DataType::VARCHAR, false);
field_data->FillFieldData(data.data(), data.size());
auto segment = CreateSealedSegment(schema);
auto field_data_info = PrepareSingleFieldInsertBinlog(collection_id,
partition_id,
segment_id,
field_id.get(),
{field_data},
cm);
segment->LoadFieldData(field_data_info);
auto payload_reader =
std::make_shared<milvus::storage::PayloadReader>(field_data);
storage::InsertData insert_data(payload_reader);
insert_data.SetFieldDataMeta(field_meta);
insert_data.SetTimestamps(0, 100);
auto serialized_bytes = insert_data.Serialize(storage::Remote);
auto get_binlog_path = [=](int64_t log_id) {
return fmt::format("{}/{}/{}/{}/{}",
collection_id,
partition_id,
segment_id,
field_id.get(),
log_id);
};
auto log_path = get_binlog_path(0);
auto cm_w = ChunkManagerWrapper(cm);
cm_w.Write(log_path, serialized_bytes.data(), serialized_bytes.size());
storage::FileManagerContext ctx(field_meta, index_meta, cm);
std::vector<std::string> index_files;
{
Config config;
config["index_type"] = milvus::index::INVERTED_INDEX_TYPE;
config["insert_files"] = std::vector<std::string>{log_path};
auto ngram_params = index::NgramParams{
.loading_index = false,
.min_gram = 2,
.max_gram = 4,
};
auto index =
std::make_shared<index::NgramInvertedIndex>(ctx, ngram_params);
index->Build(config);
auto create_index_result = index->Upload();
auto memSize = create_index_result->GetMemSize();
auto serializedSize = create_index_result->GetSerializedSize();
ASSERT_GT(memSize, 0);
ASSERT_GT(serializedSize, 0);
index_files = create_index_result->GetIndexFiles();
}
{
index::CreateIndexInfo index_info{};
index_info.index_type = milvus::index::INVERTED_INDEX_TYPE;
index_info.field_type = DataType::VARCHAR;
Config config;
config[milvus::index::INDEX_FILES] = index_files;
config[milvus::LOAD_PRIORITY] =
milvus::proto::common::LoadPriority::HIGH;
auto ngram_params = index::NgramParams{
.loading_index = true,
.min_gram = 2,
.max_gram = 4,
};
auto index =
std::make_unique<index::NgramInvertedIndex>(ctx, ngram_params);
index->Load(milvus::tracer::TraceContext{}, config);
auto cnt = index->Count();
ASSERT_EQ(cnt, nb);
exec::SegmentExpr segment_expr(std::move(std::vector<exec::ExprPtr>{}),
"SegmentExpr",
segment.get(),
field_id,
{},
DataType::VARCHAR,
nb,
8192,
0);
auto bitset = index->InnerMatchQuery(literal, &segment_expr).value();
for (size_t i = 0; i < nb; i++) {
ASSERT_EQ(bitset[i], expected_result[i]);
}
}
{
std::map<std::string, std::string> index_params{
{milvus::index::INDEX_TYPE, milvus::index::NGRAM_INDEX_TYPE},
{milvus::index::MIN_GRAM, "2"},
{milvus::index::MAX_GRAM, "4"},
{milvus::LOAD_PRIORITY, "HIGH"},
};
milvus::segcore::LoadIndexInfo load_index_info{
.collection_id = collection_id,
.partition_id = partition_id,
.segment_id = segment_id,
.field_id = field_id.get(),
.field_type = DataType::VARCHAR,
.enable_mmap = true,
.mmap_dir_path = "/tmp/test-ngram-index-mmap-dir",
.index_id = index_id,
.index_build_id = index_build_id,
.index_version = index_version,
.index_params = index_params,
.index_files = index_files,
.schema = field_meta.field_schema,
.index_size = 1024 * 1024 * 1024,
};
uint8_t trace_id[16] = {0};
uint8_t span_id[8] = {0};
trace_id[0] = 1;
span_id[0] = 2;
CTraceContext trace{
.traceID = trace_id,
.spanID = span_id,
.traceFlags = 0,
};
auto cload_index_info = static_cast<CLoadIndexInfo>(&load_index_info);
AppendIndexV2(trace, cload_index_info);
UpdateSealedSegmentIndex(segment.get(), cload_index_info);
auto unary_range_expr =
test::GenUnaryRangeExpr(OpType::InnerMatch, literal);
auto column_info = test::GenColumnInfo(
field_id.get(), proto::schema::DataType::VarChar, false, false);
unary_range_expr->set_allocated_column_info(column_info);
auto expr = test::GenExpr();
expr->set_allocated_unary_range_expr(unary_range_expr);
auto parser = ProtoParser(schema);
auto typed_expr = parser.ParseExprs(*expr);
auto parsed = std::make_shared<plan::FilterBitsNode>(
DEFAULT_PLANNODE_ID, typed_expr);
BitsetType final;
final = ExecuteQueryExpr(parsed, segment.get(), nb, MAX_TIMESTAMP);
for (size_t i = 0; i < nb; i++) {
ASSERT_EQ(final[i], expected_result[i]);
}
}
}
TEST(NgramIndex, TestNgramWikiEpisode) {
boost::container::vector<std::string> data;
// not hit
data.push_back(
"'Indira Davelba Murillo Alvarado (Tegucigalpa, "
"the youngest of eight siblings. She attended primary school at the "
"Escuela 14 de Julio, and her secondary studies at the Instituto "
"school called \"Indi del Bosque\", where she taught the children of "
"Honduran women'");
// hit
data.push_back(
"Richmond Green Secondary School is a public secondary school in "
"Richmond Hill, Ontario, Canada.");
// hit
data.push_back(
"The Gymnasium in 2002 Gymnasium Philippinum or Philippinum High "
"School is an almost 500-year-old secondary school in Marburg, Hesse, "
"Germany.");
// hit
data.push_back(
"Sir Winston Churchill Secondary School is a Canadian secondary school "
"located in St. Catharines, Ontario.");
// not hit
data.push_back("Sir Winston Churchill Secondary School");
std::vector<bool> expected_result{false, true, true, true, false};
test_ngram_with_data(data, "secondary school", expected_result);
}
TEST(NgramIndex, TestNgramAllFalse) {
boost::container::vector<std::string> data(10000,
"elementary school secondary");
// all can be hit by ngram tantivy but will be filterred out by the second phase
test_ngram_with_data(
data, "secondary school", std::vector<bool>(10000, false));
}

View File

@ -195,8 +195,8 @@ func (si *statsInspector) enableBM25() bool {
}
func needDoTextIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool {
if !(isFlush(segment) && segment.GetLevel() != datapb.SegmentLevel_L0 &&
segment.GetIsSorted()) {
if !isFlush(segment) || segment.GetLevel() == datapb.SegmentLevel_L0 ||
!segment.GetIsSorted() {
return false
}
@ -212,12 +212,15 @@ func needDoTextIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool {
}
func needDoJsonKeyIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool {
if !(isFlush(segment) && segment.GetLevel() != datapb.SegmentLevel_L0 &&
segment.GetIsSorted()) {
if !isFlush(segment) || segment.GetLevel() == datapb.SegmentLevel_L0 ||
!segment.GetIsSorted() {
return false
}
for _, fieldID := range fieldIDs {
if segment.GetJsonKeyStats() == nil {
return true
}
if segment.GetJsonKeyStats()[fieldID] == nil {
return true
}

View File

@ -1162,6 +1162,94 @@ func Test_parseIndexParams(t *testing.T) {
})
}
func Test_ngram_parseIndexParams(t *testing.T) {
t.Run("valid ngram index params", func(t *testing.T) {
cit := &createIndexTask{
req: &milvuspb.CreateIndexRequest{
ExtraParams: []*commonpb.KeyValuePair{
{Key: common.IndexTypeKey, Value: "NGRAM"},
{Key: common.IndexParamsKey, Value: "{\"min_gram\": \"2\", \"max_gram\": \"3\"}"},
},
},
fieldSchema: &schemapb.FieldSchema{
FieldID: 101, Name: "FieldID", DataType: schemapb.DataType_VarChar,
},
}
err := cit.parseIndexParams(context.TODO())
assert.NoError(t, err)
assert.ElementsMatch(t, []*commonpb.KeyValuePair{
{Key: common.IndexTypeKey, Value: "NGRAM"},
{Key: indexparamcheck.MinGramKey, Value: "2"},
{Key: indexparamcheck.MaxGramKey, Value: "3"},
}, cit.newIndexParams)
assert.Empty(t, cit.newTypeParams)
})
t.Run("ngram on non varchar field", func(t *testing.T) {
cit := &createIndexTask{
req: &milvuspb.CreateIndexRequest{
ExtraParams: []*commonpb.KeyValuePair{
{Key: common.IndexTypeKey, Value: "NGRAM"},
{Key: common.IndexParamsKey, Value: "{\"min_gram\": \"2\", \"max_gram\": \"3\"}"},
},
},
fieldSchema: &schemapb.FieldSchema{
FieldID: 101, Name: "FieldInt", DataType: schemapb.DataType_Int64,
},
}
err := cit.parseIndexParams(context.TODO())
assert.Error(t, err)
})
t.Run("ngram missing params", func(t *testing.T) {
cit := &createIndexTask{
req: &milvuspb.CreateIndexRequest{
ExtraParams: []*commonpb.KeyValuePair{
{Key: common.IndexTypeKey, Value: "NGRAM"},
{Key: common.IndexParamsKey, Value: "{\"min_gram\": \"2\"}"},
},
},
fieldSchema: &schemapb.FieldSchema{
FieldID: 101, Name: "FieldID", DataType: schemapb.DataType_VarChar,
},
}
err := cit.parseIndexParams(context.TODO())
assert.Error(t, err)
})
t.Run("ngram non-integer params", func(t *testing.T) {
cit := &createIndexTask{
req: &milvuspb.CreateIndexRequest{
ExtraParams: []*commonpb.KeyValuePair{
{Key: common.IndexTypeKey, Value: "NGRAM"},
{Key: common.IndexParamsKey, Value: "{\"min_gram\": \"a\", \"max_gram\": \"3\"}"},
},
},
fieldSchema: &schemapb.FieldSchema{
FieldID: 101, Name: "FieldID", DataType: schemapb.DataType_VarChar,
},
}
err := cit.parseIndexParams(context.TODO())
assert.Error(t, err)
})
t.Run("ngram invalid range", func(t *testing.T) {
cit := &createIndexTask{
req: &milvuspb.CreateIndexRequest{
ExtraParams: []*commonpb.KeyValuePair{
{Key: common.IndexTypeKey, Value: "NGRAM"},
{Key: common.IndexParamsKey, Value: "{\"min_gram\": \"5\", \"max_gram\": \"3\"}"},
},
},
fieldSchema: &schemapb.FieldSchema{
FieldID: 101, Name: "FieldID", DataType: schemapb.DataType_VarChar,
},
}
err := cit.parseIndexParams(context.TODO())
assert.Error(t, err)
})
}
func Test_wrapUserIndexParams(t *testing.T) {
params := wrapUserIndexParams("L2")
assert.Equal(t, 2, len(params))

View File

@ -796,7 +796,7 @@ func (loader *segmentLoader) loadSealedSegment(ctx context.Context, loadInfo *qu
log := log.Ctx(ctx).With(zap.Int64("segmentID", segment.ID()))
tr := timerecord.NewTimeRecorder("segmentLoader.loadSealedSegment")
log.Info("Start loading fields...",
// zap.Int64s("indexedFields", lo.Keys(indexedFieldInfos)),
zap.Int("indexedFields count", len(indexedFieldInfos)),
zap.Int64s("indexed text fields", lo.Keys(textIndexes)),
zap.Int64s("unindexed text fields", lo.Keys(unindexedTextFields)),
zap.Int64s("indexed json key fields", lo.Keys(jsonKeyStats)),
@ -1744,6 +1744,10 @@ func (loader *segmentLoader) LoadJSONIndex(ctx context.Context,
return merr.WrapErrParameterInvalid("LocalSegment", fmt.Sprintf("%T", seg))
}
if len(loadInfo.GetJsonKeyStatsLogs()) == 0 {
return nil
}
collection := segment.GetCollection()
schemaHelper, _ := typeutil.CreateSchemaHelper(collection.Schema())

View File

@ -58,6 +58,7 @@ func (mgr *indexCheckerMgrImpl) registerIndexChecker() {
mgr.checkers[IndexHybrid] = newHYBRIDChecker()
mgr.checkers["marisa-trie"] = newTRIEChecker()
mgr.checkers[AutoIndex] = newAUTOINDEXChecker()
mgr.checkers[IndexNGRAM] = newNgramIndexChecker()
}
func newIndexCheckerMgr() *indexCheckerMgrImpl {

View File

@ -33,6 +33,7 @@ const (
IndexBitmap IndexType = "BITMAP"
IndexHybrid IndexType = "HYBRID" // BITMAP + INVERTED
IndexINVERTED IndexType = "INVERTED"
IndexNGRAM IndexType = "NGRAM"
AutoIndex IndexType = "AUTOINDEX"
)

View File

@ -0,0 +1,60 @@
package indexparamcheck
import (
"fmt"
"strconv"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/v2/util/merr"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
const (
MinGramKey = "min_gram"
MaxGramKey = "max_gram"
)
type NgramIndexChecker struct {
scalarIndexChecker
}
func newNgramIndexChecker() *NgramIndexChecker {
return &NgramIndexChecker{}
}
func (c *NgramIndexChecker) CheckTrain(dataType schemapb.DataType, params map[string]string) error {
if dataType != schemapb.DataType_VarChar {
// todo(SpadeA): we may support it for json in the future
return merr.WrapErrParameterInvalidMsg("Ngram index can only be created on VARCHAR field")
}
minGramStr, minGramExist := params[MinGramKey]
maxGramStr, maxGramExist := params[MaxGramKey]
if !minGramExist || !maxGramExist {
return merr.WrapErrParameterInvalidMsg("Ngram index must specify both min_gram and max_gram")
}
minGram, err := strconv.Atoi(minGramStr)
if err != nil {
return merr.WrapErrParameterInvalidMsg("min_gram for Ngram index must be an integer, got: %s", minGramStr)
}
maxGram, err := strconv.Atoi(maxGramStr)
if err != nil {
return merr.WrapErrParameterInvalidMsg("max_gram for Ngram index must be an integer, got: %s", maxGramStr)
}
if minGram <= 0 || maxGram <= 0 || minGram > maxGram {
return merr.WrapErrParameterInvalidMsg("invalid min_gram or max_gram value for Ngram index, min_gram: %d, max_gram: %d", minGram, maxGram)
}
return c.scalarIndexChecker.CheckTrain(dataType, params)
}
func (c *NgramIndexChecker) CheckValidDataType(indexType IndexType, field *schemapb.FieldSchema) error {
dType := field.GetDataType()
if !typeutil.IsStringType(dType) {
return fmt.Errorf("ngram index can only be created on VARCHAR field")
}
return nil
}