Spade A 26ec841feb
feat: optimize Like query with n-gram (#41803)
Ref #42053

This is the first PR for optimizing `LIKE` with ngram inverted index.
Now, only VARCHAR data type is supported and only InnerMatch LIKE
(%xxx%) query is supported.


How to use it:
```
milvus_client = MilvusClient("http://localhost:19530")
schema = milvus_client.create_schema()
...
schema.add_field("content_ngram", DataType.VARCHAR, max_length=10000)
...
index_params = milvus_client.prepare_index_params()
index_params.add_index(field_name="content_ngram", index_type="NGRAM", index_name="ngram_index", min_gram=2, max_gram=3)
milvus_client.create_collection(COLLECTION_NAME, ...)
```

min_gram and max_gram controls how we tokenize the documents. For
example, for min_gram=2 and max_gram=4, we will tokenize each document
with 2-gram, 3-gram and 4-gram.

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
Signed-off-by: SpadeA-Tang <tangchenjie1210@gmail.com>
2025-07-01 10:08:44 +08:00

523 lines
20 KiB
C++

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <fmt/core.h>
#include <optional>
#include <utility>
#include "common/EasyAssert.h"
#include "common/Types.h"
#include "common/Vector.h"
#include "exec/expression/Expr.h"
#include "exec/expression/Element.h"
#include "index/Meta.h"
#include "index/ScalarIndex.h"
#include "segcore/SegmentInterface.h"
#include "query/Utils.h"
#include "common/RegexQuery.h"
namespace milvus {
namespace exec {
template <typename T, FilterType filter_type>
struct UnaryElementFuncForMatch {
using IndexInnerType =
std::conditional_t<std::is_same_v<T, std::string_view>, std::string, T>;
void
operator()(const T* src,
size_t size,
IndexInnerType val,
TargetBitmapView res,
const TargetBitmap& bitmap_input,
int start_cursor,
const int32_t* offsets = nullptr) {
PatternMatchTranslator translator;
auto regex_pattern = translator(val);
RegexMatcher matcher(regex_pattern);
bool has_bitmap_input = !bitmap_input.empty();
for (int i = 0; i < size; ++i) {
if (has_bitmap_input && !bitmap_input[i + start_cursor]) {
continue;
}
if constexpr (filter_type == FilterType::random) {
res[i] = matcher(src[offsets ? offsets[i] : i]);
} else {
res[i] = matcher(src[i]);
}
}
}
};
template <typename T, proto::plan::OpType op, FilterType filter_type>
struct UnaryElementFunc {
using IndexInnerType =
std::conditional_t<std::is_same_v<T, std::string_view>, std::string, T>;
void
operator()(const T* src,
size_t size,
IndexInnerType val,
TargetBitmapView res,
const TargetBitmap& bitmap_input,
size_t start_cursor,
const int32_t* offsets = nullptr) {
bool has_bitmap_input = !bitmap_input.empty();
if constexpr (op == proto::plan::OpType::Match) {
UnaryElementFuncForMatch<T, filter_type> func;
func(src, size, val, res, bitmap_input, start_cursor, offsets);
return;
}
// This is the original code, which is kept for the documentation purposes
// also, for iterative filter
if constexpr (filter_type == FilterType::random) {
for (int i = 0; i < size; ++i) {
auto offset = (offsets != nullptr) ? offsets[i] : i;
if constexpr (op == proto::plan::OpType::Equal) {
res[i] = src[offset] == val;
} else if constexpr (op == proto::plan::OpType::NotEqual) {
res[i] = src[offset] != val;
} else if constexpr (op == proto::plan::OpType::GreaterThan) {
res[i] = src[offset] > val;
} else if constexpr (op == proto::plan::OpType::LessThan) {
res[i] = src[offset] < val;
} else if constexpr (op == proto::plan::OpType::GreaterEqual) {
res[i] = src[offset] >= val;
} else if constexpr (op == proto::plan::OpType::LessEqual) {
res[i] = src[offset] <= val;
} else if constexpr (op == proto::plan::OpType::PrefixMatch ||
op == proto::plan::OpType::PostfixMatch ||
op == proto::plan::OpType::InnerMatch) {
res[i] = milvus::query::Match(src[offset], val, op);
} else {
PanicInfo(OpTypeInvalid,
"unsupported op_type:{} for UnaryElementFunc",
op);
}
}
return;
}
if (has_bitmap_input) {
if constexpr (std::is_same_v<T, std::string_view> ||
std::is_same_v<T, std::string>) {
for (int i = 0; i < size; ++i) {
if (!bitmap_input[i + start_cursor]) {
continue;
}
if constexpr (op == proto::plan::OpType::Equal) {
res[i] = src[i] == val;
} else if constexpr (op == proto::plan::OpType::NotEqual) {
res[i] = src[i] != val;
} else if constexpr (op ==
proto::plan::OpType::GreaterThan) {
res[i] = src[i] > val;
} else if constexpr (op == proto::plan::OpType::LessThan) {
res[i] = src[i] < val;
} else if constexpr (op ==
proto::plan::OpType::GreaterEqual) {
res[i] = src[i] >= val;
} else if constexpr (op == proto::plan::OpType::LessEqual) {
res[i] = src[i] <= val;
} else if constexpr (op ==
proto::plan::OpType::PrefixMatch ||
op == proto::plan::OpType::
PostfixMatch ||
op ==
proto::plan::OpType::InnerMatch) {
res[i] = milvus::query::Match(src[i], val, op);
} else {
PanicInfo(OpTypeInvalid,
"unsupported op_type:{} for UnaryElementFunc",
op);
}
}
return;
}
}
if constexpr (op == proto::plan::OpType::PrefixMatch ||
op == proto::plan::OpType::PostfixMatch ||
op == proto::plan::OpType::InnerMatch) {
for (int i = 0; i < size; ++i) {
res[i] = milvus::query::Match(src[i], val, op);
}
return;
}
if constexpr (op == proto::plan::OpType::Equal) {
res.inplace_compare_val<T, milvus::bitset::CompareOpType::EQ>(
src, size, val);
} else if constexpr (op == proto::plan::OpType::NotEqual) {
res.inplace_compare_val<T, milvus::bitset::CompareOpType::NE>(
src, size, val);
} else if constexpr (op == proto::plan::OpType::GreaterThan) {
res.inplace_compare_val<T, milvus::bitset::CompareOpType::GT>(
src, size, val);
} else if constexpr (op == proto::plan::OpType::LessThan) {
res.inplace_compare_val<T, milvus::bitset::CompareOpType::LT>(
src, size, val);
} else if constexpr (op == proto::plan::OpType::GreaterEqual) {
res.inplace_compare_val<T, milvus::bitset::CompareOpType::GE>(
src, size, val);
} else if constexpr (op == proto::plan::OpType::LessEqual) {
res.inplace_compare_val<T, milvus::bitset::CompareOpType::LE>(
src, size, val);
} else {
PanicInfo(OpTypeInvalid,
"unsupported op_type:{} for UnaryElementFunc",
op);
}
}
};
#define UnaryArrayCompare(cmp) \
do { \
if constexpr (std::is_same_v<GetType, proto::plan::Array>) { \
res[i] = false; \
} else { \
if (index >= src[i].length()) { \
res[i] = false; \
continue; \
} \
auto array_data = src[i].template get_data<GetType>(index); \
res[i] = (cmp); \
} \
} while (false)
template <typename ValueType, proto::plan::OpType op, FilterType filter_type>
struct UnaryElementFuncForArray {
using GetType = std::conditional_t<std::is_same_v<ValueType, std::string>,
std::string_view,
ValueType>;
void
operator()(const ArrayView* src,
const bool* valid_data,
size_t size,
ValueType val,
int index,
TargetBitmapView res,
TargetBitmapView valid_res,
const TargetBitmap& bitmap_input,
size_t start_cursor,
const int32_t* offsets = nullptr) {
bool has_bitmap_input = !bitmap_input.empty();
for (int i = 0; i < size; ++i) {
auto offset = i;
if constexpr (filter_type == FilterType::random) {
offset = (offsets) ? offsets[i] : i;
}
if (valid_data != nullptr && !valid_data[offset]) {
res[i] = valid_res[i] = false;
continue;
}
if (has_bitmap_input && !bitmap_input[i + start_cursor]) {
continue;
}
if constexpr (op == proto::plan::OpType::Equal) {
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
res[i] = src[offset].is_same_array(val);
} else {
if (index >= src[offset].length()) {
res[i] = false;
continue;
}
auto array_data =
src[offset].template get_data<GetType>(index);
res[i] = array_data == val;
}
} else if constexpr (op == proto::plan::OpType::NotEqual) {
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
res[i] = !src[offset].is_same_array(val);
} else {
if (index >= src[offset].length()) {
res[i] = false;
continue;
}
auto array_data =
src[offset].template get_data<GetType>(index);
res[i] = array_data != val;
}
} else if constexpr (op == proto::plan::OpType::GreaterThan) {
UnaryArrayCompare(array_data > val);
} else if constexpr (op == proto::plan::OpType::LessThan) {
UnaryArrayCompare(array_data < val);
} else if constexpr (op == proto::plan::OpType::GreaterEqual) {
UnaryArrayCompare(array_data >= val);
} else if constexpr (op == proto::plan::OpType::LessEqual) {
UnaryArrayCompare(array_data <= val);
} else if constexpr (op == proto::plan::OpType::PrefixMatch ||
op == proto::plan::OpType::PostfixMatch ||
op == proto::plan::OpType::InnerMatch) {
UnaryArrayCompare(milvus::query::Match(array_data, val, op));
} else if constexpr (op == proto::plan::OpType::Match) {
if constexpr (std::is_same_v<GetType, proto::plan::Array>) {
res[i] = false;
} else {
if (index >= src[offset].length()) {
res[i] = false;
continue;
}
PatternMatchTranslator translator;
auto regex_pattern = translator(val);
RegexMatcher matcher(regex_pattern);
auto array_data =
src[offset].template get_data<GetType>(index);
res[i] = matcher(array_data);
}
} else {
PanicInfo(OpTypeInvalid,
"unsupported op_type:{} for "
"UnaryElementFuncForArray",
op);
}
}
}
};
template <typename T>
struct UnaryIndexFuncForMatch {
using IndexInnerType =
std::conditional_t<std::is_same_v<T, std::string_view>, std::string, T>;
using Index = index::ScalarIndex<IndexInnerType>;
TargetBitmap
operator()(Index* index, IndexInnerType val, proto::plan::OpType op) {
AssertInfo(op == proto::plan::OpType::Match ||
op == proto::plan::OpType::PostfixMatch ||
op == proto::plan::OpType::InnerMatch ||
op == proto::plan::OpType::PrefixMatch,
"op must be one of the following: Match, PrefixMatch, "
"PostfixMatch, InnerMatch");
if constexpr (std::is_same_v<T, std::string> ||
std::is_same_v<T, std::string_view>) {
if (index->SupportPatternMatch()) {
return index->PatternMatch(val, op);
}
if (!index->HasRawData()) {
PanicInfo(Unsupported,
"index don't support regex query and don't have "
"raw data");
}
// retrieve raw data to do brute force query, may be very slow.
auto cnt = index->Count();
TargetBitmap res(cnt);
if (op == proto::plan::OpType::InnerMatch ||
op == proto::plan::OpType::PostfixMatch ||
op == proto::plan::OpType::PrefixMatch) {
for (int64_t i = 0; i < cnt; i++) {
auto raw = index->Reverse_Lookup(i);
if (!raw.has_value()) {
res[i] = false;
continue;
}
res[i] = milvus::query::Match(raw.value(), val, op);
}
return res;
} else {
PatternMatchTranslator translator;
auto regex_pattern = translator(val);
RegexMatcher matcher(regex_pattern);
for (int64_t i = 0; i < cnt; i++) {
auto raw = index->Reverse_Lookup(i);
if (!raw.has_value()) {
res[i] = false;
continue;
}
res[i] = matcher(raw.value());
}
return res;
}
}
PanicInfo(ErrorCode::Unsupported,
"UnaryIndexFuncForMatch is only supported on string types");
}
};
template <typename T, proto::plan::OpType op>
struct UnaryIndexFunc {
using IndexInnerType =
std::conditional_t<std::is_same_v<T, std::string_view>, std::string, T>;
using Index = index::ScalarIndex<IndexInnerType>;
TargetBitmap
operator()(Index* index, IndexInnerType val) {
if constexpr (op == proto::plan::OpType::Equal) {
return index->In(1, &val);
} else if constexpr (op == proto::plan::OpType::NotEqual) {
return index->NotIn(1, &val);
} else if constexpr (op == proto::plan::OpType::GreaterThan) {
return index->Range(val, OpType::GreaterThan);
} else if constexpr (op == proto::plan::OpType::LessThan) {
return index->Range(val, OpType::LessThan);
} else if constexpr (op == proto::plan::OpType::GreaterEqual) {
return index->Range(val, OpType::GreaterEqual);
} else if constexpr (op == proto::plan::OpType::LessEqual) {
return index->Range(val, OpType::LessEqual);
} else if constexpr (op == proto::plan::OpType::PrefixMatch ||
op == proto::plan::OpType::Match ||
op == proto::plan::OpType::PostfixMatch ||
op == proto::plan::OpType::InnerMatch) {
UnaryIndexFuncForMatch<T> func;
return func(index, val, op);
} else {
PanicInfo(
OpTypeInvalid,
fmt::format("unsupported op_type:{} for UnaryIndexFunc", op));
}
}
};
class PhyUnaryRangeFilterExpr : public SegmentExpr {
public:
PhyUnaryRangeFilterExpr(
const std::vector<std::shared_ptr<Expr>>& input,
const std::shared_ptr<const milvus::expr::UnaryRangeFilterExpr>& expr,
const std::string& name,
const segcore::SegmentInternalInterface* segment,
int64_t active_count,
int64_t batch_size,
int32_t consistency_level)
: SegmentExpr(std::move(input),
name,
segment,
expr->column_.field_id_,
expr->column_.nested_path_,
FromValCase(expr->val_.val_case()),
active_count,
batch_size,
consistency_level),
expr_(expr) {
}
void
Eval(EvalCtx& context, VectorPtr& result) override;
bool
SupportOffsetInput() override {
if (expr_->op_type_ == proto::plan::OpType::TextMatch ||
expr_->op_type_ == proto::plan::OpType::PhraseMatch) {
return false;
}
return true;
}
std::string
ToString() const {
return fmt::format("{}", expr_->ToString());
}
std::optional<milvus::expr::ColumnInfo>
GetColumnInfo() const override {
return expr_->column_;
}
bool
IsSource() const override {
return true;
}
std::shared_ptr<const milvus::expr::UnaryRangeFilterExpr>
GetLogicalExpr() {
return expr_;
}
proto::plan::OpType
GetOpType() {
return expr_->op_type_;
}
FieldId
GetFieldId() {
return expr_->column_.field_id_;
}
DataType
GetFieldType() {
return expr_->column_.data_type_;
}
private:
template <typename T>
VectorPtr
ExecRangeVisitorImpl(EvalCtx& context);
template <typename T>
VectorPtr
ExecRangeVisitorImplForIndex();
template <typename T>
VectorPtr
ExecRangeVisitorImplForData(EvalCtx& context);
template <typename ExprValueType>
VectorPtr
ExecRangeVisitorImplJson(EvalCtx& context);
template <typename ExprValueType>
VectorPtr
ExecRangeVisitorImplJsonForIndex();
template <typename ExprValueType>
VectorPtr
ExecRangeVisitorImplArray(EvalCtx& context);
template <typename T>
VectorPtr
ExecRangeVisitorImplArrayForIndex(EvalCtx& context);
template <typename T>
VectorPtr
ExecArrayEqualForIndex(EvalCtx& context, bool reverse);
// Check overflow and cache result for performace
template <typename T>
ColumnVectorPtr
PreCheckOverflow(OffsetVector* input = nullptr);
template <typename T>
bool
CanUseIndex();
template <typename T>
bool
CanUseIndexForArray();
bool
CanUseIndexForJson(DataType val_type);
VectorPtr
ExecTextMatch();
std::optional<VectorPtr>
ExecNgramMatch();
std::pair<std::string, std::string>
SplitAtFirstSlashDigit(std::string input);
private:
std::shared_ptr<const milvus::expr::UnaryRangeFilterExpr> expr_;
int64_t overflow_check_pos_{0};
bool arg_inited_{false};
SingleElement value_arg_;
};
} // namespace exec
} // namespace milvus