milvus/pkg/proto/plan.proto
Spade A 0114bd1dc9
feat: support match operator family (#46518)
issue: https://github.com/milvus-io/milvus/issues/46517
ref: https://github.com/milvus-io/milvus/issues/42148

This PR supports match operator family with struct array and brute force
search only.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
- Core invariant: match operators only target struct-array element-level
predicates and assume callers provide a correct row_start so element
indices form a contiguous range; IArrayOffsets implementations convert
row-level bitmaps/rows (starting at row_start) into element-level
bitmaps or a contiguous element-offset vector used by brute-force
evaluation.

- New capability added: end-to-end support for MATCH_* semantics
(match_any, match_all, match_least, match_most, match_exact) — parser
(grammar + proto), planner (ParseMatchExprs), expr model
(expr::MatchExpr), compilation (Expr→PhyMatchFilterExpr), execution
(PhyMatchFilterExpr::Eval uses element offsets/bitmaps), and unit tests
(MatchExprTest + parser tests). Implementation currently works for
struct-array inputs and uses brute-force element counting via
RowBitsetToElementOffsets/RowBitsetToElementBitset.

- Logic removed or simplified and why: removed the ad-hoc
DocBitsetToElementOffsets helper and consolidated offset/bitset
derivation into IArrayOffsets::RowBitsetToElementOffsets and a
row_start-aware RowBitsetToElementBitset, and removed EvalCtx overloads
that embedded ExprSet (now EvalCtx(exec_ctx, offset_input)). This
centralizes array-layout logic in ArrayOffsets and removes duplicated
offset conversion and EvalCtx variants that were redundant for
element-level evaluation.

- No data loss / no behavior regression: persistent formats are
unchanged (no proto storage or on-disk layout changed); callers were
updated to supply row_start and now route through the centralized
ArrayOffsets APIs which still use the authoritative
row_to_element_start_ mapping, preserving exact element index mappings.
Eval logic changes are limited to in-memory plumbing (how
offsets/bitmaps are produced and how EvalCtx is constructed); expression
evaluation still invokes exprs_->Eval where needed, so existing behavior
and stored data remain intact.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
Signed-off-by: SpadeA-Tang <tangchenjie1210@gmail.com>
2025-12-29 11:03:26 +08:00

379 lines
8.4 KiB
Protocol Buffer

syntax = "proto3";
package milvus.proto.plan;
option go_package = "github.com/milvus-io/milvus/pkg/v2/proto/planpb";
import "schema.proto";
import "common.proto";
enum OpType {
Invalid = 0;
GreaterThan = 1;
GreaterEqual = 2;
LessThan = 3;
LessEqual = 4;
Equal = 5;
NotEqual = 6;
PrefixMatch = 7; // startsWith
PostfixMatch = 8; // endsWith
Match = 9; // like
Range = 10; // for case 1 < a < b
In = 11; // TODO:: used for term expr
NotIn = 12;
TextMatch = 13; // text match
PhraseMatch = 14; // phrase match
InnerMatch = 15; // substring (e.g., "%value%")
};
enum ArithOpType {
Unknown = 0;
Add = 1;
Sub = 2;
Mul = 3;
Div = 4;
Mod = 5;
ArrayLength = 6;
};
enum VectorType {
BinaryVector = 0;
FloatVector = 1;
Float16Vector = 2;
BFloat16Vector = 3;
SparseFloatVector = 4;
Int8Vector = 5;
EmbListFloatVector = 6;
EmbListFloat16Vector = 7;
EmbListBFloat16Vector = 8;
EmbListInt8Vector = 9;
EmbListBinaryVector = 10;
};
message GenericValue {
oneof val {
bool bool_val = 1;
int64 int64_val = 2;
double float_val = 3;
string string_val = 4;
Array array_val = 5;
};
}
message Array {
repeated GenericValue array = 1;
bool same_type = 2;
schema.DataType element_type = 3;
}
message SearchIteratorV2Info {
string token = 1;
uint32 batch_size = 2;
optional float last_bound = 3;
}
message QueryInfo {
int64 topk = 1;
string metric_type = 3;
string search_params = 4;
int64 round_decimal = 5;
int64 group_by_field_id = 6;
bool materialized_view_involved = 7;
int64 group_size = 8;
bool strict_group_size = 9;
double bm25_avgdl = 10;
int64 query_field_id =11;
string hints = 12;
optional SearchIteratorV2Info search_iterator_v2_info = 13;
string json_path = 14;
schema.DataType json_type = 15;
bool strict_cast = 16;
}
message ColumnInfo {
int64 field_id = 1;
schema.DataType data_type = 2;
bool is_primary_key = 3;
bool is_autoID = 4;
repeated string nested_path = 5;
bool is_partition_key = 6;
schema.DataType element_type = 7;
bool is_clustering_key = 8;
bool nullable = 9;
bool is_element_level = 10;
}
message ColumnExpr {
ColumnInfo info = 1;
}
message ExistsExpr {
ColumnInfo info = 1;
}
message ValueExpr {
GenericValue value = 1;
string template_variable_name = 2;
}
message UnaryRangeExpr {
ColumnInfo column_info = 1;
OpType op = 2;
GenericValue value = 3;
string template_variable_name = 4;
repeated GenericValue extra_values = 5;
}
message BinaryRangeExpr {
ColumnInfo column_info = 1;
bool lower_inclusive = 2;
bool upper_inclusive = 3;
GenericValue lower_value = 4;
GenericValue upper_value = 5;
string lower_template_variable_name = 6;
string upper_template_variable_name = 7;
}
message CallExpr {
string function_name = 1;
repeated Expr function_parameters = 2;
}
message CompareExpr {
ColumnInfo left_column_info = 1;
ColumnInfo right_column_info = 2;
OpType op = 3;
}
message TermExpr {
ColumnInfo column_info = 1;
repeated GenericValue values = 2;
bool is_in_field = 3;
string template_variable_name = 4;
}
message JSONContainsExpr {
ColumnInfo column_info = 1;
repeated GenericValue elements = 2;
// 0: invalid
// 1: json_contains | array_contains
// 2: json_contains_all | array_contains_all
// 3: json_contains_any | array_contains_any
enum JSONOp {
Invalid = 0;
Contains = 1;
ContainsAll = 2;
ContainsAny = 3;
}
JSONOp op = 3;
bool elements_same_type = 4;
string template_variable_name = 5;
}
message NullExpr {
ColumnInfo column_info = 1;
enum NullOp {
Invalid = 0;
IsNull = 1;
IsNotNull = 2;
}
NullOp op = 2;
}
message GISFunctionFilterExpr{
ColumnInfo column_info = 1;
string wkt_string = 2;
enum GISOp {
Invalid = 0;
Equals = 1;
Touches = 2;
Overlaps = 3;
Crosses = 4;
Contains = 5;
Intersects = 6;
Within = 7;
DWithin = 8;
// STIsValid is a special operator that conflicts with proto-generated default methods.
// Using STIsValid instead of IsValid to avoid naming conflicts with IsValid() method.
STIsValid = 9;
}
GISOp op = 3;
double distance = 4; // Distance parameter for DWithin
}
message UnaryExpr {
enum UnaryOp {
Invalid = 0;
Not = 1;
};
UnaryOp op = 1;
Expr child = 2;
}
message BinaryExpr {
enum BinaryOp {
Invalid = 0;
LogicalAnd = 1;
LogicalOr = 2;
}
BinaryOp op = 1;
Expr left = 2;
Expr right = 3;
}
message BinaryArithOp {
ColumnInfo column_info = 1;
ArithOpType arith_op = 2;
GenericValue right_operand = 3;
}
message BinaryArithExpr {
Expr left = 1;
Expr right = 2;
ArithOpType op = 3;
}
message BinaryArithOpEvalRangeExpr {
ColumnInfo column_info = 1;
ArithOpType arith_op = 2;
GenericValue right_operand = 3;
OpType op = 4;
GenericValue value = 5;
string operand_template_variable_name = 6;
string value_template_variable_name = 7;
}
message RandomSampleExpr {
float sample_factor = 1;
Expr predicate = 2;
}
message ElementFilterExpr {
Expr element_expr = 1;
string struct_name = 2;
Expr predicate = 3;
}
// MatchType defines the type of match operation for struct array queries
enum MatchType {
MatchAll = 0; // All elements must match the predicate
MatchAny = 1; // At least one element matches the predicate
MatchLeast = 2; // At least N elements match the predicate
MatchMost = 3; // At most N elements match the predicate
MatchExact = 4; // Exactly N elements match the predicate
}
message MatchExpr {
string struct_name = 1; // The struct array field name (e.g., struct_array)
Expr predicate = 2; // The condition expression using $[field] syntax (e.g., $[intField] == 1 && $[strField] == "aaa")
MatchType match_type = 3; // Type of match operation
int64 count = 4; // For MatchLeast/MatchMost: the count parameter (N)
}
message AlwaysTrueExpr {}
message Interval {
int64 years = 1;
int64 months = 2;
int64 days = 3;
int64 hours = 4;
int64 minutes = 5;
int64 seconds = 6;
}
// New expression type for the operation: (timestamp_col + interval) OP iso_string
message TimestamptzArithCompareExpr {
ColumnInfo timestamptz_column = 1;
ArithOpType arith_op = 2; // ADD or SUB
Interval interval = 3;
OpType compare_op = 4;
GenericValue compare_value = 5;
}
message Expr {
oneof expr {
TermExpr term_expr = 1;
UnaryExpr unary_expr = 2;
BinaryExpr binary_expr = 3;
CompareExpr compare_expr = 4;
UnaryRangeExpr unary_range_expr = 5;
BinaryRangeExpr binary_range_expr = 6;
BinaryArithOpEvalRangeExpr binary_arith_op_eval_range_expr = 7;
BinaryArithExpr binary_arith_expr = 8;
ValueExpr value_expr = 9;
ColumnExpr column_expr = 10;
ExistsExpr exists_expr = 11;
AlwaysTrueExpr always_true_expr = 12;
JSONContainsExpr json_contains_expr = 13;
CallExpr call_expr = 14;
NullExpr null_expr = 15;
RandomSampleExpr random_sample_expr = 16;
GISFunctionFilterExpr gisfunction_filter_expr = 17;
TimestamptzArithCompareExpr timestamptz_arith_compare_expr = 18;
ElementFilterExpr element_filter_expr = 19;
MatchExpr match_expr = 21;
};
bool is_template = 20;
}
message VectorANNS {
VectorType vector_type = 1;
int64 field_id = 2;
Expr predicates = 3;
QueryInfo query_info = 4;
string placeholder_tag = 5; // always be "$0"
}
message QueryPlanNode {
Expr predicates = 1;
bool is_count = 2;
int64 limit = 3;
};
enum FunctionType{
FunctionTypeWeight = 0;
FunctionTypeRandom = 1;
}
// FunctionMode decide how to calculate boost score
// for multiple boost function scores
enum FunctionMode{
FunctionModeMultiply = 0;
FunctionModeSum = 1;
};
// BoostMode decide how to calculate final score
// for origin score and boost score.
enum BoostMode{
BoostModeMultiply = 0;
BoostModeSum = 1;
};
message ScoreFunction {
Expr filter = 1;
float weight = 2;
FunctionType type = 3;
repeated common.KeyValuePair params = 4;
}
message ScoreOption{
BoostMode boost_mode = 1;
FunctionMode function_mode = 2;
}
message PlanOption {
bool expr_use_json_stats = 1;
};
message PlanNode {
oneof node {
VectorANNS vector_anns = 1;
Expr predicates = 2; // deprecated, use query instead.
QueryPlanNode query = 4;
}
repeated int64 output_field_ids = 3;
repeated string dynamic_fields = 5;
repeated ScoreFunction scorers = 6;
PlanOption plan_options = 7;
ScoreOption score_option = 8;
optional string namespace = 9;
}