mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-28 22:45:26 +08:00
issue: https://github.com/milvus-io/milvus/issues/42148 For a vector field inside a STRUCT, since a STRUCT can only appear as the element type of an ARRAY field, the vector field in STRUCT is effectively an array of vectors, i.e. an embedding list. Milvus already supports searching embedding lists with metrics whose names start with the prefix MAX_SIM_. This PR allows Milvus to search embeddings inside an embedding list using the same metrics as normal embedding fields. Each embedding in the list is treated as an independent vector and participates in ANN search. Further, since STRUCT may contain scalar fields that are highly related to the embedding field, this PR introduces an element-level filter expression to refine search results. The grammar of the element-level filter is: element_filter(structFieldName, $[subFieldName] == 3) where $[subFieldName] refers to the value of subFieldName in each element of the STRUCT array structFieldName. It can be combined with existing filter expressions, for example: "varcharField == 'aaa' && element_filter(struct_field, $[struct_int] == 3)" A full example: ``` struct_schema = milvus_client.create_struct_field_schema() struct_schema.add_field("struct_str", DataType.VARCHAR, max_length=65535) struct_schema.add_field("struct_int", DataType.INT32) struct_schema.add_field("struct_float_vec", DataType.FLOAT_VECTOR, dim=EMBEDDING_DIM) schema.add_field( "struct_field", datatype=DataType.ARRAY, element_type=DataType.STRUCT, struct_schema=struct_schema, max_capacity=1000, ) ... filter = "varcharField == 'aaa' && element_filter(struct_field, $[struct_int] == 3 && $[struct_str] == 'abc')" res = milvus_client.search( COLLECTION_NAME, data=query_embeddings, limit=10, anns_field="struct_field[struct_float_vec]", filter=filter, output_fields=["struct_field[struct_int]", "varcharField"], ) ``` TODO: 1. When an `element_filter` expression is used, a regular filter expression must also be present. Remove this restriction. 2. Implement `element_filter` expressions in the `query`. --------- Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
362 lines
7.6 KiB
Protocol Buffer
362 lines
7.6 KiB
Protocol Buffer
syntax = "proto3";
|
|
package milvus.proto.plan;
|
|
|
|
option go_package = "github.com/milvus-io/milvus/pkg/v2/proto/planpb";
|
|
import "schema.proto";
|
|
import "common.proto";
|
|
|
|
enum OpType {
|
|
Invalid = 0;
|
|
GreaterThan = 1;
|
|
GreaterEqual = 2;
|
|
LessThan = 3;
|
|
LessEqual = 4;
|
|
Equal = 5;
|
|
NotEqual = 6;
|
|
PrefixMatch = 7; // startsWith
|
|
PostfixMatch = 8; // endsWith
|
|
Match = 9; // like
|
|
Range = 10; // for case 1 < a < b
|
|
In = 11; // TODO:: used for term expr
|
|
NotIn = 12;
|
|
TextMatch = 13; // text match
|
|
PhraseMatch = 14; // phrase match
|
|
InnerMatch = 15; // substring (e.g., "%value%")
|
|
};
|
|
|
|
enum ArithOpType {
|
|
Unknown = 0;
|
|
Add = 1;
|
|
Sub = 2;
|
|
Mul = 3;
|
|
Div = 4;
|
|
Mod = 5;
|
|
ArrayLength = 6;
|
|
};
|
|
|
|
enum VectorType {
|
|
BinaryVector = 0;
|
|
FloatVector = 1;
|
|
Float16Vector = 2;
|
|
BFloat16Vector = 3;
|
|
SparseFloatVector = 4;
|
|
Int8Vector = 5;
|
|
EmbListFloatVector = 6;
|
|
EmbListFloat16Vector = 7;
|
|
EmbListBFloat16Vector = 8;
|
|
EmbListInt8Vector = 9;
|
|
EmbListBinaryVector = 10;
|
|
};
|
|
|
|
message GenericValue {
|
|
oneof val {
|
|
bool bool_val = 1;
|
|
int64 int64_val = 2;
|
|
double float_val = 3;
|
|
string string_val = 4;
|
|
Array array_val = 5;
|
|
};
|
|
}
|
|
|
|
message Array {
|
|
repeated GenericValue array = 1;
|
|
bool same_type = 2;
|
|
schema.DataType element_type = 3;
|
|
}
|
|
|
|
message SearchIteratorV2Info {
|
|
string token = 1;
|
|
uint32 batch_size = 2;
|
|
optional float last_bound = 3;
|
|
}
|
|
|
|
message QueryInfo {
|
|
int64 topk = 1;
|
|
string metric_type = 3;
|
|
string search_params = 4;
|
|
int64 round_decimal = 5;
|
|
int64 group_by_field_id = 6;
|
|
bool materialized_view_involved = 7;
|
|
int64 group_size = 8;
|
|
bool strict_group_size = 9;
|
|
double bm25_avgdl = 10;
|
|
int64 query_field_id =11;
|
|
string hints = 12;
|
|
optional SearchIteratorV2Info search_iterator_v2_info = 13;
|
|
string json_path = 14;
|
|
schema.DataType json_type = 15;
|
|
bool strict_cast = 16;
|
|
}
|
|
|
|
message ColumnInfo {
|
|
int64 field_id = 1;
|
|
schema.DataType data_type = 2;
|
|
bool is_primary_key = 3;
|
|
bool is_autoID = 4;
|
|
repeated string nested_path = 5;
|
|
bool is_partition_key = 6;
|
|
schema.DataType element_type = 7;
|
|
bool is_clustering_key = 8;
|
|
bool nullable = 9;
|
|
bool is_element_level = 10;
|
|
}
|
|
|
|
message ColumnExpr {
|
|
ColumnInfo info = 1;
|
|
}
|
|
|
|
message ExistsExpr {
|
|
ColumnInfo info = 1;
|
|
}
|
|
|
|
message ValueExpr {
|
|
GenericValue value = 1;
|
|
string template_variable_name = 2;
|
|
}
|
|
|
|
message UnaryRangeExpr {
|
|
ColumnInfo column_info = 1;
|
|
OpType op = 2;
|
|
GenericValue value = 3;
|
|
string template_variable_name = 4;
|
|
repeated GenericValue extra_values = 5;
|
|
}
|
|
|
|
message BinaryRangeExpr {
|
|
ColumnInfo column_info = 1;
|
|
bool lower_inclusive = 2;
|
|
bool upper_inclusive = 3;
|
|
GenericValue lower_value = 4;
|
|
GenericValue upper_value = 5;
|
|
string lower_template_variable_name = 6;
|
|
string upper_template_variable_name = 7;
|
|
}
|
|
|
|
message CallExpr {
|
|
string function_name = 1;
|
|
repeated Expr function_parameters = 2;
|
|
}
|
|
|
|
message CompareExpr {
|
|
ColumnInfo left_column_info = 1;
|
|
ColumnInfo right_column_info = 2;
|
|
OpType op = 3;
|
|
}
|
|
|
|
message TermExpr {
|
|
ColumnInfo column_info = 1;
|
|
repeated GenericValue values = 2;
|
|
bool is_in_field = 3;
|
|
string template_variable_name = 4;
|
|
}
|
|
|
|
message JSONContainsExpr {
|
|
ColumnInfo column_info = 1;
|
|
repeated GenericValue elements = 2;
|
|
// 0: invalid
|
|
// 1: json_contains | array_contains
|
|
// 2: json_contains_all | array_contains_all
|
|
// 3: json_contains_any | array_contains_any
|
|
enum JSONOp {
|
|
Invalid = 0;
|
|
Contains = 1;
|
|
ContainsAll = 2;
|
|
ContainsAny = 3;
|
|
}
|
|
JSONOp op = 3;
|
|
bool elements_same_type = 4;
|
|
string template_variable_name = 5;
|
|
}
|
|
|
|
message NullExpr {
|
|
ColumnInfo column_info = 1;
|
|
enum NullOp {
|
|
Invalid = 0;
|
|
IsNull = 1;
|
|
IsNotNull = 2;
|
|
}
|
|
NullOp op = 2;
|
|
}
|
|
|
|
message GISFunctionFilterExpr{
|
|
ColumnInfo column_info = 1;
|
|
string wkt_string = 2;
|
|
enum GISOp {
|
|
Invalid = 0;
|
|
Equals = 1;
|
|
Touches = 2;
|
|
Overlaps = 3;
|
|
Crosses = 4;
|
|
Contains = 5;
|
|
Intersects = 6;
|
|
Within = 7;
|
|
DWithin = 8;
|
|
// STIsValid is a special operator that conflicts with proto-generated default methods.
|
|
// Using STIsValid instead of IsValid to avoid naming conflicts with IsValid() method.
|
|
STIsValid = 9;
|
|
}
|
|
GISOp op = 3;
|
|
double distance = 4; // Distance parameter for DWithin
|
|
}
|
|
|
|
message UnaryExpr {
|
|
enum UnaryOp {
|
|
Invalid = 0;
|
|
Not = 1;
|
|
};
|
|
UnaryOp op = 1;
|
|
Expr child = 2;
|
|
}
|
|
|
|
message BinaryExpr {
|
|
enum BinaryOp {
|
|
Invalid = 0;
|
|
LogicalAnd = 1;
|
|
LogicalOr = 2;
|
|
}
|
|
BinaryOp op = 1;
|
|
Expr left = 2;
|
|
Expr right = 3;
|
|
}
|
|
|
|
message BinaryArithOp {
|
|
ColumnInfo column_info = 1;
|
|
ArithOpType arith_op = 2;
|
|
GenericValue right_operand = 3;
|
|
}
|
|
|
|
message BinaryArithExpr {
|
|
Expr left = 1;
|
|
Expr right = 2;
|
|
ArithOpType op = 3;
|
|
}
|
|
|
|
message BinaryArithOpEvalRangeExpr {
|
|
ColumnInfo column_info = 1;
|
|
ArithOpType arith_op = 2;
|
|
GenericValue right_operand = 3;
|
|
OpType op = 4;
|
|
GenericValue value = 5;
|
|
string operand_template_variable_name = 6;
|
|
string value_template_variable_name = 7;
|
|
}
|
|
|
|
message RandomSampleExpr {
|
|
float sample_factor = 1;
|
|
Expr predicate = 2;
|
|
}
|
|
|
|
message ElementFilterExpr {
|
|
Expr element_expr = 1;
|
|
string struct_name = 2;
|
|
Expr predicate = 3;
|
|
}
|
|
|
|
message AlwaysTrueExpr {}
|
|
|
|
message Interval {
|
|
int64 years = 1;
|
|
int64 months = 2;
|
|
int64 days = 3;
|
|
int64 hours = 4;
|
|
int64 minutes = 5;
|
|
int64 seconds = 6;
|
|
}
|
|
|
|
// New expression type for the operation: (timestamp_col + interval) OP iso_string
|
|
message TimestamptzArithCompareExpr {
|
|
ColumnInfo timestamptz_column = 1;
|
|
ArithOpType arith_op = 2; // ADD or SUB
|
|
Interval interval = 3;
|
|
OpType compare_op = 4;
|
|
GenericValue compare_value = 5;
|
|
}
|
|
|
|
message Expr {
|
|
oneof expr {
|
|
TermExpr term_expr = 1;
|
|
UnaryExpr unary_expr = 2;
|
|
BinaryExpr binary_expr = 3;
|
|
CompareExpr compare_expr = 4;
|
|
UnaryRangeExpr unary_range_expr = 5;
|
|
BinaryRangeExpr binary_range_expr = 6;
|
|
BinaryArithOpEvalRangeExpr binary_arith_op_eval_range_expr = 7;
|
|
BinaryArithExpr binary_arith_expr = 8;
|
|
ValueExpr value_expr = 9;
|
|
ColumnExpr column_expr = 10;
|
|
ExistsExpr exists_expr = 11;
|
|
AlwaysTrueExpr always_true_expr = 12;
|
|
JSONContainsExpr json_contains_expr = 13;
|
|
CallExpr call_expr = 14;
|
|
NullExpr null_expr = 15;
|
|
RandomSampleExpr random_sample_expr = 16;
|
|
GISFunctionFilterExpr gisfunction_filter_expr = 17;
|
|
TimestamptzArithCompareExpr timestamptz_arith_compare_expr = 18;
|
|
ElementFilterExpr element_filter_expr = 19;
|
|
};
|
|
bool is_template = 20;
|
|
}
|
|
|
|
message VectorANNS {
|
|
VectorType vector_type = 1;
|
|
int64 field_id = 2;
|
|
Expr predicates = 3;
|
|
QueryInfo query_info = 4;
|
|
string placeholder_tag = 5; // always be "$0"
|
|
}
|
|
|
|
message QueryPlanNode {
|
|
Expr predicates = 1;
|
|
bool is_count = 2;
|
|
int64 limit = 3;
|
|
};
|
|
|
|
enum FunctionType{
|
|
FunctionTypeWeight = 0;
|
|
FunctionTypeRandom = 1;
|
|
}
|
|
|
|
// FunctionMode decide how to calculate boost score
|
|
// for multiple boost function scores
|
|
enum FunctionMode{
|
|
FunctionModeMultiply = 0;
|
|
FunctionModeSum = 1;
|
|
};
|
|
|
|
// BoostMode decide how to calculate final score
|
|
// for origin score and boost score.
|
|
enum BoostMode{
|
|
BoostModeMultiply = 0;
|
|
BoostModeSum = 1;
|
|
};
|
|
|
|
message ScoreFunction {
|
|
Expr filter = 1;
|
|
float weight = 2;
|
|
FunctionType type = 3;
|
|
repeated common.KeyValuePair params = 4;
|
|
}
|
|
|
|
message ScoreOption{
|
|
BoostMode boost_mode = 1;
|
|
FunctionMode function_mode = 2;
|
|
}
|
|
|
|
message PlanOption {
|
|
bool expr_use_json_stats = 1;
|
|
};
|
|
|
|
message PlanNode {
|
|
oneof node {
|
|
VectorANNS vector_anns = 1;
|
|
Expr predicates = 2; // deprecated, use query instead.
|
|
QueryPlanNode query = 4;
|
|
}
|
|
repeated int64 output_field_ids = 3;
|
|
repeated string dynamic_fields = 5;
|
|
repeated ScoreFunction scorers = 6;
|
|
PlanOption plan_options = 7;
|
|
ScoreOption score_option = 8;
|
|
optional string namespace = 9;
|
|
}
|