mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
related: #36380 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant: aggregation is centralized and schema-aware — all aggregate functions are created via the exec Aggregate registry (milvus::exec::Aggregate) and validated by ValidateAggFieldType, use a single in-memory accumulator layout (Accumulator/RowContainer) and grouping primitives (GroupingSet, HashTable, VectorHasher), ensuring consistent typing, null semantics and offsets across planner → exec → reducer conversion paths (toAggregateInfo, Aggregate::create, GroupingSet, AggResult converters). - Removed / simplified logic: removed ad‑hoc count/group-by and reducer code (CountNode/PhyCountNode, GroupByNode/PhyGroupByNode, cntReducer and its tests) and consolidated into a unified AggregationNode → PhyAggregationNode + GroupingSet + HashTable execution path and centralized reducers (MilvusAggReducer, InternalAggReducer, SegcoreAggReducer). AVG now implemented compositionally (SUM + COUNT) rather than a bespoke operator, eliminating duplicate implementations. - Why this does NOT cause data loss or regressions: existing data-access and serialization paths are preserved and explicitly validated — bulk_subscript / bulk_script_field_data and FieldData creation are used for output materialization; converters (InternalResult2AggResult ↔ AggResult2internalResult, SegcoreResults2AggResult ↔ AggResult2segcoreResult) enforce shape/type/row-count validation; proxy and plan-level checks (MatchAggregationExpression, translateOutputFields, ValidateAggFieldType, translateGroupByFieldIds) reject unsupported inputs (ARRAY/JSON, unsupported datatypes) early. Empty-result generation and explicit error returns guard against silent corruption. - New capability and scope: end-to-end GROUP BY and aggregation support added across the stack — proto (plan.proto, RetrieveRequest fields group_by_field_ids/aggregates), planner nodes (AggregationNode, ProjectNode, SearchGroupByNode), exec operators (PhyAggregationNode, PhyProjectNode) and aggregation core (Aggregate implementations: Sum/Count/Min/Max, SimpleNumericAggregate, RowContainer, GroupingSet, HashTable) plus proxy/querynode reducers and tests — enabling grouped and global aggregation (sum, count, min, max, avg via sum+count) with schema-aware validation and reduction. <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: MrPresent-Han <chun.han@gmail.com> Co-authored-by: MrPresent-Han <chun.han@gmail.com>
394 lines
8.6 KiB
Protocol Buffer
394 lines
8.6 KiB
Protocol Buffer
syntax = "proto3";
|
|
package milvus.proto.plan;
|
|
|
|
option go_package = "github.com/milvus-io/milvus/pkg/v2/proto/planpb";
|
|
import "schema.proto";
|
|
import "common.proto";
|
|
|
|
enum OpType {
|
|
Invalid = 0;
|
|
GreaterThan = 1;
|
|
GreaterEqual = 2;
|
|
LessThan = 3;
|
|
LessEqual = 4;
|
|
Equal = 5;
|
|
NotEqual = 6;
|
|
PrefixMatch = 7; // startsWith
|
|
PostfixMatch = 8; // endsWith
|
|
Match = 9; // like
|
|
Range = 10; // for case 1 < a < b
|
|
In = 11; // TODO:: used for term expr
|
|
NotIn = 12;
|
|
TextMatch = 13; // text match
|
|
PhraseMatch = 14; // phrase match
|
|
InnerMatch = 15; // substring (e.g., "%value%")
|
|
};
|
|
|
|
enum ArithOpType {
|
|
Unknown = 0;
|
|
Add = 1;
|
|
Sub = 2;
|
|
Mul = 3;
|
|
Div = 4;
|
|
Mod = 5;
|
|
ArrayLength = 6;
|
|
};
|
|
|
|
enum VectorType {
|
|
BinaryVector = 0;
|
|
FloatVector = 1;
|
|
Float16Vector = 2;
|
|
BFloat16Vector = 3;
|
|
SparseFloatVector = 4;
|
|
Int8Vector = 5;
|
|
EmbListFloatVector = 6;
|
|
EmbListFloat16Vector = 7;
|
|
EmbListBFloat16Vector = 8;
|
|
EmbListInt8Vector = 9;
|
|
EmbListBinaryVector = 10;
|
|
};
|
|
|
|
message GenericValue {
|
|
oneof val {
|
|
bool bool_val = 1;
|
|
int64 int64_val = 2;
|
|
double float_val = 3;
|
|
string string_val = 4;
|
|
Array array_val = 5;
|
|
};
|
|
}
|
|
|
|
message Array {
|
|
repeated GenericValue array = 1;
|
|
bool same_type = 2;
|
|
schema.DataType element_type = 3;
|
|
}
|
|
|
|
message SearchIteratorV2Info {
|
|
string token = 1;
|
|
uint32 batch_size = 2;
|
|
optional float last_bound = 3;
|
|
}
|
|
|
|
message QueryInfo {
|
|
int64 topk = 1;
|
|
string metric_type = 3;
|
|
string search_params = 4;
|
|
int64 round_decimal = 5;
|
|
int64 group_by_field_id = 6;
|
|
bool materialized_view_involved = 7;
|
|
int64 group_size = 8;
|
|
bool strict_group_size = 9;
|
|
double bm25_avgdl = 10;
|
|
int64 query_field_id =11;
|
|
string hints = 12;
|
|
optional SearchIteratorV2Info search_iterator_v2_info = 13;
|
|
string json_path = 14;
|
|
schema.DataType json_type = 15;
|
|
bool strict_cast = 16;
|
|
}
|
|
|
|
message ColumnInfo {
|
|
int64 field_id = 1;
|
|
schema.DataType data_type = 2;
|
|
bool is_primary_key = 3;
|
|
bool is_autoID = 4;
|
|
repeated string nested_path = 5;
|
|
bool is_partition_key = 6;
|
|
schema.DataType element_type = 7;
|
|
bool is_clustering_key = 8;
|
|
bool nullable = 9;
|
|
bool is_element_level = 10;
|
|
}
|
|
|
|
message ColumnExpr {
|
|
ColumnInfo info = 1;
|
|
}
|
|
|
|
message ExistsExpr {
|
|
ColumnInfo info = 1;
|
|
}
|
|
|
|
message ValueExpr {
|
|
GenericValue value = 1;
|
|
string template_variable_name = 2;
|
|
}
|
|
|
|
message UnaryRangeExpr {
|
|
ColumnInfo column_info = 1;
|
|
OpType op = 2;
|
|
GenericValue value = 3;
|
|
string template_variable_name = 4;
|
|
repeated GenericValue extra_values = 5;
|
|
}
|
|
|
|
message BinaryRangeExpr {
|
|
ColumnInfo column_info = 1;
|
|
bool lower_inclusive = 2;
|
|
bool upper_inclusive = 3;
|
|
GenericValue lower_value = 4;
|
|
GenericValue upper_value = 5;
|
|
string lower_template_variable_name = 6;
|
|
string upper_template_variable_name = 7;
|
|
}
|
|
|
|
message CallExpr {
|
|
string function_name = 1;
|
|
repeated Expr function_parameters = 2;
|
|
}
|
|
|
|
message CompareExpr {
|
|
ColumnInfo left_column_info = 1;
|
|
ColumnInfo right_column_info = 2;
|
|
OpType op = 3;
|
|
}
|
|
|
|
message TermExpr {
|
|
ColumnInfo column_info = 1;
|
|
repeated GenericValue values = 2;
|
|
bool is_in_field = 3;
|
|
string template_variable_name = 4;
|
|
}
|
|
|
|
message JSONContainsExpr {
|
|
ColumnInfo column_info = 1;
|
|
repeated GenericValue elements = 2;
|
|
// 0: invalid
|
|
// 1: json_contains | array_contains
|
|
// 2: json_contains_all | array_contains_all
|
|
// 3: json_contains_any | array_contains_any
|
|
enum JSONOp {
|
|
Invalid = 0;
|
|
Contains = 1;
|
|
ContainsAll = 2;
|
|
ContainsAny = 3;
|
|
}
|
|
JSONOp op = 3;
|
|
bool elements_same_type = 4;
|
|
string template_variable_name = 5;
|
|
}
|
|
|
|
message NullExpr {
|
|
ColumnInfo column_info = 1;
|
|
enum NullOp {
|
|
Invalid = 0;
|
|
IsNull = 1;
|
|
IsNotNull = 2;
|
|
}
|
|
NullOp op = 2;
|
|
}
|
|
|
|
message GISFunctionFilterExpr{
|
|
ColumnInfo column_info = 1;
|
|
string wkt_string = 2;
|
|
enum GISOp {
|
|
Invalid = 0;
|
|
Equals = 1;
|
|
Touches = 2;
|
|
Overlaps = 3;
|
|
Crosses = 4;
|
|
Contains = 5;
|
|
Intersects = 6;
|
|
Within = 7;
|
|
DWithin = 8;
|
|
// STIsValid is a special operator that conflicts with proto-generated default methods.
|
|
// Using STIsValid instead of IsValid to avoid naming conflicts with IsValid() method.
|
|
STIsValid = 9;
|
|
}
|
|
GISOp op = 3;
|
|
double distance = 4; // Distance parameter for DWithin
|
|
}
|
|
|
|
message UnaryExpr {
|
|
enum UnaryOp {
|
|
Invalid = 0;
|
|
Not = 1;
|
|
};
|
|
UnaryOp op = 1;
|
|
Expr child = 2;
|
|
}
|
|
|
|
message BinaryExpr {
|
|
enum BinaryOp {
|
|
Invalid = 0;
|
|
LogicalAnd = 1;
|
|
LogicalOr = 2;
|
|
}
|
|
BinaryOp op = 1;
|
|
Expr left = 2;
|
|
Expr right = 3;
|
|
}
|
|
|
|
message BinaryArithOp {
|
|
ColumnInfo column_info = 1;
|
|
ArithOpType arith_op = 2;
|
|
GenericValue right_operand = 3;
|
|
}
|
|
|
|
message BinaryArithExpr {
|
|
Expr left = 1;
|
|
Expr right = 2;
|
|
ArithOpType op = 3;
|
|
}
|
|
|
|
message BinaryArithOpEvalRangeExpr {
|
|
ColumnInfo column_info = 1;
|
|
ArithOpType arith_op = 2;
|
|
GenericValue right_operand = 3;
|
|
OpType op = 4;
|
|
GenericValue value = 5;
|
|
string operand_template_variable_name = 6;
|
|
string value_template_variable_name = 7;
|
|
}
|
|
|
|
message RandomSampleExpr {
|
|
float sample_factor = 1;
|
|
Expr predicate = 2;
|
|
}
|
|
|
|
message ElementFilterExpr {
|
|
Expr element_expr = 1;
|
|
string struct_name = 2;
|
|
Expr predicate = 3;
|
|
}
|
|
|
|
// MatchType defines the type of match operation for struct array queries
|
|
enum MatchType {
|
|
MatchAll = 0; // All elements must match the predicate
|
|
MatchAny = 1; // At least one element matches the predicate
|
|
MatchLeast = 2; // At least N elements match the predicate
|
|
MatchMost = 3; // At most N elements match the predicate
|
|
MatchExact = 4; // Exactly N elements match the predicate
|
|
}
|
|
|
|
message MatchExpr {
|
|
string struct_name = 1; // The struct array field name (e.g., struct_array)
|
|
Expr predicate = 2; // The condition expression using $[field] syntax (e.g., $[intField] == 1 && $[strField] == "aaa")
|
|
MatchType match_type = 3; // Type of match operation
|
|
int64 count = 4; // For MatchLeast/MatchMost: the count parameter (N)
|
|
}
|
|
|
|
message AlwaysTrueExpr {}
|
|
|
|
message Interval {
|
|
int64 years = 1;
|
|
int64 months = 2;
|
|
int64 days = 3;
|
|
int64 hours = 4;
|
|
int64 minutes = 5;
|
|
int64 seconds = 6;
|
|
}
|
|
|
|
// New expression type for the operation: (timestamp_col + interval) OP iso_string
|
|
message TimestamptzArithCompareExpr {
|
|
ColumnInfo timestamptz_column = 1;
|
|
ArithOpType arith_op = 2; // ADD or SUB
|
|
Interval interval = 3;
|
|
OpType compare_op = 4;
|
|
GenericValue compare_value = 5;
|
|
}
|
|
|
|
message Expr {
|
|
oneof expr {
|
|
TermExpr term_expr = 1;
|
|
UnaryExpr unary_expr = 2;
|
|
BinaryExpr binary_expr = 3;
|
|
CompareExpr compare_expr = 4;
|
|
UnaryRangeExpr unary_range_expr = 5;
|
|
BinaryRangeExpr binary_range_expr = 6;
|
|
BinaryArithOpEvalRangeExpr binary_arith_op_eval_range_expr = 7;
|
|
BinaryArithExpr binary_arith_expr = 8;
|
|
ValueExpr value_expr = 9;
|
|
ColumnExpr column_expr = 10;
|
|
ExistsExpr exists_expr = 11;
|
|
AlwaysTrueExpr always_true_expr = 12;
|
|
JSONContainsExpr json_contains_expr = 13;
|
|
CallExpr call_expr = 14;
|
|
NullExpr null_expr = 15;
|
|
RandomSampleExpr random_sample_expr = 16;
|
|
GISFunctionFilterExpr gisfunction_filter_expr = 17;
|
|
TimestamptzArithCompareExpr timestamptz_arith_compare_expr = 18;
|
|
ElementFilterExpr element_filter_expr = 19;
|
|
MatchExpr match_expr = 21;
|
|
};
|
|
bool is_template = 20;
|
|
}
|
|
|
|
message VectorANNS {
|
|
VectorType vector_type = 1;
|
|
int64 field_id = 2;
|
|
Expr predicates = 3;
|
|
QueryInfo query_info = 4;
|
|
string placeholder_tag = 5; // always be "$0"
|
|
}
|
|
|
|
enum AggregateOp {
|
|
sum = 0;
|
|
count = 1;
|
|
avg = 2;
|
|
min = 3;
|
|
max = 4;
|
|
}
|
|
|
|
message Aggregate {
|
|
AggregateOp op = 1;
|
|
int64 field_id = 2;
|
|
}
|
|
|
|
message QueryPlanNode {
|
|
Expr predicates = 1;
|
|
bool is_count = 2;
|
|
int64 limit = 3;
|
|
repeated int64 group_by_field_ids = 4;
|
|
repeated Aggregate aggregates = 5;
|
|
};
|
|
|
|
enum FunctionType{
|
|
FunctionTypeWeight = 0;
|
|
FunctionTypeRandom = 1;
|
|
}
|
|
|
|
// FunctionMode decide how to calculate boost score
|
|
// for multiple boost function scores
|
|
enum FunctionMode{
|
|
FunctionModeMultiply = 0;
|
|
FunctionModeSum = 1;
|
|
};
|
|
|
|
// BoostMode decide how to calculate final score
|
|
// for origin score and boost score.
|
|
enum BoostMode{
|
|
BoostModeMultiply = 0;
|
|
BoostModeSum = 1;
|
|
};
|
|
|
|
message ScoreFunction {
|
|
Expr filter = 1;
|
|
float weight = 2;
|
|
FunctionType type = 3;
|
|
repeated common.KeyValuePair params = 4;
|
|
}
|
|
|
|
message ScoreOption{
|
|
BoostMode boost_mode = 1;
|
|
FunctionMode function_mode = 2;
|
|
}
|
|
|
|
message PlanOption {
|
|
bool expr_use_json_stats = 1;
|
|
};
|
|
|
|
message PlanNode {
|
|
oneof node {
|
|
VectorANNS vector_anns = 1;
|
|
Expr predicates = 2; // deprecated, use query instead.
|
|
QueryPlanNode query = 4;
|
|
}
|
|
repeated int64 output_field_ids = 3;
|
|
repeated string dynamic_fields = 5;
|
|
repeated ScoreFunction scorers = 6;
|
|
PlanOption plan_options = 7;
|
|
ScoreOption score_option = 8;
|
|
optional string namespace = 9;
|
|
}
|