milvus/pkg/proto/plan.proto
Chun Han b7ee93fc52
feat: support query aggregtion(#36380) (#44394)
related: #36380

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
- Core invariant: aggregation is centralized and schema-aware — all
aggregate functions are created via the exec Aggregate registry
(milvus::exec::Aggregate) and validated by ValidateAggFieldType, use a
single in-memory accumulator layout (Accumulator/RowContainer) and
grouping primitives (GroupingSet, HashTable, VectorHasher), ensuring
consistent typing, null semantics and offsets across planner → exec →
reducer conversion paths (toAggregateInfo, Aggregate::create,
GroupingSet, AggResult converters).

- Removed / simplified logic: removed ad‑hoc count/group-by and reducer
code (CountNode/PhyCountNode, GroupByNode/PhyGroupByNode, cntReducer and
its tests) and consolidated into a unified AggregationNode →
PhyAggregationNode + GroupingSet + HashTable execution path and
centralized reducers (MilvusAggReducer, InternalAggReducer,
SegcoreAggReducer). AVG now implemented compositionally (SUM + COUNT)
rather than a bespoke operator, eliminating duplicate implementations.

- Why this does NOT cause data loss or regressions: existing data-access
and serialization paths are preserved and explicitly validated —
bulk_subscript / bulk_script_field_data and FieldData creation are used
for output materialization; converters (InternalResult2AggResult ↔
AggResult2internalResult, SegcoreResults2AggResult ↔
AggResult2segcoreResult) enforce shape/type/row-count validation; proxy
and plan-level checks (MatchAggregationExpression,
translateOutputFields, ValidateAggFieldType, translateGroupByFieldIds)
reject unsupported inputs (ARRAY/JSON, unsupported datatypes) early.
Empty-result generation and explicit error returns guard against silent
corruption.

- New capability and scope: end-to-end GROUP BY and aggregation support
added across the stack — proto (plan.proto, RetrieveRequest fields
group_by_field_ids/aggregates), planner nodes (AggregationNode,
ProjectNode, SearchGroupByNode), exec operators (PhyAggregationNode,
PhyProjectNode) and aggregation core (Aggregate implementations:
Sum/Count/Min/Max, SimpleNumericAggregate, RowContainer, GroupingSet,
HashTable) plus proxy/querynode reducers and tests — enabling grouped
and global aggregation (sum, count, min, max, avg via sum+count) with
schema-aware validation and reduction.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

Signed-off-by: MrPresent-Han <chun.han@gmail.com>
Co-authored-by: MrPresent-Han <chun.han@gmail.com>
2026-01-06 16:29:25 +08:00

394 lines
8.6 KiB
Protocol Buffer

syntax = "proto3";
package milvus.proto.plan;
option go_package = "github.com/milvus-io/milvus/pkg/v2/proto/planpb";
import "schema.proto";
import "common.proto";
enum OpType {
Invalid = 0;
GreaterThan = 1;
GreaterEqual = 2;
LessThan = 3;
LessEqual = 4;
Equal = 5;
NotEqual = 6;
PrefixMatch = 7; // startsWith
PostfixMatch = 8; // endsWith
Match = 9; // like
Range = 10; // for case 1 < a < b
In = 11; // TODO:: used for term expr
NotIn = 12;
TextMatch = 13; // text match
PhraseMatch = 14; // phrase match
InnerMatch = 15; // substring (e.g., "%value%")
};
enum ArithOpType {
Unknown = 0;
Add = 1;
Sub = 2;
Mul = 3;
Div = 4;
Mod = 5;
ArrayLength = 6;
};
enum VectorType {
BinaryVector = 0;
FloatVector = 1;
Float16Vector = 2;
BFloat16Vector = 3;
SparseFloatVector = 4;
Int8Vector = 5;
EmbListFloatVector = 6;
EmbListFloat16Vector = 7;
EmbListBFloat16Vector = 8;
EmbListInt8Vector = 9;
EmbListBinaryVector = 10;
};
message GenericValue {
oneof val {
bool bool_val = 1;
int64 int64_val = 2;
double float_val = 3;
string string_val = 4;
Array array_val = 5;
};
}
message Array {
repeated GenericValue array = 1;
bool same_type = 2;
schema.DataType element_type = 3;
}
message SearchIteratorV2Info {
string token = 1;
uint32 batch_size = 2;
optional float last_bound = 3;
}
message QueryInfo {
int64 topk = 1;
string metric_type = 3;
string search_params = 4;
int64 round_decimal = 5;
int64 group_by_field_id = 6;
bool materialized_view_involved = 7;
int64 group_size = 8;
bool strict_group_size = 9;
double bm25_avgdl = 10;
int64 query_field_id =11;
string hints = 12;
optional SearchIteratorV2Info search_iterator_v2_info = 13;
string json_path = 14;
schema.DataType json_type = 15;
bool strict_cast = 16;
}
message ColumnInfo {
int64 field_id = 1;
schema.DataType data_type = 2;
bool is_primary_key = 3;
bool is_autoID = 4;
repeated string nested_path = 5;
bool is_partition_key = 6;
schema.DataType element_type = 7;
bool is_clustering_key = 8;
bool nullable = 9;
bool is_element_level = 10;
}
message ColumnExpr {
ColumnInfo info = 1;
}
message ExistsExpr {
ColumnInfo info = 1;
}
message ValueExpr {
GenericValue value = 1;
string template_variable_name = 2;
}
message UnaryRangeExpr {
ColumnInfo column_info = 1;
OpType op = 2;
GenericValue value = 3;
string template_variable_name = 4;
repeated GenericValue extra_values = 5;
}
message BinaryRangeExpr {
ColumnInfo column_info = 1;
bool lower_inclusive = 2;
bool upper_inclusive = 3;
GenericValue lower_value = 4;
GenericValue upper_value = 5;
string lower_template_variable_name = 6;
string upper_template_variable_name = 7;
}
message CallExpr {
string function_name = 1;
repeated Expr function_parameters = 2;
}
message CompareExpr {
ColumnInfo left_column_info = 1;
ColumnInfo right_column_info = 2;
OpType op = 3;
}
message TermExpr {
ColumnInfo column_info = 1;
repeated GenericValue values = 2;
bool is_in_field = 3;
string template_variable_name = 4;
}
message JSONContainsExpr {
ColumnInfo column_info = 1;
repeated GenericValue elements = 2;
// 0: invalid
// 1: json_contains | array_contains
// 2: json_contains_all | array_contains_all
// 3: json_contains_any | array_contains_any
enum JSONOp {
Invalid = 0;
Contains = 1;
ContainsAll = 2;
ContainsAny = 3;
}
JSONOp op = 3;
bool elements_same_type = 4;
string template_variable_name = 5;
}
message NullExpr {
ColumnInfo column_info = 1;
enum NullOp {
Invalid = 0;
IsNull = 1;
IsNotNull = 2;
}
NullOp op = 2;
}
message GISFunctionFilterExpr{
ColumnInfo column_info = 1;
string wkt_string = 2;
enum GISOp {
Invalid = 0;
Equals = 1;
Touches = 2;
Overlaps = 3;
Crosses = 4;
Contains = 5;
Intersects = 6;
Within = 7;
DWithin = 8;
// STIsValid is a special operator that conflicts with proto-generated default methods.
// Using STIsValid instead of IsValid to avoid naming conflicts with IsValid() method.
STIsValid = 9;
}
GISOp op = 3;
double distance = 4; // Distance parameter for DWithin
}
message UnaryExpr {
enum UnaryOp {
Invalid = 0;
Not = 1;
};
UnaryOp op = 1;
Expr child = 2;
}
message BinaryExpr {
enum BinaryOp {
Invalid = 0;
LogicalAnd = 1;
LogicalOr = 2;
}
BinaryOp op = 1;
Expr left = 2;
Expr right = 3;
}
message BinaryArithOp {
ColumnInfo column_info = 1;
ArithOpType arith_op = 2;
GenericValue right_operand = 3;
}
message BinaryArithExpr {
Expr left = 1;
Expr right = 2;
ArithOpType op = 3;
}
message BinaryArithOpEvalRangeExpr {
ColumnInfo column_info = 1;
ArithOpType arith_op = 2;
GenericValue right_operand = 3;
OpType op = 4;
GenericValue value = 5;
string operand_template_variable_name = 6;
string value_template_variable_name = 7;
}
message RandomSampleExpr {
float sample_factor = 1;
Expr predicate = 2;
}
message ElementFilterExpr {
Expr element_expr = 1;
string struct_name = 2;
Expr predicate = 3;
}
// MatchType defines the type of match operation for struct array queries
enum MatchType {
MatchAll = 0; // All elements must match the predicate
MatchAny = 1; // At least one element matches the predicate
MatchLeast = 2; // At least N elements match the predicate
MatchMost = 3; // At most N elements match the predicate
MatchExact = 4; // Exactly N elements match the predicate
}
message MatchExpr {
string struct_name = 1; // The struct array field name (e.g., struct_array)
Expr predicate = 2; // The condition expression using $[field] syntax (e.g., $[intField] == 1 && $[strField] == "aaa")
MatchType match_type = 3; // Type of match operation
int64 count = 4; // For MatchLeast/MatchMost: the count parameter (N)
}
message AlwaysTrueExpr {}
message Interval {
int64 years = 1;
int64 months = 2;
int64 days = 3;
int64 hours = 4;
int64 minutes = 5;
int64 seconds = 6;
}
// New expression type for the operation: (timestamp_col + interval) OP iso_string
message TimestamptzArithCompareExpr {
ColumnInfo timestamptz_column = 1;
ArithOpType arith_op = 2; // ADD or SUB
Interval interval = 3;
OpType compare_op = 4;
GenericValue compare_value = 5;
}
message Expr {
oneof expr {
TermExpr term_expr = 1;
UnaryExpr unary_expr = 2;
BinaryExpr binary_expr = 3;
CompareExpr compare_expr = 4;
UnaryRangeExpr unary_range_expr = 5;
BinaryRangeExpr binary_range_expr = 6;
BinaryArithOpEvalRangeExpr binary_arith_op_eval_range_expr = 7;
BinaryArithExpr binary_arith_expr = 8;
ValueExpr value_expr = 9;
ColumnExpr column_expr = 10;
ExistsExpr exists_expr = 11;
AlwaysTrueExpr always_true_expr = 12;
JSONContainsExpr json_contains_expr = 13;
CallExpr call_expr = 14;
NullExpr null_expr = 15;
RandomSampleExpr random_sample_expr = 16;
GISFunctionFilterExpr gisfunction_filter_expr = 17;
TimestamptzArithCompareExpr timestamptz_arith_compare_expr = 18;
ElementFilterExpr element_filter_expr = 19;
MatchExpr match_expr = 21;
};
bool is_template = 20;
}
message VectorANNS {
VectorType vector_type = 1;
int64 field_id = 2;
Expr predicates = 3;
QueryInfo query_info = 4;
string placeholder_tag = 5; // always be "$0"
}
enum AggregateOp {
sum = 0;
count = 1;
avg = 2;
min = 3;
max = 4;
}
message Aggregate {
AggregateOp op = 1;
int64 field_id = 2;
}
message QueryPlanNode {
Expr predicates = 1;
bool is_count = 2;
int64 limit = 3;
repeated int64 group_by_field_ids = 4;
repeated Aggregate aggregates = 5;
};
enum FunctionType{
FunctionTypeWeight = 0;
FunctionTypeRandom = 1;
}
// FunctionMode decide how to calculate boost score
// for multiple boost function scores
enum FunctionMode{
FunctionModeMultiply = 0;
FunctionModeSum = 1;
};
// BoostMode decide how to calculate final score
// for origin score and boost score.
enum BoostMode{
BoostModeMultiply = 0;
BoostModeSum = 1;
};
message ScoreFunction {
Expr filter = 1;
float weight = 2;
FunctionType type = 3;
repeated common.KeyValuePair params = 4;
}
message ScoreOption{
BoostMode boost_mode = 1;
FunctionMode function_mode = 2;
}
message PlanOption {
bool expr_use_json_stats = 1;
};
message PlanNode {
oneof node {
VectorANNS vector_anns = 1;
Expr predicates = 2; // deprecated, use query instead.
QueryPlanNode query = 4;
}
repeated int64 output_field_ids = 3;
repeated string dynamic_fields = 5;
repeated ScoreFunction scorers = 6;
PlanOption plan_options = 7;
ScoreOption score_option = 8;
optional string namespace = 9;
}