mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
related: #36380 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant: aggregation is centralized and schema-aware — all aggregate functions are created via the exec Aggregate registry (milvus::exec::Aggregate) and validated by ValidateAggFieldType, use a single in-memory accumulator layout (Accumulator/RowContainer) and grouping primitives (GroupingSet, HashTable, VectorHasher), ensuring consistent typing, null semantics and offsets across planner → exec → reducer conversion paths (toAggregateInfo, Aggregate::create, GroupingSet, AggResult converters). - Removed / simplified logic: removed ad‑hoc count/group-by and reducer code (CountNode/PhyCountNode, GroupByNode/PhyGroupByNode, cntReducer and its tests) and consolidated into a unified AggregationNode → PhyAggregationNode + GroupingSet + HashTable execution path and centralized reducers (MilvusAggReducer, InternalAggReducer, SegcoreAggReducer). AVG now implemented compositionally (SUM + COUNT) rather than a bespoke operator, eliminating duplicate implementations. - Why this does NOT cause data loss or regressions: existing data-access and serialization paths are preserved and explicitly validated — bulk_subscript / bulk_script_field_data and FieldData creation are used for output materialization; converters (InternalResult2AggResult ↔ AggResult2internalResult, SegcoreResults2AggResult ↔ AggResult2segcoreResult) enforce shape/type/row-count validation; proxy and plan-level checks (MatchAggregationExpression, translateOutputFields, ValidateAggFieldType, translateGroupByFieldIds) reject unsupported inputs (ARRAY/JSON, unsupported datatypes) early. Empty-result generation and explicit error returns guard against silent corruption. - New capability and scope: end-to-end GROUP BY and aggregation support added across the stack — proto (plan.proto, RetrieveRequest fields group_by_field_ids/aggregates), planner nodes (AggregationNode, ProjectNode, SearchGroupByNode), exec operators (PhyAggregationNode, PhyProjectNode) and aggregation core (Aggregate implementations: Sum/Count/Min/Max, SimpleNumericAggregate, RowContainer, GroupingSet, HashTable) plus proxy/querynode reducers and tests — enabling grouped and global aggregation (sum, count, min, max, avg via sum+count) with schema-aware validation and reduction. <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: MrPresent-Han <chun.han@gmail.com> Co-authored-by: MrPresent-Han <chun.han@gmail.com>
83 lines
2.7 KiB
Go
83 lines
2.7 KiB
Go
package segments
|
|
|
|
import (
|
|
"context"
|
|
|
|
"github.com/samber/lo"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/util/segcore"
|
|
"github.com/milvus-io/milvus/pkg/v2/common"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/internalpb"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/segcorepb"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
|
|
)
|
|
|
|
type internalReducer interface {
|
|
Reduce(context.Context, []*internalpb.RetrieveResults) (*internalpb.RetrieveResults, error)
|
|
}
|
|
|
|
func CreateInternalReducer(req *querypb.QueryRequest, schema *schemapb.CollectionSchema) internalReducer {
|
|
if len(req.GetReq().GetAggregates()) > 0 || len(req.GetReq().GetGroupByFieldIds()) > 0 {
|
|
return NewInternalAggReducer(req.GetReq().GetGroupByFieldIds(), req.GetReq().GetAggregates(), req.GetReq().GetLimit(), schema)
|
|
}
|
|
return newDefaultLimitReducer(req, schema)
|
|
}
|
|
|
|
type segCoreReducer interface {
|
|
Reduce(context.Context, []*segcorepb.RetrieveResults, []Segment, *segcore.RetrievePlan) (*segcorepb.RetrieveResults, error)
|
|
}
|
|
|
|
func CreateSegCoreReducer(req *querypb.QueryRequest, schema *schemapb.CollectionSchema, manager *Manager) segCoreReducer {
|
|
if len(req.GetReq().GetGroupByFieldIds()) > 0 || len(req.GetReq().GetAggregates()) > 0 {
|
|
return NewSegcoreAggReducer(req.GetReq().GetGroupByFieldIds(), req.GetReq().GetAggregates(), req.GetReq().GetLimit(), schema)
|
|
}
|
|
|
|
return newDefaultLimitReducerSegcore(req, schema, manager)
|
|
}
|
|
|
|
type TimestampedRetrieveResult[T interface {
|
|
typeutil.ResultWithID
|
|
GetFieldsData() []*schemapb.FieldData
|
|
}] struct {
|
|
Result T
|
|
Timestamps []int64
|
|
}
|
|
|
|
func (r *TimestampedRetrieveResult[T]) GetIds() *schemapb.IDs {
|
|
return r.Result.GetIds()
|
|
}
|
|
|
|
func (r *TimestampedRetrieveResult[T]) GetHasMoreResult() bool {
|
|
return r.Result.GetHasMoreResult()
|
|
}
|
|
|
|
func (r *TimestampedRetrieveResult[T]) GetTimestamps() []int64 {
|
|
return r.Timestamps
|
|
}
|
|
|
|
func NewTimestampedRetrieveResult[T interface {
|
|
typeutil.ResultWithID
|
|
GetFieldsData() []*schemapb.FieldData
|
|
}](result T) (*TimestampedRetrieveResult[T], error) {
|
|
tsField, has := lo.Find(result.GetFieldsData(), func(fd *schemapb.FieldData) bool {
|
|
return fd.GetFieldId() == common.TimeStampField
|
|
})
|
|
if !has {
|
|
return nil, merr.WrapErrServiceInternal("RetrieveResult does not have timestamp field")
|
|
}
|
|
timestamps := tsField.GetScalars().GetLongData().GetData()
|
|
idSize := typeutil.GetSizeOfIDs(result.GetIds())
|
|
|
|
if idSize != len(timestamps) {
|
|
return nil, merr.WrapErrServiceInternal("id length is not equal to timestamp length")
|
|
}
|
|
|
|
return &TimestampedRetrieveResult[T]{
|
|
Result: result,
|
|
Timestamps: timestamps,
|
|
}, nil
|
|
}
|