mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
related: #36380 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant: aggregation is centralized and schema-aware — all aggregate functions are created via the exec Aggregate registry (milvus::exec::Aggregate) and validated by ValidateAggFieldType, use a single in-memory accumulator layout (Accumulator/RowContainer) and grouping primitives (GroupingSet, HashTable, VectorHasher), ensuring consistent typing, null semantics and offsets across planner → exec → reducer conversion paths (toAggregateInfo, Aggregate::create, GroupingSet, AggResult converters). - Removed / simplified logic: removed ad‑hoc count/group-by and reducer code (CountNode/PhyCountNode, GroupByNode/PhyGroupByNode, cntReducer and its tests) and consolidated into a unified AggregationNode → PhyAggregationNode + GroupingSet + HashTable execution path and centralized reducers (MilvusAggReducer, InternalAggReducer, SegcoreAggReducer). AVG now implemented compositionally (SUM + COUNT) rather than a bespoke operator, eliminating duplicate implementations. - Why this does NOT cause data loss or regressions: existing data-access and serialization paths are preserved and explicitly validated — bulk_subscript / bulk_script_field_data and FieldData creation are used for output materialization; converters (InternalResult2AggResult ↔ AggResult2internalResult, SegcoreResults2AggResult ↔ AggResult2segcoreResult) enforce shape/type/row-count validation; proxy and plan-level checks (MatchAggregationExpression, translateOutputFields, ValidateAggFieldType, translateGroupByFieldIds) reject unsupported inputs (ARRAY/JSON, unsupported datatypes) early. Empty-result generation and explicit error returns guard against silent corruption. - New capability and scope: end-to-end GROUP BY and aggregation support added across the stack — proto (plan.proto, RetrieveRequest fields group_by_field_ids/aggregates), planner nodes (AggregationNode, ProjectNode, SearchGroupByNode), exec operators (PhyAggregationNode, PhyProjectNode) and aggregation core (Aggregate implementations: Sum/Count/Min/Max, SimpleNumericAggregate, RowContainer, GroupingSet, HashTable) plus proxy/querynode reducers and tests — enabling grouped and global aggregation (sum, count, min, max, avg via sum+count) with schema-aware validation and reduction. <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: MrPresent-Han <chun.han@gmail.com> Co-authored-by: MrPresent-Han <chun.han@gmail.com>
69 lines
2.2 KiB
Go
69 lines
2.2 KiB
Go
package segmentutil
|
|
|
|
import (
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus/pkg/v2/log"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/internalpb"
|
|
)
|
|
|
|
// ReCalcRowCount re-calculates number of rows of `oldSeg` based on its bin log count, and correct its value in its
|
|
// cloned copy, which is `newSeg`.
|
|
// Note that `segCloned` should be a copied version of `seg`.
|
|
func ReCalcRowCount(seg, segCloned *datapb.SegmentInfo) {
|
|
// `segment` is not mutated but only cloned above and is safe to be referred here.
|
|
if newCount := CalcRowCountFromBinLog(seg); newCount != seg.GetNumOfRows() && newCount > 0 {
|
|
log.Warn("segment row number meta inconsistent with bin log row count and will be corrected",
|
|
zap.Int64("segmentID", seg.GetID()),
|
|
zap.Int64("segment meta row count (wrong)", seg.GetNumOfRows()),
|
|
zap.Int64("segment bin log row count (correct)", newCount))
|
|
// Update the corrected row count.
|
|
segCloned.NumOfRows = newCount
|
|
}
|
|
}
|
|
|
|
// CalcRowCountFromBinLog calculates # of rows of a segment from bin logs
|
|
func CalcRowCountFromBinLog(seg *datapb.SegmentInfo) int64 {
|
|
var rowCt int64
|
|
if len(seg.GetBinlogs()) > 0 {
|
|
for _, ct := range seg.GetBinlogs()[0].GetBinlogs() {
|
|
rowCt += ct.GetEntriesNum()
|
|
// This segment contains stale log with incorrect entries num,
|
|
if ct.GetEntriesNum() <= 0 {
|
|
return -1
|
|
}
|
|
}
|
|
}
|
|
return rowCt
|
|
}
|
|
|
|
// CalcDelRowCountFromDeltaLog calculates deleted rows of a L0 segment from delta logs
|
|
func CalcDelRowCountFromDeltaLog(seg *datapb.SegmentInfo) int64 {
|
|
var rowCt int64
|
|
if len(seg.GetDeltalogs()) > 0 {
|
|
for _, dls := range seg.GetDeltalogs() {
|
|
for _, dl := range dls.GetBinlogs() {
|
|
rowCt += dl.GetEntriesNum()
|
|
}
|
|
}
|
|
}
|
|
return rowCt
|
|
}
|
|
|
|
// MergeRequestCost merges the costs of request; the cost may come from different worker in same channel
|
|
// or different channel in same collection, for now we just choose the part with the highest response time
|
|
func MergeRequestCost(requestCosts []*internalpb.CostAggregation) *internalpb.CostAggregation {
|
|
var result *internalpb.CostAggregation
|
|
for _, cost := range requestCosts {
|
|
if cost == nil {
|
|
continue
|
|
}
|
|
if result == nil || result.ResponseTime < cost.ResponseTime {
|
|
result = cost
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|