enhance: Replace PrimaryKey slice with PrimaryKeys saving memory (#37127)

Related to #35303

Slice of `storage.PrimaryKey` will have extra interface cost for each
element, which may cause notable memory usage when delta row count
number is large.

This PR replaces PrimaryKey slice with PrimaryKeys interface saving the
extra interface cost.

---------

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
This commit is contained in:
congqixia 2024-10-28 10:29:30 +08:00 committed by GitHub
parent 9d16b972ea
commit 7774b7275e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 124 additions and 56 deletions

View File

@ -1509,8 +1509,18 @@ func (s *DelegatorDataSuite) TestSyncTargetVersion() {
func (s *DelegatorDataSuite) TestLevel0Deletions() { func (s *DelegatorDataSuite) TestLevel0Deletions() {
delegator := s.delegator delegator := s.delegator
partitionID := int64(10) partitionID := int64(10)
partitionDeleteData := storage.NewDeleteData([]storage.PrimaryKey{storage.NewInt64PrimaryKey(1)}, []storage.Timestamp{100}) partitionDelPks := storage.NewInt64PrimaryKeys(1)
allPartitionDeleteData := storage.NewDeleteData([]storage.PrimaryKey{storage.NewInt64PrimaryKey(2)}, []storage.Timestamp{101}) partitionDelPks.AppendRaw(1)
allPartitionDelPks := storage.NewInt64PrimaryKeys(1)
allPartitionDelPks.AppendRaw(2)
partitionDeleteData := &storage.DeltaData{
DeletePks: partitionDelPks,
DeleteTimestamps: []storage.Timestamp{100},
}
allPartitionDeleteData := &storage.DeltaData{
DeletePks: allPartitionDelPks,
DeleteTimestamps: []storage.Timestamp{101},
}
schema := segments.GenTestCollectionSchema("test_stop", schemapb.DataType_Int64, true) schema := segments.GenTestCollectionSchema("test_stop", schemapb.DataType_Int64, true)
collection := segments.NewCollection(1, schema, nil, &querypb.LoadMetaInfo{ collection := segments.NewCollection(1, schema, nil, &querypb.LoadMetaInfo{
@ -1539,29 +1549,29 @@ func (s *DelegatorDataSuite) TestLevel0Deletions() {
l0Global.LoadDeltaData(context.TODO(), allPartitionDeleteData) l0Global.LoadDeltaData(context.TODO(), allPartitionDeleteData)
pks, _ := delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing)) pks, _ := delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.True(pks[0].EQ(partitionDeleteData.Pks[0])) s.True(pks[0].EQ(partitionDeleteData.DeletePks.Get(0)))
pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing)) pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.Empty(pks) s.Empty(pks)
delegator.segmentManager.Put(context.TODO(), segments.SegmentTypeSealed, l0Global) delegator.segmentManager.Put(context.TODO(), segments.SegmentTypeSealed, l0Global)
pks, _ = delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing)) pks, _ = delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.ElementsMatch(pks, []storage.PrimaryKey{partitionDeleteData.Pks[0], allPartitionDeleteData.Pks[0]}) s.ElementsMatch(pks, []storage.PrimaryKey{partitionDeleteData.DeletePks.Get(0), allPartitionDeleteData.DeletePks.Get(0)})
bfs := pkoracle.NewBloomFilterSet(3, l0.Partition(), commonpb.SegmentState_Sealed) bfs := pkoracle.NewBloomFilterSet(3, l0.Partition(), commonpb.SegmentState_Sealed)
bfs.UpdateBloomFilter(allPartitionDeleteData.Pks) bfs.UpdateBloomFilter([]storage.PrimaryKey{allPartitionDeleteData.DeletePks.Get(0)})
pks, _ = delegator.GetLevel0Deletions(partitionID, bfs) pks, _ = delegator.GetLevel0Deletions(partitionID, bfs)
// bf filtered segment // bf filtered segment
s.Equal(len(pks), 1) s.Equal(len(pks), 1)
s.True(pks[0].EQ(allPartitionDeleteData.Pks[0])) s.True(pks[0].EQ(allPartitionDeleteData.DeletePks.Get(0)))
delegator.segmentManager.Remove(context.TODO(), l0.ID(), querypb.DataScope_All) delegator.segmentManager.Remove(context.TODO(), l0.ID(), querypb.DataScope_All)
pks, _ = delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing)) pks, _ = delegator.GetLevel0Deletions(partitionID, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.True(pks[0].EQ(allPartitionDeleteData.Pks[0])) s.True(pks[0].EQ(allPartitionDeleteData.DeletePks.Get(0)))
pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing)) pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))
s.True(pks[0].EQ(allPartitionDeleteData.Pks[0])) s.True(pks[0].EQ(allPartitionDeleteData.DeletePks.Get(0)))
delegator.segmentManager.Remove(context.TODO(), l0Global.ID(), querypb.DataScope_All) delegator.segmentManager.Remove(context.TODO(), l0Global.ID(), querypb.DataScope_All)
pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing)) pks, _ = delegator.GetLevel0Deletions(partitionID+1, pkoracle.NewCandidateKey(l0.ID(), l0.Partition(), segments.SegmentTypeGrowing))

View File

@ -822,7 +822,7 @@ func (_c *MockSegment_Level_Call) RunAndReturn(run func() datapb.SegmentLevel) *
} }
// LoadDeltaData provides a mock function with given fields: ctx, deltaData // LoadDeltaData provides a mock function with given fields: ctx, deltaData
func (_m *MockSegment) LoadDeltaData(ctx context.Context, deltaData *storage.DeleteData) error { func (_m *MockSegment) LoadDeltaData(ctx context.Context, deltaData *storage.DeltaData) error {
ret := _m.Called(ctx, deltaData) ret := _m.Called(ctx, deltaData)
if len(ret) == 0 { if len(ret) == 0 {
@ -830,7 +830,7 @@ func (_m *MockSegment) LoadDeltaData(ctx context.Context, deltaData *storage.Del
} }
var r0 error var r0 error
if rf, ok := ret.Get(0).(func(context.Context, *storage.DeleteData) error); ok { if rf, ok := ret.Get(0).(func(context.Context, *storage.DeltaData) error); ok {
r0 = rf(ctx, deltaData) r0 = rf(ctx, deltaData)
} else { } else {
r0 = ret.Error(0) r0 = ret.Error(0)
@ -846,14 +846,14 @@ type MockSegment_LoadDeltaData_Call struct {
// LoadDeltaData is a helper method to define mock.On call // LoadDeltaData is a helper method to define mock.On call
// - ctx context.Context // - ctx context.Context
// - deltaData *storage.DeleteData // - deltaData *storage.DeltaData
func (_e *MockSegment_Expecter) LoadDeltaData(ctx interface{}, deltaData interface{}) *MockSegment_LoadDeltaData_Call { func (_e *MockSegment_Expecter) LoadDeltaData(ctx interface{}, deltaData interface{}) *MockSegment_LoadDeltaData_Call {
return &MockSegment_LoadDeltaData_Call{Call: _e.mock.On("LoadDeltaData", ctx, deltaData)} return &MockSegment_LoadDeltaData_Call{Call: _e.mock.On("LoadDeltaData", ctx, deltaData)}
} }
func (_c *MockSegment_LoadDeltaData_Call) Run(run func(ctx context.Context, deltaData *storage.DeleteData)) *MockSegment_LoadDeltaData_Call { func (_c *MockSegment_LoadDeltaData_Call) Run(run func(ctx context.Context, deltaData *storage.DeltaData)) *MockSegment_LoadDeltaData_Call {
_c.Call.Run(func(args mock.Arguments) { _c.Call.Run(func(args mock.Arguments) {
run(args[0].(context.Context), args[1].(*storage.DeleteData)) run(args[0].(context.Context), args[1].(*storage.DeltaData))
}) })
return _c return _c
} }
@ -863,7 +863,7 @@ func (_c *MockSegment_LoadDeltaData_Call) Return(_a0 error) *MockSegment_LoadDel
return _c return _c
} }
func (_c *MockSegment_LoadDeltaData_Call) RunAndReturn(run func(context.Context, *storage.DeleteData) error) *MockSegment_LoadDeltaData_Call { func (_c *MockSegment_LoadDeltaData_Call) RunAndReturn(run func(context.Context, *storage.DeltaData) error) *MockSegment_LoadDeltaData_Call {
_c.Call.Return(run) _c.Call.Return(run)
return _c return _c
} }

View File

@ -1018,9 +1018,9 @@ func (s *LocalSegment) AddFieldDataInfo(ctx context.Context, rowCount int64, fie
return nil return nil
} }
func (s *LocalSegment) LoadDeltaData(ctx context.Context, deltaData *storage.DeleteData) error { func (s *LocalSegment) LoadDeltaData(ctx context.Context, deltaData *storage.DeltaData) error {
pks, tss := deltaData.Pks, deltaData.Tss pks, tss := deltaData.DeletePks, deltaData.DeleteTimestamps
rowNum := deltaData.RowCount rowNum := deltaData.DelRowCount
if !s.ptrLock.RLockIf(state.IsNotReleased) { if !s.ptrLock.RLockIf(state.IsNotReleased) {
return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released") return merr.WrapErrSegmentNotLoaded(s.ID(), "segment released")
@ -1033,31 +1033,9 @@ func (s *LocalSegment) LoadDeltaData(ctx context.Context, deltaData *storage.Del
zap.Int64("segmentID", s.ID()), zap.Int64("segmentID", s.ID()),
) )
pkType := pks[0].Type() ids, err := storage.ParsePrimaryKeysBatch2IDs(pks)
ids := &schemapb.IDs{} if err != nil {
switch pkType { return err
case schemapb.DataType_Int64:
int64Pks := make([]int64, len(pks))
for index, pk := range pks {
int64Pks[index] = pk.(*storage.Int64PrimaryKey).Value
}
ids.IdField = &schemapb.IDs_IntId{
IntId: &schemapb.LongArray{
Data: int64Pks,
},
}
case schemapb.DataType_VarChar:
varCharPks := make([]string, len(pks))
for index, pk := range pks {
varCharPks[index] = pk.(*storage.VarCharPrimaryKey).Value
}
ids.IdField = &schemapb.IDs_StrId{
StrId: &schemapb.StringArray{
Data: varCharPks,
},
}
default:
return fmt.Errorf("invalid data type of primary keys")
} }
idsBlob, err := proto.Marshal(ids) idsBlob, err := proto.Marshal(ids)

View File

@ -78,7 +78,7 @@ type Segment interface {
// Modification related // Modification related
Insert(ctx context.Context, rowIDs []int64, timestamps []typeutil.Timestamp, record *segcorepb.InsertRecord) error Insert(ctx context.Context, rowIDs []int64, timestamps []typeutil.Timestamp, record *segcorepb.InsertRecord) error
Delete(ctx context.Context, primaryKeys []storage.PrimaryKey, timestamps []typeutil.Timestamp) error Delete(ctx context.Context, primaryKeys []storage.PrimaryKey, timestamps []typeutil.Timestamp) error
LoadDeltaData(ctx context.Context, deltaData *storage.DeleteData) error LoadDeltaData(ctx context.Context, deltaData *storage.DeltaData) error
LastDeltaTimestamp() uint64 LastDeltaTimestamp() uint64
Release(ctx context.Context, opts ...releaseOption) Release(ctx context.Context, opts ...releaseOption)

View File

@ -151,12 +151,14 @@ func (s *L0Segment) Delete(ctx context.Context, primaryKeys []storage.PrimaryKey
return merr.WrapErrIoFailedReason("delete not supported for L0 segment") return merr.WrapErrIoFailedReason("delete not supported for L0 segment")
} }
func (s *L0Segment) LoadDeltaData(ctx context.Context, deltaData *storage.DeleteData) error { func (s *L0Segment) LoadDeltaData(ctx context.Context, deltaData *storage.DeltaData) error {
s.dataGuard.Lock() s.dataGuard.Lock()
defer s.dataGuard.Unlock() defer s.dataGuard.Unlock()
s.pks = append(s.pks, deltaData.Pks...) for i := 0; i < deltaData.DeletePks.Len(); i++ {
s.tss = append(s.tss, deltaData.Tss...) s.pks = append(s.pks, deltaData.DeletePks.Get(i))
}
s.tss = append(s.tss, deltaData.DeleteTimestamps...)
return nil return nil
} }

View File

@ -1207,9 +1207,23 @@ func (loader *segmentLoader) LoadDeltaLogs(ctx context.Context, segment Segment,
rowNums := lo.SumBy(blobs, func(blob *storage.Blob) int64 { rowNums := lo.SumBy(blobs, func(blob *storage.Blob) int64 {
return blob.RowNum return blob.RowNum
}) })
deltaData := &storage.DeleteData{
Pks: make([]storage.PrimaryKey, 0, rowNums), var deltaData *storage.DeltaData
Tss: make([]uint64, 0, rowNums), collection := loader.manager.Collection.Get(segment.Collection())
helper, _ := typeutil.CreateSchemaHelper(collection.Schema())
pkField, _ := helper.GetPrimaryKeyField()
switch pkField.DataType {
case schemapb.DataType_Int64:
deltaData = &storage.DeltaData{
DeletePks: storage.NewInt64PrimaryKeys(int(rowNums)),
DeleteTimestamps: make([]uint64, 0, rowNums),
}
case schemapb.DataType_VarChar:
deltaData = &storage.DeltaData{
DeletePks: storage.NewVarcharPrimaryKeys(int(rowNums)),
DeleteTimestamps: make([]uint64, 0, rowNums),
}
} }
reader, err := storage.CreateDeltalogReader(blobs) reader, err := storage.CreateDeltalogReader(blobs)
@ -1226,7 +1240,9 @@ func (loader *segmentLoader) LoadDeltaLogs(ctx context.Context, segment Segment,
return err return err
} }
dl := reader.Value() dl := reader.Value()
deltaData.Append(dl.Pk, dl.Ts) deltaData.DeletePks.MustAppend(dl.Pk)
deltaData.DeleteTimestamps = append(deltaData.DeleteTimestamps, dl.Ts)
deltaData.DelRowCount++
} }
err = segment.LoadDeltaData(ctx, deltaData) err = segment.LoadDeltaData(ctx, deltaData)
@ -1234,7 +1250,7 @@ func (loader *segmentLoader) LoadDeltaLogs(ctx context.Context, segment Segment,
return err return err
} }
log.Info("load delta logs done", zap.Int64("deleteCount", deltaData.RowCount)) log.Info("load delta logs done", zap.Int64("deleteCount", deltaData.DelRowCount))
return nil return nil
} }

View File

@ -34,14 +34,14 @@ var parserPool = &fastjson.ParserPool{}
// DeltaData stores delta data // DeltaData stores delta data
// currently only delete tuples are stored // currently only delete tuples are stored
type DeltaData struct { type DeltaData struct {
pkType schemapb.DataType PkType schemapb.DataType
// delete tuples // delete tuples
delPks PrimaryKeys DeletePks PrimaryKeys
delTss []Timestamp DeleteTimestamps []Timestamp
// stats // stats
delRowCount int64 DelRowCount int64
memSize int64 MemSize int64
} }
type DeleteLog struct { type DeleteLog struct {

View File

@ -23,6 +23,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/log" "github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/merr"
) )
type PrimaryKey interface { type PrimaryKey interface {
@ -350,6 +351,33 @@ func ParseIDs2PrimaryKeys(ids *schemapb.IDs) []PrimaryKey {
return ret return ret
} }
func ParsePrimaryKeysBatch2IDs(pks PrimaryKeys) (*schemapb.IDs, error) {
ret := &schemapb.IDs{}
if pks.Len() == 0 {
return ret, nil
}
switch pks.Type() {
case schemapb.DataType_Int64:
int64Pks := pks.(*Int64PrimaryKeys)
ret.IdField = &schemapb.IDs_IntId{
IntId: &schemapb.LongArray{
Data: int64Pks.values,
},
}
case schemapb.DataType_VarChar:
varcharPks := pks.(*VarcharPrimaryKeys)
ret.IdField = &schemapb.IDs_StrId{
StrId: &schemapb.StringArray{
Data: varcharPks.values,
},
}
default:
return nil, merr.WrapErrServiceInternal("parsing unsupported pk type", pks.Type().String())
}
return ret, nil
}
func ParsePrimaryKeys2IDs(pks []PrimaryKey) *schemapb.IDs { func ParsePrimaryKeys2IDs(pks []PrimaryKey) *schemapb.IDs {
ret := &schemapb.IDs{} ret := &schemapb.IDs{}
if len(pks) == 0 { if len(pks) == 0 {

View File

@ -177,3 +177,37 @@ func TestParsePrimaryKeysAndIDs(t *testing.T) {
assert.ElementsMatch(t, c.pks, testPks) assert.ElementsMatch(t, c.pks, testPks)
} }
} }
type badPks struct {
PrimaryKeys
}
func (pks *badPks) Type() schemapb.DataType {
return schemapb.DataType_None
}
func TestParsePrimaryKeysBatch2IDs(t *testing.T) {
t.Run("success_cases", func(t *testing.T) {
intPks := NewInt64PrimaryKeys(3)
intPks.AppendRaw(1, 2, 3)
ids, err := ParsePrimaryKeysBatch2IDs(intPks)
assert.NoError(t, err)
assert.ElementsMatch(t, []int64{1, 2, 3}, ids.GetIntId().GetData())
strPks := NewVarcharPrimaryKeys(3)
strPks.AppendRaw("1", "2", "3")
ids, err = ParsePrimaryKeysBatch2IDs(strPks)
assert.NoError(t, err)
assert.ElementsMatch(t, []string{"1", "2", "3"}, ids.GetStrId().GetData())
})
t.Run("unsupport_type", func(t *testing.T) {
intPks := NewInt64PrimaryKeys(3)
intPks.AppendRaw(1, 2, 3)
_, err := ParsePrimaryKeysBatch2IDs(&badPks{PrimaryKeys: intPks})
assert.Error(t, err)
})
}