mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-28 22:45:26 +08:00
issue: #46352 - Add GenerateChannelBalancedPrimaryKeys function supporting Int64 and VarChar PK types with even distribution across channels - Add GenerateBalancedInt64PKs and GenerateBalancedVarCharPKs helper functions using murmur3 and crc32 hash algorithms respectively - Add PrimaryKeyConfig struct to configure PK generation in InsertAndFlush - Update InsertAndFlush to use channel balanced PKs instead of random hash keys for better test coverage - Add comprehensive unit tests including end-to-end validation with typeutil.HashPK2Channels to verify exact channel distribution <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Tests** * Enhanced integration tests with configurable primary-key sequencing across insert batches. * Added utilities to generate channel-balanced primary keys for integer and string types. * Expanded test coverage validating balanced distribution, uniqueness, continuation, and large-scale behavior across multiple channels. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub> <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: wayblink <anyang.wang@zilliz.com> Signed-off-by: Wei Liu <wei.liu@zilliz.com>
440 lines
13 KiB
Go
440 lines
13 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package integration
|
|
|
|
import (
|
|
"context"
|
|
"encoding/binary"
|
|
"fmt"
|
|
"hash/crc32"
|
|
"time"
|
|
|
|
"github.com/spaolacci/murmur3"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/testutils"
|
|
)
|
|
|
|
func (s *MiniClusterSuite) WaitForFlush(ctx context.Context, segIDs []int64, flushTs uint64, dbName, collectionName string) {
|
|
flushed := func() bool {
|
|
resp, err := s.Cluster.MilvusClient.GetFlushState(ctx, &milvuspb.GetFlushStateRequest{
|
|
SegmentIDs: segIDs,
|
|
FlushTs: flushTs,
|
|
DbName: dbName,
|
|
CollectionName: collectionName,
|
|
})
|
|
if err != nil {
|
|
return false
|
|
}
|
|
return resp.GetFlushed()
|
|
}
|
|
for !flushed() {
|
|
select {
|
|
case <-ctx.Done():
|
|
s.FailNow("failed to wait for flush until ctx done")
|
|
return
|
|
default:
|
|
time.Sleep(500 * time.Millisecond)
|
|
}
|
|
}
|
|
}
|
|
|
|
func NewInt64FieldData(fieldName string, numRows int) *schemapb.FieldData {
|
|
return &schemapb.FieldData{
|
|
Type: schemapb.DataType_Int64,
|
|
FieldName: fieldName,
|
|
Field: &schemapb.FieldData_Scalars{
|
|
Scalars: &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_LongData{
|
|
LongData: &schemapb.LongArray{
|
|
Data: GenerateInt64Array(numRows, 0),
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func NewInt64FieldDataWithStart(fieldName string, numRows int, start int64) *schemapb.FieldData {
|
|
return &schemapb.FieldData{
|
|
Type: schemapb.DataType_Int64,
|
|
FieldName: fieldName,
|
|
Field: &schemapb.FieldData_Scalars{
|
|
Scalars: &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_LongData{
|
|
LongData: &schemapb.LongArray{
|
|
Data: GenerateInt64Array(numRows, start),
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func NewInt64FieldDataNullableWithStart(fieldName string, numRows, start int) *schemapb.FieldData {
|
|
validData, num := GenerateBoolArray(numRows)
|
|
return &schemapb.FieldData{
|
|
Type: schemapb.DataType_Int64,
|
|
FieldName: fieldName,
|
|
Field: &schemapb.FieldData_Scalars{
|
|
Scalars: &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_LongData{
|
|
LongData: &schemapb.LongArray{
|
|
Data: GenerateInt64Array(num, int64(start)),
|
|
},
|
|
},
|
|
},
|
|
},
|
|
ValidData: validData,
|
|
}
|
|
}
|
|
|
|
func NewInt64SameFieldData(fieldName string, numRows int, value int64) *schemapb.FieldData {
|
|
return &schemapb.FieldData{
|
|
Type: schemapb.DataType_Int64,
|
|
FieldName: fieldName,
|
|
Field: &schemapb.FieldData_Scalars{
|
|
Scalars: &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_LongData{
|
|
LongData: &schemapb.LongArray{
|
|
Data: GenerateSameInt64Array(numRows, value),
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func NewVarCharSameFieldData(fieldName string, numRows int, value string) *schemapb.FieldData {
|
|
return &schemapb.FieldData{
|
|
Type: schemapb.DataType_String,
|
|
FieldName: fieldName,
|
|
Field: &schemapb.FieldData_Scalars{
|
|
Scalars: &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_StringData{
|
|
StringData: &schemapb.StringArray{
|
|
Data: GenerateSameStringArray(numRows, value),
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func NewVarCharFieldData(fieldName string, numRows int, nullable bool) *schemapb.FieldData {
|
|
numValid := numRows
|
|
if nullable {
|
|
numValid = numRows / 2
|
|
}
|
|
return &schemapb.FieldData{
|
|
Type: schemapb.DataType_String,
|
|
FieldName: fieldName,
|
|
Field: &schemapb.FieldData_Scalars{
|
|
Scalars: &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_StringData{
|
|
StringData: &schemapb.StringArray{
|
|
Data: testutils.GenerateStringArray(numValid),
|
|
// Data: testutils.GenerateStringArray(numRows),
|
|
},
|
|
},
|
|
},
|
|
},
|
|
ValidData: testutils.GenerateBoolArray(numRows),
|
|
}
|
|
}
|
|
|
|
func NewStringFieldData(fieldName string, numRows int) *schemapb.FieldData {
|
|
return testutils.NewStringFieldData(fieldName, numRows)
|
|
}
|
|
|
|
// note: unlike testutils's NewGeometryFieldData ,integration's NewGeometryFieldData generate wkt string bytes
|
|
func NewGeometryFieldData(fieldName string, numRows int) *schemapb.FieldData {
|
|
return testutils.NewGeometryFieldDataWktFormat(fieldName, numRows)
|
|
}
|
|
|
|
func NewFloatVectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData {
|
|
return testutils.NewFloatVectorFieldData(fieldName, numRows, dim)
|
|
}
|
|
|
|
func NewFloat16VectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData {
|
|
return testutils.NewFloat16VectorFieldData(fieldName, numRows, dim)
|
|
}
|
|
|
|
func NewBFloat16VectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData {
|
|
return testutils.NewBFloat16VectorFieldData(fieldName, numRows, dim)
|
|
}
|
|
|
|
func NewBinaryVectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData {
|
|
return testutils.NewBinaryVectorFieldData(fieldName, numRows, dim)
|
|
}
|
|
|
|
func NewSparseFloatVectorFieldData(fieldName string, numRows int) *schemapb.FieldData {
|
|
return testutils.NewSparseFloatVectorFieldData(fieldName, numRows)
|
|
}
|
|
|
|
func NewInt8VectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData {
|
|
return testutils.NewInt8VectorFieldData(fieldName, numRows, dim)
|
|
}
|
|
|
|
func NewStructArrayFieldData(schema *schemapb.StructArrayFieldSchema, fieldName string, numRow int, dim int) *schemapb.FieldData {
|
|
fieldData := &schemapb.FieldData{
|
|
Type: schemapb.DataType_ArrayOfStruct,
|
|
FieldName: fieldName,
|
|
Field: &schemapb.FieldData_StructArrays{
|
|
StructArrays: &schemapb.StructArrayField{
|
|
Fields: testutils.GenerateArrayOfStructArray(schema, numRow, dim),
|
|
},
|
|
},
|
|
}
|
|
return fieldData
|
|
}
|
|
|
|
func GenerateInt64Array(numRows int, start int64) []int64 {
|
|
ret := make([]int64, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret[i] = int64(i) + start
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func GenerateSameInt64Array(numRows int, value int64) []int64 {
|
|
ret := make([]int64, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret[i] = value
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func GenerateBoolArray(numRows int) ([]bool, int) {
|
|
var num int
|
|
ret := make([]bool, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret[i] = i%2 == 0
|
|
if ret[i] {
|
|
num++
|
|
}
|
|
}
|
|
return ret, num
|
|
}
|
|
|
|
func GenerateSameStringArray(numRows int, value string) []string {
|
|
ret := make([]string, numRows)
|
|
for i := 0; i < numRows; i++ {
|
|
ret[i] = value
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func GenerateSparseFloatArray(numRows int) *schemapb.SparseFloatArray {
|
|
return testutils.GenerateSparseFloatVectors(numRows)
|
|
}
|
|
|
|
func GenerateHashKeys(numRows int) []uint32 {
|
|
return testutils.GenerateHashKeys(numRows)
|
|
}
|
|
|
|
// GenerateChannelBalancedPrimaryKeys generates primary keys that are evenly distributed across channels.
|
|
// It supports both Int64 and VarChar primary key types.
|
|
// For Int64: uses murmur3 hash (same as typeutil.Hash32Int64)
|
|
// For VarChar: uses crc32 hash (same as typeutil.HashString2Uint32)
|
|
// startPK specifies where to begin searching for PKs.
|
|
// Returns the FieldData and the next startPK for subsequent calls.
|
|
func GenerateChannelBalancedPrimaryKeys(fieldName string, fieldType schemapb.DataType, numRows int, numChannels int, startPK int64) (*schemapb.FieldData, int64) {
|
|
switch fieldType {
|
|
case schemapb.DataType_Int64:
|
|
pks, nextPK := GenerateBalancedInt64PKs(numRows, numChannels, startPK)
|
|
return &schemapb.FieldData{
|
|
Type: schemapb.DataType_Int64,
|
|
FieldName: fieldName,
|
|
Field: &schemapb.FieldData_Scalars{
|
|
Scalars: &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_LongData{
|
|
LongData: &schemapb.LongArray{
|
|
Data: pks,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}, nextPK
|
|
case schemapb.DataType_VarChar, schemapb.DataType_String:
|
|
pks, nextIndex := GenerateBalancedVarCharPKs(numRows, numChannels, int(startPK))
|
|
return &schemapb.FieldData{
|
|
Type: schemapb.DataType_VarChar,
|
|
FieldName: fieldName,
|
|
Field: &schemapb.FieldData_Scalars{
|
|
Scalars: &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_StringData{
|
|
StringData: &schemapb.StringArray{
|
|
Data: pks,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}, int64(nextIndex)
|
|
default:
|
|
panic(fmt.Sprintf("not supported primary key type: %s", fieldType))
|
|
}
|
|
}
|
|
|
|
// GenerateBalancedInt64PKs generates int64 primary keys that are evenly distributed across channels.
|
|
// This ensures each channel receives exactly numRows/numChannels items based on PK hash values.
|
|
// The function searches for PKs that hash to each channel to achieve exact distribution.
|
|
// startPK specifies where to begin searching for PKs.
|
|
// Returns the generated PKs and the next startPK for subsequent calls.
|
|
func GenerateBalancedInt64PKs(numRows int, numChannels int, startPK int64) ([]int64, int64) {
|
|
if numChannels <= 0 {
|
|
numChannels = 1
|
|
}
|
|
|
|
// Calculate how many items each channel should receive
|
|
baseCount := numRows / numChannels
|
|
remainder := numRows % numChannels
|
|
|
|
// Collect PKs for each channel
|
|
channelPKs := make([][]int64, numChannels)
|
|
targetCounts := make([]int, numChannels)
|
|
for ch := 0; ch < numChannels; ch++ {
|
|
targetCounts[ch] = baseCount
|
|
if ch < remainder {
|
|
targetCounts[ch]++
|
|
}
|
|
channelPKs[ch] = make([]int64, 0, targetCounts[ch])
|
|
}
|
|
|
|
// Search for PKs that hash to each channel
|
|
var lastPK int64
|
|
for pk := startPK; ; pk++ {
|
|
lastPK = pk
|
|
// Calculate which channel this PK would go to
|
|
hash := hashInt64ForChannel(pk)
|
|
ch := int(hash % uint32(numChannels))
|
|
|
|
if len(channelPKs[ch]) < targetCounts[ch] {
|
|
channelPKs[ch] = append(channelPKs[ch], pk)
|
|
|
|
// Check if all channels have enough PKs
|
|
done := true
|
|
for ch := 0; ch < numChannels; ch++ {
|
|
if len(channelPKs[ch]) < targetCounts[ch] {
|
|
done = false
|
|
break
|
|
}
|
|
}
|
|
if done {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Combine all PKs
|
|
result := make([]int64, 0, numRows)
|
|
for ch := 0; ch < numChannels; ch++ {
|
|
result = append(result, channelPKs[ch]...)
|
|
}
|
|
|
|
return result, lastPK + 1
|
|
}
|
|
|
|
// hashInt64ForChannel computes the hash value for channel assignment.
|
|
// This mirrors the logic in typeutil.Hash32Int64 and HashPK2Channels.
|
|
func hashInt64ForChannel(v int64) uint32 {
|
|
// Must match the behavior of typeutil.Hash32Int64
|
|
// which uses common.Endian (binary.LittleEndian)
|
|
b := make([]byte, 8)
|
|
binary.LittleEndian.PutUint64(b, uint64(v))
|
|
|
|
// Use murmur3 hash (same as typeutil.Hash32Bytes)
|
|
h := murmur3.New32()
|
|
h.Write(b)
|
|
return h.Sum32() & 0x7fffffff
|
|
}
|
|
|
|
// GenerateBalancedVarCharPKs generates varchar primary keys that are evenly distributed across channels.
|
|
// This ensures each channel receives exactly numRows/numChannels items based on PK hash values.
|
|
// The function searches for PKs that hash to each channel to achieve exact distribution.
|
|
// startIndex specifies where to begin searching for PKs (used in "pk_<index>" format).
|
|
// Returns the generated PKs and the next startIndex for subsequent calls.
|
|
func GenerateBalancedVarCharPKs(numRows int, numChannels int, startIndex int) ([]string, int) {
|
|
if numChannels <= 0 {
|
|
numChannels = 1
|
|
}
|
|
|
|
// Calculate how many items each channel should receive
|
|
baseCount := numRows / numChannels
|
|
remainder := numRows % numChannels
|
|
|
|
// Collect PKs for each channel
|
|
channelPKs := make([][]string, numChannels)
|
|
targetCounts := make([]int, numChannels)
|
|
for ch := 0; ch < numChannels; ch++ {
|
|
targetCounts[ch] = baseCount
|
|
if ch < remainder {
|
|
targetCounts[ch]++
|
|
}
|
|
channelPKs[ch] = make([]string, 0, targetCounts[ch])
|
|
}
|
|
|
|
// Search for PKs that hash to each channel
|
|
var lastIndex int
|
|
for i := startIndex; ; i++ {
|
|
lastIndex = i
|
|
// Generate a unique string PK
|
|
pk := fmt.Sprintf("pk_%d", i)
|
|
|
|
// Calculate which channel this PK would go to
|
|
hash := hashVarCharForChannel(pk)
|
|
ch := int(hash % uint32(numChannels))
|
|
|
|
if len(channelPKs[ch]) < targetCounts[ch] {
|
|
channelPKs[ch] = append(channelPKs[ch], pk)
|
|
|
|
// Check if all channels have enough PKs
|
|
done := true
|
|
for ch := 0; ch < numChannels; ch++ {
|
|
if len(channelPKs[ch]) < targetCounts[ch] {
|
|
done = false
|
|
break
|
|
}
|
|
}
|
|
if done {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Combine all PKs
|
|
result := make([]string, 0, numRows)
|
|
for ch := 0; ch < numChannels; ch++ {
|
|
result = append(result, channelPKs[ch]...)
|
|
}
|
|
|
|
return result, lastIndex + 1
|
|
}
|
|
|
|
// hashVarCharForChannel computes the hash value for channel assignment of varchar PKs.
|
|
// This mirrors the logic in typeutil.HashString2Uint32 and HashPK2Channels.
|
|
func hashVarCharForChannel(v string) uint32 {
|
|
// Must match the behavior of typeutil.HashString2Uint32
|
|
// which uses crc32.ChecksumIEEE with substring limit of 100 chars
|
|
subString := v
|
|
if len(v) > 100 {
|
|
subString = v[:100]
|
|
}
|
|
return crc32.ChecksumIEEE([]byte(subString))
|
|
}
|