From 692dcebac64eabbee67a1defecc421102b3bb997 Mon Sep 17 00:00:00 2001 From: smellthemoon <64083300+smellthemoon@users.noreply.github.com> Date: Fri, 2 Feb 2024 16:11:08 +0800 Subject: [PATCH] =?UTF-8?q?enhance:=20support=20varchar=20autoid=20when=20?= =?UTF-8?q?bulkinsert=EF=BC=88#30377=EF=BC=89=20(#30448)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit related pr: #30377 Signed-off-by: lixinguo Co-authored-by: lixinguo --- internal/util/importutil/import_util.go | 34 ++++++++----- internal/util/importutil/numpy_parser_test.go | 49 ++++++++++++++++++- 2 files changed, 70 insertions(+), 13 deletions(-) diff --git a/internal/util/importutil/import_util.go b/internal/util/importutil/import_util.go index 58b2849436..aad7c22527 100644 --- a/internal/util/importutil/import_util.go +++ b/internal/util/importutil/import_util.go @@ -1038,20 +1038,30 @@ func splitFieldsData(collectionInfo *CollectionInfo, fieldsData BlockData, shard autoIDRange := make([]int64, 0) if primaryKey.GetAutoID() { log.Info("generating auto-id", zap.Int("rowCount", rowCount), zap.Int64("rowIDBegin", rowIDBegin)) - if primaryKey.GetDataType() != schemapb.DataType_Int64 { - log.Warn("primary key field is auto-generated but the field type is not int64") - return nil, fmt.Errorf("primary key field is auto-generated but the field type is not int64") - } + if primaryKey.GetDataType() == schemapb.DataType_Int64 { + primaryDataArr := &storage.Int64FieldData{ + Data: make([]int64, 0, rowCount), + } + for i := rowIDBegin; i < rowIDEnd; i++ { + primaryDataArr.Data = append(primaryDataArr.Data, i) + } - primaryDataArr := &storage.Int64FieldData{ - Data: make([]int64, 0, rowCount), - } - for i := rowIDBegin; i < rowIDEnd; i++ { - primaryDataArr.Data = append(primaryDataArr.Data, i) - } + fieldsData[primaryKey.GetFieldID()] = primaryDataArr + autoIDRange = append(autoIDRange, rowIDBegin, rowIDEnd) + } else if primaryKey.GetDataType() == schemapb.DataType_VarChar { + primaryDataArr := &storage.StringFieldData{ + Data: make([]string, 0, rowCount), + } + for i := rowIDBegin; i < rowIDEnd; i++ { + primaryDataArr.Data = append(primaryDataArr.Data, strconv.FormatInt(i, 10)) + } - fieldsData[primaryKey.GetFieldID()] = primaryDataArr - autoIDRange = append(autoIDRange, rowIDBegin, rowIDEnd) + fieldsData[primaryKey.GetFieldID()] = primaryDataArr + autoIDRange = append(autoIDRange, rowIDBegin, rowIDEnd) + } else { + log.Warn("unsupported primary key type", zap.Int("type", int(primaryKey.GetDataType()))) + return nil, merr.WrapErrParameterInvalidMsg(fmt.Sprintf("unsupported primary key type %d, primary key should be int64 or varchar", primaryKey.GetDataType())) + } } // if the primary key is not auto-gernerate and user doesn't provide, return error diff --git a/internal/util/importutil/numpy_parser_test.go b/internal/util/importutil/numpy_parser_test.go index 656831395b..4d0aca5b1d 100644 --- a/internal/util/importutil/numpy_parser_test.go +++ b/internal/util/importutil/numpy_parser_test.go @@ -30,6 +30,7 @@ import ( "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus/internal/storage" "github.com/milvus-io/milvus/pkg/common" + "github.com/milvus-io/milvus/pkg/util/merr" "github.com/milvus-io/milvus/pkg/util/timerecord" ) @@ -879,7 +880,7 @@ func Test_NumpyParserSplitFieldsData(t *testing.T) { parser.rowIDAllocator = newIDAllocator(ctx, t, nil) }) - t.Run("primary key auto-generated", func(t *testing.T) { + t.Run("int64 primary key auto-generated", func(t *testing.T) { parser.collectionInfo.resetSchema(createNumpySchema()) schema := findSchema(parser.collectionInfo.Schema, schemapb.DataType_Int64) schema.AutoID = true @@ -906,6 +907,52 @@ func Test_NumpyParserSplitFieldsData(t *testing.T) { schema.AutoID = false }) + t.Run("varchar primary key auto-generated", func(t *testing.T) { + parser.collectionInfo.resetSchema(createNumpySchema()) + schema := findSchema(parser.collectionInfo.Schema, schemapb.DataType_Int64) + schema.IsPartitionKey = false + schema = findSchema(parser.collectionInfo.Schema, schemapb.DataType_VarChar) + schema.AutoID = true + parser.collectionInfo.PrimaryKey = schema + + partitionID := int64(1) + fieldsData := createFieldsData(sampleSchema(), 0, baseTimestamp) + shards := createShardsData(sampleSchema(), fieldsData, 2, []int64{partitionID}) + segmentData := genFieldsDataFunc() + parser.autoIDRange, err = splitFieldsData(parser.collectionInfo, segmentData, shards, parser.rowIDAllocator) + assert.NoError(t, err) + assert.NotEmpty(t, parser.autoIDRange) + + totalNum := 0 + for i := 0; i < int(parser.collectionInfo.ShardNum); i++ { + totalNum += shards[i][partitionID][106].RowNum() + } + assert.Equal(t, segmentData[106].RowNum(), totalNum) + + // target field data is nil + shards[0][partitionID][105] = nil + parser.autoIDRange, err = splitFieldsData(parser.collectionInfo, segmentData, shards, parser.rowIDAllocator) + assert.Error(t, err) + + schema.AutoID = false + }) + + t.Run("not support primary key type auto-generated", func(t *testing.T) { + parser.collectionInfo.resetSchema(createNumpySchema()) + schema := findSchema(parser.collectionInfo.Schema, schemapb.DataType_Int64) + schema.IsPartitionKey = false + schema = findSchema(parser.collectionInfo.Schema, schemapb.DataType_Bool) + schema.AutoID = true + parser.collectionInfo.PrimaryKey = schema + + partitionID := int64(1) + fieldsData := createFieldsData(sampleSchema(), 0, baseTimestamp) + shards := createShardsData(sampleSchema(), fieldsData, 2, []int64{partitionID}) + segmentData := genFieldsDataFunc() + parser.autoIDRange, err = splitFieldsData(parser.collectionInfo, segmentData, shards, parser.rowIDAllocator) + assert.ErrorIs(t, err, merr.ErrParameterInvalid) + }) + t.Run("has dynamic field", func(t *testing.T) { schema := &schemapb.CollectionSchema{ Name: "schema",