fix: csv/json import with STRUCT adapts concatenated struct name (#45000)

After https://github.com/milvus-io/milvus/pull/44557, the field name in
STRUCT field becomes STRUCT_NAME[FIELD_NAME]
This PR make import consider the change.

issue: https://github.com/milvus-io/milvus/issues/45006
ref: https://github.com/milvus-io/milvus/issues/42148

TODO: parquet is much more complex than csv/json, and I will leave it to
a separate PR.

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
This commit is contained in:
Spade A 2025-10-24 10:22:15 +08:00 committed by GitHub
parent 6494c75d31
commit d8591f9548
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 58 additions and 37 deletions

View File

@ -1751,7 +1751,7 @@ func TestProxy(t *testing.T) {
assert.Equal(t, commonpb.ErrorCode_Success, resp.ErrorCode) assert.Equal(t, commonpb.ErrorCode_Success, resp.ErrorCode)
}) })
fieldName := ConcatStructFieldName(structField, subFieldFVec) fieldName := typeutil.ConcatStructFieldName(structField, subFieldFVec)
wg.Add(1) wg.Add(1)
t.Run("create index for embedding list field", func(t *testing.T) { t.Run("create index for embedding list field", func(t *testing.T) {
defer wg.Done() defer wg.Done()

View File

@ -83,10 +83,6 @@ const (
var logger = log.L().WithOptions(zap.Fields(zap.String("role", typeutil.ProxyRole))) var logger = log.L().WithOptions(zap.Fields(zap.String("role", typeutil.ProxyRole)))
func ConcatStructFieldName(structName string, fieldName string) string {
return fmt.Sprintf("%s[%s]", structName, fieldName)
}
// transformStructFieldNames transforms struct field names to structName[fieldName] format // transformStructFieldNames transforms struct field names to structName[fieldName] format
// This ensures global uniqueness while allowing same field names across different structs // This ensures global uniqueness while allowing same field names across different structs
func transformStructFieldNames(schema *schemapb.CollectionSchema) error { func transformStructFieldNames(schema *schemapb.CollectionSchema) error {
@ -94,7 +90,7 @@ func transformStructFieldNames(schema *schemapb.CollectionSchema) error {
structName := structArrayField.Name structName := structArrayField.Name
for _, field := range structArrayField.Fields { for _, field := range structArrayField.Fields {
// Create transformed name: structName[fieldName] // Create transformed name: structName[fieldName]
newName := ConcatStructFieldName(structName, field.Name) newName := typeutil.ConcatStructFieldName(structName, field.Name)
field.Name = newName field.Name = newName
} }
} }
@ -1835,7 +1831,7 @@ func checkAndFlattenStructFieldData(schema *schemapb.CollectionSchema, insertMsg
structName, expectedArrayLen, currentArrayLen, subField.FieldName) structName, expectedArrayLen, currentArrayLen, subField.FieldName)
} }
transformedFieldName := ConcatStructFieldName(structName, subField.FieldName) transformedFieldName := typeutil.ConcatStructFieldName(structName, subField.FieldName)
subFieldCopy := &schemapb.FieldData{ subFieldCopy := &schemapb.FieldData{
FieldName: transformedFieldName, FieldName: transformedFieldName,
FieldId: subField.FieldId, FieldId: subField.FieldId,

View File

@ -159,7 +159,7 @@ func NewRowParser(schema *schemapb.CollectionSchema, header []string, nullkey st
// we reconstruct it to be handled by handleField as: // we reconstruct it to be handled by handleField as:
// //
// {"sub-field1": "[1, 2]", "sub-field2": "[[1.0, 2.0], [3.0, 4.0]]"} // {"sub-field1": "[1, 2]", "sub-field2": "[[1.0, 2.0], [3.0, 4.0]]"}
func (r *rowParser) reconstructArrayForStructArray(subFieldsMap map[string]*schemapb.FieldSchema, raw string) (map[string]string, error) { func (r *rowParser) reconstructArrayForStructArray(structName string, subFieldsMap map[string]*schemapb.FieldSchema, raw string) (map[string]string, error) {
// Parse the JSON array string // Parse the JSON array string
var rows []any var rows []any
dec := json.NewDecoder(strings.NewReader(raw)) dec := json.NewDecoder(strings.NewReader(raw))
@ -175,20 +175,21 @@ func (r *rowParser) reconstructArrayForStructArray(subFieldsMap map[string]*sche
return nil, merr.WrapErrImportFailed(fmt.Sprintf("invalid element in StructArray, expect map[string]any but got type %T", elem)) return nil, merr.WrapErrImportFailed(fmt.Sprintf("invalid element in StructArray, expect map[string]any but got type %T", elem))
} }
for key, value := range row { for key, value := range row {
field, ok := subFieldsMap[key] fieldName := typeutil.ConcatStructFieldName(structName, key)
field, ok := subFieldsMap[fieldName]
if !ok { if !ok {
return nil, merr.WrapErrImportFailed(fmt.Sprintf("field %s not found", key)) return nil, merr.WrapErrImportFailed(fmt.Sprintf("field %s not found", fieldName))
} }
strVal, ok := value.(string) strVal, ok := value.(string)
if !ok { if !ok {
return nil, merr.WrapErrImportFailed(fmt.Sprintf("invalid value type for field %s, expect string but got %T", key, value)) return nil, merr.WrapErrImportFailed(fmt.Sprintf("invalid value type for field %s, expect string but got %T", fieldName, value))
} }
data, err := r.parseEntity(field, strVal, true) data, err := r.parseEntity(field, strVal, true)
if err != nil { if err != nil {
return nil, err return nil, err
} }
buf[key] = append(buf[key], data) buf[fieldName] = append(buf[fieldName], data)
} }
} }
@ -215,7 +216,7 @@ func (r *rowParser) Parse(strArr []string) (Row, error) {
// read values from csv file // read values from csv file
for index, value := range strArr { for index, value := range strArr {
if subFieldsMap, ok := r.structArrays[r.header[index]]; ok { if subFieldsMap, ok := r.structArrays[r.header[index]]; ok {
values, err := r.reconstructArrayForStructArray(subFieldsMap, value) values, err := r.reconstructArrayForStructArray(r.header[index], subFieldsMap, value)
if err != nil { if err != nil {
return nil, err return nil, err
} }

View File

@ -75,7 +75,7 @@ func (suite *RowParserSuite) createAllTypesSchema() *schemapb.CollectionSchema {
Fields: []*schemapb.FieldSchema{ Fields: []*schemapb.FieldSchema{
{ {
FieldID: 111, FieldID: 111,
Name: "sub_float_vector", Name: "struct_array[sub_float_vector]",
DataType: schemapb.DataType_ArrayOfVector, DataType: schemapb.DataType_ArrayOfVector,
ElementType: schemapb.DataType_FloatVector, ElementType: schemapb.DataType_FloatVector,
TypeParams: []*commonpb.KeyValuePair{ TypeParams: []*commonpb.KeyValuePair{
@ -91,7 +91,7 @@ func (suite *RowParserSuite) createAllTypesSchema() *schemapb.CollectionSchema {
}, },
{ {
FieldID: 112, FieldID: 112,
Name: "sub_str", Name: "struct_array[sub_str]",
DataType: schemapb.DataType_Array, DataType: schemapb.DataType_Array,
ElementType: schemapb.DataType_VarChar, ElementType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{ TypeParams: []*commonpb.KeyValuePair{
@ -649,19 +649,21 @@ func (suite *RowParserSuite) runValid(c *testCase) {
// For each sub-field in the struct array // For each sub-field in the struct array
for _, subField := range structArray.GetFields() { for _, subField := range structArray.GetFields() {
subFieldName := subField.GetName()
originalSubFieldName := subFieldName[len(structArray.GetName())+1 : len(subFieldName)-1]
val, ok := row[subField.GetFieldID()] val, ok := row[subField.GetFieldID()]
suite.True(ok, "Sub-field %s should exist in row", subField.GetName()) suite.True(ok, "Sub-field %s should exist in row", subFieldName)
// Validate based on sub-field type // Validate based on sub-field type
switch subField.GetDataType() { switch subField.GetDataType() {
case schemapb.DataType_ArrayOfVector: case schemapb.DataType_ArrayOfVector:
vf, ok := val.(*schemapb.VectorField) vf, ok := val.(*schemapb.VectorField)
suite.True(ok, "Sub-field %s should be a VectorField", subField.GetName()) suite.True(ok, "Sub-field %s should be a VectorField", subFieldName)
// Extract expected vectors from struct array data // Extract expected vectors from struct array data
var expectedVectors [][]float32 var expectedVectors [][]float32
for _, elem := range structArrayData { for _, elem := range structArrayData {
if vecStr, ok := elem[subField.GetName()].(string); ok { if vecStr, ok := elem[originalSubFieldName].(string); ok {
var vec []float32 var vec []float32
err := json.Unmarshal([]byte(vecStr), &vec) err := json.Unmarshal([]byte(vecStr), &vec)
suite.NoError(err) suite.NoError(err)
@ -678,12 +680,12 @@ func (suite *RowParserSuite) runValid(c *testCase) {
case schemapb.DataType_Array: case schemapb.DataType_Array:
sf, ok := val.(*schemapb.ScalarField) sf, ok := val.(*schemapb.ScalarField)
suite.True(ok, "Sub-field %s should be a ScalarField", subField.GetName()) suite.True(ok, "Sub-field %s should be a ScalarField", subFieldName)
// Extract expected values from struct array data // Extract expected values from struct array data
var expectedValues []string var expectedValues []string
for _, elem := range structArrayData { for _, elem := range structArrayData {
if v, ok := elem[subField.GetName()].(string); ok { if v, ok := elem[originalSubFieldName].(string); ok {
expectedValues = append(expectedValues, v) expectedValues = append(expectedValues, v)
} }
} }

View File

@ -178,8 +178,20 @@ func (r *rowParser) Parse(raw any) (Row, error) {
row := make(Row) row := make(Row)
dynamicValues := make(map[string]any) dynamicValues := make(map[string]any)
handleField := func(key string, value any) error { handleField := func(structName string, key string, value any) error {
if fieldID, ok := r.name2FieldID[key]; ok { var fieldID int64
var found bool
if structName != "" {
// Transform to structName[fieldName] format
transformedKey := typeutil.ConcatStructFieldName(structName, key)
fieldID, found = r.name2FieldID[transformedKey]
} else {
// For regular fields, lookup directly
fieldID, found = r.name2FieldID[key]
}
if found {
data, err := r.parseEntity(fieldID, value) data, err := r.parseEntity(fieldID, value)
if err != nil { if err != nil {
return err return err
@ -215,12 +227,14 @@ func (r *rowParser) Parse(raw any) (Row, error) {
} }
for subKey, subValue := range values { for subKey, subValue := range values {
if err := handleField(subKey, subValue); err != nil { // Pass struct name for sub-fields
if err := handleField(key, subKey, subValue); err != nil {
return nil, err return nil, err
} }
} }
} else { } else {
if err := handleField(key, value); err != nil { // Pass empty string for regular fields
if err := handleField("", key, value); err != nil {
return nil, err return nil, err
} }
} }

View File

@ -66,7 +66,7 @@ func (suite *RowParserSuite) createAllTypesSchema() *schemapb.CollectionSchema {
Fields: []*schemapb.FieldSchema{ Fields: []*schemapb.FieldSchema{
{ {
FieldID: 111, FieldID: 111,
Name: "sub_float_vector", Name: "struct_array[sub_float_vector]",
DataType: schemapb.DataType_ArrayOfVector, DataType: schemapb.DataType_ArrayOfVector,
ElementType: schemapb.DataType_FloatVector, ElementType: schemapb.DataType_FloatVector,
TypeParams: []*commonpb.KeyValuePair{ TypeParams: []*commonpb.KeyValuePair{
@ -82,7 +82,7 @@ func (suite *RowParserSuite) createAllTypesSchema() *schemapb.CollectionSchema {
}, },
{ {
FieldID: 112, FieldID: 112,
Name: "sub_str", Name: "struct_array[sub_str]",
DataType: schemapb.DataType_Array, DataType: schemapb.DataType_Array,
ElementType: schemapb.DataType_VarChar, ElementType: schemapb.DataType_VarChar,
TypeParams: []*commonpb.KeyValuePair{ TypeParams: []*commonpb.KeyValuePair{

View File

@ -2456,3 +2456,9 @@ func IsBm25FunctionInputField(coll *schemapb.CollectionSchema, field *schemapb.F
} }
return false return false
} }
// ConcatStructFieldName transforms struct field names to structName[fieldName] format
// This ensures global uniqueness while allowing same field names across different structs
func ConcatStructFieldName(structName string, fieldName string) string {
return fmt.Sprintf("%s[%s]", structName, fieldName)
}

View File

@ -28,11 +28,11 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb" "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/proxy"
"github.com/milvus-io/milvus/pkg/v2/common" "github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/log" "github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/util/merr" "github.com/milvus-io/milvus/pkg/v2/util/merr"
"github.com/milvus-io/milvus/pkg/v2/util/metric" "github.com/milvus-io/milvus/pkg/v2/util/metric"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
"github.com/milvus-io/milvus/tests/integration" "github.com/milvus-io/milvus/tests/integration"
) )
@ -170,7 +170,7 @@ func (s *ArrayStructDataNodeSuite) loadCollection(collectionName string) {
log.Info("=========================Index created for float vector=========================") log.Info("=========================Index created for float vector=========================")
s.WaitForIndexBuilt(context.TODO(), collectionName, integration.FloatVecField) s.WaitForIndexBuilt(context.TODO(), collectionName, integration.FloatVecField)
subFieldName := proxy.ConcatStructFieldName(integration.StructArrayField, integration.StructSubFloatVecField) subFieldName := typeutil.ConcatStructFieldName(integration.StructArrayField, integration.StructSubFloatVecField)
createIndexResult, err := c.MilvusClient.CreateIndex(context.TODO(), &milvuspb.CreateIndexRequest{ createIndexResult, err := c.MilvusClient.CreateIndex(context.TODO(), &milvuspb.CreateIndexRequest{
DbName: dbName, DbName: dbName,
CollectionName: collectionName, CollectionName: collectionName,
@ -317,7 +317,7 @@ func (s *ArrayStructDataNodeSuite) query(collectionName string) {
topk := 10 topk := 10
roundDecimal := -1 roundDecimal := -1
subFieldName := proxy.ConcatStructFieldName(integration.StructArrayField, integration.StructSubFloatVecField) subFieldName := typeutil.ConcatStructFieldName(integration.StructArrayField, integration.StructSubFloatVecField)
params := integration.GetSearchParams(integration.IndexHNSW, metric.MaxSim) params := integration.GetSearchParams(integration.IndexHNSW, metric.MaxSim)
searchReq := integration.ConstructEmbeddingListSearchRequest("", collectionName, expr, searchReq := integration.ConstructEmbeddingListSearchRequest("", collectionName, expr,
subFieldName, schemapb.DataType_FloatVector, []string{integration.StructArrayField}, metric.MaxSim, params, nq, s.dim, topk, roundDecimal) subFieldName, schemapb.DataType_FloatVector, []string{integration.StructArrayField}, metric.MaxSim, params, nq, s.dim, topk, roundDecimal)

View File

@ -27,11 +27,11 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb" "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/proxy"
"github.com/milvus-io/milvus/pkg/v2/common" "github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/util/funcutil" "github.com/milvus-io/milvus/pkg/v2/util/funcutil"
"github.com/milvus-io/milvus/pkg/v2/util/metric" "github.com/milvus-io/milvus/pkg/v2/util/metric"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable" "github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
"github.com/milvus-io/milvus/tests/integration" "github.com/milvus-io/milvus/tests/integration"
) )
@ -199,7 +199,7 @@ func (s *TestArrayStructSuite) run() {
s.WaitForIndexBuiltWithDB(ctx, s.dbName, collection, vecFieldName) s.WaitForIndexBuiltWithDB(ctx, s.dbName, collection, vecFieldName)
subFieldName := proxy.ConcatStructFieldName(structFieldName, structSubVecFieldName) subFieldName := typeutil.ConcatStructFieldName(structFieldName, structSubVecFieldName)
// create index for struct sub-vector field // create index for struct sub-vector field
createIndexResult, err := s.Cluster.MilvusClient.CreateIndex(ctx, &milvuspb.CreateIndexRequest{ createIndexResult, err := s.Cluster.MilvusClient.CreateIndex(ctx, &milvuspb.CreateIndexRequest{
DbName: s.dbName, DbName: s.dbName,

View File

@ -30,7 +30,6 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb" "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/proxy"
"github.com/milvus-io/milvus/internal/util/importutilv2" "github.com/milvus-io/milvus/internal/util/importutilv2"
"github.com/milvus-io/milvus/pkg/v2/common" "github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/log" "github.com/milvus-io/milvus/pkg/v2/log"
@ -40,6 +39,7 @@ import (
"github.com/milvus-io/milvus/pkg/v2/util/merr" "github.com/milvus-io/milvus/pkg/v2/util/merr"
"github.com/milvus-io/milvus/pkg/v2/util/metric" "github.com/milvus-io/milvus/pkg/v2/util/metric"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable" "github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
"github.com/milvus-io/milvus/tests/integration" "github.com/milvus-io/milvus/tests/integration"
) )
@ -97,7 +97,7 @@ func (s *BulkInsertSuite) PrepareSourceCollection(dim int, dmlGroup *DMLGroup) *
s.NoError(merr.CheckRPCCall(createIndexStatus, err)) s.NoError(merr.CheckRPCCall(createIndexStatus, err))
s.WaitForIndexBuilt(ctx, collectionName, integration.FloatVecField) s.WaitForIndexBuilt(ctx, collectionName, integration.FloatVecField)
name := proxy.ConcatStructFieldName(integration.StructArrayField, integration.StructSubFloatVecField) name := typeutil.ConcatStructFieldName(integration.StructArrayField, integration.StructSubFloatVecField)
createIndexResult, err := c.MilvusClient.CreateIndex(context.TODO(), &milvuspb.CreateIndexRequest{ createIndexResult, err := c.MilvusClient.CreateIndex(context.TODO(), &milvuspb.CreateIndexRequest{
CollectionName: collectionName, CollectionName: collectionName,
FieldName: name, FieldName: name,

View File

@ -200,9 +200,11 @@ func (s *BulkInsertSuite) runForStructArray() {
s.NoError(err) s.NoError(err)
s.Equal(int32(0), createCollectionStatus.GetCode()) s.Equal(int32(0), createCollectionStatus.GetCode())
// adjust struct field name // Note: when `CreateCollection`, the field name in Struct will be transformed to `structName[fieldName]` format
schema.StructArrayFields[0].Fields[0].Name = "struct_with_vector_array[vector_array_field]" // such as struct_with_vector_array[vector_array_field]. But we use the schema which is not transformed to generate
schema.StructArrayFields[0].Fields[1].Name = "struct_with_vector_array[scalar_array_field]" // test data. This is expected because user will not generate data with the transformed field name.
schema.StructArrayFields[0].Fields[0].Name = "vector_array_field"
schema.StructArrayFields[0].Fields[1].Name = "scalar_array_field"
var files []*internalpb.ImportFile var files []*internalpb.ImportFile
@ -299,7 +301,7 @@ func (s *BulkInsertSuite) runForStructArray() {
} }
func (s *BulkInsertSuite) TestImportWithVectorArray() { func (s *BulkInsertSuite) TestImportWithVectorArray() {
fileTypeArr := []importutilv2.FileType{importutilv2.CSV, importutilv2.Parquet, importutilv2.JSON} fileTypeArr := []importutilv2.FileType{importutilv2.CSV, importutilv2.JSON}
for _, fileType := range fileTypeArr { for _, fileType := range fileTypeArr {
s.fileType = fileType s.fileType = fileType
s.vecType = schemapb.DataType_FloatVector s.vecType = schemapb.DataType_FloatVector