mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
issue: #43427 This pr's main goal is merge #37417 to milvus 2.5 without conflicts. # Main Goals 1. Create and describe collections with geospatial type 2. Insert geospatial data into the insert binlog 3. Load segments containing geospatial data into memory 4. Enable query and search can display geospatial data 5. Support using GIS funtions like ST_EQUALS in query 6. Support R-Tree index for geometry type # Solution 1. **Add Type**: Modify the Milvus core by adding a Geospatial type in both the C++ and Go code layers, defining the Geospatial data structure and the corresponding interfaces. 2. **Dependency Libraries**: Introduce necessary geospatial data processing libraries. In the C++ source code, use Conan package management to include the GDAL library. In the Go source code, add the go-geom library to the go.mod file. 3. **Protocol Interface**: Revise the Milvus protocol to provide mechanisms for Geospatial message serialization and deserialization. 4. **Data Pipeline**: Facilitate interaction between the client and proxy using the WKT format for geospatial data. The proxy will convert all data into WKB format for downstream processing, providing column data interfaces, segment encapsulation, segment loading, payload writing, and cache block management. 5. **Query Operators**: Implement simple display and support for filter queries. Initially, focus on filtering based on spatial relationships for a single column of geospatial literal values, providing parsing and execution for query expressions.Now only support brutal search 7. **Client Modification**: Enable the client to handle user input for geospatial data and facilitate end-to-end testing.Check the modification in pymilvus. --------- Signed-off-by: Yinwei Li <yinwei.li@zilliz.com> Signed-off-by: Cai Zhang <cai.zhang@zilliz.com> Co-authored-by: ZhuXi <150327960+Yinwei-Yu@users.noreply.github.com>
1250 lines
44 KiB
Go
1250 lines
44 KiB
Go
package testutil
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
"math/rand"
|
|
"strconv"
|
|
|
|
"github.com/apache/arrow/go/v17/arrow"
|
|
"github.com/apache/arrow/go/v17/arrow/array"
|
|
"github.com/apache/arrow/go/v17/arrow/memory"
|
|
"github.com/samber/lo"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/json"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
"github.com/milvus-io/milvus/pkg/v2/common"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/testutils"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
|
|
)
|
|
|
|
const (
|
|
testMaxVarCharLength = 100
|
|
)
|
|
|
|
func ConstructCollectionSchemaWithKeys(collectionName string,
|
|
fieldName2DataType map[string]schemapb.DataType,
|
|
primaryFieldName string,
|
|
partitionKeyFieldName string,
|
|
clusteringKeyFieldName string,
|
|
autoID bool,
|
|
dim int,
|
|
) *schemapb.CollectionSchema {
|
|
schema := ConstructCollectionSchemaByDataType(collectionName,
|
|
fieldName2DataType,
|
|
primaryFieldName,
|
|
autoID,
|
|
dim)
|
|
for _, field := range schema.Fields {
|
|
if field.Name == partitionKeyFieldName {
|
|
field.IsPartitionKey = true
|
|
}
|
|
if field.Name == clusteringKeyFieldName {
|
|
field.IsClusteringKey = true
|
|
}
|
|
}
|
|
|
|
return schema
|
|
}
|
|
|
|
func ConstructCollectionSchemaByDataType(collectionName string,
|
|
fieldName2DataType map[string]schemapb.DataType,
|
|
primaryFieldName string,
|
|
autoID bool,
|
|
dim int,
|
|
) *schemapb.CollectionSchema {
|
|
fieldsSchema := make([]*schemapb.FieldSchema, 0)
|
|
fieldIdx := int64(0)
|
|
for fieldName, dataType := range fieldName2DataType {
|
|
fieldSchema := &schemapb.FieldSchema{
|
|
Name: fieldName,
|
|
DataType: dataType,
|
|
FieldID: fieldIdx,
|
|
}
|
|
fieldIdx += 1
|
|
if typeutil.IsVectorType(dataType) {
|
|
fieldSchema.TypeParams = []*commonpb.KeyValuePair{
|
|
{
|
|
Key: common.DimKey,
|
|
Value: strconv.Itoa(dim),
|
|
},
|
|
}
|
|
}
|
|
if dataType == schemapb.DataType_VarChar {
|
|
fieldSchema.TypeParams = []*commonpb.KeyValuePair{
|
|
{
|
|
Key: common.MaxLengthKey,
|
|
Value: strconv.Itoa(testMaxVarCharLength),
|
|
},
|
|
}
|
|
}
|
|
if fieldName == primaryFieldName {
|
|
fieldSchema.IsPrimaryKey = true
|
|
fieldSchema.AutoID = autoID
|
|
}
|
|
|
|
fieldsSchema = append(fieldsSchema, fieldSchema)
|
|
}
|
|
|
|
return &schemapb.CollectionSchema{
|
|
Name: collectionName,
|
|
Fields: fieldsSchema,
|
|
}
|
|
}
|
|
|
|
func randomString(length int) string {
|
|
letterRunes := []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|
b := make([]rune, length)
|
|
for i := range b {
|
|
b[i] = letterRunes[rand.Intn(len(letterRunes))]
|
|
}
|
|
return string(b)
|
|
}
|
|
|
|
func CreateInsertData(schema *schemapb.CollectionSchema, rows int, nullPercent ...int) (*storage.InsertData, error) {
|
|
insertData, err := storage.NewInsertData(schema)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
allFields := typeutil.GetAllFieldSchemas(schema)
|
|
for _, f := range allFields {
|
|
if f.GetAutoID() || f.IsFunctionOutput {
|
|
continue
|
|
}
|
|
switch f.GetDataType() {
|
|
case schemapb.DataType_Bool:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateBoolArray(rows))
|
|
case schemapb.DataType_Int8:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateInt8Array(rows))
|
|
case schemapb.DataType_Int16:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateInt16Array(rows))
|
|
case schemapb.DataType_Int32:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateInt32Array(rows))
|
|
case schemapb.DataType_Int64:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateInt64Array(rows))
|
|
case schemapb.DataType_Float:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateFloat32Array(rows))
|
|
case schemapb.DataType_Double:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateFloat64Array(rows))
|
|
case schemapb.DataType_BinaryVector:
|
|
dim, err := typeutil.GetDim(f)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
insertData.Data[f.FieldID] = &storage.BinaryVectorFieldData{
|
|
Data: testutils.GenerateBinaryVectors(rows, int(dim)),
|
|
Dim: int(dim),
|
|
}
|
|
case schemapb.DataType_FloatVector:
|
|
dim, err := typeutil.GetDim(f)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
insertData.Data[f.GetFieldID()] = &storage.FloatVectorFieldData{
|
|
Data: testutils.GenerateFloatVectors(rows, int(dim)),
|
|
Dim: int(dim),
|
|
}
|
|
case schemapb.DataType_Float16Vector:
|
|
dim, err := typeutil.GetDim(f)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
insertData.Data[f.FieldID] = &storage.Float16VectorFieldData{
|
|
Data: testutils.GenerateFloat16Vectors(rows, int(dim)),
|
|
Dim: int(dim),
|
|
}
|
|
case schemapb.DataType_BFloat16Vector:
|
|
dim, err := typeutil.GetDim(f)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
insertData.Data[f.FieldID] = &storage.BFloat16VectorFieldData{
|
|
Data: testutils.GenerateBFloat16Vectors(rows, int(dim)),
|
|
Dim: int(dim),
|
|
}
|
|
case schemapb.DataType_SparseFloatVector:
|
|
data, dim := testutils.GenerateSparseFloatVectorsData(rows)
|
|
insertData.Data[f.FieldID] = &storage.SparseFloatVectorFieldData{
|
|
SparseFloatArray: schemapb.SparseFloatArray{
|
|
Contents: data,
|
|
Dim: dim,
|
|
},
|
|
}
|
|
case schemapb.DataType_Int8Vector:
|
|
dim, err := typeutil.GetDim(f)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
insertData.Data[f.FieldID] = &storage.Int8VectorFieldData{
|
|
Data: testutils.GenerateInt8Vectors(rows, int(dim)),
|
|
Dim: int(dim),
|
|
}
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateStringArray(rows))
|
|
case schemapb.DataType_JSON:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateJSONArray(rows))
|
|
case schemapb.DataType_Geometry:
|
|
// wkb bytes array
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateGeometryArray(rows))
|
|
case schemapb.DataType_Array:
|
|
switch f.GetElementType() {
|
|
case schemapb.DataType_Bool:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateArrayOfBoolArray(rows))
|
|
case schemapb.DataType_Int8, schemapb.DataType_Int16, schemapb.DataType_Int32:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateArrayOfIntArray(rows))
|
|
case schemapb.DataType_Int64:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateArrayOfLongArray(rows))
|
|
case schemapb.DataType_Float:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateArrayOfFloatArray(rows))
|
|
case schemapb.DataType_Double:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateArrayOfDoubleArray(rows))
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateArrayOfStringArray(rows))
|
|
}
|
|
case schemapb.DataType_ArrayOfVector:
|
|
dim, err := typeutil.GetDim(f)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
switch f.GetElementType() {
|
|
case schemapb.DataType_FloatVector:
|
|
insertData.Data[f.FieldID].AppendDataRows(testutils.GenerateArrayOfFloatVectorArray(rows, int(dim)))
|
|
default:
|
|
panic(fmt.Sprintf("unimplemented data type: %s", f.GetElementType().String()))
|
|
}
|
|
|
|
default:
|
|
panic(fmt.Sprintf("unsupported data type: %s", f.GetDataType().String()))
|
|
}
|
|
if f.GetNullable() {
|
|
if len(nullPercent) > 1 {
|
|
return nil, merr.WrapErrParameterInvalidMsg("the length of nullPercent is wrong")
|
|
}
|
|
if len(nullPercent) == 0 || nullPercent[0] == 50 {
|
|
insertData.Data[f.FieldID].AppendValidDataRows(testutils.GenerateBoolArray(rows))
|
|
} else if len(nullPercent) == 1 && nullPercent[0] == 100 {
|
|
insertData.Data[f.FieldID].AppendValidDataRows(make([]bool, rows))
|
|
} else if len(nullPercent) == 1 && nullPercent[0] == 0 {
|
|
validData := make([]bool, rows)
|
|
for i := range validData {
|
|
validData[i] = true
|
|
}
|
|
insertData.Data[f.FieldID].AppendValidDataRows(validData)
|
|
} else {
|
|
return nil, merr.WrapErrParameterInvalidMsg(fmt.Sprintf("not support the number of nullPercent(%d)", nullPercent))
|
|
}
|
|
}
|
|
}
|
|
return insertData, nil
|
|
}
|
|
|
|
func CreateFieldWithDefaultValue(dataType schemapb.DataType, id int64, nullable bool) (*schemapb.FieldSchema, error) {
|
|
field := &schemapb.FieldSchema{
|
|
FieldID: 102,
|
|
Name: dataType.String(),
|
|
DataType: dataType,
|
|
TypeParams: []*commonpb.KeyValuePair{
|
|
{
|
|
Key: common.MaxLengthKey,
|
|
Value: "128",
|
|
},
|
|
{
|
|
Key: common.MaxCapacityKey,
|
|
Value: "128",
|
|
},
|
|
},
|
|
Nullable: nullable,
|
|
}
|
|
|
|
switch field.GetDataType() {
|
|
case schemapb.DataType_Bool:
|
|
field.DefaultValue = &schemapb.ValueField{
|
|
Data: &schemapb.ValueField_BoolData{
|
|
BoolData: ([]bool{true, false})[rand.Intn(2)],
|
|
},
|
|
}
|
|
case schemapb.DataType_Int8, schemapb.DataType_Int16, schemapb.DataType_Int32:
|
|
field.DefaultValue = &schemapb.ValueField{
|
|
Data: &schemapb.ValueField_IntData{
|
|
IntData: ([]int32{1, 10, 100, 1000})[rand.Intn(4)],
|
|
},
|
|
}
|
|
case schemapb.DataType_Int64:
|
|
field.DefaultValue = &schemapb.ValueField{
|
|
Data: &schemapb.ValueField_LongData{
|
|
LongData: rand.Int63(),
|
|
},
|
|
}
|
|
case schemapb.DataType_Float:
|
|
field.DefaultValue = &schemapb.ValueField{
|
|
Data: &schemapb.ValueField_FloatData{
|
|
FloatData: rand.Float32(),
|
|
},
|
|
}
|
|
case schemapb.DataType_Double:
|
|
field.DefaultValue = &schemapb.ValueField{
|
|
Data: &schemapb.ValueField_DoubleData{
|
|
DoubleData: rand.Float64(),
|
|
},
|
|
}
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
|
field.DefaultValue = &schemapb.ValueField{
|
|
Data: &schemapb.ValueField_StringData{
|
|
StringData: randomString(10),
|
|
},
|
|
}
|
|
default:
|
|
msg := fmt.Sprintf("type (%s) not support default_value", field.GetDataType().String())
|
|
return nil, merr.WrapErrParameterInvalidMsg(msg)
|
|
}
|
|
return field, nil
|
|
}
|
|
|
|
func BuildSparseVectorData(mem *memory.GoAllocator, contents [][]byte, arrowType arrow.DataType) (arrow.Array, error) {
|
|
if arrowType == nil || arrowType.ID() == arrow.STRING {
|
|
// build sparse vector as JSON-format string
|
|
builder := array.NewStringBuilder(mem)
|
|
rows := len(contents)
|
|
jsonBytesData := make([][]byte, 0)
|
|
for i := 0; i < rows; i++ {
|
|
rowVecData := contents[i]
|
|
mapData := typeutil.SparseFloatBytesToMap(rowVecData)
|
|
// convert to JSON format
|
|
jsonBytes, err := json.Marshal(mapData)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
jsonBytesData = append(jsonBytesData, jsonBytes)
|
|
}
|
|
builder.AppendValues(lo.Map(jsonBytesData, func(bs []byte, _ int) string {
|
|
return string(bs)
|
|
}), nil)
|
|
return builder.NewStringArray(), nil
|
|
} else if arrowType.ID() == arrow.STRUCT {
|
|
// build sparse vector as parquet struct
|
|
stType, _ := arrowType.(*arrow.StructType)
|
|
indicesField, ok1 := stType.FieldByName("indices")
|
|
valuesField, ok2 := stType.FieldByName("values")
|
|
if !ok1 || !ok2 {
|
|
return nil, merr.WrapErrParameterInvalidMsg("Indices type or values type is missed for sparse vector")
|
|
}
|
|
|
|
indicesList, ok1 := indicesField.Type.(*arrow.ListType)
|
|
valuesList, ok2 := valuesField.Type.(*arrow.ListType)
|
|
if !ok1 || !ok2 {
|
|
return nil, merr.WrapErrParameterInvalidMsg("Indices type and values type of sparse vector should be list")
|
|
}
|
|
indexType := indicesList.Elem().ID()
|
|
valueType := valuesList.Elem().ID()
|
|
|
|
fields := []arrow.Field{indicesField, valuesField}
|
|
structType := arrow.StructOf(fields...)
|
|
builder := array.NewStructBuilder(mem, structType)
|
|
indicesBuilder := builder.FieldBuilder(0).(*array.ListBuilder)
|
|
valuesBuilder := builder.FieldBuilder(1).(*array.ListBuilder)
|
|
|
|
// The array.Uint32Builder/array.Int64Builder/array.Float32Builder/array.Float64Builder
|
|
// are derived from array.Builder, but array.Builder doesn't have Append() interface
|
|
// To call array.Uint32Builder.Value(uint32), we need to explicitly cast the indicesBuilder.ValueBuilder()
|
|
// to array.Uint32Builder
|
|
// So, we declare two methods here to avoid type casting in the "for" loop
|
|
type AppendIndex func(index uint32)
|
|
type AppendValue func(value float32)
|
|
|
|
var appendIndexFunc AppendIndex
|
|
switch indexType {
|
|
case arrow.INT32:
|
|
indicesArrayBuilder := indicesBuilder.ValueBuilder().(*array.Int32Builder)
|
|
appendIndexFunc = func(index uint32) {
|
|
indicesArrayBuilder.Append((int32)(index))
|
|
}
|
|
case arrow.UINT32:
|
|
indicesArrayBuilder := indicesBuilder.ValueBuilder().(*array.Uint32Builder)
|
|
appendIndexFunc = func(index uint32) {
|
|
indicesArrayBuilder.Append(index)
|
|
}
|
|
case arrow.INT64:
|
|
indicesArrayBuilder := indicesBuilder.ValueBuilder().(*array.Int64Builder)
|
|
appendIndexFunc = func(index uint32) {
|
|
indicesArrayBuilder.Append((int64)(index))
|
|
}
|
|
case arrow.UINT64:
|
|
indicesArrayBuilder := indicesBuilder.ValueBuilder().(*array.Uint64Builder)
|
|
appendIndexFunc = func(index uint32) {
|
|
indicesArrayBuilder.Append((uint64)(index))
|
|
}
|
|
default:
|
|
msg := fmt.Sprintf("Not able to write this type (%s) for sparse vector index", indexType.String())
|
|
return nil, merr.WrapErrImportFailed(msg)
|
|
}
|
|
|
|
var appendValueFunc AppendValue
|
|
switch valueType {
|
|
case arrow.FLOAT32:
|
|
valuesArrayBuilder := valuesBuilder.ValueBuilder().(*array.Float32Builder)
|
|
appendValueFunc = func(value float32) {
|
|
valuesArrayBuilder.Append(value)
|
|
}
|
|
case arrow.FLOAT64:
|
|
valuesArrayBuilder := valuesBuilder.ValueBuilder().(*array.Float64Builder)
|
|
appendValueFunc = func(value float32) {
|
|
valuesArrayBuilder.Append((float64)(value))
|
|
}
|
|
default:
|
|
msg := fmt.Sprintf("Not able to write this type (%s) for sparse vector index", indexType.String())
|
|
return nil, merr.WrapErrImportFailed(msg)
|
|
}
|
|
|
|
for i := 0; i < len(contents); i++ {
|
|
builder.Append(true)
|
|
indicesBuilder.Append(true)
|
|
valuesBuilder.Append(true)
|
|
rowVecData := contents[i]
|
|
elemCount := len(rowVecData) / 8
|
|
for j := 0; j < elemCount; j++ {
|
|
appendIndexFunc(common.Endian.Uint32(rowVecData[j*8:]))
|
|
appendValueFunc(math.Float32frombits(common.Endian.Uint32(rowVecData[j*8+4:])))
|
|
}
|
|
}
|
|
return builder.NewStructArray(), nil
|
|
}
|
|
|
|
return nil, merr.WrapErrParameterInvalidMsg("Invalid arrow data type for sparse vector")
|
|
}
|
|
|
|
func BuildArrayData(schema *schemapb.CollectionSchema, insertData *storage.InsertData, useNullType bool) ([]arrow.Array, error) {
|
|
mem := memory.NewGoAllocator()
|
|
// Get all fields including struct sub-fields
|
|
allFields := typeutil.GetAllFieldSchemas(schema)
|
|
// Filter out auto-generated and function output fields
|
|
fields := lo.Filter(allFields, func(field *schemapb.FieldSchema, _ int) bool {
|
|
return !(field.GetIsPrimaryKey() && field.GetAutoID()) && !field.GetIsFunctionOutput()
|
|
})
|
|
|
|
columns := make([]arrow.Array, 0, len(fields))
|
|
for _, field := range fields {
|
|
fieldID := field.GetFieldID()
|
|
dataType := field.GetDataType()
|
|
elementType := field.GetElementType()
|
|
if field.GetNullable() && useNullType {
|
|
columns = append(columns, array.NewNull(insertData.Data[fieldID].RowNum()))
|
|
continue
|
|
}
|
|
switch dataType {
|
|
case schemapb.DataType_Bool:
|
|
builder := array.NewBooleanBuilder(mem)
|
|
boolData := insertData.Data[fieldID].(*storage.BoolFieldData).Data
|
|
validData := insertData.Data[fieldID].(*storage.BoolFieldData).ValidData
|
|
builder.AppendValues(boolData, validData)
|
|
|
|
columns = append(columns, builder.NewBooleanArray())
|
|
case schemapb.DataType_Int8:
|
|
builder := array.NewInt8Builder(mem)
|
|
int8Data := insertData.Data[fieldID].(*storage.Int8FieldData).Data
|
|
validData := insertData.Data[fieldID].(*storage.Int8FieldData).ValidData
|
|
builder.AppendValues(int8Data, validData)
|
|
columns = append(columns, builder.NewInt8Array())
|
|
case schemapb.DataType_Int16:
|
|
builder := array.NewInt16Builder(mem)
|
|
int16Data := insertData.Data[fieldID].(*storage.Int16FieldData).Data
|
|
validData := insertData.Data[fieldID].(*storage.Int16FieldData).ValidData
|
|
builder.AppendValues(int16Data, validData)
|
|
columns = append(columns, builder.NewInt16Array())
|
|
case schemapb.DataType_Int32:
|
|
builder := array.NewInt32Builder(mem)
|
|
int32Data := insertData.Data[fieldID].(*storage.Int32FieldData).Data
|
|
validData := insertData.Data[fieldID].(*storage.Int32FieldData).ValidData
|
|
builder.AppendValues(int32Data, validData)
|
|
columns = append(columns, builder.NewInt32Array())
|
|
case schemapb.DataType_Int64:
|
|
builder := array.NewInt64Builder(mem)
|
|
int64Data := insertData.Data[fieldID].(*storage.Int64FieldData).Data
|
|
validData := insertData.Data[fieldID].(*storage.Int64FieldData).ValidData
|
|
builder.AppendValues(int64Data, validData)
|
|
columns = append(columns, builder.NewInt64Array())
|
|
case schemapb.DataType_Float:
|
|
builder := array.NewFloat32Builder(mem)
|
|
floatData := insertData.Data[fieldID].(*storage.FloatFieldData).Data
|
|
validData := insertData.Data[fieldID].(*storage.FloatFieldData).ValidData
|
|
builder.AppendValues(floatData, validData)
|
|
columns = append(columns, builder.NewFloat32Array())
|
|
case schemapb.DataType_Double:
|
|
builder := array.NewFloat64Builder(mem)
|
|
doubleData := insertData.Data[fieldID].(*storage.DoubleFieldData).Data
|
|
validData := insertData.Data[fieldID].(*storage.DoubleFieldData).ValidData
|
|
builder.AppendValues(doubleData, validData)
|
|
columns = append(columns, builder.NewFloat64Array())
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
|
builder := array.NewStringBuilder(mem)
|
|
stringData := insertData.Data[fieldID].(*storage.StringFieldData).Data
|
|
validData := insertData.Data[fieldID].(*storage.StringFieldData).ValidData
|
|
builder.AppendValues(stringData, validData)
|
|
columns = append(columns, builder.NewStringArray())
|
|
case schemapb.DataType_BinaryVector:
|
|
builder := array.NewListBuilder(mem, &arrow.Uint8Type{})
|
|
dim := insertData.Data[fieldID].(*storage.BinaryVectorFieldData).Dim
|
|
binVecData := insertData.Data[fieldID].(*storage.BinaryVectorFieldData).Data
|
|
rowBytes := dim / 8
|
|
rows := len(binVecData) / rowBytes
|
|
offsets := make([]int32, 0, rows)
|
|
valid := make([]bool, 0)
|
|
for i := 0; i < rows; i++ {
|
|
offsets = append(offsets, int32(i*rowBytes))
|
|
valid = append(valid, true)
|
|
}
|
|
builder.ValueBuilder().(*array.Uint8Builder).AppendValues(binVecData, nil)
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_FloatVector:
|
|
builder := array.NewListBuilder(mem, &arrow.Float32Type{})
|
|
dim := insertData.Data[fieldID].(*storage.FloatVectorFieldData).Dim
|
|
floatVecData := insertData.Data[fieldID].(*storage.FloatVectorFieldData).Data
|
|
rows := len(floatVecData) / dim
|
|
offsets := make([]int32, 0, rows)
|
|
valid := make([]bool, 0, rows)
|
|
for i := 0; i < rows; i++ {
|
|
offsets = append(offsets, int32(i*dim))
|
|
valid = append(valid, true)
|
|
}
|
|
builder.ValueBuilder().(*array.Float32Builder).AppendValues(floatVecData, nil)
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_Float16Vector:
|
|
builder := array.NewListBuilder(mem, &arrow.Uint8Type{})
|
|
dim := insertData.Data[fieldID].(*storage.Float16VectorFieldData).Dim
|
|
float16VecData := insertData.Data[fieldID].(*storage.Float16VectorFieldData).Data
|
|
rowBytes := dim * 2
|
|
rows := len(float16VecData) / rowBytes
|
|
offsets := make([]int32, 0, rows)
|
|
valid := make([]bool, 0, rows)
|
|
for i := 0; i < rows; i++ {
|
|
offsets = append(offsets, int32(i*rowBytes))
|
|
valid = append(valid, true)
|
|
}
|
|
builder.ValueBuilder().(*array.Uint8Builder).AppendValues(float16VecData, nil)
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_BFloat16Vector:
|
|
builder := array.NewListBuilder(mem, &arrow.Uint8Type{})
|
|
dim := insertData.Data[fieldID].(*storage.BFloat16VectorFieldData).Dim
|
|
bfloat16VecData := insertData.Data[fieldID].(*storage.BFloat16VectorFieldData).Data
|
|
rowBytes := dim * 2
|
|
rows := len(bfloat16VecData) / rowBytes
|
|
offsets := make([]int32, 0, rows)
|
|
valid := make([]bool, 0, rows)
|
|
for i := 0; i < rows; i++ {
|
|
offsets = append(offsets, int32(i*rowBytes))
|
|
valid = append(valid, true)
|
|
}
|
|
builder.ValueBuilder().(*array.Uint8Builder).AppendValues(bfloat16VecData, nil)
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_SparseFloatVector:
|
|
contents := insertData.Data[fieldID].(*storage.SparseFloatVectorFieldData).GetContents()
|
|
arr, err := BuildSparseVectorData(mem, contents, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
columns = append(columns, arr)
|
|
case schemapb.DataType_Int8Vector:
|
|
builder := array.NewListBuilder(mem, &arrow.Int8Type{})
|
|
dim := insertData.Data[fieldID].(*storage.Int8VectorFieldData).Dim
|
|
int8VecData := insertData.Data[fieldID].(*storage.Int8VectorFieldData).Data
|
|
rows := len(int8VecData) / dim
|
|
offsets := make([]int32, 0, rows)
|
|
valid := make([]bool, 0, rows)
|
|
for i := 0; i < rows; i++ {
|
|
offsets = append(offsets, int32(i*dim))
|
|
valid = append(valid, true)
|
|
}
|
|
builder.ValueBuilder().(*array.Int8Builder).AppendValues(int8VecData, nil)
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_JSON:
|
|
builder := array.NewStringBuilder(mem)
|
|
jsonData := insertData.Data[fieldID].(*storage.JSONFieldData).Data
|
|
validData := insertData.Data[fieldID].(*storage.JSONFieldData).ValidData
|
|
builder.AppendValues(lo.Map(jsonData, func(bs []byte, _ int) string {
|
|
return string(bs)
|
|
}), validData)
|
|
columns = append(columns, builder.NewStringArray())
|
|
case schemapb.DataType_Geometry:
|
|
builder := array.NewStringBuilder(mem)
|
|
wkbData := insertData.Data[fieldID].(*storage.GeometryFieldData).Data
|
|
validData := insertData.Data[fieldID].(*storage.GeometryFieldData).ValidData
|
|
builder.AppendValues(lo.Map(wkbData, func(bs []byte, _ int) string {
|
|
return string(bs)
|
|
}), validData)
|
|
columns = append(columns, builder.NewStringArray())
|
|
case schemapb.DataType_Array:
|
|
data := insertData.Data[fieldID].(*storage.ArrayFieldData).Data
|
|
validData := insertData.Data[fieldID].(*storage.ArrayFieldData).ValidData
|
|
rows := len(data)
|
|
offsets := make([]int32, 0, rows)
|
|
valid := make([]bool, 0, rows)
|
|
currOffset := int32(0)
|
|
|
|
switch elementType {
|
|
case schemapb.DataType_Bool:
|
|
builder := array.NewListBuilder(mem, &arrow.BooleanType{})
|
|
valueBuilder := builder.ValueBuilder().(*array.BooleanBuilder)
|
|
for i := 0; i < rows; i++ {
|
|
if field.GetNullable() && !validData[i] {
|
|
offsets = append(offsets, currOffset)
|
|
valid = append(valid, false)
|
|
} else {
|
|
boolData := data[i].Data.(*schemapb.ScalarField_BoolData).BoolData.GetData()
|
|
valueBuilder.AppendValues(boolData, nil)
|
|
offsets = append(offsets, currOffset)
|
|
currOffset = currOffset + int32(len(boolData))
|
|
valid = append(valid, true)
|
|
}
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_Int8:
|
|
builder := array.NewListBuilder(mem, &arrow.Int8Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Int8Builder)
|
|
for i := 0; i < rows; i++ {
|
|
if field.GetNullable() && !validData[i] {
|
|
offsets = append(offsets, currOffset)
|
|
valid = append(valid, false)
|
|
} else {
|
|
intData := data[i].Data.(*schemapb.ScalarField_IntData).IntData.GetData()
|
|
int8Data := make([]int8, 0)
|
|
for j := 0; j < len(intData); j++ {
|
|
int8Data = append(int8Data, int8(intData[j]))
|
|
}
|
|
valueBuilder.AppendValues(int8Data, nil)
|
|
offsets = append(offsets, currOffset)
|
|
currOffset = currOffset + int32(len(int8Data))
|
|
valid = append(valid, true)
|
|
}
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_Int16:
|
|
builder := array.NewListBuilder(mem, &arrow.Int16Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Int16Builder)
|
|
for i := 0; i < rows; i++ {
|
|
if field.GetNullable() && !validData[i] {
|
|
offsets = append(offsets, currOffset)
|
|
valid = append(valid, false)
|
|
} else {
|
|
intData := data[i].Data.(*schemapb.ScalarField_IntData).IntData.GetData()
|
|
int16Data := make([]int16, 0)
|
|
for j := 0; j < len(intData); j++ {
|
|
int16Data = append(int16Data, int16(intData[j]))
|
|
}
|
|
valueBuilder.AppendValues(int16Data, nil)
|
|
offsets = append(offsets, currOffset)
|
|
currOffset = currOffset + int32(len(int16Data))
|
|
valid = append(valid, true)
|
|
}
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_Int32:
|
|
builder := array.NewListBuilder(mem, &arrow.Int32Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Int32Builder)
|
|
for i := 0; i < rows; i++ {
|
|
if field.GetNullable() && !validData[i] {
|
|
offsets = append(offsets, currOffset)
|
|
valid = append(valid, false)
|
|
} else {
|
|
intData := data[i].Data.(*schemapb.ScalarField_IntData).IntData.GetData()
|
|
valueBuilder.AppendValues(intData, nil)
|
|
offsets = append(offsets, currOffset)
|
|
currOffset = currOffset + int32(len(intData))
|
|
valid = append(valid, true)
|
|
}
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_Int64:
|
|
builder := array.NewListBuilder(mem, &arrow.Int64Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Int64Builder)
|
|
for i := 0; i < rows; i++ {
|
|
if field.GetNullable() && !validData[i] {
|
|
offsets = append(offsets, currOffset)
|
|
valid = append(valid, false)
|
|
} else {
|
|
longData := data[i].Data.(*schemapb.ScalarField_LongData).LongData.GetData()
|
|
valueBuilder.AppendValues(longData, nil)
|
|
offsets = append(offsets, currOffset)
|
|
currOffset = currOffset + int32(len(longData))
|
|
valid = append(valid, true)
|
|
}
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_Float:
|
|
builder := array.NewListBuilder(mem, &arrow.Float32Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Float32Builder)
|
|
for i := 0; i < rows; i++ {
|
|
if field.GetNullable() && !validData[i] {
|
|
offsets = append(offsets, currOffset)
|
|
valid = append(valid, false)
|
|
} else {
|
|
floatData := data[i].Data.(*schemapb.ScalarField_FloatData).FloatData.GetData()
|
|
valueBuilder.AppendValues(floatData, nil)
|
|
offsets = append(offsets, currOffset)
|
|
currOffset = currOffset + int32(len(floatData))
|
|
valid = append(valid, true)
|
|
}
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_Double:
|
|
builder := array.NewListBuilder(mem, &arrow.Float64Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Float64Builder)
|
|
for i := 0; i < rows; i++ {
|
|
if field.GetNullable() && !validData[i] {
|
|
offsets = append(offsets, currOffset)
|
|
valid = append(valid, false)
|
|
} else {
|
|
doubleData := data[i].Data.(*schemapb.ScalarField_DoubleData).DoubleData.GetData()
|
|
valueBuilder.AppendValues(doubleData, nil)
|
|
offsets = append(offsets, currOffset)
|
|
currOffset = currOffset + int32(len(doubleData))
|
|
valid = append(valid, true)
|
|
}
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
case schemapb.DataType_VarChar, schemapb.DataType_String:
|
|
builder := array.NewListBuilder(mem, &arrow.StringType{})
|
|
valueBuilder := builder.ValueBuilder().(*array.StringBuilder)
|
|
for i := 0; i < rows; i++ {
|
|
if field.GetNullable() && !validData[i] {
|
|
offsets = append(offsets, currOffset)
|
|
valid = append(valid, false)
|
|
} else {
|
|
stringData := data[i].Data.(*schemapb.ScalarField_StringData).StringData.GetData()
|
|
valueBuilder.AppendValues(stringData, nil)
|
|
offsets = append(offsets, currOffset)
|
|
currOffset = currOffset + int32(len(stringData))
|
|
valid = append(valid, true)
|
|
}
|
|
}
|
|
builder.AppendValues(offsets, valid)
|
|
columns = append(columns, builder.NewListArray())
|
|
}
|
|
case schemapb.DataType_ArrayOfVector:
|
|
data := insertData.Data[fieldID].(*storage.VectorArrayFieldData).Data
|
|
rows := len(data)
|
|
|
|
switch elementType {
|
|
case schemapb.DataType_FloatVector:
|
|
// ArrayOfVector is flattened in Arrow - just a list of floats
|
|
// where total floats = dim * num_vectors
|
|
builder := array.NewListBuilder(mem, &arrow.Float32Type{})
|
|
valueBuilder := builder.ValueBuilder().(*array.Float32Builder)
|
|
|
|
for i := 0; i < rows; i++ {
|
|
vectorArray := data[i].GetFloatVector()
|
|
if vectorArray == nil || len(vectorArray.GetData()) == 0 {
|
|
builder.AppendNull()
|
|
continue
|
|
}
|
|
builder.Append(true)
|
|
// Append all flattened vector data
|
|
valueBuilder.AppendValues(vectorArray.GetData(), nil)
|
|
}
|
|
columns = append(columns, builder.NewListArray())
|
|
default:
|
|
return nil, fmt.Errorf("unsupported element type in VectorArray: %s", elementType.String())
|
|
}
|
|
}
|
|
}
|
|
return columns, nil
|
|
}
|
|
|
|
// reconstructStructArrayForJSON reconstructs struct array data for JSON format
|
|
// Returns an array of maps where each element represents a struct
|
|
func reconstructStructArrayForJSON(structField *schemapb.StructArrayFieldSchema, insertData *storage.InsertData, rowIndex int) ([]map[string]any, error) {
|
|
subFields := structField.GetFields()
|
|
if len(subFields) == 0 {
|
|
return []map[string]any{}, nil
|
|
}
|
|
|
|
// Determine the array length from the first sub-field's data
|
|
var arrayLen int
|
|
for _, subField := range subFields {
|
|
if fieldData, ok := insertData.Data[subField.GetFieldID()]; ok {
|
|
rowData := fieldData.GetRow(rowIndex)
|
|
if rowData == nil {
|
|
continue
|
|
}
|
|
|
|
switch subField.GetDataType() {
|
|
case schemapb.DataType_Array:
|
|
if scalarField, ok := rowData.(*schemapb.ScalarField); ok {
|
|
switch subField.GetElementType() {
|
|
case schemapb.DataType_Bool:
|
|
if data := scalarField.GetBoolData(); data != nil {
|
|
arrayLen = len(data.GetData())
|
|
}
|
|
case schemapb.DataType_Int8, schemapb.DataType_Int16, schemapb.DataType_Int32:
|
|
if data := scalarField.GetIntData(); data != nil {
|
|
arrayLen = len(data.GetData())
|
|
}
|
|
case schemapb.DataType_Int64:
|
|
if data := scalarField.GetLongData(); data != nil {
|
|
arrayLen = len(data.GetData())
|
|
}
|
|
case schemapb.DataType_Float:
|
|
if data := scalarField.GetFloatData(); data != nil {
|
|
arrayLen = len(data.GetData())
|
|
}
|
|
case schemapb.DataType_Double:
|
|
if data := scalarField.GetDoubleData(); data != nil {
|
|
arrayLen = len(data.GetData())
|
|
}
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
|
if data := scalarField.GetStringData(); data != nil {
|
|
arrayLen = len(data.GetData())
|
|
}
|
|
}
|
|
}
|
|
case schemapb.DataType_ArrayOfVector:
|
|
if vectorField, ok := rowData.(*schemapb.VectorField); ok {
|
|
switch subField.GetElementType() {
|
|
case schemapb.DataType_FloatVector:
|
|
if data := vectorField.GetFloatVector(); data != nil {
|
|
dim, _ := typeutil.GetDim(subField)
|
|
if dim > 0 {
|
|
arrayLen = len(data.GetData()) / int(dim)
|
|
}
|
|
}
|
|
case schemapb.DataType_BinaryVector:
|
|
if data := vectorField.GetBinaryVector(); data != nil {
|
|
dim, _ := typeutil.GetDim(subField)
|
|
if dim > 0 {
|
|
bytesPerVector := int(dim) / 8
|
|
arrayLen = len(data) / bytesPerVector
|
|
}
|
|
}
|
|
case schemapb.DataType_Float16Vector:
|
|
if data := vectorField.GetFloat16Vector(); data != nil {
|
|
dim, _ := typeutil.GetDim(subField)
|
|
if dim > 0 {
|
|
bytesPerVector := int(dim) * 2
|
|
arrayLen = len(data) / bytesPerVector
|
|
}
|
|
}
|
|
case schemapb.DataType_BFloat16Vector:
|
|
if data := vectorField.GetBfloat16Vector(); data != nil {
|
|
dim, _ := typeutil.GetDim(subField)
|
|
if dim > 0 {
|
|
bytesPerVector := int(dim) * 2
|
|
arrayLen = len(data) / bytesPerVector
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if arrayLen > 0 {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build the struct array
|
|
structArray := make([]map[string]any, arrayLen)
|
|
for j := 0; j < arrayLen; j++ {
|
|
structElem := make(map[string]any)
|
|
|
|
for _, subField := range subFields {
|
|
if fieldData, ok := insertData.Data[subField.GetFieldID()]; ok {
|
|
rowData := fieldData.GetRow(rowIndex)
|
|
if rowData == nil {
|
|
continue
|
|
}
|
|
|
|
// Extract the j-th element
|
|
switch subField.GetDataType() {
|
|
case schemapb.DataType_Array:
|
|
if scalarField, ok := rowData.(*schemapb.ScalarField); ok {
|
|
switch subField.GetElementType() {
|
|
case schemapb.DataType_Bool:
|
|
if data := scalarField.GetBoolData(); data != nil && j < len(data.GetData()) {
|
|
structElem[subField.GetName()] = data.GetData()[j]
|
|
}
|
|
case schemapb.DataType_Int8, schemapb.DataType_Int16, schemapb.DataType_Int32:
|
|
if data := scalarField.GetIntData(); data != nil && j < len(data.GetData()) {
|
|
structElem[subField.GetName()] = data.GetData()[j]
|
|
}
|
|
case schemapb.DataType_Int64:
|
|
if data := scalarField.GetLongData(); data != nil && j < len(data.GetData()) {
|
|
structElem[subField.GetName()] = data.GetData()[j]
|
|
}
|
|
case schemapb.DataType_Float:
|
|
if data := scalarField.GetFloatData(); data != nil && j < len(data.GetData()) {
|
|
structElem[subField.GetName()] = data.GetData()[j]
|
|
}
|
|
case schemapb.DataType_Double:
|
|
if data := scalarField.GetDoubleData(); data != nil && j < len(data.GetData()) {
|
|
structElem[subField.GetName()] = data.GetData()[j]
|
|
}
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
|
if data := scalarField.GetStringData(); data != nil && j < len(data.GetData()) {
|
|
structElem[subField.GetName()] = data.GetData()[j]
|
|
}
|
|
}
|
|
}
|
|
case schemapb.DataType_ArrayOfVector:
|
|
if vectorField, ok := rowData.(*schemapb.VectorField); ok {
|
|
switch subField.GetElementType() {
|
|
case schemapb.DataType_FloatVector:
|
|
if data := vectorField.GetFloatVector(); data != nil {
|
|
dim, _ := typeutil.GetDim(subField)
|
|
if dim > 0 {
|
|
startIdx := j * int(dim)
|
|
endIdx := startIdx + int(dim)
|
|
if endIdx <= len(data.GetData()) {
|
|
structElem[subField.GetName()] = data.GetData()[startIdx:endIdx]
|
|
}
|
|
}
|
|
}
|
|
case schemapb.DataType_BinaryVector:
|
|
if data := vectorField.GetBinaryVector(); data != nil {
|
|
dim, _ := typeutil.GetDim(subField)
|
|
if dim > 0 {
|
|
bytesPerVector := int(dim) / 8
|
|
startIdx := j * bytesPerVector
|
|
endIdx := startIdx + bytesPerVector
|
|
if endIdx <= len(data) {
|
|
structElem[subField.GetName()] = data[startIdx:endIdx]
|
|
}
|
|
}
|
|
}
|
|
case schemapb.DataType_Float16Vector:
|
|
if data := vectorField.GetFloat16Vector(); data != nil {
|
|
dim, _ := typeutil.GetDim(subField)
|
|
if dim > 0 {
|
|
bytesPerVector := int(dim) * 2
|
|
startIdx := j * bytesPerVector
|
|
endIdx := startIdx + bytesPerVector
|
|
if endIdx <= len(data) {
|
|
// Convert Float16 bytes to float32 for JSON representation
|
|
structElem[subField.GetName()] = typeutil.Float16BytesToFloat32Vector(data[startIdx:endIdx])
|
|
}
|
|
}
|
|
}
|
|
case schemapb.DataType_BFloat16Vector:
|
|
if data := vectorField.GetBfloat16Vector(); data != nil {
|
|
dim, _ := typeutil.GetDim(subField)
|
|
if dim > 0 {
|
|
bytesPerVector := int(dim) * 2
|
|
startIdx := j * bytesPerVector
|
|
endIdx := startIdx + bytesPerVector
|
|
if endIdx <= len(data) {
|
|
// Convert BFloat16 bytes to float32 for JSON representation
|
|
structElem[subField.GetName()] = typeutil.BFloat16BytesToFloat32Vector(data[startIdx:endIdx])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
structArray[j] = structElem
|
|
}
|
|
|
|
return structArray, nil
|
|
}
|
|
|
|
func CreateInsertDataRowsForJSON(schema *schemapb.CollectionSchema, insertData *storage.InsertData) ([]map[string]any, error) {
|
|
fieldIDToField := lo.KeyBy(schema.GetFields(), func(field *schemapb.FieldSchema) int64 {
|
|
return field.GetFieldID()
|
|
})
|
|
|
|
// Track which field IDs belong to struct array sub-fields
|
|
structSubFieldIDs := make(map[int64]bool)
|
|
for _, structField := range schema.GetStructArrayFields() {
|
|
for _, subField := range structField.GetFields() {
|
|
structSubFieldIDs[subField.GetFieldID()] = true
|
|
}
|
|
}
|
|
|
|
rowNum := insertData.GetRowNum()
|
|
rows := make([]map[string]any, 0, rowNum)
|
|
for i := 0; i < rowNum; i++ {
|
|
data := make(map[int64]interface{})
|
|
|
|
// First process regular fields
|
|
for fieldID, v := range insertData.Data {
|
|
// Skip if this is a sub-field of a struct array
|
|
if structSubFieldIDs[fieldID] {
|
|
continue
|
|
}
|
|
|
|
field, ok := fieldIDToField[fieldID]
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
dataType := field.GetDataType()
|
|
elemType := field.GetElementType()
|
|
if field.GetAutoID() || field.IsFunctionOutput {
|
|
continue
|
|
}
|
|
if v.GetRow(i) == nil {
|
|
data[fieldID] = nil
|
|
continue
|
|
}
|
|
switch dataType {
|
|
case schemapb.DataType_Array:
|
|
switch elemType {
|
|
case schemapb.DataType_Bool:
|
|
data[fieldID] = v.GetRow(i).(*schemapb.ScalarField).GetBoolData().GetData()
|
|
case schemapb.DataType_Int8, schemapb.DataType_Int16, schemapb.DataType_Int32:
|
|
data[fieldID] = v.GetRow(i).(*schemapb.ScalarField).GetIntData().GetData()
|
|
case schemapb.DataType_Int64:
|
|
data[fieldID] = v.GetRow(i).(*schemapb.ScalarField).GetLongData().GetData()
|
|
case schemapb.DataType_Float:
|
|
data[fieldID] = v.GetRow(i).(*schemapb.ScalarField).GetFloatData().GetData()
|
|
case schemapb.DataType_Double:
|
|
data[fieldID] = v.GetRow(i).(*schemapb.ScalarField).GetDoubleData().GetData()
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
|
data[fieldID] = v.GetRow(i).(*schemapb.ScalarField).GetStringData().GetData()
|
|
}
|
|
case schemapb.DataType_ArrayOfVector:
|
|
panic("unreachable")
|
|
case schemapb.DataType_JSON:
|
|
data[fieldID] = string(v.GetRow(i).([]byte))
|
|
case schemapb.DataType_BinaryVector:
|
|
bytes := v.GetRow(i).([]byte)
|
|
ints := make([]int, 0, len(bytes))
|
|
for _, b := range bytes {
|
|
ints = append(ints, int(b))
|
|
}
|
|
data[fieldID] = ints
|
|
case schemapb.DataType_Float16Vector:
|
|
bytes := v.GetRow(i).([]byte)
|
|
data[fieldID] = typeutil.Float16BytesToFloat32Vector(bytes)
|
|
case schemapb.DataType_BFloat16Vector:
|
|
bytes := v.GetRow(i).([]byte)
|
|
data[fieldID] = typeutil.BFloat16BytesToFloat32Vector(bytes)
|
|
case schemapb.DataType_SparseFloatVector:
|
|
bytes := v.GetRow(i).([]byte)
|
|
data[fieldID] = typeutil.SparseFloatBytesToMap(bytes)
|
|
default:
|
|
data[fieldID] = v.GetRow(i)
|
|
}
|
|
}
|
|
|
|
// Now process struct array fields - reconstruct the nested structure
|
|
for _, structField := range schema.GetStructArrayFields() {
|
|
structArray, err := reconstructStructArrayForJSON(structField, insertData, i)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data[structField.GetFieldID()] = structArray
|
|
}
|
|
|
|
// Convert field IDs to field names
|
|
row := make(map[string]any)
|
|
for fieldID, value := range data {
|
|
if field, ok := fieldIDToField[fieldID]; ok {
|
|
row[field.GetName()] = value
|
|
} else {
|
|
// Check if it's a struct array field
|
|
for _, structField := range schema.GetStructArrayFields() {
|
|
if structField.GetFieldID() == fieldID {
|
|
row[structField.GetName()] = value
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
rows = append(rows, row)
|
|
}
|
|
|
|
return rows, nil
|
|
}
|
|
|
|
// reconstructStructArrayForCSV reconstructs struct array data for CSV format
|
|
// Returns a JSON string where each sub-field value is also a JSON string
|
|
func reconstructStructArrayForCSV(structField *schemapb.StructArrayFieldSchema, insertData *storage.InsertData, rowIndex int) (string, error) {
|
|
// Use the JSON reconstruction function to get the struct array
|
|
structArray, err := reconstructStructArrayForJSON(structField, insertData, rowIndex)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
// Convert to CSV format: each sub-field value needs to be JSON-encoded
|
|
csvArray := make([]map[string]string, len(structArray))
|
|
for i, elem := range structArray {
|
|
csvElem := make(map[string]string)
|
|
for key, value := range elem {
|
|
// Convert each value to JSON string for CSV
|
|
jsonBytes, err := json.Marshal(value)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
csvElem[key] = string(jsonBytes)
|
|
}
|
|
csvArray[i] = csvElem
|
|
}
|
|
|
|
// Convert the entire struct array to JSON string
|
|
jsonBytes, err := json.Marshal(csvArray)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return string(jsonBytes), nil
|
|
}
|
|
|
|
func CreateInsertDataForCSV(schema *schemapb.CollectionSchema, insertData *storage.InsertData, nullkey string) ([][]string, error) {
|
|
rowNum := insertData.GetRowNum()
|
|
csvData := make([][]string, 0, rowNum+1)
|
|
|
|
// Build header - regular fields and struct array fields (not sub-fields)
|
|
header := make([]string, 0)
|
|
|
|
// Track which field IDs belong to struct array sub-fields
|
|
structSubFieldIDs := make(map[int64]bool)
|
|
for _, structField := range schema.GetStructArrayFields() {
|
|
for _, subField := range structField.GetFields() {
|
|
structSubFieldIDs[subField.GetFieldID()] = true
|
|
}
|
|
}
|
|
|
|
// Add regular fields to header (excluding struct array sub-fields)
|
|
allFields := typeutil.GetAllFieldSchemas(schema)
|
|
fields := lo.Filter(allFields, func(field *schemapb.FieldSchema, _ int) bool {
|
|
return !field.GetAutoID() && !field.IsFunctionOutput && !structSubFieldIDs[field.GetFieldID()]
|
|
})
|
|
nameToFields := lo.KeyBy(fields, func(field *schemapb.FieldSchema) string {
|
|
name := field.GetName()
|
|
header = append(header, name)
|
|
return name
|
|
})
|
|
|
|
// Build map for struct array fields for quick lookup
|
|
structArrayFields := make(map[string]*schemapb.StructArrayFieldSchema)
|
|
for _, structField := range schema.GetStructArrayFields() {
|
|
structArrayFields[structField.GetName()] = structField
|
|
header = append(header, structField.GetName())
|
|
}
|
|
|
|
csvData = append(csvData, header)
|
|
|
|
for i := 0; i < rowNum; i++ {
|
|
data := make([]string, 0)
|
|
for _, name := range header {
|
|
if structArrayField, ok := structArrayFields[name]; ok {
|
|
structArrayData, err := reconstructStructArrayForCSV(structArrayField, insertData, i)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, structArrayData)
|
|
continue
|
|
}
|
|
|
|
// Handle regular field
|
|
field := nameToFields[name]
|
|
value := insertData.Data[field.FieldID]
|
|
dataType := field.GetDataType()
|
|
elemType := field.GetElementType()
|
|
// deal with null value
|
|
if field.GetNullable() && value.GetRow(i) == nil {
|
|
data = append(data, nullkey)
|
|
continue
|
|
}
|
|
switch dataType {
|
|
case schemapb.DataType_Array:
|
|
var arr any
|
|
switch elemType {
|
|
case schemapb.DataType_Bool:
|
|
arr = value.GetRow(i).(*schemapb.ScalarField).GetBoolData().GetData()
|
|
case schemapb.DataType_Int8, schemapb.DataType_Int16, schemapb.DataType_Int32:
|
|
arr = value.GetRow(i).(*schemapb.ScalarField).GetIntData().GetData()
|
|
case schemapb.DataType_Int64:
|
|
arr = value.GetRow(i).(*schemapb.ScalarField).GetLongData().GetData()
|
|
case schemapb.DataType_Float:
|
|
arr = value.GetRow(i).(*schemapb.ScalarField).GetFloatData().GetData()
|
|
case schemapb.DataType_Double:
|
|
arr = value.GetRow(i).(*schemapb.ScalarField).GetDoubleData().GetData()
|
|
case schemapb.DataType_String:
|
|
arr = value.GetRow(i).(*schemapb.ScalarField).GetStringData().GetData()
|
|
}
|
|
j, err := json.Marshal(arr)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, string(j))
|
|
case schemapb.DataType_JSON:
|
|
data = append(data, string(value.GetRow(i).([]byte)))
|
|
case schemapb.DataType_FloatVector:
|
|
vec := value.GetRow(i).([]float32)
|
|
j, err := json.Marshal(vec)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, string(j))
|
|
case schemapb.DataType_BinaryVector:
|
|
bytes := value.GetRow(i).([]byte)
|
|
vec := make([]int, 0, len(bytes))
|
|
for _, b := range bytes {
|
|
vec = append(vec, int(b))
|
|
}
|
|
j, err := json.Marshal(vec)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, string(j))
|
|
case schemapb.DataType_Float16Vector:
|
|
bytes := value.GetRow(i).([]byte)
|
|
vec := typeutil.Float16BytesToFloat32Vector(bytes)
|
|
j, err := json.Marshal(vec)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, string(j))
|
|
case schemapb.DataType_BFloat16Vector:
|
|
bytes := value.GetRow(i).([]byte)
|
|
vec := typeutil.BFloat16BytesToFloat32Vector(bytes)
|
|
j, err := json.Marshal(vec)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, string(j))
|
|
case schemapb.DataType_SparseFloatVector:
|
|
bytes := value.GetRow(i).([]byte)
|
|
m := typeutil.SparseFloatBytesToMap(bytes)
|
|
j, err := json.Marshal(m)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, string(j))
|
|
case schemapb.DataType_Int8Vector:
|
|
vec := value.GetRow(i).([]int8)
|
|
j, err := json.Marshal(vec)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, string(j))
|
|
case schemapb.DataType_ArrayOfVector:
|
|
// ArrayOfVector should not appear as a top-level field
|
|
// It can only be a sub-field in struct arrays
|
|
panic("ArrayOfVector cannot be a top-level field")
|
|
default:
|
|
str := fmt.Sprintf("%v", value.GetRow(i))
|
|
data = append(data, str)
|
|
}
|
|
}
|
|
csvData = append(csvData, data)
|
|
}
|
|
|
|
return csvData, nil
|
|
}
|