mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
related: #45993 Add nullable vector support in import utility layer Key changes: ImportV2 util: - Add nullable vector types (FloatVector, Float16Vector, BFloat16Vector, BinaryVector, SparseFloatVector, Int8Vector) to AppendNullableDefaultFieldsData() - Add tests for nullable vector field data appending CSV/JSON/Numpy readers: - Add nullPercent parameter to test data generation for better null coverage - Mark vector fields as nullable in test schemas - Add test cases for nullable vector field parsing - Refactor tests to use loop-based approach with 0%, 50%, 100% null percentages Parquet field reader: - Add ReadNullableBinaryData() for nullable BinaryVector/Float16Vector/BFloat16Vector - Add ReadNullableFloatVectorData() for nullable FloatVector - Add ReadNullableSparseFloatVectorData() for nullable SparseFloatVector - Add ReadNullableInt8VectorData() for nullable Int8Vector - Add ReadNullableStructData() for generic nullable struct data - Update Next() to use nullable read methods when field is nullable - Add null data validation for non-nullable fields <!-- This is an auto-generated comment: release notes by coderabbit.ai --> - Core invariant: import must preserve per-row alignment and validity for every field — nullable vector fields are expected to be encoded with per-row validity masks and all readers/writers must emit arrays aligned to original input rows (null entries represented explicitly). - New feature & scope: adds end-to-end nullable-vector support in the import utility layer — AppendNullableDefaultFieldsData in internal/datanode/importv2/util.go now appends nil placeholders for nullable vectors (FloatVector, Float16Vector, BFloat16Vector, BinaryVector, SparseFloatVector, Int8Vector); parquet reader (internal/util/importutilv2/parquet/field_reader.go) adds ReadNullableBinaryData, ReadNullableFloatVectorData, ReadNullableSparseFloatVectorData, ReadNullableInt8VectorData, ReadNullableStructData and routes nullable branches to these helpers; CSV/JSON/Numpy readers and test utilities updated to generate and validate 0/50/100% null scenarios and mark vector fields as nullable in test schemas. - Logic removed / simplified: eliminates ad-hoc "parameter-invalid" rejections for nullable vectors inside FieldReader.Next by centralizing nullable handling into ReadNullable* helpers and shared validators (getArrayDataNullable, checkNullableVectorAlignWithDim/checkNullableVectorAligned), simplifying control flow and removing scattered special-case checks. - No data loss / no regression (concrete code paths): nulls are preserved end-to-end — AppendNullableDefaultFieldsData explicitly inserts nil entries per null row (datanode import append path); ReadNullable*Data helpers return both data and []bool validity masks so callers in field_reader.go and downstream readers receive exact per-row validity; testutil.BuildSparseVectorData was extended to accept validData so sparse vectors are materialized only for valid rows while null rows are represented as missing. These concrete paths ensure null rows are represented rather than dropped, preventing data loss or behavioral regression. <!-- end of auto-generated comment: release notes by coderabbit.ai --> Signed-off-by: marcelo-cjl <marcelo.chen@zilliz.com>
2336 lines
70 KiB
Go
2336 lines
70 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package parquet
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
|
|
"github.com/apache/arrow/go/v17/arrow"
|
|
"github.com/apache/arrow/go/v17/arrow/array"
|
|
"github.com/apache/arrow/go/v17/parquet/pqarrow"
|
|
"github.com/cockroachdb/errors"
|
|
"github.com/samber/lo"
|
|
"golang.org/x/exp/constraints"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/json"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
"github.com/milvus-io/milvus/internal/util/importutilv2/common"
|
|
"github.com/milvus-io/milvus/internal/util/nullutil"
|
|
pkgcommon "github.com/milvus-io/milvus/pkg/v2/common"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/parameterutil"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/timestamptz"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
|
|
)
|
|
|
|
type FieldReader struct {
|
|
columnIndex int
|
|
columnReader *pqarrow.ColumnReader
|
|
|
|
dim int
|
|
field *schemapb.FieldSchema
|
|
sparseIsString bool
|
|
|
|
// timezone is the collection's default timezone
|
|
timezone string
|
|
|
|
// structReader is non-nil when Struct Array field exists
|
|
structReader *StructFieldReader
|
|
}
|
|
|
|
func NewFieldReader(ctx context.Context, reader *pqarrow.FileReader, columnIndex int, field *schemapb.FieldSchema, timezone string) (*FieldReader, error) {
|
|
columnReader, err := reader.GetColumn(ctx, columnIndex)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var dim int64 = 1
|
|
if typeutil.IsVectorType(field.GetDataType()) && !typeutil.IsSparseFloatVectorType(field.GetDataType()) {
|
|
dim, err = typeutil.GetDim(field)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
// set a flag here to know whether a sparse vector is stored as JSON-format string or parquet struct
|
|
// because we don't intend to check it every time the Next() is called
|
|
sparseIsString := true
|
|
if field.GetDataType() == schemapb.DataType_SparseFloatVector {
|
|
_, sparseIsString = IsValidSparseVectorSchema(columnReader.Field().Type)
|
|
}
|
|
|
|
cr := &FieldReader{
|
|
columnIndex: columnIndex,
|
|
columnReader: columnReader,
|
|
dim: int(dim),
|
|
field: field,
|
|
sparseIsString: sparseIsString,
|
|
timezone: timezone,
|
|
}
|
|
return cr, nil
|
|
}
|
|
|
|
func (c *FieldReader) Next(count int64) (any, any, error) {
|
|
// Check if this FieldReader wraps a StructFieldReader
|
|
if c.structReader != nil {
|
|
return c.structReader.Next(count)
|
|
}
|
|
|
|
switch c.field.GetDataType() {
|
|
case schemapb.DataType_Bool:
|
|
if c.field.GetNullable() || c.field.GetDefaultValue() != nil {
|
|
return ReadNullableBoolData(c, count)
|
|
}
|
|
data, err := ReadBoolData(c, count)
|
|
return data, nil, err
|
|
case schemapb.DataType_Int8:
|
|
if c.field.GetNullable() || c.field.GetDefaultValue() != nil {
|
|
return ReadNullableIntegerOrFloatData[int8](c, count)
|
|
}
|
|
data, err := ReadIntegerOrFloatData[int8](c, count)
|
|
return data, nil, err
|
|
case schemapb.DataType_Int16:
|
|
if c.field.GetNullable() || c.field.GetDefaultValue() != nil {
|
|
return ReadNullableIntegerOrFloatData[int16](c, count)
|
|
}
|
|
data, err := ReadIntegerOrFloatData[int16](c, count)
|
|
return data, nil, err
|
|
case schemapb.DataType_Int32:
|
|
if c.field.GetNullable() || c.field.GetDefaultValue() != nil {
|
|
return ReadNullableIntegerOrFloatData[int32](c, count)
|
|
}
|
|
data, err := ReadIntegerOrFloatData[int32](c, count)
|
|
return data, nil, err
|
|
case schemapb.DataType_Int64:
|
|
if c.field.GetNullable() || c.field.GetDefaultValue() != nil {
|
|
return ReadNullableIntegerOrFloatData[int64](c, count)
|
|
}
|
|
data, err := ReadIntegerOrFloatData[int64](c, count)
|
|
return data, nil, err
|
|
case schemapb.DataType_Float:
|
|
if c.field.GetNullable() || c.field.GetDefaultValue() != nil {
|
|
data, validData, err := ReadNullableIntegerOrFloatData[float32](c, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil, nil
|
|
}
|
|
return data, validData, typeutil.VerifyFloats32(data.([]float32))
|
|
}
|
|
data, err := ReadIntegerOrFloatData[float32](c, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil, nil
|
|
}
|
|
return data, nil, typeutil.VerifyFloats32(data.([]float32))
|
|
case schemapb.DataType_Double:
|
|
if c.field.GetNullable() || c.field.GetDefaultValue() != nil {
|
|
data, validData, err := ReadNullableIntegerOrFloatData[float64](c, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil, nil
|
|
}
|
|
return data, validData, typeutil.VerifyFloats64(data.([]float64))
|
|
}
|
|
data, err := ReadIntegerOrFloatData[float64](c, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil, nil
|
|
}
|
|
return data, nil, typeutil.VerifyFloats64(data.([]float64))
|
|
case schemapb.DataType_VarChar, schemapb.DataType_String:
|
|
if c.field.GetNullable() || c.field.GetDefaultValue() != nil {
|
|
return ReadNullableStringData(c, count)
|
|
}
|
|
data, err := ReadStringData(c, count, true)
|
|
return data, nil, err
|
|
case schemapb.DataType_JSON:
|
|
// json has not support default_value
|
|
if c.field.GetNullable() {
|
|
return ReadNullableJSONData(c, count)
|
|
}
|
|
data, err := ReadJSONData(c, count)
|
|
return data, nil, err
|
|
case schemapb.DataType_Geometry:
|
|
if c.field.GetNullable() || c.field.GetDefaultValue() != nil {
|
|
return ReadNullableGeometryData(c, count)
|
|
}
|
|
data, err := ReadGeometryData(c, count)
|
|
return data, nil, err
|
|
case schemapb.DataType_Timestamptz:
|
|
if c.field.GetNullable() || c.field.GetDefaultValue() != nil {
|
|
return ReadNullableTimestamptzData(c, count)
|
|
}
|
|
data, err := ReadTimestamptzData(c, count)
|
|
return data, nil, err
|
|
case schemapb.DataType_BinaryVector, schemapb.DataType_Float16Vector, schemapb.DataType_BFloat16Vector:
|
|
// vector not support default_value
|
|
if c.field.GetNullable() {
|
|
return ReadNullableBinaryData(c, count)
|
|
}
|
|
data, err := ReadBinaryData(c, count)
|
|
return data, nil, err
|
|
case schemapb.DataType_FloatVector:
|
|
if c.field.GetNullable() {
|
|
return ReadNullableFloatVectorData(c, count)
|
|
}
|
|
arrayData, err := ReadIntegerOrFloatArrayData[float32](c, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if arrayData == nil {
|
|
return nil, nil, nil
|
|
}
|
|
vectors := lo.Flatten(arrayData.([][]float32))
|
|
return vectors, nil, nil
|
|
case schemapb.DataType_SparseFloatVector:
|
|
if c.field.GetNullable() {
|
|
return ReadNullableSparseFloatVectorData(c, count)
|
|
}
|
|
data, err := ReadSparseFloatVectorData(c, count)
|
|
return data, nil, err
|
|
case schemapb.DataType_Int8Vector:
|
|
if c.field.GetNullable() {
|
|
return ReadNullableInt8VectorData(c, count)
|
|
}
|
|
arrayData, err := ReadIntegerOrFloatArrayData[int8](c, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if arrayData == nil {
|
|
return nil, nil, nil
|
|
}
|
|
vectors := lo.Flatten(arrayData.([][]int8))
|
|
return vectors, nil, nil
|
|
case schemapb.DataType_Array:
|
|
// array has not supported default_value
|
|
if c.field.GetNullable() {
|
|
return ReadNullableArrayData(c, count)
|
|
}
|
|
data, err := ReadArrayData(c, count)
|
|
return data, nil, err
|
|
case schemapb.DataType_ArrayOfVector:
|
|
if c.field.GetNullable() {
|
|
return nil, nil, merr.WrapErrParameterInvalidMsg("not support nullable in vector")
|
|
}
|
|
data, err := ReadVectorArrayData(c, count)
|
|
return data, nil, err
|
|
default:
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("unsupported data type '%s' for field '%s'",
|
|
c.field.GetDataType().String(), c.field.GetName()))
|
|
}
|
|
}
|
|
|
|
func ReadBoolData(pcr *FieldReader, count int64) (any, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data := make([]bool, 0, count)
|
|
for _, chunk := range chunked.Chunks() {
|
|
dataNums := chunk.Data().Len()
|
|
if chunk.NullN() > 0 {
|
|
return nil, WrapNullRowErr(pcr.field)
|
|
}
|
|
boolReader, ok := chunk.(*array.Boolean)
|
|
|
|
if !ok {
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, boolReader.Value(i))
|
|
}
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
func fillWithDefaultValueImpl[T any](array []T, value T, validData []bool, field *schemapb.FieldSchema) (any, []bool, error) {
|
|
rowNum := len(validData)
|
|
for i, v := range validData {
|
|
if !v {
|
|
array[i] = value
|
|
}
|
|
}
|
|
if !typeutil.IsVectorType(field.GetDataType()) {
|
|
if field.GetNullable() {
|
|
for i := range validData {
|
|
validData[i] = true
|
|
}
|
|
} else {
|
|
validData = []bool{}
|
|
}
|
|
}
|
|
|
|
err := nullutil.CheckValidData(validData, field, rowNum)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
return array, validData, nil
|
|
}
|
|
|
|
func ReadNullableBoolData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data := make([]bool, 0, count)
|
|
validData := make([]bool, 0, count)
|
|
for _, chunk := range chunked.Chunks() {
|
|
dataNums := chunk.Data().Len()
|
|
boolReader, ok := chunk.(*array.Boolean)
|
|
if !ok {
|
|
// the chunk type may be *array.Null if the data in chunk is all null
|
|
_, ok := chunk.(*array.Null)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
validData = append(validData, make([]bool, dataNums)...)
|
|
data = append(data, make([]bool, dataNums)...)
|
|
} else {
|
|
validData = append(validData, bytesToValidData(dataNums, boolReader.NullBitmapBytes())...)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, boolReader.Value(i))
|
|
}
|
|
}
|
|
}
|
|
if len(data) != len(validData) {
|
|
return nil, nil, merr.WrapErrParameterInvalid(len(data), len(validData), "length of data is not equal to length of valid_data")
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil, nil
|
|
}
|
|
if pcr.field.GetDefaultValue() != nil {
|
|
defaultValue := pcr.field.GetDefaultValue().GetBoolData()
|
|
return fillWithDefaultValueImpl(data, defaultValue, validData, pcr.field)
|
|
}
|
|
return data, validData, nil
|
|
}
|
|
|
|
func ReadIntegerOrFloatData[T constraints.Integer | constraints.Float](pcr *FieldReader, count int64) (any, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data := make([]T, 0, count)
|
|
for _, chunk := range chunked.Chunks() {
|
|
dataNums := chunk.Data().Len()
|
|
if chunk.NullN() > 0 {
|
|
return nil, WrapNullRowErr(pcr.field)
|
|
}
|
|
switch chunk.DataType().ID() {
|
|
case arrow.INT8:
|
|
int8Reader := chunk.(*array.Int8)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(int8Reader.Value(i)))
|
|
}
|
|
case arrow.INT16:
|
|
int16Reader := chunk.(*array.Int16)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(int16Reader.Value(i)))
|
|
}
|
|
case arrow.INT32:
|
|
int32Reader := chunk.(*array.Int32)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(int32Reader.Value(i)))
|
|
}
|
|
case arrow.INT64:
|
|
int64Reader := chunk.(*array.Int64)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(int64Reader.Value(i)))
|
|
}
|
|
case arrow.FLOAT32:
|
|
float32Reader := chunk.(*array.Float32)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(float32Reader.Value(i)))
|
|
}
|
|
case arrow.FLOAT64:
|
|
float64Reader := chunk.(*array.Float64)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(float64Reader.Value(i)))
|
|
}
|
|
default:
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
func ReadNullableIntegerOrFloatData[T constraints.Integer | constraints.Float](pcr *FieldReader, count int64) (any, []bool, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data := make([]T, 0, count)
|
|
validData := make([]bool, 0, count)
|
|
for _, chunk := range chunked.Chunks() {
|
|
dataNums := chunk.Data().Len()
|
|
switch chunk.DataType().ID() {
|
|
case arrow.INT8:
|
|
int8Reader := chunk.(*array.Int8)
|
|
validData = append(validData, bytesToValidData(dataNums, int8Reader.NullBitmapBytes())...)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(int8Reader.Value(i)))
|
|
}
|
|
case arrow.INT16:
|
|
int16Reader := chunk.(*array.Int16)
|
|
validData = append(validData, bytesToValidData(dataNums, int16Reader.NullBitmapBytes())...)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(int16Reader.Value(i)))
|
|
}
|
|
case arrow.INT32:
|
|
int32Reader := chunk.(*array.Int32)
|
|
validData = append(validData, bytesToValidData(dataNums, int32Reader.NullBitmapBytes())...)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(int32Reader.Value(i)))
|
|
}
|
|
case arrow.INT64:
|
|
int64Reader := chunk.(*array.Int64)
|
|
validData = append(validData, bytesToValidData(dataNums, int64Reader.NullBitmapBytes())...)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(int64Reader.Value(i)))
|
|
}
|
|
case arrow.FLOAT32:
|
|
float32Reader := chunk.(*array.Float32)
|
|
validData = append(validData, bytesToValidData(dataNums, float32Reader.NullBitmapBytes())...)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(float32Reader.Value(i)))
|
|
}
|
|
case arrow.FLOAT64:
|
|
float64Reader := chunk.(*array.Float64)
|
|
validData = append(validData, bytesToValidData(dataNums, float64Reader.NullBitmapBytes())...)
|
|
for i := 0; i < dataNums; i++ {
|
|
data = append(data, T(float64Reader.Value(i)))
|
|
}
|
|
case arrow.NULL:
|
|
// the chunk type may be *array.Null if the data in chunk is all null
|
|
validData = append(validData, make([]bool, dataNums)...)
|
|
data = append(data, make([]T, dataNums)...)
|
|
default:
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
}
|
|
if len(data) != len(validData) {
|
|
return nil, nil, merr.WrapErrParameterInvalid(len(data), len(validData), "length of data is not equal to length of valid_data")
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil, nil
|
|
}
|
|
|
|
if pcr.field.GetDefaultValue() != nil {
|
|
defaultValue, err := nullutil.GetDefaultValue(pcr.field)
|
|
if err != nil {
|
|
// won't happen
|
|
return nil, nil, err
|
|
}
|
|
return fillWithDefaultValueImpl(data, defaultValue.(T), validData, pcr.field)
|
|
}
|
|
return data, validData, nil
|
|
}
|
|
|
|
// This method returns a []map[string]arrow.Array
|
|
// map[string]arrow.Array represents a struct
|
|
// For example 1:
|
|
//
|
|
// struct {
|
|
// name string
|
|
// age int
|
|
// }
|
|
//
|
|
// The ReadStructData() will return a list like:
|
|
//
|
|
// [
|
|
// {"name": ["a", "b", "c"], "age": [4, 5, 6]},
|
|
// {"name": ["e", "f"], "age": [7, 8]}
|
|
// ]
|
|
//
|
|
// Value type of "name" is array.String, value type of "age" is array.Int32
|
|
// The length of the list is equal to the length of chunked.Chunks()
|
|
//
|
|
// For sparse vector, the map[string]arrow.Array is like {"indices": array.List, "values": array.List}
|
|
// For example 2:
|
|
//
|
|
// struct {
|
|
// indices []uint32
|
|
// values []float32
|
|
// }
|
|
//
|
|
// The ReadStructData() will return a list like:
|
|
//
|
|
// [
|
|
// {"indices": [[1, 2, 3], [4, 5], [6, 7]], "values": [[0.1, 0.2, 0.3], [0.4, 0.5], [0.6, 0.7]]},
|
|
// {"indices": [[8], [9, 10]], "values": [[0.8], [0.9, 1.0]]}
|
|
// ]
|
|
//
|
|
// Value type of "indices" is array.List, element type is array.Uint32
|
|
// Value type of "values" is array.List, element type is array.Float32
|
|
// The length of the list is equal to the length of chunked.Chunks()
|
|
func ReadStructData(pcr *FieldReader, count int64) ([]map[string]arrow.Array, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data := make([]map[string]arrow.Array, 0, count)
|
|
for _, chunk := range chunked.Chunks() {
|
|
structReader, ok := chunk.(*array.Struct)
|
|
if structReader.NullN() > 0 {
|
|
return nil, WrapNullRowErr(pcr.field)
|
|
}
|
|
if !ok {
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
|
|
structType := structReader.DataType().(*arrow.StructType)
|
|
st := make(map[string]arrow.Array)
|
|
for k, field := range structType.Fields() {
|
|
st[field.Name] = structReader.Field(k)
|
|
}
|
|
data = append(data, st)
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
func ReadNullableStructData(pcr *FieldReader, count int64) ([]map[string]arrow.Array, []bool, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data := make([]map[string]arrow.Array, 0, count)
|
|
validData := make([]bool, 0, count)
|
|
|
|
for _, chunk := range chunked.Chunks() {
|
|
structReader, ok := chunk.(*array.Struct)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
|
|
structType := structReader.DataType().(*arrow.StructType)
|
|
rows := structReader.Len()
|
|
// Sparse storage: only store valid rows' data
|
|
for i := 0; i < rows; i++ {
|
|
validData = append(validData, !structReader.IsNull(i))
|
|
if !structReader.IsNull(i) {
|
|
st := make(map[string]arrow.Array)
|
|
for k, field := range structType.Fields() {
|
|
st[field.Name] = structReader.Field(k)
|
|
}
|
|
data = append(data, st)
|
|
}
|
|
}
|
|
}
|
|
if len(data) == 0 && len(validData) == 0 {
|
|
return nil, nil, nil
|
|
}
|
|
return data, validData, nil
|
|
}
|
|
|
|
func ReadStringData(pcr *FieldReader, count int64, isVarcharField bool) (any, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data := make([]string, 0, count)
|
|
var maxLength int64
|
|
if isVarcharField {
|
|
maxLength, err = parameterutil.GetMaxLength(pcr.field)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
for _, chunk := range chunked.Chunks() {
|
|
dataNums := chunk.Data().Len()
|
|
if chunk.NullN() > 0 {
|
|
return nil, WrapNullRowErr(pcr.field)
|
|
}
|
|
stringReader, ok := chunk.(*array.String)
|
|
if !ok {
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
for i := 0; i < dataNums; i++ {
|
|
value := stringReader.Value(i)
|
|
if isVarcharField {
|
|
if err = common.CheckValidString(value, maxLength, pcr.field); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
data = append(data, value)
|
|
}
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
// readRawStringDataFromParquet handles the low-level logic of reading string chunks
|
|
// from the Parquet column, extracting data, validity mask, and performing VARCHAR length checks.
|
|
// It returns the raw string data and the corresponding validity mask.
|
|
func readRawStringDataFromParquet(pcr *FieldReader, count int64) ([]string, []bool, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
dataType := pcr.field.GetDataType()
|
|
data := make([]string, 0, count)
|
|
validData := make([]bool, 0, count)
|
|
var maxLength int64
|
|
isVarcharField := typeutil.IsStringType(dataType)
|
|
if isVarcharField {
|
|
maxLength, err = parameterutil.GetMaxLength(pcr.field)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
}
|
|
for _, chunk := range chunked.Chunks() {
|
|
dataNums := chunk.Data().Len()
|
|
stringReader, ok := chunk.(*array.String)
|
|
if !ok {
|
|
// the chunk type may be *array.Null if the data in chunk is all null
|
|
_, ok := chunk.(*array.Null)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
validData = append(validData, make([]bool, dataNums)...)
|
|
data = append(data, make([]string, dataNums)...)
|
|
} else {
|
|
validData = append(validData, bytesToValidData(dataNums, stringReader.NullBitmapBytes())...)
|
|
for i := 0; i < dataNums; i++ {
|
|
if stringReader.IsNull(i) {
|
|
data = append(data, "")
|
|
continue
|
|
}
|
|
value := stringReader.Value(i)
|
|
if isVarcharField {
|
|
if err = common.CheckValidString(value, maxLength, pcr.field); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
}
|
|
data = append(data, value)
|
|
}
|
|
}
|
|
}
|
|
if len(data) != len(validData) {
|
|
return nil, nil, merr.WrapErrParameterInvalid(len(data), len(validData), "length of data is not equal to length of valid_data")
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil, nil
|
|
}
|
|
return data, validData, nil
|
|
}
|
|
|
|
func ReadNullableStringData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
// Delegate I/O, Arrow iteration, and VARCHAR validation to the helper function.
|
|
data, validData, err := readRawStringDataFromParquet(pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil, nil
|
|
}
|
|
if pcr.field.GetDefaultValue() != nil {
|
|
// Fill default values for standard string fields (VARCHAR, String, Geometry).
|
|
defaultValue := pcr.field.GetDefaultValue().GetStringData()
|
|
// Assuming fillWithDefaultValueImpl is available
|
|
return fillWithDefaultValueImpl(data, defaultValue, validData, pcr.field)
|
|
}
|
|
|
|
// Return raw data for convertible types or non-defaulted fields.
|
|
return data, validData, nil
|
|
}
|
|
|
|
func ReadJSONData(pcr *FieldReader, count int64) (any, error) {
|
|
// JSON field read data from string array Parquet
|
|
data, err := ReadStringData(pcr, count, false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil
|
|
}
|
|
byteArr := make([][]byte, 0)
|
|
for _, str := range data.([]string) {
|
|
var dummy interface{}
|
|
err = json.Unmarshal([]byte(str), &dummy)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if pcr.field.GetIsDynamic() {
|
|
var dummy2 map[string]interface{}
|
|
err = json.Unmarshal([]byte(str), &dummy2)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
byteArr = append(byteArr, []byte(str))
|
|
}
|
|
return byteArr, nil
|
|
}
|
|
|
|
func ReadNullableJSONData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
// JSON field read data from string array Parquet
|
|
data, validData, err := readRawStringDataFromParquet(pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil, nil
|
|
}
|
|
byteArr := make([][]byte, 0)
|
|
defaultValue := []byte(nil)
|
|
for i, str := range data {
|
|
if !validData[i] {
|
|
byteArr = append(byteArr, defaultValue)
|
|
continue
|
|
}
|
|
var dummy interface{}
|
|
err = json.Unmarshal([]byte(str), &dummy)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if pcr.field.GetIsDynamic() {
|
|
var dummy2 map[string]interface{}
|
|
err = json.Unmarshal([]byte(str), &dummy2)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
}
|
|
byteArr = append(byteArr, []byte(str))
|
|
}
|
|
return byteArr, validData, nil
|
|
}
|
|
|
|
// ReadNullableTimestamptzData reads Timestamptz data from the Parquet column,
|
|
// handling nullability by parsing the time string and converting it to the internal int64 format.
|
|
func ReadNullableTimestamptzData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
// 1. Read the raw data as strings from the underlying Parquet column.
|
|
// This is because Timestamptz data is initially stored as strings in the insertion layer
|
|
// or represented as strings in the Parquet file for ease of parsing/validation.
|
|
data, validData, err := readRawStringDataFromParquet(pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
// If no data was read (e.g., end of file), return nil.
|
|
if data == nil {
|
|
return nil, nil, nil
|
|
}
|
|
|
|
// 2. Initialize the target array for internal int64 timestamps (UTC microseconds).
|
|
int64Ts := make([]int64, 0, len(data))
|
|
defaultValue := pcr.field.GetDefaultValue().GetTimestamptzData()
|
|
|
|
// 3. Iterate over the string array and convert each timestamp.
|
|
for i, strValue := range data {
|
|
// Check the validity mask: If it's null, append the zero value (0) and continue.
|
|
if !validData[i] {
|
|
int64Ts = append(int64Ts, defaultValue)
|
|
continue
|
|
}
|
|
|
|
// Convert the ISO 8601 string to int64 (UTC microseconds).
|
|
// The pcr.timezone is used as the default timezone if the string (strValue)
|
|
// does not contain an explicit UTC offset (e.g., "+08:00").
|
|
tz, err := timestamptz.ValidateAndReturnUnixMicroTz(strValue, pcr.timezone)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
int64Ts = append(int64Ts, tz)
|
|
}
|
|
return int64Ts, validData, nil
|
|
}
|
|
|
|
// ReadTimestamptzData reads non-nullable Timestamptz data from the Parquet column.
|
|
// It assumes all values are present (non-null) and converts them to the internal int64 format.
|
|
func ReadTimestamptzData(pcr *FieldReader, count int64) (any, error) {
|
|
// Read the raw data as strings. Since this is a non-nullable field, we use ReadStringData.
|
|
data, err := ReadStringData(pcr, count, false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// If no data was read (e.g., end of file), return nil.
|
|
if data == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
int64Ts := make([]int64, 0, len(data.([]string)))
|
|
for _, strValue := range data.([]string) {
|
|
// Convert the ISO 8601 string to int64 (UTC microseconds).
|
|
// The pcr.timezone is used as the default if the string lacks an explicit offset.
|
|
tz, err := timestamptz.ValidateAndReturnUnixMicroTz(strValue, pcr.timezone)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
int64Ts = append(int64Ts, tz)
|
|
}
|
|
|
|
// Return the converted int64 array.
|
|
return int64Ts, nil
|
|
}
|
|
|
|
func ReadNullableGeometryData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
// Geometry field read data from string array Parquet
|
|
data, validData, err := readRawStringDataFromParquet(pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil, nil
|
|
}
|
|
wkbValues := make([][]byte, 0)
|
|
defaultValueStr := pcr.field.GetDefaultValue().GetStringData()
|
|
defaultValue := []byte(nil)
|
|
if defaultValueStr != "" {
|
|
defaultValue, _ = pkgcommon.ConvertWKTToWKB(defaultValueStr)
|
|
}
|
|
for i, wktValue := range data {
|
|
if !validData[i] {
|
|
wkbValues = append(wkbValues, defaultValue)
|
|
continue
|
|
}
|
|
wkbValue, err := pkgcommon.ConvertWKTToWKB(wktValue)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
wkbValues = append(wkbValues, wkbValue)
|
|
}
|
|
return wkbValues, validData, nil
|
|
}
|
|
|
|
func ReadGeometryData(pcr *FieldReader, count int64) (any, error) {
|
|
// Geometry field read data from string array Parquet
|
|
data, err := ReadStringData(pcr, count, false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
wkbValues := make([][]byte, 0)
|
|
for _, wktValue := range data.([]string) {
|
|
wkbValue, err := pkgcommon.ConvertWKTToWKB(wktValue)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
wkbValues = append(wkbValues, wkbValue)
|
|
}
|
|
return wkbValues, nil
|
|
}
|
|
|
|
func ReadBinaryData(pcr *FieldReader, count int64) (any, error) {
|
|
dataType := pcr.field.GetDataType()
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data := make([]byte, 0, count)
|
|
for _, chunk := range chunked.Chunks() {
|
|
rows := chunk.Data().Len()
|
|
switch chunk.DataType().ID() {
|
|
case arrow.BINARY:
|
|
binaryReader := chunk.(*array.Binary)
|
|
for i := 0; i < rows; i++ {
|
|
data = append(data, binaryReader.Value(i)...)
|
|
}
|
|
case arrow.LIST:
|
|
listReader := chunk.(*array.List)
|
|
if err = checkVectorAligned(listReader.Offsets(), pcr.dim, dataType); err != nil {
|
|
return nil, merr.WrapErrImportFailed(fmt.Sprintf("length of vector is not aligned: %s, data type: %s", err.Error(), dataType.String()))
|
|
}
|
|
uint8Reader, ok := listReader.ListValues().(*array.Uint8)
|
|
if !ok {
|
|
return nil, WrapTypeErr(pcr.field, listReader.ListValues().DataType().Name())
|
|
}
|
|
data = append(data, uint8Reader.Uint8Values()...)
|
|
default:
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
func ReadNullableBinaryData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
dataType := pcr.field.GetDataType()
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data := make([]byte, 0, count)
|
|
validData := make([]bool, 0, count)
|
|
|
|
// Sparse storage: only store valid rows' data
|
|
for _, chunk := range chunked.Chunks() {
|
|
rows := chunk.Data().Len()
|
|
switch chunk.DataType().ID() {
|
|
case arrow.NULL:
|
|
for i := 0; i < rows; i++ {
|
|
validData = append(validData, false)
|
|
}
|
|
case arrow.BINARY:
|
|
binaryReader := chunk.(*array.Binary)
|
|
for i := 0; i < rows; i++ {
|
|
if binaryReader.IsNull(i) {
|
|
validData = append(validData, false)
|
|
} else {
|
|
data = append(data, binaryReader.Value(i)...)
|
|
validData = append(validData, true)
|
|
}
|
|
}
|
|
case arrow.LIST:
|
|
listReader := chunk.(*array.List)
|
|
if err = checkNullableVectorAligned(listReader.Offsets(), listReader, pcr.dim, dataType); err != nil {
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("length of vector is not aligned: %s, data type: %s", err.Error(), dataType.String()))
|
|
}
|
|
uint8Reader, ok := listReader.ListValues().(*array.Uint8)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, listReader.ListValues().DataType().Name())
|
|
}
|
|
for i := 0; i < rows; i++ {
|
|
if listReader.IsNull(i) {
|
|
validData = append(validData, false)
|
|
} else {
|
|
start, end := listReader.ValueOffsets(i)
|
|
data = append(data, uint8Reader.Uint8Values()[start:end]...)
|
|
validData = append(validData, true)
|
|
}
|
|
}
|
|
default:
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
}
|
|
if len(data) == 0 && len(validData) == 0 {
|
|
return nil, nil, nil
|
|
}
|
|
return data, validData, nil
|
|
}
|
|
|
|
func parseSparseFloatRowVector(str string) ([]byte, uint32, error) {
|
|
rowVec, err := typeutil.CreateSparseFloatRowFromJSON([]byte(str))
|
|
if err != nil {
|
|
return nil, 0, merr.WrapErrImportFailed(fmt.Sprintf("Invalid JSON string for SparseFloatVector: '%s', err = %v", str, err))
|
|
}
|
|
elemCount := len(rowVec) / 8
|
|
maxIdx := uint32(0)
|
|
|
|
if elemCount > 0 {
|
|
maxIdx = typeutil.SparseFloatRowIndexAt(rowVec, elemCount-1) + 1
|
|
}
|
|
|
|
return rowVec, maxIdx, nil
|
|
}
|
|
|
|
// This method accepts input from ReadStructData()
|
|
// For sparse vector, the map[string]arrow.Array is like {"indices": array.List, "values": array.List}
|
|
// Although "indices" and "values" is two-dim list, the array.List provides ListValues() and ValueOffsets()
|
|
// to return one-dim list. We use the start/end position of ValueOffsets() to get the correct sparse vector
|
|
// from ListValues().
|
|
// Note that arrow.Uint32.Value(int i) accepts an int32 value, the max length of indices/values is max value of int32
|
|
func parseSparseFloatVectorStructs(structs []map[string]arrow.Array) ([][]byte, uint32, error) {
|
|
byteArr := make([][]byte, 0)
|
|
maxDim := uint32(0)
|
|
for _, st := range structs {
|
|
indices, ok1 := st[sparseVectorIndice]
|
|
values, ok2 := st[sparseVectorValues]
|
|
if !ok1 || !ok2 {
|
|
return nil, 0, merr.WrapErrImportFailed("Invalid parquet struct for SparseFloatVector: 'indices' or 'values' missed")
|
|
}
|
|
|
|
indicesList, ok1 := indices.(*array.List)
|
|
valuesList, ok2 := values.(*array.List)
|
|
if !ok1 || !ok2 {
|
|
return nil, 0, merr.WrapErrImportFailed("Invalid parquet struct for SparseFloatVector: 'indices' or 'values' is not list")
|
|
}
|
|
|
|
// Len() is the number of rows in this row group
|
|
if indices.Len() != values.Len() {
|
|
msg := fmt.Sprintf("Invalid parquet struct for SparseFloatVector: number of rows of 'indices' and 'values' mismatched, '%d' vs '%d'", indices.Len(), values.Len())
|
|
return nil, 0, merr.WrapErrImportFailed(msg)
|
|
}
|
|
|
|
// technically, DataType() of array.List must be arrow.ListType, but we still check here to ensure safety
|
|
indicesListType, ok1 := indicesList.DataType().(*arrow.ListType)
|
|
valuesListType, ok2 := valuesList.DataType().(*arrow.ListType)
|
|
if !ok1 || !ok2 {
|
|
return nil, 0, merr.WrapErrImportFailed("Invalid parquet struct for SparseFloatVector: incorrect arrow type of 'indices' or 'values'")
|
|
}
|
|
|
|
indexDataType := indicesListType.Elem().ID()
|
|
valueDataType := valuesListType.Elem().ID()
|
|
|
|
// The array.Uint32/array.Int64/array.Float32/array.Float64 are derived from arrow.Array
|
|
// The ListValues() returns arrow.Array interface, but the arrow.Array doesn't have Value(int) interface
|
|
// To call array.Uint32.Value(int), we need to explicitly cast the ListValues() to array.Uint32
|
|
// So, we declare two methods here to avoid type casting in the "for" loop
|
|
type GetIndex func(position int) uint32
|
|
type GetValue func(position int) float32
|
|
|
|
var getIndexFunc GetIndex
|
|
switch indexDataType {
|
|
case arrow.INT32:
|
|
indicesList := indicesList.ListValues().(*array.Int32)
|
|
getIndexFunc = func(position int) uint32 {
|
|
return (uint32)(indicesList.Value(position))
|
|
}
|
|
case arrow.UINT32:
|
|
indicesList := indicesList.ListValues().(*array.Uint32)
|
|
getIndexFunc = func(position int) uint32 {
|
|
return indicesList.Value(position)
|
|
}
|
|
case arrow.INT64:
|
|
indicesList := indicesList.ListValues().(*array.Int64)
|
|
getIndexFunc = func(position int) uint32 {
|
|
return (uint32)(indicesList.Value(position))
|
|
}
|
|
case arrow.UINT64:
|
|
indicesList := indicesList.ListValues().(*array.Uint64)
|
|
getIndexFunc = func(position int) uint32 {
|
|
return (uint32)(indicesList.Value(position))
|
|
}
|
|
default:
|
|
msg := fmt.Sprintf("Invalid parquet struct for SparseFloatVector: index type must be uint32/int32/uint64/int64 but actual type is '%s'", indicesListType.Elem().Name())
|
|
return nil, 0, merr.WrapErrImportFailed(msg)
|
|
}
|
|
|
|
var getValueFunc GetValue
|
|
switch valueDataType {
|
|
case arrow.FLOAT32:
|
|
valuesList := valuesList.ListValues().(*array.Float32)
|
|
getValueFunc = func(position int) float32 {
|
|
return valuesList.Value(position)
|
|
}
|
|
case arrow.FLOAT64:
|
|
valuesList := valuesList.ListValues().(*array.Float64)
|
|
getValueFunc = func(position int) float32 {
|
|
return (float32)(valuesList.Value(position))
|
|
}
|
|
default:
|
|
msg := fmt.Sprintf("Invalid parquet struct for SparseFloatVector: value type must be float32 or float64 but actual type is '%s'", valuesListType.Elem().Name())
|
|
return nil, 0, merr.WrapErrImportFailed(msg)
|
|
}
|
|
|
|
for i := 0; i < indicesList.Len(); i++ {
|
|
start, end := indicesList.ValueOffsets(i)
|
|
start2, end2 := valuesList.ValueOffsets(i)
|
|
rowLen := (int)(end - start)
|
|
rowLenValues := (int)(end2 - start2)
|
|
if rowLenValues != rowLen {
|
|
msg := fmt.Sprintf("Invalid parquet struct for SparseFloatVector: number of elements of 'indices' and 'values' mismatched, '%d' vs '%d'", rowLen, rowLenValues)
|
|
return nil, 0, merr.WrapErrImportFailed(msg)
|
|
}
|
|
|
|
rowIndices := make([]uint32, rowLen)
|
|
rowValues := make([]float32, rowLen)
|
|
for i := start; i < end; i++ {
|
|
rowIndices[i-start] = getIndexFunc((int)(i))
|
|
rowValues[i-start] = getValueFunc((int)(i))
|
|
}
|
|
|
|
// ensure the indices is sorted
|
|
sortedIndices, sortedValues := typeutil.SortSparseFloatRow(rowIndices, rowValues)
|
|
rowVec := typeutil.CreateSparseFloatRow(sortedIndices, sortedValues)
|
|
if err := typeutil.ValidateSparseFloatRows(rowVec); err != nil {
|
|
return byteArr, maxDim, err
|
|
}
|
|
|
|
// set the maxDim as the last value of sortedIndices since it has been sorted
|
|
if len(sortedIndices) > 0 && sortedIndices[len(sortedIndices)-1] > maxDim {
|
|
maxDim = sortedIndices[len(sortedIndices)-1]
|
|
}
|
|
byteArr = append(byteArr, rowVec) // rowVec could be an empty sparse
|
|
}
|
|
}
|
|
return byteArr, maxDim, nil
|
|
}
|
|
|
|
func ReadSparseFloatVectorData(pcr *FieldReader, count int64) (any, error) {
|
|
// read sparse vector from JSON-format string
|
|
if pcr.sparseIsString {
|
|
data, err := ReadStringData(pcr, count, false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
byteArr := make([][]byte, 0, count)
|
|
maxDim := uint32(0)
|
|
|
|
for _, str := range data.([]string) {
|
|
rowVec, rowMaxIdx, err := parseSparseFloatRowVector(str)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
byteArr = append(byteArr, rowVec)
|
|
if rowMaxIdx > maxDim {
|
|
maxDim = rowMaxIdx
|
|
}
|
|
}
|
|
|
|
return &storage.SparseFloatVectorFieldData{
|
|
SparseFloatArray: schemapb.SparseFloatArray{
|
|
Dim: int64(maxDim),
|
|
Contents: byteArr,
|
|
},
|
|
}, nil
|
|
}
|
|
|
|
// read sparse vector from parquet struct
|
|
data, err := ReadStructData(pcr, count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
byteArr, maxDim, err := parseSparseFloatVectorStructs(data)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &storage.SparseFloatVectorFieldData{
|
|
SparseFloatArray: schemapb.SparseFloatArray{
|
|
Dim: int64(maxDim),
|
|
Contents: byteArr,
|
|
},
|
|
}, nil
|
|
}
|
|
|
|
func ReadNullableSparseFloatVectorData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
if pcr.sparseIsString {
|
|
data, validData, err := ReadNullableStringData(pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil, nil
|
|
}
|
|
|
|
// Sparse storage: only store valid rows' data
|
|
byteArr := make([][]byte, 0, count)
|
|
maxDim := uint32(0)
|
|
|
|
for i, str := range data.([]string) {
|
|
if validData[i] {
|
|
rowVec, rowMaxIdx, err := parseSparseFloatRowVector(str)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
byteArr = append(byteArr, rowVec)
|
|
if rowMaxIdx > maxDim {
|
|
maxDim = rowMaxIdx
|
|
}
|
|
}
|
|
}
|
|
|
|
return &storage.SparseFloatVectorFieldData{
|
|
SparseFloatArray: schemapb.SparseFloatArray{
|
|
Dim: int64(maxDim),
|
|
Contents: byteArr,
|
|
},
|
|
ValidData: validData,
|
|
Nullable: true,
|
|
}, validData, nil
|
|
}
|
|
|
|
data, validData, err := ReadNullableStructData(pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if data == nil {
|
|
return nil, nil, nil
|
|
}
|
|
|
|
// Sparse storage: only store valid rows' data
|
|
byteArr := make([][]byte, 0, count)
|
|
maxDim := uint32(0)
|
|
|
|
for i, structData := range data {
|
|
if validData[i] {
|
|
singleByteArr, singleMaxDim, err := parseSparseFloatVectorStructs([]map[string]arrow.Array{structData})
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if len(singleByteArr) > 0 {
|
|
byteArr = append(byteArr, singleByteArr[0])
|
|
if singleMaxDim > maxDim {
|
|
maxDim = singleMaxDim
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return &storage.SparseFloatVectorFieldData{
|
|
SparseFloatArray: schemapb.SparseFloatArray{
|
|
Dim: int64(maxDim),
|
|
Contents: byteArr,
|
|
},
|
|
ValidData: validData,
|
|
Nullable: true,
|
|
}, validData, nil
|
|
}
|
|
|
|
func checkVectorAlignWithDim(offsets []int32, dim int32) error {
|
|
for i := 1; i < len(offsets); i++ {
|
|
if offsets[i]-offsets[i-1] != dim {
|
|
return fmt.Errorf("expected %d but got %d", dim, offsets[i]-offsets[i-1])
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func checkNullableVectorAlignWithDim(offsets []int32, listReader *array.List, dim int32) error {
|
|
for i := 1; i < len(offsets); i++ {
|
|
length := offsets[i] - offsets[i-1]
|
|
if !listReader.IsNull(i-1) && length != dim {
|
|
return fmt.Errorf("expected %d but got %d", dim, length)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func checkVectorAligned(offsets []int32, dim int, dataType schemapb.DataType) error {
|
|
if len(offsets) < 1 {
|
|
return errors.New("empty offsets")
|
|
}
|
|
switch dataType {
|
|
case schemapb.DataType_BinaryVector:
|
|
return checkVectorAlignWithDim(offsets, int32(dim/8))
|
|
case schemapb.DataType_FloatVector:
|
|
return checkVectorAlignWithDim(offsets, int32(dim))
|
|
case schemapb.DataType_Float16Vector, schemapb.DataType_BFloat16Vector:
|
|
return checkVectorAlignWithDim(offsets, int32(dim*2))
|
|
case schemapb.DataType_SparseFloatVector:
|
|
// JSON format, skip alignment check
|
|
return nil
|
|
case schemapb.DataType_Int8Vector:
|
|
return checkVectorAlignWithDim(offsets, int32(dim))
|
|
default:
|
|
return fmt.Errorf("unexpected vector data type %s", dataType.String())
|
|
}
|
|
}
|
|
|
|
func checkNullableVectorAligned(offsets []int32, listReader *array.List, dim int, dataType schemapb.DataType) error {
|
|
if len(offsets) < 1 {
|
|
return errors.New("empty offsets")
|
|
}
|
|
switch dataType {
|
|
case schemapb.DataType_BinaryVector:
|
|
return checkNullableVectorAlignWithDim(offsets, listReader, int32(dim/8))
|
|
case schemapb.DataType_FloatVector:
|
|
return checkNullableVectorAlignWithDim(offsets, listReader, int32(dim))
|
|
case schemapb.DataType_Float16Vector, schemapb.DataType_BFloat16Vector:
|
|
return checkNullableVectorAlignWithDim(offsets, listReader, int32(dim*2))
|
|
case schemapb.DataType_SparseFloatVector:
|
|
return nil
|
|
case schemapb.DataType_Int8Vector:
|
|
return checkNullableVectorAlignWithDim(offsets, listReader, int32(dim))
|
|
default:
|
|
return fmt.Errorf("unexpected vector data type %s", dataType.String())
|
|
}
|
|
}
|
|
|
|
func getArrayData[T any](offsets []int32, getElement func(int) (T, error), outputArray func(arr []T, valid bool)) error {
|
|
for i := 1; i < len(offsets); i++ {
|
|
start, end := offsets[i-1], offsets[i]
|
|
arrData := make([]T, 0, end-start)
|
|
for j := start; j < end; j++ {
|
|
elementVal, err := getElement(int(j))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
arrData = append(arrData, elementVal)
|
|
}
|
|
isValid := (start != end)
|
|
outputArray(arrData, isValid)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func getArrayDataNullable[T any](offsets []int32, listReader *array.List, getElement func(int) (T, error), outputArray func(arr []T, valid bool)) error {
|
|
for i := 1; i < len(offsets); i++ {
|
|
isValid := !listReader.IsNull(i - 1)
|
|
|
|
start, end := offsets[i-1], offsets[i]
|
|
arrData := make([]T, 0, end-start)
|
|
for j := start; j < end; j++ {
|
|
elementVal, err := getElement(int(j))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
arrData = append(arrData, elementVal)
|
|
}
|
|
outputArray(arrData, isValid)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func ReadBoolArrayData(pcr *FieldReader, count int64) (any, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data := make([][]bool, 0, count)
|
|
for _, chunk := range chunked.Chunks() {
|
|
if chunk.NullN() > 0 {
|
|
// Array field is not nullable, but some arrays are null
|
|
return nil, WrapNullRowErr(pcr.field)
|
|
}
|
|
listReader, ok := chunk.(*array.List)
|
|
if !ok {
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
boolReader, ok := listReader.ListValues().(*array.Boolean)
|
|
if !ok {
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
offsets := listReader.Offsets()
|
|
err = getArrayData(offsets, func(i int) (bool, error) {
|
|
if boolReader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return false, WrapNullElementErr(pcr.field)
|
|
}
|
|
return boolReader.Value(i), nil
|
|
}, func(arr []bool, valid bool) {
|
|
data = append(data, arr)
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
func ReadNullableBoolArrayData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data := make([][]bool, 0, count)
|
|
validData := make([]bool, 0, count)
|
|
for _, chunk := range chunked.Chunks() {
|
|
listReader, ok := chunk.(*array.List)
|
|
if !ok {
|
|
// the chunk type may be *array.Null if the data in chunk is all null
|
|
_, ok := chunk.(*array.Null)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
dataNums := chunk.Data().Len()
|
|
validData = append(validData, make([]bool, dataNums)...)
|
|
data = append(data, make([][]bool, dataNums)...)
|
|
} else {
|
|
boolReader, ok := listReader.ListValues().(*array.Boolean)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
offsets := listReader.Offsets()
|
|
err = getArrayData(offsets, func(i int) (bool, error) {
|
|
if boolReader.IsNull(i) {
|
|
return false, WrapNullElementErr(pcr.field)
|
|
}
|
|
return boolReader.Value(i), nil
|
|
}, func(arr []bool, valid bool) {
|
|
data = append(data, arr)
|
|
validData = append(validData, valid)
|
|
})
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
}
|
|
}
|
|
if len(data) != len(validData) {
|
|
return nil, nil, merr.WrapErrParameterInvalid(len(data), len(validData), "length of data is not equal to length of valid_data")
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil, nil
|
|
}
|
|
return data, validData, nil
|
|
}
|
|
|
|
func ReadIntegerOrFloatArrayData[T constraints.Integer | constraints.Float](pcr *FieldReader, count int64) (any, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data := make([][]T, 0, count)
|
|
|
|
for _, chunk := range chunked.Chunks() {
|
|
if chunk.NullN() > 0 {
|
|
// Array field is not nullable, but some arrays are null
|
|
return nil, WrapNullRowErr(pcr.field)
|
|
}
|
|
listReader, ok := chunk.(*array.List)
|
|
if !ok {
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
offsets := listReader.Offsets()
|
|
dataType := pcr.field.GetDataType()
|
|
if typeutil.IsVectorType(dataType) {
|
|
if err = checkVectorAligned(offsets, pcr.dim, dataType); err != nil {
|
|
return nil, merr.WrapErrImportFailed(fmt.Sprintf("length of vector is not aligned: %s, data type: %s", err.Error(), dataType.String()))
|
|
}
|
|
}
|
|
valueReader := listReader.ListValues()
|
|
switch valueReader.DataType().ID() {
|
|
case arrow.INT8:
|
|
int8Reader := valueReader.(*array.Int8)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if int8Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(int8Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
case arrow.INT16:
|
|
int16Reader := valueReader.(*array.Int16)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if int16Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(int16Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
case arrow.INT32:
|
|
int32Reader := valueReader.(*array.Int32)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if int32Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(int32Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
case arrow.INT64:
|
|
int64Reader := valueReader.(*array.Int64)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if int64Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(int64Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
case arrow.FLOAT32:
|
|
float32Reader := valueReader.(*array.Float32)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if float32Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0.0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(float32Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
case arrow.FLOAT64:
|
|
float64Reader := valueReader.(*array.Float64)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if float64Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0.0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(float64Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
default:
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
func ReadNullableIntegerOrFloatArrayData[T constraints.Integer | constraints.Float](pcr *FieldReader, count int64) (any, []bool, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data := make([][]T, 0, count)
|
|
validData := make([]bool, 0, count)
|
|
|
|
for _, chunk := range chunked.Chunks() {
|
|
listReader, ok := chunk.(*array.List)
|
|
if !ok {
|
|
// the chunk type may be *array.Null if the data in chunk is all null
|
|
_, ok := chunk.(*array.Null)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
dataNums := chunk.Data().Len()
|
|
validData = append(validData, make([]bool, dataNums)...)
|
|
data = append(data, make([][]T, dataNums)...)
|
|
} else {
|
|
offsets := listReader.Offsets()
|
|
dataType := pcr.field.GetDataType()
|
|
if typeutil.IsVectorType(dataType) {
|
|
if err = checkVectorAligned(offsets, pcr.dim, dataType); err != nil {
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("length of vector is not aligned: %s, data type: %s", err.Error(), dataType.String()))
|
|
}
|
|
}
|
|
valueReader := listReader.ListValues()
|
|
switch valueReader.DataType().ID() {
|
|
case arrow.INT8:
|
|
int8Reader := valueReader.(*array.Int8)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if int8Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(int8Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
validData = append(validData, valid)
|
|
})
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
case arrow.INT16:
|
|
int16Reader := valueReader.(*array.Int16)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if int16Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(int16Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
validData = append(validData, valid)
|
|
})
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
case arrow.INT32:
|
|
int32Reader := valueReader.(*array.Int32)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if int32Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(int32Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
validData = append(validData, valid)
|
|
})
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
case arrow.INT64:
|
|
int64Reader := valueReader.(*array.Int64)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if int64Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(int64Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
validData = append(validData, valid)
|
|
})
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
case arrow.FLOAT32:
|
|
float32Reader := valueReader.(*array.Float32)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if float32Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0.0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(float32Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
validData = append(validData, valid)
|
|
})
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
case arrow.FLOAT64:
|
|
float64Reader := valueReader.(*array.Float64)
|
|
err = getArrayData(offsets, func(i int) (T, error) {
|
|
if float64Reader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return 0.0, WrapNullElementErr(pcr.field)
|
|
}
|
|
return T(float64Reader.Value(i)), nil
|
|
}, func(arr []T, valid bool) {
|
|
data = append(data, arr)
|
|
validData = append(validData, valid)
|
|
})
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
default:
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
}
|
|
}
|
|
if len(data) != len(validData) {
|
|
return nil, nil, merr.WrapErrParameterInvalid(len(data), len(validData), "length of data is not equal to length of valid_data")
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil, nil
|
|
}
|
|
return data, validData, nil
|
|
}
|
|
|
|
func ReadNullableFloatVectorData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data := make([]float32, 0, int(count)*pcr.dim)
|
|
validData := make([]bool, 0, count)
|
|
|
|
for _, chunk := range chunked.Chunks() {
|
|
listReader, ok := chunk.(*array.List)
|
|
if !ok {
|
|
// the chunk type may be *array.Null if the data in chunk is all null
|
|
_, ok := chunk.(*array.Null)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
dataNums := chunk.Data().Len()
|
|
validData = append(validData, make([]bool, dataNums)...)
|
|
continue
|
|
}
|
|
|
|
dataType := pcr.field.GetDataType()
|
|
offsets := listReader.Offsets()
|
|
if err = checkNullableVectorAligned(offsets, listReader, pcr.dim, dataType); err != nil {
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("length of vector is not aligned: %s, data type: %s", err.Error(), dataType.String()))
|
|
}
|
|
|
|
valueReader := listReader.ListValues()
|
|
rows := listReader.Len()
|
|
|
|
// Sparse storage: only store valid rows' data
|
|
float32Reader, ok := valueReader.(*array.Float32)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, valueReader.DataType().Name())
|
|
}
|
|
for i := 0; i < rows; i++ {
|
|
validData = append(validData, !listReader.IsNull(i))
|
|
if !listReader.IsNull(i) {
|
|
start, end := offsets[i], offsets[i+1]
|
|
for j := start; j < end; j++ {
|
|
data = append(data, float32Reader.Value(int(j)))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if len(data) == 0 && len(validData) == 0 {
|
|
return nil, nil, nil
|
|
}
|
|
return data, validData, typeutil.VerifyFloats32(data)
|
|
}
|
|
|
|
func ReadNullableInt8VectorData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data := make([]int8, 0, int(count)*pcr.dim)
|
|
validData := make([]bool, 0, count)
|
|
|
|
for _, chunk := range chunked.Chunks() {
|
|
listReader, ok := chunk.(*array.List)
|
|
if !ok {
|
|
// the chunk type may be *array.Null if the data in chunk is all null
|
|
_, ok := chunk.(*array.Null)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
dataNums := chunk.Data().Len()
|
|
validData = append(validData, make([]bool, dataNums)...)
|
|
continue
|
|
}
|
|
|
|
dataType := pcr.field.GetDataType()
|
|
offsets := listReader.Offsets()
|
|
if err = checkNullableVectorAligned(offsets, listReader, pcr.dim, dataType); err != nil {
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("length of vector is not aligned: %s, data type: %s", err.Error(), dataType.String()))
|
|
}
|
|
|
|
valueReader := listReader.ListValues()
|
|
rows := listReader.Len()
|
|
|
|
// Sparse storage: only store valid rows' data
|
|
int8Reader, ok := valueReader.(*array.Int8)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, valueReader.DataType().Name())
|
|
}
|
|
for i := 0; i < rows; i++ {
|
|
validData = append(validData, !listReader.IsNull(i))
|
|
if !listReader.IsNull(i) {
|
|
start, end := offsets[i], offsets[i+1]
|
|
for j := start; j < end; j++ {
|
|
data = append(data, int8Reader.Value(int(j)))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if len(data) == 0 && len(validData) == 0 {
|
|
return nil, nil, nil
|
|
}
|
|
return data, validData, nil
|
|
}
|
|
|
|
func ReadStringArrayData(pcr *FieldReader, count int64) (any, error) {
|
|
maxLength, err := parameterutil.GetMaxLength(pcr.field)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
data := make([][]string, 0, count)
|
|
for _, chunk := range chunked.Chunks() {
|
|
if chunk.NullN() > 0 {
|
|
// Array field is not nullable, but some arrays are null
|
|
return nil, WrapNullRowErr(pcr.field)
|
|
}
|
|
listReader, ok := chunk.(*array.List)
|
|
if !ok {
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
stringReader, ok := listReader.ListValues().(*array.String)
|
|
if !ok {
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
offsets := listReader.Offsets()
|
|
err = getArrayData(offsets, func(i int) (string, error) {
|
|
if stringReader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return "", WrapNullElementErr(pcr.field)
|
|
}
|
|
val := stringReader.Value(i)
|
|
if err = common.CheckValidString(val, maxLength, pcr.field); err != nil {
|
|
return val, err
|
|
}
|
|
return val, nil
|
|
}, func(arr []string, valid bool) {
|
|
data = append(data, arr)
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
func ReadNullableStringArrayData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
maxLength, err := parameterutil.GetMaxLength(pcr.field)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data := make([][]string, 0, count)
|
|
validData := make([]bool, 0, count)
|
|
for _, chunk := range chunked.Chunks() {
|
|
listReader, ok := chunk.(*array.List)
|
|
if !ok {
|
|
// the chunk type may be *array.Null if the data in chunk is all null
|
|
_, ok := chunk.(*array.Null)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
dataNums := chunk.Data().Len()
|
|
validData = append(validData, make([]bool, dataNums)...)
|
|
data = append(data, make([][]string, dataNums)...)
|
|
} else {
|
|
stringReader, ok := listReader.ListValues().(*array.String)
|
|
if !ok {
|
|
return nil, nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
offsets := listReader.Offsets()
|
|
err = getArrayData(offsets, func(i int) (string, error) {
|
|
if stringReader.IsNull(i) {
|
|
// array contains null values is not allowed
|
|
return "", WrapNullElementErr(pcr.field)
|
|
}
|
|
val := stringReader.Value(i)
|
|
if err = common.CheckValidString(val, maxLength, pcr.field); err != nil {
|
|
return val, err
|
|
}
|
|
return val, nil
|
|
}, func(arr []string, valid bool) {
|
|
data = append(data, arr)
|
|
validData = append(validData, valid)
|
|
})
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(data) != len(validData) {
|
|
return nil, nil, merr.WrapErrImportFailed(
|
|
fmt.Sprintf("length(%d) of data is not equal to length(%d) of valid_data for field '%s'",
|
|
len(data), len(validData), pcr.field.GetName()))
|
|
}
|
|
if len(data) == 0 {
|
|
return nil, nil, nil
|
|
}
|
|
return data, validData, nil
|
|
}
|
|
|
|
func ReadArrayData(pcr *FieldReader, count int64) (any, error) {
|
|
data := make([]*schemapb.ScalarField, 0, count)
|
|
maxCapacity, err := parameterutil.GetMaxCapacity(pcr.field)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
elementType := pcr.field.GetElementType()
|
|
switch elementType {
|
|
case schemapb.DataType_Bool:
|
|
boolArray, err := ReadBoolArrayData(pcr, count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if boolArray == nil {
|
|
return nil, nil
|
|
}
|
|
for _, elementArray := range boolArray.([][]bool) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_BoolData{
|
|
BoolData: &schemapb.BoolArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
case schemapb.DataType_Int8:
|
|
int8Array, err := ReadIntegerOrFloatArrayData[int32](pcr, count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if int8Array == nil {
|
|
return nil, nil
|
|
}
|
|
for _, elementArray := range int8Array.([][]int32) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_IntData{
|
|
IntData: &schemapb.IntArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
case schemapb.DataType_Int16:
|
|
int16Array, err := ReadIntegerOrFloatArrayData[int32](pcr, count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if int16Array == nil {
|
|
return nil, nil
|
|
}
|
|
for _, elementArray := range int16Array.([][]int32) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_IntData{
|
|
IntData: &schemapb.IntArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
case schemapb.DataType_Int32:
|
|
int32Array, err := ReadIntegerOrFloatArrayData[int32](pcr, count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if int32Array == nil {
|
|
return nil, nil
|
|
}
|
|
for _, elementArray := range int32Array.([][]int32) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_IntData{
|
|
IntData: &schemapb.IntArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
case schemapb.DataType_Int64:
|
|
int64Array, err := ReadIntegerOrFloatArrayData[int64](pcr, count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if int64Array == nil {
|
|
return nil, nil
|
|
}
|
|
for _, elementArray := range int64Array.([][]int64) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_LongData{
|
|
LongData: &schemapb.LongArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
case schemapb.DataType_Float:
|
|
float32Array, err := ReadIntegerOrFloatArrayData[float32](pcr, count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if float32Array == nil {
|
|
return nil, nil
|
|
}
|
|
for _, elementArray := range float32Array.([][]float32) {
|
|
if err := typeutil.VerifyFloats32(elementArray); err != nil {
|
|
return nil, fmt.Errorf("float32 verification failed: %w", err)
|
|
}
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_FloatData{
|
|
FloatData: &schemapb.FloatArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
case schemapb.DataType_Double:
|
|
float64Array, err := ReadIntegerOrFloatArrayData[float64](pcr, count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if float64Array == nil {
|
|
return nil, nil
|
|
}
|
|
for _, elementArray := range float64Array.([][]float64) {
|
|
if err := typeutil.VerifyFloats64(elementArray); err != nil {
|
|
return nil, fmt.Errorf("float64 verification failed: %w", err)
|
|
}
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_DoubleData{
|
|
DoubleData: &schemapb.DoubleArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
case schemapb.DataType_Timestamptz:
|
|
int64Array, err := ReadIntegerOrFloatArrayData[int64](pcr, count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if int64Array == nil {
|
|
return nil, nil
|
|
}
|
|
for _, elementArray := range int64Array.([][]int64) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_TimestamptzData{
|
|
TimestamptzData: &schemapb.TimestamptzArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
case schemapb.DataType_VarChar, schemapb.DataType_String:
|
|
stringArray, err := ReadStringArrayData(pcr, count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if stringArray == nil {
|
|
return nil, nil
|
|
}
|
|
for _, elementArray := range stringArray.([][]string) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_StringData{
|
|
StringData: &schemapb.StringArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
default:
|
|
return nil, merr.WrapErrImportFailed(fmt.Sprintf("unsupported data type '%s' for array field '%s'",
|
|
elementType.String(), pcr.field.GetName()))
|
|
}
|
|
return data, nil
|
|
}
|
|
|
|
func ReadNullableArrayData(pcr *FieldReader, count int64) (any, []bool, error) {
|
|
data := make([]*schemapb.ScalarField, 0, count)
|
|
maxCapacity, err := parameterutil.GetMaxCapacity(pcr.field)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
elementType := pcr.field.GetElementType()
|
|
switch elementType {
|
|
case schemapb.DataType_Bool:
|
|
boolArray, validData, err := ReadNullableBoolArrayData(pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if boolArray == nil {
|
|
return nil, nil, nil
|
|
}
|
|
for _, elementArray := range boolArray.([][]bool) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_BoolData{
|
|
BoolData: &schemapb.BoolArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
return data, validData, nil
|
|
case schemapb.DataType_Int8:
|
|
int8Array, validData, err := ReadNullableIntegerOrFloatArrayData[int32](pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if int8Array == nil {
|
|
return nil, nil, nil
|
|
}
|
|
for _, elementArray := range int8Array.([][]int32) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_IntData{
|
|
IntData: &schemapb.IntArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
return data, validData, nil
|
|
case schemapb.DataType_Int16:
|
|
int16Array, validData, err := ReadNullableIntegerOrFloatArrayData[int32](pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if int16Array == nil {
|
|
return nil, nil, nil
|
|
}
|
|
for _, elementArray := range int16Array.([][]int32) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_IntData{
|
|
IntData: &schemapb.IntArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
return data, validData, nil
|
|
case schemapb.DataType_Int32:
|
|
int32Array, validData, err := ReadNullableIntegerOrFloatArrayData[int32](pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if int32Array == nil {
|
|
return nil, nil, nil
|
|
}
|
|
for _, elementArray := range int32Array.([][]int32) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_IntData{
|
|
IntData: &schemapb.IntArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
return data, validData, nil
|
|
case schemapb.DataType_Int64:
|
|
int64Array, validData, err := ReadNullableIntegerOrFloatArrayData[int64](pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if int64Array == nil {
|
|
return nil, nil, nil
|
|
}
|
|
for _, elementArray := range int64Array.([][]int64) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_LongData{
|
|
LongData: &schemapb.LongArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
return data, validData, nil
|
|
case schemapb.DataType_Float:
|
|
float32Array, validData, err := ReadNullableIntegerOrFloatArrayData[float32](pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if float32Array == nil {
|
|
return nil, nil, nil
|
|
}
|
|
for _, elementArray := range float32Array.([][]float32) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_FloatData{
|
|
FloatData: &schemapb.FloatArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
return data, validData, nil
|
|
case schemapb.DataType_Double:
|
|
float64Array, validData, err := ReadNullableIntegerOrFloatArrayData[float64](pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if float64Array == nil {
|
|
return nil, nil, nil
|
|
}
|
|
for _, elementArray := range float64Array.([][]float64) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_DoubleData{
|
|
DoubleData: &schemapb.DoubleArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
return data, validData, nil
|
|
case schemapb.DataType_Timestamptz:
|
|
int64Array, validData, err := ReadNullableIntegerOrFloatArrayData[int64](pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if int64Array == nil {
|
|
return nil, nil, nil
|
|
}
|
|
for _, elementArray := range int64Array.([][]int64) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_TimestamptzData{
|
|
TimestamptzData: &schemapb.TimestamptzArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
return data, validData, nil
|
|
case schemapb.DataType_VarChar, schemapb.DataType_String:
|
|
stringArray, validData, err := ReadNullableStringArrayData(pcr, count)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if stringArray == nil {
|
|
return nil, nil, nil
|
|
}
|
|
for _, elementArray := range stringArray.([][]string) {
|
|
if err = common.CheckArrayCapacity(len(elementArray), maxCapacity, pcr.field); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
data = append(data, &schemapb.ScalarField{
|
|
Data: &schemapb.ScalarField_StringData{
|
|
StringData: &schemapb.StringArray{
|
|
Data: elementArray,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
return data, validData, nil
|
|
default:
|
|
return nil, nil, merr.WrapErrImportFailed(fmt.Sprintf("unsupported data type '%s' for array field '%s'",
|
|
elementType.String(), pcr.field.GetName()))
|
|
}
|
|
}
|
|
|
|
func ReadVectorArrayData(pcr *FieldReader, count int64) (any, error) {
|
|
data := make([]*schemapb.VectorField, 0, count)
|
|
maxCapacity, err := parameterutil.GetMaxCapacity(pcr.field)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
dim, err := typeutil.GetDim(pcr.field)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
chunked, err := pcr.columnReader.NextBatch(count)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if chunked == nil {
|
|
return nil, nil
|
|
}
|
|
|
|
elementType := pcr.field.GetElementType()
|
|
switch elementType {
|
|
case schemapb.DataType_FloatVector:
|
|
for _, chunk := range chunked.Chunks() {
|
|
if chunk.NullN() > 0 {
|
|
return nil, WrapNullRowErr(pcr.field)
|
|
}
|
|
// ArrayOfVector is stored as list of fixed size binary
|
|
listReader, ok := chunk.(*array.List)
|
|
if !ok {
|
|
return nil, WrapTypeErr(pcr.field, chunk.DataType().Name())
|
|
}
|
|
|
|
fixedBinaryReader, ok := listReader.ListValues().(*array.FixedSizeBinary)
|
|
if !ok {
|
|
return nil, WrapTypeErr(pcr.field, listReader.ListValues().DataType().Name())
|
|
}
|
|
|
|
// Check that each vector has the correct byte size (dim * 4 bytes for float32)
|
|
expectedByteSize := int(dim) * 4
|
|
actualByteSize := fixedBinaryReader.DataType().(*arrow.FixedSizeBinaryType).ByteWidth
|
|
if actualByteSize != expectedByteSize {
|
|
return nil, merr.WrapErrImportFailed(fmt.Sprintf("vector byte size mismatch: expected %d, got %d for field '%s'",
|
|
expectedByteSize, actualByteSize, pcr.field.GetName()))
|
|
}
|
|
|
|
offsets := listReader.Offsets()
|
|
for i := 1; i < len(offsets); i++ {
|
|
start, end := offsets[i-1], offsets[i]
|
|
vectorCount := end - start
|
|
|
|
if err = common.CheckArrayCapacity(int(vectorCount), maxCapacity, pcr.field); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Convert binary data to float32 array using arrow's built-in conversion
|
|
totalFloats := vectorCount * int32(dim)
|
|
floatData := make([]float32, totalFloats)
|
|
for j := int32(0); j < vectorCount; j++ {
|
|
vectorIndex := start + j
|
|
binaryData := fixedBinaryReader.Value(int(vectorIndex))
|
|
vectorFloats := arrow.Float32Traits.CastFromBytes(binaryData)
|
|
copy(floatData[j*int32(dim):(j+1)*int32(dim)], vectorFloats)
|
|
}
|
|
|
|
data = append(data, &schemapb.VectorField{
|
|
Dim: dim,
|
|
Data: &schemapb.VectorField_FloatVector{
|
|
FloatVector: &schemapb.FloatArray{
|
|
Data: floatData,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
}
|
|
default:
|
|
return nil, merr.WrapErrImportFailed(fmt.Sprintf("unsupported data type '%s' for vector field '%s'",
|
|
elementType.String(), pcr.field.GetName()))
|
|
}
|
|
|
|
return data, nil
|
|
}
|