mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
related: https://github.com/milvus-io/milvus/issues/39173 Signed-off-by: shaoting-huang <shaoting.huang@zilliz.com>
405 lines
12 KiB
Go
405 lines
12 KiB
Go
// Copyright 2023 Zilliz
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package utils
|
|
|
|
import (
|
|
"fmt"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/apache/arrow/go/v12/arrow"
|
|
"github.com/apache/arrow/go/v12/arrow/endian"
|
|
"github.com/cockroachdb/errors"
|
|
"github.com/google/uuid"
|
|
|
|
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
|
|
"github.com/milvus-io/milvus/internal/storagev2/common/log"
|
|
"github.com/milvus-io/milvus/pkg/proto/storagev2pb"
|
|
)
|
|
|
|
var ErrInvalidArgument = errors.New("invalid argument")
|
|
|
|
func ToProtobufType(dataType arrow.Type) (storagev2pb.LogicType, error) {
|
|
typeId := int(dataType)
|
|
if typeId < 0 || typeId >= int(storagev2pb.LogicType_MAX_ID) {
|
|
return storagev2pb.LogicType_NA, fmt.Errorf("parse data type %v: %w", dataType, ErrInvalidArgument)
|
|
}
|
|
return storagev2pb.LogicType(typeId), nil
|
|
}
|
|
|
|
func ToProtobufMetadata(metadata *arrow.Metadata) (*storagev2pb.KeyValueMetadata, error) {
|
|
keys := metadata.Keys()
|
|
values := metadata.Values()
|
|
return &storagev2pb.KeyValueMetadata{Keys: keys, Values: values}, nil
|
|
}
|
|
|
|
func ToProtobufDataType(dataType arrow.DataType) (*storagev2pb.DataType, error) {
|
|
protoType := &storagev2pb.DataType{}
|
|
err := SetTypeValues(protoType, dataType)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
logicType, err := ToProtobufType(dataType.ID())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
protoType.LogicType = logicType
|
|
|
|
if len(GetFields(dataType)) > 0 {
|
|
for _, field := range GetFields(dataType) {
|
|
fieldCopy := field
|
|
protoFieldType, err := ToProtobufField(&fieldCopy)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
protoType.Children = append(protoType.Children, protoFieldType)
|
|
}
|
|
}
|
|
|
|
return protoType, nil
|
|
}
|
|
|
|
// GetFields TODO CHECK MORE TYPES
|
|
func GetFields(dataType arrow.DataType) []arrow.Field {
|
|
switch dataType.ID() {
|
|
case arrow.LIST:
|
|
listType, _ := dataType.(*arrow.ListType)
|
|
return listType.Fields()
|
|
case arrow.STRUCT:
|
|
structType, _ := dataType.(*arrow.StructType)
|
|
return structType.Fields()
|
|
case arrow.MAP:
|
|
mapType, _ := dataType.(*arrow.MapType)
|
|
return mapType.Fields()
|
|
case arrow.FIXED_SIZE_LIST:
|
|
listType, _ := dataType.(*arrow.FixedSizeListType)
|
|
return listType.Fields()
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func ToProtobufField(field *arrow.Field) (*storagev2pb.Field, error) {
|
|
protoField := &storagev2pb.Field{}
|
|
protoField.Name = field.Name
|
|
protoField.Nullable = field.Nullable
|
|
|
|
if field.Metadata.Len() != 0 {
|
|
fieldMetadata, err := ToProtobufMetadata(&field.Metadata)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("convert to protobuf field: %w", err)
|
|
}
|
|
protoField.Metadata = fieldMetadata
|
|
}
|
|
|
|
dataType, err := ToProtobufDataType(field.Type)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("convert to protobuf field: %w", err)
|
|
}
|
|
protoField.DataType = dataType
|
|
return protoField, nil
|
|
}
|
|
|
|
func SetTypeValues(protoType *storagev2pb.DataType, dataType arrow.DataType) error {
|
|
switch dataType.ID() {
|
|
case arrow.FIXED_SIZE_BINARY:
|
|
realType, ok := dataType.(*arrow.FixedSizeBinaryType)
|
|
if !ok {
|
|
return fmt.Errorf("convert to fixed size binary type: %w", ErrInvalidArgument)
|
|
}
|
|
fixedSizeBinaryType := &storagev2pb.FixedSizeBinaryType{}
|
|
fixedSizeBinaryType.ByteWidth = int32(realType.ByteWidth)
|
|
protoType.TypeRelatedValues = &storagev2pb.DataType_FixedSizeBinaryType{FixedSizeBinaryType: fixedSizeBinaryType}
|
|
case arrow.FIXED_SIZE_LIST:
|
|
realType, ok := dataType.(*arrow.FixedSizeListType)
|
|
if !ok {
|
|
return fmt.Errorf("convert to fixed size list type: %w", ErrInvalidArgument)
|
|
}
|
|
fixedSizeListType := &storagev2pb.FixedSizeListType{}
|
|
fixedSizeListType.ListSize = realType.Len()
|
|
protoType.TypeRelatedValues = &storagev2pb.DataType_FixedSizeListType{FixedSizeListType: fixedSizeListType}
|
|
case arrow.DICTIONARY:
|
|
realType, ok := dataType.(*arrow.DictionaryType)
|
|
if !ok {
|
|
return fmt.Errorf("convert to dictionary type: %w", ErrInvalidArgument)
|
|
}
|
|
dictionaryType := &storagev2pb.DictionaryType{}
|
|
indexType, err := ToProtobufDataType(realType.IndexType)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
dictionaryType.IndexType = indexType
|
|
valueType, err := ToProtobufDataType(realType.ValueType)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
dictionaryType.ValueType = valueType
|
|
dictionaryType.Ordered = realType.Ordered
|
|
protoType.TypeRelatedValues = &storagev2pb.DataType_DictionaryType{DictionaryType: dictionaryType}
|
|
|
|
case arrow.MAP:
|
|
realType, ok := dataType.(*arrow.MapType)
|
|
if !ok {
|
|
return fmt.Errorf("convert to map type: %w", ErrInvalidArgument)
|
|
}
|
|
mapType := &storagev2pb.MapType{}
|
|
mapType.KeysSorted = realType.KeysSorted
|
|
protoType.TypeRelatedValues = &storagev2pb.DataType_MapType{MapType: mapType}
|
|
|
|
default:
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func ToProtobufSchema(schema *arrow.Schema) (*storagev2pb.ArrowSchema, error) {
|
|
protoSchema := &storagev2pb.ArrowSchema{}
|
|
for _, field := range schema.Fields() {
|
|
fieldCopy := field
|
|
protoField, err := ToProtobufField(&fieldCopy)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
protoSchema.Fields = append(protoSchema.Fields, protoField)
|
|
}
|
|
if schema.Endianness() == endian.LittleEndian {
|
|
protoSchema.Endianness = storagev2pb.Endianness_Little
|
|
} else if schema.Endianness() == endian.BigEndian {
|
|
protoSchema.Endianness = storagev2pb.Endianness_Big
|
|
}
|
|
|
|
// TODO FIX ME: golang proto not support proto_schema->mutable_metadata()->add_keys(key);
|
|
if schema.HasMetadata() && !schema.HasMetadata() {
|
|
for _, key := range schema.Metadata().Keys() {
|
|
protoKeyValue := protoSchema.GetMetadata()
|
|
protoKeyValue.Keys = append(protoKeyValue.Keys, key)
|
|
}
|
|
for _, value := range schema.Metadata().Values() {
|
|
protoKeyValue := protoSchema.GetMetadata()
|
|
protoKeyValue.Values = append(protoKeyValue.Values, value)
|
|
}
|
|
}
|
|
|
|
return protoSchema, nil
|
|
}
|
|
|
|
func FromProtobufSchema(schema *storagev2pb.ArrowSchema) (*arrow.Schema, error) {
|
|
fields := make([]arrow.Field, 0, len(schema.Fields))
|
|
for _, field := range schema.Fields {
|
|
tmp, err := FromProtobufField(field)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
fields = append(fields, *tmp)
|
|
}
|
|
tmp, err := FromProtobufKeyValueMetadata(schema.Metadata)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
newSchema := arrow.NewSchema(fields, tmp)
|
|
return newSchema, nil
|
|
}
|
|
|
|
func FromProtobufField(field *storagev2pb.Field) (*arrow.Field, error) {
|
|
datatype, err := FromProtobufDataType(field.DataType)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
metadata, err := FromProtobufKeyValueMetadata(field.GetMetadata())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &arrow.Field{Name: field.Name, Type: datatype, Nullable: field.Nullable, Metadata: *metadata}, nil
|
|
}
|
|
|
|
func FromProtobufKeyValueMetadata(metadata *storagev2pb.KeyValueMetadata) (*arrow.Metadata, error) {
|
|
keys := make([]string, 0)
|
|
values := make([]string, 0)
|
|
if metadata != nil {
|
|
keys = metadata.Keys
|
|
values = metadata.Values
|
|
}
|
|
newMetadata := arrow.NewMetadata(keys, values)
|
|
return &newMetadata, nil
|
|
}
|
|
|
|
func FromProtobufDataType(dataType *storagev2pb.DataType) (arrow.DataType, error) {
|
|
switch dataType.LogicType {
|
|
case storagev2pb.LogicType_NA:
|
|
return &arrow.NullType{}, nil
|
|
case storagev2pb.LogicType_BOOL:
|
|
return &arrow.BooleanType{}, nil
|
|
case storagev2pb.LogicType_UINT8:
|
|
return &arrow.Uint8Type{}, nil
|
|
case storagev2pb.LogicType_INT8:
|
|
return &arrow.Int8Type{}, nil
|
|
case storagev2pb.LogicType_UINT16:
|
|
return &arrow.Uint16Type{}, nil
|
|
case storagev2pb.LogicType_INT16:
|
|
return &arrow.Int16Type{}, nil
|
|
case storagev2pb.LogicType_UINT32:
|
|
return &arrow.Uint32Type{}, nil
|
|
case storagev2pb.LogicType_INT32:
|
|
return &arrow.Int32Type{}, nil
|
|
case storagev2pb.LogicType_UINT64:
|
|
return &arrow.Uint64Type{}, nil
|
|
case storagev2pb.LogicType_INT64:
|
|
return &arrow.Int64Type{}, nil
|
|
case storagev2pb.LogicType_HALF_FLOAT:
|
|
return &arrow.Float16Type{}, nil
|
|
case storagev2pb.LogicType_FLOAT:
|
|
return &arrow.Float32Type{}, nil
|
|
case storagev2pb.LogicType_DOUBLE:
|
|
return &arrow.Float64Type{}, nil
|
|
case storagev2pb.LogicType_STRING:
|
|
return &arrow.StringType{}, nil
|
|
case storagev2pb.LogicType_BINARY:
|
|
return &arrow.BinaryType{}, nil
|
|
|
|
case storagev2pb.LogicType_LIST:
|
|
fieldType, err := FromProtobufField(dataType.Children[0])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
listType := arrow.ListOf(fieldType.Type)
|
|
return listType, nil
|
|
|
|
case storagev2pb.LogicType_STRUCT:
|
|
fields := make([]arrow.Field, 0, len(dataType.Children))
|
|
for _, child := range dataType.Children {
|
|
field, err := FromProtobufField(child)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
fields = append(fields, *field)
|
|
}
|
|
structType := arrow.StructOf(fields...)
|
|
return structType, nil
|
|
|
|
case storagev2pb.LogicType_DICTIONARY:
|
|
keyType, err := FromProtobufField(dataType.Children[0])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
valueType, err := FromProtobufField(dataType.Children[1])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
dictType := &arrow.DictionaryType{
|
|
IndexType: keyType.Type,
|
|
ValueType: valueType.Type,
|
|
}
|
|
return dictType, nil
|
|
|
|
case storagev2pb.LogicType_MAP:
|
|
fieldType, err := FromProtobufField(dataType.Children[0])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// TODO FIX ME
|
|
return arrow.MapOf(fieldType.Type, fieldType.Type), nil
|
|
|
|
case storagev2pb.LogicType_FIXED_SIZE_BINARY:
|
|
|
|
sizeBinaryType := arrow.FixedSizeBinaryType{ByteWidth: int(dataType.GetFixedSizeBinaryType().ByteWidth)}
|
|
return &sizeBinaryType, nil
|
|
|
|
case storagev2pb.LogicType_FIXED_SIZE_LIST:
|
|
fieldType, err := FromProtobufField(dataType.Children[0])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
fixedSizeListType := arrow.FixedSizeListOf(int32(int(dataType.GetFixedSizeListType().ListSize)), fieldType.Type)
|
|
return fixedSizeListType, nil
|
|
|
|
default:
|
|
return nil, fmt.Errorf("parse protobuf datatype: %w", ErrInvalidArgument)
|
|
}
|
|
}
|
|
|
|
func GetNewParquetFilePath(path string) string {
|
|
scalarFileId := uuid.New()
|
|
path = filepath.Join(path, scalarFileId.String()+constant.ParquetDataFileSuffix)
|
|
return path
|
|
}
|
|
|
|
func GetManifestFilePath(path string, version int64) string {
|
|
path = filepath.Join(path, constant.ManifestDir, strconv.FormatInt(version, 10)+constant.ManifestFileSuffix)
|
|
return path
|
|
}
|
|
|
|
func GetManifestTmpFilePath(path string, version int64) string {
|
|
path = filepath.Join(path, constant.ManifestDir, strconv.FormatInt(version, 10)+constant.ManifestTempFileSuffix)
|
|
return path
|
|
}
|
|
|
|
func GetBlobFilePath(path string) string {
|
|
blobId := uuid.New()
|
|
return filepath.Join(GetBlobDir(path), blobId.String())
|
|
}
|
|
|
|
func GetManifestDir(path string) string {
|
|
path = filepath.Join(path, constant.ManifestDir)
|
|
return path
|
|
}
|
|
|
|
func GetVectorDataDir(path string) string {
|
|
return filepath.Join(path, constant.VectorDataDir)
|
|
}
|
|
|
|
func GetScalarDataDir(path string) string {
|
|
return filepath.Join(path, constant.ScalarDataDir)
|
|
}
|
|
|
|
func GetBlobDir(path string) string {
|
|
return filepath.Join(path, constant.BlobDir)
|
|
}
|
|
|
|
func GetDeleteDataDir(path string) string {
|
|
return filepath.Join(path, constant.DeleteDataDir)
|
|
}
|
|
|
|
func ParseVersionFromFileName(path string) int64 {
|
|
pos := strings.Index(path, constant.ManifestFileSuffix)
|
|
if pos == -1 || !strings.HasSuffix(path, constant.ManifestFileSuffix) {
|
|
log.Warn("manifest file suffix not match", log.String("path", path))
|
|
return -1
|
|
}
|
|
version := path[0:pos]
|
|
versionInt, err := strconv.ParseInt(version, 10, 64)
|
|
if err != nil {
|
|
log.Error("parse version from file name error", log.String("path", path), log.String("version", version))
|
|
return -1
|
|
}
|
|
return versionInt
|
|
}
|
|
|
|
func ProjectSchema(sc *arrow.Schema, columns []string) *arrow.Schema {
|
|
var fields []arrow.Field
|
|
for _, field := range sc.Fields() {
|
|
for _, column := range columns {
|
|
if field.Name == column {
|
|
fields = append(fields, field)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
return arrow.NewSchema(fields, nil)
|
|
}
|