sthuang c4ae9f4ece
feat: introduce third-party milvus-storage (#39418)
related: https://github.com/milvus-io/milvus/issues/39173

Signed-off-by: shaoting-huang <shaoting.huang@zilliz.com>
2025-01-24 17:21:13 +08:00

405 lines
12 KiB
Go

// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package utils
import (
"fmt"
"path/filepath"
"strconv"
"strings"
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/endian"
"github.com/cockroachdb/errors"
"github.com/google/uuid"
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
"github.com/milvus-io/milvus/internal/storagev2/common/log"
"github.com/milvus-io/milvus/pkg/proto/storagev2pb"
)
var ErrInvalidArgument = errors.New("invalid argument")
func ToProtobufType(dataType arrow.Type) (storagev2pb.LogicType, error) {
typeId := int(dataType)
if typeId < 0 || typeId >= int(storagev2pb.LogicType_MAX_ID) {
return storagev2pb.LogicType_NA, fmt.Errorf("parse data type %v: %w", dataType, ErrInvalidArgument)
}
return storagev2pb.LogicType(typeId), nil
}
func ToProtobufMetadata(metadata *arrow.Metadata) (*storagev2pb.KeyValueMetadata, error) {
keys := metadata.Keys()
values := metadata.Values()
return &storagev2pb.KeyValueMetadata{Keys: keys, Values: values}, nil
}
func ToProtobufDataType(dataType arrow.DataType) (*storagev2pb.DataType, error) {
protoType := &storagev2pb.DataType{}
err := SetTypeValues(protoType, dataType)
if err != nil {
return nil, err
}
logicType, err := ToProtobufType(dataType.ID())
if err != nil {
return nil, err
}
protoType.LogicType = logicType
if len(GetFields(dataType)) > 0 {
for _, field := range GetFields(dataType) {
fieldCopy := field
protoFieldType, err := ToProtobufField(&fieldCopy)
if err != nil {
return nil, err
}
protoType.Children = append(protoType.Children, protoFieldType)
}
}
return protoType, nil
}
// GetFields TODO CHECK MORE TYPES
func GetFields(dataType arrow.DataType) []arrow.Field {
switch dataType.ID() {
case arrow.LIST:
listType, _ := dataType.(*arrow.ListType)
return listType.Fields()
case arrow.STRUCT:
structType, _ := dataType.(*arrow.StructType)
return structType.Fields()
case arrow.MAP:
mapType, _ := dataType.(*arrow.MapType)
return mapType.Fields()
case arrow.FIXED_SIZE_LIST:
listType, _ := dataType.(*arrow.FixedSizeListType)
return listType.Fields()
default:
return nil
}
}
func ToProtobufField(field *arrow.Field) (*storagev2pb.Field, error) {
protoField := &storagev2pb.Field{}
protoField.Name = field.Name
protoField.Nullable = field.Nullable
if field.Metadata.Len() != 0 {
fieldMetadata, err := ToProtobufMetadata(&field.Metadata)
if err != nil {
return nil, fmt.Errorf("convert to protobuf field: %w", err)
}
protoField.Metadata = fieldMetadata
}
dataType, err := ToProtobufDataType(field.Type)
if err != nil {
return nil, fmt.Errorf("convert to protobuf field: %w", err)
}
protoField.DataType = dataType
return protoField, nil
}
func SetTypeValues(protoType *storagev2pb.DataType, dataType arrow.DataType) error {
switch dataType.ID() {
case arrow.FIXED_SIZE_BINARY:
realType, ok := dataType.(*arrow.FixedSizeBinaryType)
if !ok {
return fmt.Errorf("convert to fixed size binary type: %w", ErrInvalidArgument)
}
fixedSizeBinaryType := &storagev2pb.FixedSizeBinaryType{}
fixedSizeBinaryType.ByteWidth = int32(realType.ByteWidth)
protoType.TypeRelatedValues = &storagev2pb.DataType_FixedSizeBinaryType{FixedSizeBinaryType: fixedSizeBinaryType}
case arrow.FIXED_SIZE_LIST:
realType, ok := dataType.(*arrow.FixedSizeListType)
if !ok {
return fmt.Errorf("convert to fixed size list type: %w", ErrInvalidArgument)
}
fixedSizeListType := &storagev2pb.FixedSizeListType{}
fixedSizeListType.ListSize = realType.Len()
protoType.TypeRelatedValues = &storagev2pb.DataType_FixedSizeListType{FixedSizeListType: fixedSizeListType}
case arrow.DICTIONARY:
realType, ok := dataType.(*arrow.DictionaryType)
if !ok {
return fmt.Errorf("convert to dictionary type: %w", ErrInvalidArgument)
}
dictionaryType := &storagev2pb.DictionaryType{}
indexType, err := ToProtobufDataType(realType.IndexType)
if err != nil {
return err
}
dictionaryType.IndexType = indexType
valueType, err := ToProtobufDataType(realType.ValueType)
if err != nil {
return err
}
dictionaryType.ValueType = valueType
dictionaryType.Ordered = realType.Ordered
protoType.TypeRelatedValues = &storagev2pb.DataType_DictionaryType{DictionaryType: dictionaryType}
case arrow.MAP:
realType, ok := dataType.(*arrow.MapType)
if !ok {
return fmt.Errorf("convert to map type: %w", ErrInvalidArgument)
}
mapType := &storagev2pb.MapType{}
mapType.KeysSorted = realType.KeysSorted
protoType.TypeRelatedValues = &storagev2pb.DataType_MapType{MapType: mapType}
default:
}
return nil
}
func ToProtobufSchema(schema *arrow.Schema) (*storagev2pb.ArrowSchema, error) {
protoSchema := &storagev2pb.ArrowSchema{}
for _, field := range schema.Fields() {
fieldCopy := field
protoField, err := ToProtobufField(&fieldCopy)
if err != nil {
return nil, err
}
protoSchema.Fields = append(protoSchema.Fields, protoField)
}
if schema.Endianness() == endian.LittleEndian {
protoSchema.Endianness = storagev2pb.Endianness_Little
} else if schema.Endianness() == endian.BigEndian {
protoSchema.Endianness = storagev2pb.Endianness_Big
}
// TODO FIX ME: golang proto not support proto_schema->mutable_metadata()->add_keys(key);
if schema.HasMetadata() && !schema.HasMetadata() {
for _, key := range schema.Metadata().Keys() {
protoKeyValue := protoSchema.GetMetadata()
protoKeyValue.Keys = append(protoKeyValue.Keys, key)
}
for _, value := range schema.Metadata().Values() {
protoKeyValue := protoSchema.GetMetadata()
protoKeyValue.Values = append(protoKeyValue.Values, value)
}
}
return protoSchema, nil
}
func FromProtobufSchema(schema *storagev2pb.ArrowSchema) (*arrow.Schema, error) {
fields := make([]arrow.Field, 0, len(schema.Fields))
for _, field := range schema.Fields {
tmp, err := FromProtobufField(field)
if err != nil {
return nil, err
}
fields = append(fields, *tmp)
}
tmp, err := FromProtobufKeyValueMetadata(schema.Metadata)
if err != nil {
return nil, err
}
newSchema := arrow.NewSchema(fields, tmp)
return newSchema, nil
}
func FromProtobufField(field *storagev2pb.Field) (*arrow.Field, error) {
datatype, err := FromProtobufDataType(field.DataType)
if err != nil {
return nil, err
}
metadata, err := FromProtobufKeyValueMetadata(field.GetMetadata())
if err != nil {
return nil, err
}
return &arrow.Field{Name: field.Name, Type: datatype, Nullable: field.Nullable, Metadata: *metadata}, nil
}
func FromProtobufKeyValueMetadata(metadata *storagev2pb.KeyValueMetadata) (*arrow.Metadata, error) {
keys := make([]string, 0)
values := make([]string, 0)
if metadata != nil {
keys = metadata.Keys
values = metadata.Values
}
newMetadata := arrow.NewMetadata(keys, values)
return &newMetadata, nil
}
func FromProtobufDataType(dataType *storagev2pb.DataType) (arrow.DataType, error) {
switch dataType.LogicType {
case storagev2pb.LogicType_NA:
return &arrow.NullType{}, nil
case storagev2pb.LogicType_BOOL:
return &arrow.BooleanType{}, nil
case storagev2pb.LogicType_UINT8:
return &arrow.Uint8Type{}, nil
case storagev2pb.LogicType_INT8:
return &arrow.Int8Type{}, nil
case storagev2pb.LogicType_UINT16:
return &arrow.Uint16Type{}, nil
case storagev2pb.LogicType_INT16:
return &arrow.Int16Type{}, nil
case storagev2pb.LogicType_UINT32:
return &arrow.Uint32Type{}, nil
case storagev2pb.LogicType_INT32:
return &arrow.Int32Type{}, nil
case storagev2pb.LogicType_UINT64:
return &arrow.Uint64Type{}, nil
case storagev2pb.LogicType_INT64:
return &arrow.Int64Type{}, nil
case storagev2pb.LogicType_HALF_FLOAT:
return &arrow.Float16Type{}, nil
case storagev2pb.LogicType_FLOAT:
return &arrow.Float32Type{}, nil
case storagev2pb.LogicType_DOUBLE:
return &arrow.Float64Type{}, nil
case storagev2pb.LogicType_STRING:
return &arrow.StringType{}, nil
case storagev2pb.LogicType_BINARY:
return &arrow.BinaryType{}, nil
case storagev2pb.LogicType_LIST:
fieldType, err := FromProtobufField(dataType.Children[0])
if err != nil {
return nil, err
}
listType := arrow.ListOf(fieldType.Type)
return listType, nil
case storagev2pb.LogicType_STRUCT:
fields := make([]arrow.Field, 0, len(dataType.Children))
for _, child := range dataType.Children {
field, err := FromProtobufField(child)
if err != nil {
return nil, err
}
fields = append(fields, *field)
}
structType := arrow.StructOf(fields...)
return structType, nil
case storagev2pb.LogicType_DICTIONARY:
keyType, err := FromProtobufField(dataType.Children[0])
if err != nil {
return nil, err
}
valueType, err := FromProtobufField(dataType.Children[1])
if err != nil {
return nil, err
}
dictType := &arrow.DictionaryType{
IndexType: keyType.Type,
ValueType: valueType.Type,
}
return dictType, nil
case storagev2pb.LogicType_MAP:
fieldType, err := FromProtobufField(dataType.Children[0])
if err != nil {
return nil, err
}
// TODO FIX ME
return arrow.MapOf(fieldType.Type, fieldType.Type), nil
case storagev2pb.LogicType_FIXED_SIZE_BINARY:
sizeBinaryType := arrow.FixedSizeBinaryType{ByteWidth: int(dataType.GetFixedSizeBinaryType().ByteWidth)}
return &sizeBinaryType, nil
case storagev2pb.LogicType_FIXED_SIZE_LIST:
fieldType, err := FromProtobufField(dataType.Children[0])
if err != nil {
return nil, err
}
fixedSizeListType := arrow.FixedSizeListOf(int32(int(dataType.GetFixedSizeListType().ListSize)), fieldType.Type)
return fixedSizeListType, nil
default:
return nil, fmt.Errorf("parse protobuf datatype: %w", ErrInvalidArgument)
}
}
func GetNewParquetFilePath(path string) string {
scalarFileId := uuid.New()
path = filepath.Join(path, scalarFileId.String()+constant.ParquetDataFileSuffix)
return path
}
func GetManifestFilePath(path string, version int64) string {
path = filepath.Join(path, constant.ManifestDir, strconv.FormatInt(version, 10)+constant.ManifestFileSuffix)
return path
}
func GetManifestTmpFilePath(path string, version int64) string {
path = filepath.Join(path, constant.ManifestDir, strconv.FormatInt(version, 10)+constant.ManifestTempFileSuffix)
return path
}
func GetBlobFilePath(path string) string {
blobId := uuid.New()
return filepath.Join(GetBlobDir(path), blobId.String())
}
func GetManifestDir(path string) string {
path = filepath.Join(path, constant.ManifestDir)
return path
}
func GetVectorDataDir(path string) string {
return filepath.Join(path, constant.VectorDataDir)
}
func GetScalarDataDir(path string) string {
return filepath.Join(path, constant.ScalarDataDir)
}
func GetBlobDir(path string) string {
return filepath.Join(path, constant.BlobDir)
}
func GetDeleteDataDir(path string) string {
return filepath.Join(path, constant.DeleteDataDir)
}
func ParseVersionFromFileName(path string) int64 {
pos := strings.Index(path, constant.ManifestFileSuffix)
if pos == -1 || !strings.HasSuffix(path, constant.ManifestFileSuffix) {
log.Warn("manifest file suffix not match", log.String("path", path))
return -1
}
version := path[0:pos]
versionInt, err := strconv.ParseInt(version, 10, 64)
if err != nil {
log.Error("parse version from file name error", log.String("path", path), log.String("version", version))
return -1
}
return versionInt
}
func ProjectSchema(sc *arrow.Schema, columns []string) *arrow.Schema {
var fields []arrow.Field
for _, field := range sc.Fields() {
for _, column := range columns {
if field.Name == column {
fields = append(fields, field)
break
}
}
}
return arrow.NewSchema(fields, nil)
}