sthuang c4ae9f4ece
feat: introduce third-party milvus-storage (#39418)
related: https://github.com/milvus-io/milvus/issues/39173

Signed-off-by: shaoting-huang <shaoting.huang@zilliz.com>
2025-01-24 17:21:13 +08:00

81 lines
2.5 KiB
Go

// Copyright 2023 Zilliz
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package arrowutil
import (
"context"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/parquet/file"
"github.com/apache/arrow/go/v12/parquet/pqarrow"
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
)
func MakeArrowFileReader(fs fs.Fs, filePath string) (*pqarrow.FileReader, error) {
f, err := fs.OpenFile(filePath)
if err != nil {
return nil, err
}
parquetReader, err := file.NewParquetReader(f)
if err != nil {
return nil, err
}
return pqarrow.NewFileReader(parquetReader, pqarrow.ArrowReadProperties{BatchSize: constant.ReadBatchSize}, memory.DefaultAllocator)
}
func MakeArrowRecordReader(reader *pqarrow.FileReader, opts *options.ReadOptions) (array.RecordReader, error) {
var rowGroupsIndices []int
var columnIndices []int
metadata := reader.ParquetReader().MetaData()
for _, c := range opts.Columns {
columnIndices = append(columnIndices, metadata.Schema.ColumnIndexByName(c))
}
for _, f := range opts.Filters {
columnIndices = append(columnIndices, metadata.Schema.ColumnIndexByName(f.GetColumnName()))
}
for i := 0; i < len(metadata.RowGroups); i++ {
rg := metadata.RowGroup(i)
var canIgnored bool
for _, filter := range opts.Filters {
columnIndex := rg.Schema.ColumnIndexByName(filter.GetColumnName())
columnChunk, err := rg.ColumnChunk(columnIndex)
if err != nil {
return nil, err
}
columnStats, err := columnChunk.Statistics()
if err != nil {
return nil, err
}
if columnStats == nil || !columnStats.HasMinMax() {
continue
}
if filter.CheckStatistics(columnStats) {
canIgnored = true
break
}
}
if !canIgnored {
rowGroupsIndices = append(rowGroupsIndices, i)
}
}
return reader.GetRecordReader(context.TODO(), columnIndices, rowGroupsIndices)
}