mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-06 17:18:35 +08:00
related: https://github.com/milvus-io/milvus/issues/39173 Signed-off-by: shaoting-huang <shaoting.huang@zilliz.com>
81 lines
2.5 KiB
Go
81 lines
2.5 KiB
Go
// Copyright 2023 Zilliz
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package arrowutil
|
|
|
|
import (
|
|
"context"
|
|
|
|
"github.com/apache/arrow/go/v12/arrow/array"
|
|
"github.com/apache/arrow/go/v12/arrow/memory"
|
|
"github.com/apache/arrow/go/v12/parquet/file"
|
|
"github.com/apache/arrow/go/v12/parquet/pqarrow"
|
|
|
|
"github.com/milvus-io/milvus/internal/storagev2/common/constant"
|
|
"github.com/milvus-io/milvus/internal/storagev2/io/fs"
|
|
"github.com/milvus-io/milvus/internal/storagev2/storage/options"
|
|
)
|
|
|
|
func MakeArrowFileReader(fs fs.Fs, filePath string) (*pqarrow.FileReader, error) {
|
|
f, err := fs.OpenFile(filePath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
parquetReader, err := file.NewParquetReader(f)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return pqarrow.NewFileReader(parquetReader, pqarrow.ArrowReadProperties{BatchSize: constant.ReadBatchSize}, memory.DefaultAllocator)
|
|
}
|
|
|
|
func MakeArrowRecordReader(reader *pqarrow.FileReader, opts *options.ReadOptions) (array.RecordReader, error) {
|
|
var rowGroupsIndices []int
|
|
var columnIndices []int
|
|
metadata := reader.ParquetReader().MetaData()
|
|
for _, c := range opts.Columns {
|
|
columnIndices = append(columnIndices, metadata.Schema.ColumnIndexByName(c))
|
|
}
|
|
for _, f := range opts.Filters {
|
|
columnIndices = append(columnIndices, metadata.Schema.ColumnIndexByName(f.GetColumnName()))
|
|
}
|
|
|
|
for i := 0; i < len(metadata.RowGroups); i++ {
|
|
rg := metadata.RowGroup(i)
|
|
var canIgnored bool
|
|
for _, filter := range opts.Filters {
|
|
columnIndex := rg.Schema.ColumnIndexByName(filter.GetColumnName())
|
|
columnChunk, err := rg.ColumnChunk(columnIndex)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
columnStats, err := columnChunk.Statistics()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if columnStats == nil || !columnStats.HasMinMax() {
|
|
continue
|
|
}
|
|
if filter.CheckStatistics(columnStats) {
|
|
canIgnored = true
|
|
break
|
|
}
|
|
}
|
|
if !canIgnored {
|
|
rowGroupsIndices = append(rowGroupsIndices, i)
|
|
}
|
|
}
|
|
|
|
return reader.GetRecordReader(context.TODO(), columnIndices, rowGroupsIndices)
|
|
}
|