From ad950368fef2630565dbe7f6da533b4f2699a3d0 Mon Sep 17 00:00:00 2001 From: "yihao.dai" Date: Fri, 8 Aug 2025 18:57:40 +0800 Subject: [PATCH] enhance: Fix parquet import OOM (#43756) Each ColumnReader consumes ReaderProperties.BufferSize memory independently. Therefore, the bufferSize should be divided by the number of columns to ensure total memory usage stays within the intended limit. issue: https://github.com/milvus-io/milvus/issues/43755 Signed-off-by: bigsheeper --- internal/util/importutilv2/parquet/reader.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/internal/util/importutilv2/parquet/reader.go b/internal/util/importutilv2/parquet/reader.go index 91859f7e31..ed07656e89 100644 --- a/internal/util/importutilv2/parquet/reader.go +++ b/internal/util/importutilv2/parquet/reader.go @@ -35,7 +35,7 @@ import ( "github.com/milvus-io/milvus/pkg/v2/util/merr" ) -const fileReaderBufferSize = int64(32 * 1024 * 1024) +const totalReadBufferSize = int64(64 * 1024 * 1024) type reader struct { ctx context.Context @@ -58,8 +58,14 @@ func NewReader(ctx context.Context, cm storage.ChunkManager, schema *schemapb.Co if err != nil { return nil, err } + + // Each ColumnReader consumes ReaderProperties.BufferSize memory independently. + // Therefore, the bufferSize should be divided by the number of columns + // to ensure total memory usage stays within the intended limit. + columnReaderBufferSize := totalReadBufferSize / int64(len(schema.GetFields())) + r, err := file.NewParquetReader(cmReader, file.WithReadProps(&parquet.ReaderProperties{ - BufferSize: fileReaderBufferSize, + BufferSize: columnReaderBufferSize, BufferedStreamEnabled: true, })) if err != nil {