// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package importutil import ( "context" "errors" "path" "sort" "strconv" "github.com/milvus-io/milvus/api/schemapb" "github.com/milvus-io/milvus/internal/log" "github.com/milvus-io/milvus/internal/storage" "go.uber.org/zap" ) type BinlogParser struct { collectionSchema *schemapb.CollectionSchema // collection schema shardNum int32 // sharding number of the collection segmentSize int64 // maximum size of a segment(unit:byte) chunkManager storage.ChunkManager // storage interfaces to browse/read the files callFlushFunc ImportFlushFunc // call back function to flush segment // a timestamp to define the end point of restore, data after this point will be ignored // set this value to 0, all the data will be ignored // set this value to math.MaxUint64, all the data will be imported tsEndPoint uint64 } func NewBinlogParser(collectionSchema *schemapb.CollectionSchema, shardNum int32, segmentSize int64, chunkManager storage.ChunkManager, flushFunc ImportFlushFunc, tsEndPoint uint64) (*BinlogParser, error) { if collectionSchema == nil { log.Error("Binlog parser: collection schema is nil") return nil, errors.New("collection schema is nil") } if chunkManager == nil { log.Error("Binlog parser: chunk manager pointer is nil") return nil, errors.New("chunk manager pointer is nil") } if flushFunc == nil { log.Error("Binlog parser: flush function is nil") return nil, errors.New("flush function is nil") } v := &BinlogParser{ collectionSchema: collectionSchema, shardNum: shardNum, segmentSize: segmentSize, chunkManager: chunkManager, callFlushFunc: flushFunc, tsEndPoint: tsEndPoint, } return v, nil } // For instance, the insertlogRoot is "backup/bak1/data/insert_log/435978159196147009/435978159196147010". // 435978159196147009 is a collection id, 435978159196147010 is a partition id, // there is a segment(id is 435978159261483009) under this partition. // ListWithPrefix() will return all the insert logs under this partition: // // "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/0/435978159903735811" // "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/1/435978159903735812" // "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/100/435978159903735809" // "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/101/435978159903735810" // // The deltalogRoot is "backup/bak1/data/delta_log/435978159196147009/435978159196147010". // Then we get all the delta logs under this partition: // // "backup/bak1/data/delta_log/435978159196147009/435978159196147010/435978159261483009/434574382554415105" // // In this function, we will constuct a list of SegmentFilesHolder objects, each SegmentFilesHolder holds the // insert logs and delta logs of a segment. func (p *BinlogParser) constructSegmentHolders(insertlogRoot string, deltalogRoot string) ([]*SegmentFilesHolder, error) { holders := make(map[int64]*SegmentFilesHolder) // TODO add context insertlogs, _, err := p.chunkManager.ListWithPrefix(context.TODO(), insertlogRoot, true) if err != nil { log.Error("Binlog parser: list insert logs error", zap.Error(err)) return nil, err } // collect insert log paths log.Info("Binlog parser: list insert logs", zap.Int("logsCount", len(insertlogs))) for _, insertlog := range insertlogs { log.Info("Binlog parser: mapping insert log to segment", zap.String("insertlog", insertlog)) fieldPath := path.Dir(insertlog) fieldStrID := path.Base(fieldPath) fieldID, err := strconv.ParseInt(fieldStrID, 10, 64) if err != nil { log.Error("Binlog parser: parse field id error", zap.String("fieldPath", fieldPath), zap.Error(err)) return nil, err } segmentPath := path.Dir(fieldPath) segmentStrID := path.Base(segmentPath) segmentID, err := strconv.ParseInt(segmentStrID, 10, 64) if err != nil { log.Error("Binlog parser: parse segment id error", zap.String("segmentPath", segmentPath), zap.Error(err)) return nil, err } holder, ok := holders[segmentID] if ok { holder.fieldFiles[fieldID] = append(holder.fieldFiles[fieldID], insertlog) } else { holder = &SegmentFilesHolder{ segmentID: segmentID, fieldFiles: make(map[int64][]string), deltaFiles: make([]string, 0), } holder.fieldFiles[fieldID] = make([]string, 0) holder.fieldFiles[fieldID] = append(holder.fieldFiles[fieldID], insertlog) holders[segmentID] = holder } } // sort the insert log paths of each field by ascendent sequence // there might be several insert logs under a field, for example: // 2 insert logs under field a: a_1, a_2 // 2 insert logs under field b: b_1, b_2 // the row count of a_1 is equal to b_1, the row count of a_2 is equal to b_2 // when we read these logs, we firstly read a_1 and b_1, then read a_2 and b_2 // so, here we must ensure the paths are arranged correctly for _, holder := range holders { for _, v := range holder.fieldFiles { sort.Strings(v) } } // collect delta log paths if len(deltalogRoot) > 0 { // TODO add context deltalogs, _, err := p.chunkManager.ListWithPrefix(context.TODO(), deltalogRoot, true) if err != nil { log.Error("Binlog parser: list delta logs error", zap.Error(err)) return nil, err } log.Info("Binlog parser: list delta logs", zap.Int("logsCount", len(deltalogs))) for _, deltalog := range deltalogs { log.Info("Binlog parser: mapping delta log to segment", zap.String("deltalog", deltalog)) segmentPath := path.Dir(deltalog) segmentStrID := path.Base(segmentPath) segmentID, err := strconv.ParseInt(segmentStrID, 10, 64) if err != nil { log.Error("Binlog parser: parse segment id error", zap.String("segmentPath", segmentPath), zap.Error(err)) return nil, err } // if the segment id doesn't exist, no need to process this deltalog holder, ok := holders[segmentID] if ok { holder.deltaFiles = append(holder.deltaFiles, deltalog) } } } holdersList := make([]*SegmentFilesHolder, 0) for _, holder := range holders { holdersList = append(holdersList, holder) } return holdersList, nil } func (p *BinlogParser) parseSegmentFiles(segmentHolder *SegmentFilesHolder) error { if segmentHolder == nil { log.Error("Binlog parser: segment files holder is nil") return errors.New("segment files holder is nil") } adapter, err := NewBinlogAdapter(p.collectionSchema, p.shardNum, p.segmentSize, MaxTotalSizeInMemory, p.chunkManager, p.callFlushFunc, p.tsEndPoint) if err != nil { log.Error("Binlog parser: failed to create binlog adapter", zap.Error(err)) return err } return adapter.Read(segmentHolder) } // This functions requires two paths: // 1. the insert log path of a partition // 2. the delta log path of a partiion (optional) func (p *BinlogParser) Parse(filePaths []string) error { if len(filePaths) != 1 && len(filePaths) != 2 { log.Error("Binlog parser: illegal paths for binlog import") return errors.New("illegal paths for binlog import, partition binlog path and partition delta path are required") } insertlogPath := filePaths[0] deltalogPath := "" if len(filePaths) == 2 { deltalogPath = filePaths[1] } log.Info("Binlog parser: target paths", zap.String("insertlogPath", insertlogPath), zap.String("deltalogPath", deltalogPath)) segmentHolders, err := p.constructSegmentHolders(insertlogPath, deltalogPath) if err != nil { return err } for _, segmentHolder := range segmentHolders { err = p.parseSegmentFiles(segmentHolder) if err != nil { return err } // trigger gb after each segment finished triggerGC() } return nil }