milvus/internal/util/importutil/binlog_parser.go
groot 746e9ece5a
Fix a random failure of unittest (#19643)
Signed-off-by: yhmo <yihua.mo@zilliz.com>

Signed-off-by: yhmo <yihua.mo@zilliz.com>
2022-10-09 14:22:57 +08:00

243 lines
8.7 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package importutil
import (
"context"
"errors"
"path"
"sort"
"strconv"
"github.com/milvus-io/milvus/api/schemapb"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/storage"
"go.uber.org/zap"
)
type BinlogParser struct {
collectionSchema *schemapb.CollectionSchema // collection schema
shardNum int32 // sharding number of the collection
segmentSize int64 // maximum size of a segment(unit:byte)
chunkManager storage.ChunkManager // storage interfaces to browse/read the files
callFlushFunc ImportFlushFunc // call back function to flush segment
// a timestamp to define the end point of restore, data after this point will be ignored
// set this value to 0, all the data will be ignored
// set this value to math.MaxUint64, all the data will be imported
tsEndPoint uint64
}
func NewBinlogParser(collectionSchema *schemapb.CollectionSchema,
shardNum int32,
segmentSize int64,
chunkManager storage.ChunkManager,
flushFunc ImportFlushFunc,
tsEndPoint uint64) (*BinlogParser, error) {
if collectionSchema == nil {
log.Error("Binlog parser: collection schema is nil")
return nil, errors.New("collection schema is nil")
}
if chunkManager == nil {
log.Error("Binlog parser: chunk manager pointer is nil")
return nil, errors.New("chunk manager pointer is nil")
}
if flushFunc == nil {
log.Error("Binlog parser: flush function is nil")
return nil, errors.New("flush function is nil")
}
v := &BinlogParser{
collectionSchema: collectionSchema,
shardNum: shardNum,
segmentSize: segmentSize,
chunkManager: chunkManager,
callFlushFunc: flushFunc,
tsEndPoint: tsEndPoint,
}
return v, nil
}
// For instance, the insertlogRoot is "backup/bak1/data/insert_log/435978159196147009/435978159196147010".
// 435978159196147009 is a collection id, 435978159196147010 is a partition id,
// there is a segment(id is 435978159261483009) under this partition.
// ListWithPrefix() will return all the insert logs under this partition:
//
// "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/0/435978159903735811"
// "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/1/435978159903735812"
// "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/100/435978159903735809"
// "backup/bak1/data/insert_log/435978159196147009/435978159196147010/435978159261483009/101/435978159903735810"
//
// The deltalogRoot is "backup/bak1/data/delta_log/435978159196147009/435978159196147010".
// Then we get all the delta logs under this partition:
//
// "backup/bak1/data/delta_log/435978159196147009/435978159196147010/435978159261483009/434574382554415105"
//
// In this function, we will constuct a list of SegmentFilesHolder objects, each SegmentFilesHolder holds the
// insert logs and delta logs of a segment.
func (p *BinlogParser) constructSegmentHolders(insertlogRoot string, deltalogRoot string) ([]*SegmentFilesHolder, error) {
holders := make(map[int64]*SegmentFilesHolder)
// TODO add context
insertlogs, _, err := p.chunkManager.ListWithPrefix(context.TODO(), insertlogRoot, true)
if err != nil {
log.Error("Binlog parser: list insert logs error", zap.Error(err))
return nil, err
}
// collect insert log paths
log.Info("Binlog parser: list insert logs", zap.Int("logsCount", len(insertlogs)))
for _, insertlog := range insertlogs {
log.Info("Binlog parser: mapping insert log to segment", zap.String("insertlog", insertlog))
fieldPath := path.Dir(insertlog)
fieldStrID := path.Base(fieldPath)
fieldID, err := strconv.ParseInt(fieldStrID, 10, 64)
if err != nil {
log.Error("Binlog parser: parse field id error", zap.String("fieldPath", fieldPath), zap.Error(err))
return nil, err
}
segmentPath := path.Dir(fieldPath)
segmentStrID := path.Base(segmentPath)
segmentID, err := strconv.ParseInt(segmentStrID, 10, 64)
if err != nil {
log.Error("Binlog parser: parse segment id error", zap.String("segmentPath", segmentPath), zap.Error(err))
return nil, err
}
holder, ok := holders[segmentID]
if ok {
holder.fieldFiles[fieldID] = append(holder.fieldFiles[fieldID], insertlog)
} else {
holder = &SegmentFilesHolder{
segmentID: segmentID,
fieldFiles: make(map[int64][]string),
deltaFiles: make([]string, 0),
}
holder.fieldFiles[fieldID] = make([]string, 0)
holder.fieldFiles[fieldID] = append(holder.fieldFiles[fieldID], insertlog)
holders[segmentID] = holder
}
}
// sort the insert log paths of each field by ascendent sequence
// there might be several insert logs under a field, for example:
// 2 insert logs under field a: a_1, a_2
// 2 insert logs under field b: b_1, b_2
// the row count of a_1 is equal to b_1, the row count of a_2 is equal to b_2
// when we read these logs, we firstly read a_1 and b_1, then read a_2 and b_2
// so, here we must ensure the paths are arranged correctly
segmentIDs := make([]int64, 0)
for id, holder := range holders {
segmentIDs = append(segmentIDs, id)
for _, v := range holder.fieldFiles {
sort.Strings(v)
}
}
// collect delta log paths
if len(deltalogRoot) > 0 {
// TODO add context
deltalogs, _, err := p.chunkManager.ListWithPrefix(context.TODO(), deltalogRoot, true)
if err != nil {
log.Error("Binlog parser: list delta logs error", zap.Error(err))
return nil, err
}
log.Info("Binlog parser: list delta logs", zap.Int("logsCount", len(deltalogs)))
for _, deltalog := range deltalogs {
log.Info("Binlog parser: mapping delta log to segment", zap.String("deltalog", deltalog))
segmentPath := path.Dir(deltalog)
segmentStrID := path.Base(segmentPath)
segmentID, err := strconv.ParseInt(segmentStrID, 10, 64)
if err != nil {
log.Error("Binlog parser: parse segment id error", zap.String("segmentPath", segmentPath), zap.Error(err))
return nil, err
}
// if the segment id doesn't exist, no need to process this deltalog
holder, ok := holders[segmentID]
if ok {
holder.deltaFiles = append(holder.deltaFiles, deltalog)
}
}
}
// since the map in golang is not sorted, we sort the segment id array to return holder list with ascending sequence
sort.Slice(segmentIDs, func(i, j int) bool { return segmentIDs[i] < segmentIDs[j] })
holdersList := make([]*SegmentFilesHolder, 0)
for _, id := range segmentIDs {
holdersList = append(holdersList, holders[id])
}
return holdersList, nil
}
func (p *BinlogParser) parseSegmentFiles(segmentHolder *SegmentFilesHolder) error {
if segmentHolder == nil {
log.Error("Binlog parser: segment files holder is nil")
return errors.New("segment files holder is nil")
}
adapter, err := NewBinlogAdapter(p.collectionSchema, p.shardNum, p.segmentSize,
MaxTotalSizeInMemory, p.chunkManager, p.callFlushFunc, p.tsEndPoint)
if err != nil {
log.Error("Binlog parser: failed to create binlog adapter", zap.Error(err))
return err
}
return adapter.Read(segmentHolder)
}
// This functions requires two paths:
// 1. the insert log path of a partition
// 2. the delta log path of a partiion (optional)
func (p *BinlogParser) Parse(filePaths []string) error {
if len(filePaths) != 1 && len(filePaths) != 2 {
log.Error("Binlog parser: illegal paths for binlog import")
return errors.New("illegal paths for binlog import, partition binlog path and partition delta path are required")
}
insertlogPath := filePaths[0]
deltalogPath := ""
if len(filePaths) == 2 {
deltalogPath = filePaths[1]
}
log.Info("Binlog parser: target paths",
zap.String("insertlogPath", insertlogPath),
zap.String("deltalogPath", deltalogPath))
segmentHolders, err := p.constructSegmentHolders(insertlogPath, deltalogPath)
if err != nil {
return err
}
for _, segmentHolder := range segmentHolders {
err = p.parseSegmentFiles(segmentHolder)
if err != nil {
return err
}
// trigger gb after each segment finished
triggerGC()
}
return nil
}