2022-03-30 16:25:30 +08:00
|
|
|
package importutil
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bufio"
|
|
|
|
"context"
|
|
|
|
"errors"
|
|
|
|
"path"
|
2022-06-09 16:52:07 +08:00
|
|
|
"runtime/debug"
|
2022-03-30 16:25:30 +08:00
|
|
|
"strconv"
|
2022-04-03 11:27:29 +08:00
|
|
|
"strings"
|
2022-03-30 16:25:30 +08:00
|
|
|
|
2022-04-12 22:19:34 +08:00
|
|
|
"go.uber.org/zap"
|
|
|
|
"go.uber.org/zap/zapcore"
|
|
|
|
|
2022-03-30 16:25:30 +08:00
|
|
|
"github.com/milvus-io/milvus/internal/allocator"
|
|
|
|
"github.com/milvus-io/milvus/internal/common"
|
|
|
|
"github.com/milvus-io/milvus/internal/log"
|
2022-04-21 21:37:42 +08:00
|
|
|
"github.com/milvus-io/milvus/internal/proto/commonpb"
|
|
|
|
"github.com/milvus-io/milvus/internal/proto/rootcoordpb"
|
2022-03-30 16:25:30 +08:00
|
|
|
"github.com/milvus-io/milvus/internal/proto/schemapb"
|
|
|
|
"github.com/milvus-io/milvus/internal/storage"
|
2022-04-25 17:37:46 +08:00
|
|
|
"github.com/milvus-io/milvus/internal/util/timerecord"
|
2022-03-30 16:25:30 +08:00
|
|
|
"github.com/milvus-io/milvus/internal/util/typeutil"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
JSONFileExt = ".json"
|
|
|
|
NumpyFileExt = ".npy"
|
2022-05-20 10:27:56 +08:00
|
|
|
MaxFileSize = 1 * 1024 * 1024 * 1024 // maximum size of each file
|
2022-03-30 16:25:30 +08:00
|
|
|
)
|
|
|
|
|
|
|
|
type ImportWrapper struct {
|
|
|
|
ctx context.Context // for canceling parse process
|
|
|
|
cancel context.CancelFunc // for canceling parse process
|
|
|
|
collectionSchema *schemapb.CollectionSchema // collection schema
|
|
|
|
shardNum int32 // sharding number of the collection
|
2022-04-20 14:03:40 +08:00
|
|
|
segmentSize int64 // maximum size of a segment(unit:byte)
|
2022-03-30 16:25:30 +08:00
|
|
|
rowIDAllocator *allocator.IDAllocator // autoid allocator
|
2022-04-12 22:19:34 +08:00
|
|
|
chunkManager storage.ChunkManager
|
2022-03-30 16:25:30 +08:00
|
|
|
|
2022-04-20 14:03:40 +08:00
|
|
|
callFlushFunc ImportFlushFunc // call back function to flush a segment
|
2022-04-21 21:37:42 +08:00
|
|
|
|
|
|
|
importResult *rootcoordpb.ImportResult // import result
|
|
|
|
reportFunc func(res *rootcoordpb.ImportResult) error // report import state to rootcoord
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
|
2022-04-12 22:19:34 +08:00
|
|
|
func NewImportWrapper(ctx context.Context, collectionSchema *schemapb.CollectionSchema, shardNum int32, segmentSize int64,
|
2022-04-21 21:37:42 +08:00
|
|
|
idAlloc *allocator.IDAllocator, cm storage.ChunkManager, flushFunc ImportFlushFunc,
|
|
|
|
importResult *rootcoordpb.ImportResult, reportFunc func(res *rootcoordpb.ImportResult) error) *ImportWrapper {
|
2022-03-30 16:25:30 +08:00
|
|
|
if collectionSchema == nil {
|
|
|
|
log.Error("import error: collection schema is nil")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// ignore the RowID field and Timestamp field
|
|
|
|
realSchema := &schemapb.CollectionSchema{
|
|
|
|
Name: collectionSchema.GetName(),
|
|
|
|
Description: collectionSchema.GetDescription(),
|
|
|
|
AutoID: collectionSchema.GetAutoID(),
|
|
|
|
Fields: make([]*schemapb.FieldSchema, 0),
|
|
|
|
}
|
|
|
|
for i := 0; i < len(collectionSchema.Fields); i++ {
|
|
|
|
schema := collectionSchema.Fields[i]
|
|
|
|
if schema.GetName() == common.RowIDFieldName || schema.GetName() == common.TimeStampFieldName {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
realSchema.Fields = append(realSchema.Fields, schema)
|
|
|
|
}
|
|
|
|
|
|
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
|
|
|
|
|
|
wrapper := &ImportWrapper{
|
|
|
|
ctx: ctx,
|
|
|
|
cancel: cancel,
|
|
|
|
collectionSchema: realSchema,
|
|
|
|
shardNum: shardNum,
|
|
|
|
segmentSize: segmentSize,
|
|
|
|
rowIDAllocator: idAlloc,
|
|
|
|
callFlushFunc: flushFunc,
|
2022-04-12 22:19:34 +08:00
|
|
|
chunkManager: cm,
|
2022-04-21 21:37:42 +08:00
|
|
|
importResult: importResult,
|
|
|
|
reportFunc: reportFunc,
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return wrapper
|
|
|
|
}
|
|
|
|
|
|
|
|
// this method can be used to cancel parse process
|
|
|
|
func (p *ImportWrapper) Cancel() error {
|
|
|
|
p.cancel()
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-04-12 22:19:34 +08:00
|
|
|
func (p *ImportWrapper) printFieldsDataInfo(fieldsData map[storage.FieldID]storage.FieldData, msg string, files []string) {
|
2022-03-30 16:25:30 +08:00
|
|
|
stats := make([]zapcore.Field, 0)
|
|
|
|
for k, v := range fieldsData {
|
2022-04-12 22:19:34 +08:00
|
|
|
stats = append(stats, zap.Int(strconv.FormatInt(k, 10), v.RowNum()))
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
2022-04-01 10:07:28 +08:00
|
|
|
|
|
|
|
if len(files) > 0 {
|
|
|
|
stats = append(stats, zap.Any("files", files))
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
2022-04-20 14:03:40 +08:00
|
|
|
log.Info(msg, stats...)
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
|
2022-04-03 11:27:29 +08:00
|
|
|
func getFileNameAndExt(filePath string) (string, string) {
|
|
|
|
fileName := path.Base(filePath)
|
|
|
|
fileType := path.Ext(fileName)
|
|
|
|
fileNameWithoutExt := strings.TrimSuffix(fileName, fileType)
|
|
|
|
return fileNameWithoutExt, fileType
|
|
|
|
}
|
|
|
|
|
2022-04-28 17:21:47 +08:00
|
|
|
func (p *ImportWrapper) fileValidation(filePaths []string, rowBased bool) error {
|
2022-05-20 10:27:56 +08:00
|
|
|
// use this map to check duplicate file name(only for numpy file)
|
|
|
|
fileNames := make(map[string]struct{})
|
2022-05-06 11:21:50 +08:00
|
|
|
|
2022-04-28 17:21:47 +08:00
|
|
|
for i := 0; i < len(filePaths); i++ {
|
|
|
|
filePath := filePaths[i]
|
2022-05-20 10:27:56 +08:00
|
|
|
name, fileType := getFileNameAndExt(filePath)
|
|
|
|
_, ok := fileNames[name]
|
2022-05-06 11:21:50 +08:00
|
|
|
if ok {
|
|
|
|
// only check dupliate numpy file
|
|
|
|
if fileType == NumpyFileExt {
|
2022-05-20 10:27:56 +08:00
|
|
|
return errors.New("duplicate file: " + name + "." + fileType)
|
2022-05-06 11:21:50 +08:00
|
|
|
}
|
|
|
|
} else {
|
2022-05-20 10:27:56 +08:00
|
|
|
fileNames[name] = struct{}{}
|
2022-05-06 11:21:50 +08:00
|
|
|
}
|
2022-04-28 17:21:47 +08:00
|
|
|
|
|
|
|
// check file type
|
|
|
|
if rowBased {
|
|
|
|
if fileType != JSONFileExt {
|
|
|
|
return errors.New("unsupported file type for row-based mode: " + filePath)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if fileType != JSONFileExt && fileType != NumpyFileExt {
|
|
|
|
return errors.New("unsupported file type for column-based mode: " + filePath)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// check file size
|
|
|
|
size, _ := p.chunkManager.Size(filePath)
|
2022-05-06 11:21:50 +08:00
|
|
|
if size == 0 {
|
|
|
|
return errors.New("the file " + filePath + " is empty")
|
|
|
|
}
|
2022-04-28 17:21:47 +08:00
|
|
|
if size > MaxFileSize {
|
|
|
|
return errors.New("the file " + filePath + " size exceeds the maximum file size: " + strconv.FormatInt(MaxFileSize, 10) + " bytes")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-03-30 16:25:30 +08:00
|
|
|
// import process entry
|
|
|
|
// filePath and rowBased are from ImportTask
|
|
|
|
// if onlyValidate is true, this process only do validation, no data generated, callFlushFunc will not be called
|
|
|
|
func (p *ImportWrapper) Import(filePaths []string, rowBased bool, onlyValidate bool) error {
|
2022-04-28 17:21:47 +08:00
|
|
|
err := p.fileValidation(filePaths, rowBased)
|
|
|
|
if err != nil {
|
2022-05-06 11:21:50 +08:00
|
|
|
log.Error("import error: " + err.Error())
|
2022-04-28 17:21:47 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2022-03-30 16:25:30 +08:00
|
|
|
if rowBased {
|
|
|
|
// parse and consume row-based files
|
|
|
|
// for row-based files, the JSONRowConsumer will generate autoid for primary key, and split rows into segments
|
|
|
|
// according to shard number, so the callFlushFunc will be called in the JSONRowConsumer
|
|
|
|
for i := 0; i < len(filePaths); i++ {
|
|
|
|
filePath := filePaths[i]
|
2022-04-03 11:27:29 +08:00
|
|
|
_, fileType := getFileNameAndExt(filePath)
|
2022-04-21 21:37:42 +08:00
|
|
|
log.Info("import wrapper: row-based file ", zap.Any("filePath", filePath), zap.Any("fileType", fileType))
|
2022-03-30 16:25:30 +08:00
|
|
|
|
|
|
|
if fileType == JSONFileExt {
|
|
|
|
err := func() error {
|
2022-06-14 16:18:09 +08:00
|
|
|
tr := timerecord.NewTimeRecorder("json row-based parser: " + filePath)
|
2022-04-25 17:37:46 +08:00
|
|
|
|
2022-04-21 21:37:42 +08:00
|
|
|
// for minio storage, chunkManager will download file into local memory
|
|
|
|
// for local storage, chunkManager open the file directly
|
2022-04-12 22:19:34 +08:00
|
|
|
file, err := p.chunkManager.Reader(filePath)
|
2022-03-30 16:25:30 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer file.Close()
|
2022-06-14 16:18:09 +08:00
|
|
|
tr.Record("open reader")
|
2022-03-30 16:25:30 +08:00
|
|
|
|
2022-04-21 21:37:42 +08:00
|
|
|
// report file process state
|
|
|
|
p.importResult.State = commonpb.ImportState_ImportDownloaded
|
|
|
|
p.reportFunc(p.importResult)
|
|
|
|
|
|
|
|
// parse file
|
2022-03-30 16:25:30 +08:00
|
|
|
reader := bufio.NewReader(file)
|
|
|
|
parser := NewJSONParser(p.ctx, p.collectionSchema)
|
|
|
|
var consumer *JSONRowConsumer
|
|
|
|
if !onlyValidate {
|
2022-04-20 14:03:40 +08:00
|
|
|
flushFunc := func(fields map[storage.FieldID]storage.FieldData, shardNum int) error {
|
2022-03-30 16:25:30 +08:00
|
|
|
p.printFieldsDataInfo(fields, "import wrapper: prepare to flush segment", filePaths)
|
2022-04-20 14:03:40 +08:00
|
|
|
return p.callFlushFunc(fields, shardNum)
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
consumer = NewJSONRowConsumer(p.collectionSchema, p.rowIDAllocator, p.shardNum, p.segmentSize, flushFunc)
|
|
|
|
}
|
|
|
|
validator := NewJSONRowValidator(p.collectionSchema, consumer)
|
|
|
|
err = parser.ParseRows(reader, validator)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2022-04-21 21:37:42 +08:00
|
|
|
// for row-based files, auto-id is generated within JSONRowConsumer
|
|
|
|
if consumer != nil {
|
|
|
|
p.importResult.AutoIds = append(p.importResult.AutoIds, consumer.IDRange()...)
|
|
|
|
}
|
|
|
|
|
|
|
|
// report file process state
|
|
|
|
p.importResult.State = commonpb.ImportState_ImportParsed
|
|
|
|
p.reportFunc(p.importResult)
|
|
|
|
|
2022-04-25 17:37:46 +08:00
|
|
|
tr.Record("parsed")
|
2022-03-30 16:25:30 +08:00
|
|
|
return nil
|
|
|
|
}()
|
|
|
|
|
|
|
|
if err != nil {
|
2022-04-21 21:37:42 +08:00
|
|
|
log.Error("import error: "+err.Error(), zap.String("filePath", filePath))
|
2022-03-30 16:25:30 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2022-04-20 14:03:40 +08:00
|
|
|
// parse and consume column-based files
|
2022-03-30 16:25:30 +08:00
|
|
|
// for column-based files, the XXXColumnConsumer only output map[string]storage.FieldData
|
|
|
|
// after all columns are parsed/consumed, we need to combine map[string]storage.FieldData into one
|
|
|
|
// and use splitFieldsData() to split fields data into segments according to shard number
|
|
|
|
fieldsData := initSegmentData(p.collectionSchema)
|
|
|
|
rowCount := 0
|
|
|
|
|
|
|
|
// function to combine column data into fieldsData
|
2022-04-12 22:19:34 +08:00
|
|
|
combineFunc := func(fields map[storage.FieldID]storage.FieldData) error {
|
2022-03-30 16:25:30 +08:00
|
|
|
if len(fields) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-04-21 21:37:42 +08:00
|
|
|
p.printFieldsDataInfo(fields, "import wrapper: combine field data", nil)
|
2022-04-25 17:37:46 +08:00
|
|
|
tr := timerecord.NewTimeRecorder("combine field data")
|
|
|
|
defer tr.Elapse("finished")
|
2022-04-01 10:07:28 +08:00
|
|
|
|
2022-03-30 16:25:30 +08:00
|
|
|
for k, v := range fields {
|
2022-04-01 10:07:28 +08:00
|
|
|
// ignore 0 row field
|
|
|
|
if v.RowNum() == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// each column should be only combined once
|
2022-03-30 16:25:30 +08:00
|
|
|
data, ok := fieldsData[k]
|
|
|
|
if ok && data.RowNum() > 0 {
|
2022-04-12 22:19:34 +08:00
|
|
|
return errors.New("the field " + strconv.FormatInt(k, 10) + " is duplicated")
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
|
2022-04-01 10:07:28 +08:00
|
|
|
// check the row count. only count non-zero row fields
|
|
|
|
if rowCount > 0 && rowCount != v.RowNum() {
|
2022-04-12 22:19:34 +08:00
|
|
|
return errors.New("the field " + strconv.FormatInt(k, 10) + " row count " + strconv.Itoa(v.RowNum()) + " doesn't equal " + strconv.Itoa(rowCount))
|
2022-04-01 10:07:28 +08:00
|
|
|
}
|
|
|
|
rowCount = v.RowNum()
|
|
|
|
|
|
|
|
// assign column data to fieldsData
|
2022-03-30 16:25:30 +08:00
|
|
|
fieldsData[k] = v
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// parse/validate/consume data
|
|
|
|
for i := 0; i < len(filePaths); i++ {
|
|
|
|
filePath := filePaths[i]
|
2022-04-03 11:27:29 +08:00
|
|
|
fileName, fileType := getFileNameAndExt(filePath)
|
2022-04-21 21:37:42 +08:00
|
|
|
log.Info("import wrapper: column-based file ", zap.Any("filePath", filePath), zap.Any("fileType", fileType))
|
2022-03-30 16:25:30 +08:00
|
|
|
|
|
|
|
if fileType == JSONFileExt {
|
|
|
|
err := func() error {
|
2022-06-14 16:18:09 +08:00
|
|
|
tr := timerecord.NewTimeRecorder("json column-based parser: " + filePath)
|
2022-04-25 17:37:46 +08:00
|
|
|
|
2022-04-21 21:37:42 +08:00
|
|
|
// for minio storage, chunkManager will download file into local memory
|
|
|
|
// for local storage, chunkManager open the file directly
|
2022-04-12 22:19:34 +08:00
|
|
|
file, err := p.chunkManager.Reader(filePath)
|
2022-03-30 16:25:30 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer file.Close()
|
2022-06-14 16:18:09 +08:00
|
|
|
tr.Record("open reader")
|
2022-03-30 16:25:30 +08:00
|
|
|
|
2022-04-21 21:37:42 +08:00
|
|
|
// report file process state
|
|
|
|
p.importResult.State = commonpb.ImportState_ImportDownloaded
|
|
|
|
p.reportFunc(p.importResult)
|
|
|
|
|
|
|
|
// parse file
|
2022-03-30 16:25:30 +08:00
|
|
|
reader := bufio.NewReader(file)
|
|
|
|
parser := NewJSONParser(p.ctx, p.collectionSchema)
|
|
|
|
var consumer *JSONColumnConsumer
|
|
|
|
if !onlyValidate {
|
|
|
|
consumer = NewJSONColumnConsumer(p.collectionSchema, combineFunc)
|
|
|
|
}
|
|
|
|
validator := NewJSONColumnValidator(p.collectionSchema, consumer)
|
|
|
|
|
|
|
|
err = parser.ParseColumns(reader, validator)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2022-04-21 21:37:42 +08:00
|
|
|
// report file process state
|
|
|
|
p.importResult.State = commonpb.ImportState_ImportParsed
|
|
|
|
p.reportFunc(p.importResult)
|
|
|
|
|
2022-04-25 17:37:46 +08:00
|
|
|
tr.Record("parsed")
|
2022-03-30 16:25:30 +08:00
|
|
|
return nil
|
|
|
|
}()
|
|
|
|
|
|
|
|
if err != nil {
|
2022-04-21 21:37:42 +08:00
|
|
|
log.Error("import error: "+err.Error(), zap.String("filePath", filePath))
|
2022-03-30 16:25:30 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
} else if fileType == NumpyFileExt {
|
2022-04-25 17:37:46 +08:00
|
|
|
err := func() error {
|
2022-06-14 16:18:09 +08:00
|
|
|
tr := timerecord.NewTimeRecorder("numpy parser: " + filePath)
|
2022-04-25 17:37:46 +08:00
|
|
|
|
|
|
|
// for minio storage, chunkManager will download file into local memory
|
|
|
|
// for local storage, chunkManager open the file directly
|
|
|
|
file, err := p.chunkManager.Reader(filePath)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer file.Close()
|
2022-06-14 16:18:09 +08:00
|
|
|
tr.Record("open reader")
|
2022-04-25 17:37:46 +08:00
|
|
|
|
|
|
|
// report file process state
|
|
|
|
p.importResult.State = commonpb.ImportState_ImportDownloaded
|
|
|
|
p.reportFunc(p.importResult)
|
2022-04-21 21:37:42 +08:00
|
|
|
|
2022-04-25 17:37:46 +08:00
|
|
|
var id storage.FieldID
|
|
|
|
for _, field := range p.collectionSchema.Fields {
|
|
|
|
if field.GetName() == fileName {
|
|
|
|
id = field.GetFieldID()
|
|
|
|
}
|
|
|
|
}
|
2022-04-21 21:37:42 +08:00
|
|
|
|
2022-04-25 17:37:46 +08:00
|
|
|
// the numpy parser return a storage.FieldData, here construct a map[string]storage.FieldData to combine
|
|
|
|
flushFunc := func(field storage.FieldData) error {
|
|
|
|
fields := make(map[storage.FieldID]storage.FieldData)
|
|
|
|
fields[id] = field
|
2022-05-20 10:27:56 +08:00
|
|
|
return combineFunc(fields)
|
2022-04-25 17:37:46 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// for numpy file, we say the file name(without extension) is the filed name
|
|
|
|
parser := NewNumpyParser(p.ctx, p.collectionSchema, flushFunc)
|
|
|
|
err = parser.Parse(file, fileName, onlyValidate)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2022-04-12 22:19:34 +08:00
|
|
|
}
|
2022-03-30 16:25:30 +08:00
|
|
|
|
2022-04-25 17:37:46 +08:00
|
|
|
// report file process state
|
|
|
|
p.importResult.State = commonpb.ImportState_ImportParsed
|
|
|
|
p.reportFunc(p.importResult)
|
|
|
|
|
|
|
|
tr.Record("parsed")
|
2022-04-03 11:27:29 +08:00
|
|
|
return nil
|
2022-04-25 17:37:46 +08:00
|
|
|
}()
|
2022-04-03 11:27:29 +08:00
|
|
|
|
|
|
|
if err != nil {
|
2022-04-21 21:37:42 +08:00
|
|
|
log.Error("import error: "+err.Error(), zap.String("filePath", filePath))
|
2022-04-03 11:27:29 +08:00
|
|
|
return err
|
|
|
|
}
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// split fields data into segments
|
|
|
|
err := p.splitFieldsData(fieldsData, filePaths)
|
|
|
|
if err != nil {
|
2022-04-21 21:37:42 +08:00
|
|
|
log.Error("import error: " + err.Error())
|
2022-03-30 16:25:30 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-09 16:52:07 +08:00
|
|
|
debug.FreeOSMemory()
|
2022-04-21 21:37:42 +08:00
|
|
|
// report file process state
|
|
|
|
p.importResult.State = commonpb.ImportState_ImportPersisted
|
|
|
|
return p.reportFunc(p.importResult)
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
func (p *ImportWrapper) appendFunc(schema *schemapb.FieldSchema) func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
|
|
switch schema.DataType {
|
|
|
|
case schemapb.DataType_Bool:
|
|
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
|
|
arr := target.(*storage.BoolFieldData)
|
|
|
|
arr.Data = append(arr.Data, src.GetRow(n).(bool))
|
|
|
|
arr.NumRows[0]++
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
case schemapb.DataType_Float:
|
|
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
|
|
arr := target.(*storage.FloatFieldData)
|
|
|
|
arr.Data = append(arr.Data, src.GetRow(n).(float32))
|
|
|
|
arr.NumRows[0]++
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
case schemapb.DataType_Double:
|
|
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
|
|
arr := target.(*storage.DoubleFieldData)
|
|
|
|
arr.Data = append(arr.Data, src.GetRow(n).(float64))
|
|
|
|
arr.NumRows[0]++
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
case schemapb.DataType_Int8:
|
|
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
|
|
arr := target.(*storage.Int8FieldData)
|
|
|
|
arr.Data = append(arr.Data, src.GetRow(n).(int8))
|
|
|
|
arr.NumRows[0]++
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
case schemapb.DataType_Int16:
|
|
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
|
|
arr := target.(*storage.Int16FieldData)
|
|
|
|
arr.Data = append(arr.Data, src.GetRow(n).(int16))
|
|
|
|
arr.NumRows[0]++
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
case schemapb.DataType_Int32:
|
|
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
|
|
arr := target.(*storage.Int32FieldData)
|
|
|
|
arr.Data = append(arr.Data, src.GetRow(n).(int32))
|
|
|
|
arr.NumRows[0]++
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
case schemapb.DataType_Int64:
|
|
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
|
|
arr := target.(*storage.Int64FieldData)
|
|
|
|
arr.Data = append(arr.Data, src.GetRow(n).(int64))
|
|
|
|
arr.NumRows[0]++
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
case schemapb.DataType_BinaryVector:
|
|
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
|
|
arr := target.(*storage.BinaryVectorFieldData)
|
|
|
|
arr.Data = append(arr.Data, src.GetRow(n).([]byte)...)
|
|
|
|
arr.NumRows[0]++
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
case schemapb.DataType_FloatVector:
|
|
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
|
|
arr := target.(*storage.FloatVectorFieldData)
|
|
|
|
arr.Data = append(arr.Data, src.GetRow(n).([]float32)...)
|
|
|
|
arr.NumRows[0]++
|
|
|
|
return nil
|
|
|
|
}
|
2022-04-12 22:19:34 +08:00
|
|
|
case schemapb.DataType_String, schemapb.DataType_VarChar:
|
2022-03-30 16:25:30 +08:00
|
|
|
return func(src storage.FieldData, n int, target storage.FieldData) error {
|
|
|
|
arr := target.(*storage.StringFieldData)
|
|
|
|
arr.Data = append(arr.Data, src.GetRow(n).(string))
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-04-12 22:19:34 +08:00
|
|
|
func (p *ImportWrapper) splitFieldsData(fieldsData map[storage.FieldID]storage.FieldData, files []string) error {
|
2022-03-30 16:25:30 +08:00
|
|
|
if len(fieldsData) == 0 {
|
2022-04-21 21:37:42 +08:00
|
|
|
return errors.New("import error: fields data is empty")
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
|
2022-04-25 17:37:46 +08:00
|
|
|
tr := timerecord.NewTimeRecorder("split field data")
|
|
|
|
defer tr.Elapse("finished")
|
|
|
|
|
2022-04-28 17:21:47 +08:00
|
|
|
// check existence of each field
|
|
|
|
// check row count, all fields row count must be equal
|
|
|
|
// firstly get the max row count
|
|
|
|
rowCount := 0
|
|
|
|
rowCounter := make(map[string]int)
|
2022-03-30 16:25:30 +08:00
|
|
|
var primaryKey *schemapb.FieldSchema
|
|
|
|
for i := 0; i < len(p.collectionSchema.Fields); i++ {
|
|
|
|
schema := p.collectionSchema.Fields[i]
|
|
|
|
if schema.GetIsPrimaryKey() {
|
|
|
|
primaryKey = schema
|
2022-04-28 17:21:47 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if !schema.GetAutoID() {
|
|
|
|
v, ok := fieldsData[schema.GetFieldID()]
|
2022-03-30 16:25:30 +08:00
|
|
|
if !ok {
|
2022-04-21 21:37:42 +08:00
|
|
|
return errors.New("import error: field " + schema.GetName() + " not provided")
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
2022-04-28 17:21:47 +08:00
|
|
|
rowCounter[schema.GetName()] = v.RowNum()
|
|
|
|
if v.RowNum() > rowCount {
|
|
|
|
rowCount = v.RowNum()
|
|
|
|
}
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if primaryKey == nil {
|
2022-04-21 21:37:42 +08:00
|
|
|
return errors.New("import error: primary key field is not found")
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
|
2022-04-28 17:21:47 +08:00
|
|
|
for name, count := range rowCounter {
|
|
|
|
if count != rowCount {
|
2022-05-06 11:21:50 +08:00
|
|
|
return errors.New("import error: field " + name + " row count " + strconv.Itoa(count) + " is not equal to other fields row count " + strconv.Itoa(rowCount))
|
2022-04-21 21:37:42 +08:00
|
|
|
}
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
|
2022-04-12 22:19:34 +08:00
|
|
|
primaryData, ok := fieldsData[primaryKey.GetFieldID()]
|
2022-03-30 16:25:30 +08:00
|
|
|
if !ok {
|
2022-04-21 21:37:42 +08:00
|
|
|
return errors.New("import error: primary key field is not provided")
|
|
|
|
}
|
|
|
|
|
2022-05-13 16:07:54 +08:00
|
|
|
// generate auto id for primary key and rowid field
|
|
|
|
var rowIDBegin typeutil.UniqueID
|
|
|
|
var rowIDEnd typeutil.UniqueID
|
|
|
|
rowIDBegin, rowIDEnd, _ = p.rowIDAllocator.Alloc(uint32(rowCount))
|
|
|
|
|
|
|
|
rowIDField := fieldsData[common.RowIDField]
|
|
|
|
rowIDFieldArr := rowIDField.(*storage.Int64FieldData)
|
|
|
|
for i := rowIDBegin; i < rowIDEnd; i++ {
|
|
|
|
rowIDFieldArr.Data = append(rowIDFieldArr.Data, rowIDBegin+i)
|
|
|
|
}
|
|
|
|
|
2022-04-21 21:37:42 +08:00
|
|
|
if primaryKey.GetAutoID() {
|
|
|
|
log.Info("import wrapper: generating auto-id", zap.Any("rowCount", rowCount))
|
|
|
|
|
|
|
|
primaryDataArr := primaryData.(*storage.Int64FieldData)
|
|
|
|
for i := rowIDBegin; i < rowIDEnd; i++ {
|
|
|
|
primaryDataArr.Data = append(primaryDataArr.Data, rowIDBegin+i)
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
2022-04-21 21:37:42 +08:00
|
|
|
|
|
|
|
p.importResult.AutoIds = append(p.importResult.AutoIds, rowIDBegin, rowIDEnd)
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if primaryData.RowNum() <= 0 {
|
2022-04-21 21:37:42 +08:00
|
|
|
return errors.New("import error: primary key " + primaryKey.GetName() + " not provided")
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// prepare segemnts
|
2022-04-12 22:19:34 +08:00
|
|
|
segmentsData := make([]map[storage.FieldID]storage.FieldData, 0, p.shardNum)
|
2022-03-30 16:25:30 +08:00
|
|
|
for i := 0; i < int(p.shardNum); i++ {
|
|
|
|
segmentData := initSegmentData(p.collectionSchema)
|
|
|
|
if segmentData == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
segmentsData = append(segmentsData, segmentData)
|
|
|
|
}
|
|
|
|
|
|
|
|
// prepare append functions
|
|
|
|
appendFunctions := make(map[string]func(src storage.FieldData, n int, target storage.FieldData) error)
|
|
|
|
for i := 0; i < len(p.collectionSchema.Fields); i++ {
|
|
|
|
schema := p.collectionSchema.Fields[i]
|
|
|
|
appendFunc := p.appendFunc(schema)
|
|
|
|
if appendFunc == nil {
|
2022-04-21 21:37:42 +08:00
|
|
|
return errors.New("import error: unsupported field data type")
|
2022-03-30 16:25:30 +08:00
|
|
|
}
|
|
|
|
appendFunctions[schema.GetName()] = appendFunc
|
|
|
|
}
|
|
|
|
|
|
|
|
// split data into segments
|
|
|
|
for i := 0; i < rowCount; i++ {
|
|
|
|
// hash to a shard number
|
2022-05-13 16:07:54 +08:00
|
|
|
var shard uint32
|
|
|
|
pk := primaryData.GetRow(i)
|
|
|
|
strPK, ok := interface{}(pk).(string)
|
|
|
|
if ok {
|
|
|
|
hash := typeutil.HashString2Uint32(strPK)
|
|
|
|
shard = hash % uint32(p.shardNum)
|
|
|
|
} else {
|
|
|
|
intPK, ok := interface{}(pk).(int64)
|
|
|
|
if !ok {
|
|
|
|
return errors.New("import error: primary key field must be int64 or varchar")
|
|
|
|
}
|
|
|
|
hash, _ := typeutil.Hash32Int64(intPK)
|
|
|
|
shard = hash % uint32(p.shardNum)
|
|
|
|
}
|
|
|
|
|
|
|
|
// set rowID field
|
|
|
|
rowIDField := segmentsData[shard][common.RowIDField].(*storage.Int64FieldData)
|
|
|
|
rowIDField.Data = append(rowIDField.Data, rowIDFieldArr.GetRow(i).(int64))
|
2022-03-30 16:25:30 +08:00
|
|
|
|
2022-05-13 16:07:54 +08:00
|
|
|
// append row to shard
|
2022-03-30 16:25:30 +08:00
|
|
|
for k := 0; k < len(p.collectionSchema.Fields); k++ {
|
|
|
|
schema := p.collectionSchema.Fields[k]
|
2022-04-12 22:19:34 +08:00
|
|
|
srcData := fieldsData[schema.GetFieldID()]
|
|
|
|
targetData := segmentsData[shard][schema.GetFieldID()]
|
2022-03-30 16:25:30 +08:00
|
|
|
appendFunc := appendFunctions[schema.GetName()]
|
|
|
|
err := appendFunc(srcData, i, targetData)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// call flush function
|
|
|
|
for i := 0; i < int(p.shardNum); i++ {
|
|
|
|
segmentData := segmentsData[i]
|
|
|
|
p.printFieldsDataInfo(segmentData, "import wrapper: prepare to flush segment", files)
|
2022-04-20 14:03:40 +08:00
|
|
|
err := p.callFlushFunc(segmentData, i)
|
2022-03-30 16:25:30 +08:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|