mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-05 05:18:52 +08:00
4e5f1d5f75
The import is dependent on syncTask, which in turn relies on the allocator. This PR pre-allocate the necessary IDs for import syncTask. issue: https://github.com/milvus-io/milvus/issues/33957 --------- Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
213 lines
6.3 KiB
Go
213 lines
6.3 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package importv2
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"time"
|
|
|
|
"github.com/cockroachdb/errors"
|
|
"github.com/samber/lo"
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/proto/datapb"
|
|
"github.com/milvus-io/milvus/internal/proto/internalpb"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
"github.com/milvus-io/milvus/internal/util/importutilv2"
|
|
"github.com/milvus-io/milvus/pkg/log"
|
|
"github.com/milvus-io/milvus/pkg/util/conc"
|
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
|
)
|
|
|
|
type PreImportTask struct {
|
|
*datapb.PreImportTask
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
partitionIDs []int64
|
|
vchannels []string
|
|
schema *schemapb.CollectionSchema
|
|
options []*commonpb.KeyValuePair
|
|
|
|
manager TaskManager
|
|
cm storage.ChunkManager
|
|
}
|
|
|
|
func NewPreImportTask(req *datapb.PreImportRequest,
|
|
manager TaskManager,
|
|
cm storage.ChunkManager,
|
|
) Task {
|
|
fileStats := lo.Map(req.GetImportFiles(), func(file *internalpb.ImportFile, _ int) *datapb.ImportFileStats {
|
|
return &datapb.ImportFileStats{
|
|
ImportFile: file,
|
|
}
|
|
})
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
// During binlog import, even if the primary key's autoID is set to true,
|
|
// the primary key from the binlog should be used instead of being reassigned.
|
|
if importutilv2.IsBackup(req.GetOptions()) {
|
|
UnsetAutoID(req.GetSchema())
|
|
}
|
|
return &PreImportTask{
|
|
PreImportTask: &datapb.PreImportTask{
|
|
JobID: req.GetJobID(),
|
|
TaskID: req.GetTaskID(),
|
|
CollectionID: req.GetCollectionID(),
|
|
State: datapb.ImportTaskStateV2_Pending,
|
|
FileStats: fileStats,
|
|
},
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
partitionIDs: req.GetPartitionIDs(),
|
|
vchannels: req.GetVchannels(),
|
|
schema: req.GetSchema(),
|
|
options: req.GetOptions(),
|
|
manager: manager,
|
|
cm: cm,
|
|
}
|
|
}
|
|
|
|
func (t *PreImportTask) GetPartitionIDs() []int64 {
|
|
return t.partitionIDs
|
|
}
|
|
|
|
func (t *PreImportTask) GetVchannels() []string {
|
|
return t.vchannels
|
|
}
|
|
|
|
func (t *PreImportTask) GetType() TaskType {
|
|
return PreImportTaskType
|
|
}
|
|
|
|
func (t *PreImportTask) GetSchema() *schemapb.CollectionSchema {
|
|
return t.schema
|
|
}
|
|
|
|
func (t *PreImportTask) Cancel() {
|
|
t.cancel()
|
|
}
|
|
|
|
func (t *PreImportTask) Clone() Task {
|
|
ctx, cancel := context.WithCancel(t.ctx)
|
|
return &PreImportTask{
|
|
PreImportTask: typeutil.Clone(t.PreImportTask),
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
partitionIDs: t.GetPartitionIDs(),
|
|
vchannels: t.GetVchannels(),
|
|
schema: t.GetSchema(),
|
|
options: t.options,
|
|
}
|
|
}
|
|
|
|
func (t *PreImportTask) Execute() []*conc.Future[any] {
|
|
bufferSize := paramtable.Get().DataNodeCfg.ReadBufferSizeInMB.GetAsInt() * 1024 * 1024
|
|
log.Info("start to preimport", WrapLogFields(t,
|
|
zap.Int("bufferSize", bufferSize),
|
|
zap.Any("schema", t.GetSchema()))...)
|
|
t.manager.Update(t.GetTaskID(), UpdateState(datapb.ImportTaskStateV2_InProgress))
|
|
files := lo.Map(t.GetFileStats(),
|
|
func(fileStat *datapb.ImportFileStats, _ int) *internalpb.ImportFile {
|
|
return fileStat.GetImportFile()
|
|
})
|
|
|
|
fn := func(i int, file *internalpb.ImportFile) error {
|
|
reader, err := importutilv2.NewReader(t.ctx, t.cm, t.GetSchema(), file, t.options, bufferSize)
|
|
if err != nil {
|
|
log.Warn("new reader failed", WrapLogFields(t, zap.String("file", file.String()), zap.Error(err))...)
|
|
t.manager.Update(t.GetTaskID(), UpdateState(datapb.ImportTaskStateV2_Failed), UpdateReason(err.Error()))
|
|
return err
|
|
}
|
|
defer reader.Close()
|
|
start := time.Now()
|
|
err = t.readFileStat(reader, i)
|
|
if err != nil {
|
|
log.Warn("preimport failed", WrapLogFields(t, zap.String("file", file.String()), zap.Error(err))...)
|
|
t.manager.Update(t.GetTaskID(), UpdateState(datapb.ImportTaskStateV2_Failed), UpdateReason(err.Error()))
|
|
return err
|
|
}
|
|
log.Info("read file stat done", WrapLogFields(t, zap.Strings("files", file.GetPaths()),
|
|
zap.Duration("dur", time.Since(start)))...)
|
|
return nil
|
|
}
|
|
|
|
futures := make([]*conc.Future[any], 0, len(files))
|
|
for i, file := range files {
|
|
i := i
|
|
file := file
|
|
f := GetExecPool().Submit(func() (any, error) {
|
|
err := fn(i, file)
|
|
return err, err
|
|
})
|
|
futures = append(futures, f)
|
|
}
|
|
return futures
|
|
}
|
|
|
|
func (t *PreImportTask) readFileStat(reader importutilv2.Reader, fileIdx int) error {
|
|
fileSize, err := reader.Size()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
maxSize := paramtable.Get().DataNodeCfg.MaxImportFileSizeInGB.GetAsFloat() * 1024 * 1024 * 1024
|
|
if fileSize > int64(maxSize) {
|
|
return errors.New(fmt.Sprintf(
|
|
"The import file size has reached the maximum limit allowed for importing, "+
|
|
"fileSize=%d, maxSize=%d", fileSize, int64(maxSize)))
|
|
}
|
|
|
|
totalRows := 0
|
|
totalSize := 0
|
|
hashedStats := make(map[string]*datapb.PartitionImportStats)
|
|
for {
|
|
data, err := reader.Read()
|
|
if err != nil {
|
|
if errors.Is(err, io.EOF) {
|
|
break
|
|
}
|
|
return err
|
|
}
|
|
err = CheckRowsEqual(t.GetSchema(), data)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
rowsCount, err := GetRowsStats(t, data)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
MergeHashedStats(rowsCount, hashedStats)
|
|
rows := data.GetRowNum()
|
|
size := data.GetMemorySize()
|
|
totalRows += rows
|
|
totalSize += size
|
|
log.Info("reading file stat...", WrapLogFields(t, zap.Int("readRows", rows), zap.Int("readSize", size))...)
|
|
}
|
|
|
|
stat := &datapb.ImportFileStats{
|
|
FileSize: fileSize,
|
|
TotalRows: int64(totalRows),
|
|
TotalMemorySize: int64(totalSize),
|
|
HashedStats: hashedStats,
|
|
}
|
|
t.manager.Update(t.GetTaskID(), UpdateFileStat(fileIdx, stat))
|
|
return nil
|
|
}
|