mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-02 03:48:37 +08:00
a61668c77e
This PR introduce stats task for import: 1. Define new `Stats` and `IndexBuilding` states for importJob 2. Add new stats step to the import process: trigger the stats task and wait for its completion 3. Abort stats task if import job failed issue: https://github.com/milvus-io/milvus/issues/33744 --------- Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
288 lines
8.7 KiB
Go
288 lines
8.7 KiB
Go
package datacoord
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/cockroachdb/errors"
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus/internal/datacoord/allocator"
|
|
"github.com/milvus-io/milvus/internal/proto/datapb"
|
|
"github.com/milvus-io/milvus/internal/proto/indexpb"
|
|
"github.com/milvus-io/milvus/pkg/log"
|
|
"github.com/milvus-io/milvus/pkg/util/merr"
|
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
|
)
|
|
|
|
type StatsJobManager interface {
|
|
Start()
|
|
Stop()
|
|
SubmitStatsTask(originSegmentID, targetSegmentID int64, subJobType indexpb.StatsSubJob, canRecycle bool) error
|
|
GetStatsTaskState(originSegmentID int64, subJobType indexpb.StatsSubJob) indexpb.JobState
|
|
DropStatsTask(originSegmentID int64, subJobType indexpb.StatsSubJob) error
|
|
}
|
|
|
|
var _ StatsJobManager = (*statsJobManager)(nil)
|
|
|
|
type statsJobManager struct {
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
|
|
loopWg sync.WaitGroup
|
|
|
|
mt *meta
|
|
|
|
scheduler *taskScheduler
|
|
allocator allocator.Allocator
|
|
}
|
|
|
|
func newJobManager(ctx context.Context,
|
|
mt *meta,
|
|
scheduler *taskScheduler,
|
|
allocator allocator.Allocator,
|
|
) *statsJobManager {
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
return &statsJobManager{
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
loopWg: sync.WaitGroup{},
|
|
mt: mt,
|
|
scheduler: scheduler,
|
|
allocator: allocator,
|
|
}
|
|
}
|
|
|
|
func (jm *statsJobManager) Start() {
|
|
if Params.DataCoordCfg.EnableStatsTask.GetAsBool() {
|
|
jm.loopWg.Add(2)
|
|
go jm.triggerStatsTaskLoop()
|
|
go jm.cleanupStatsTasksLoop()
|
|
}
|
|
}
|
|
|
|
func (jm *statsJobManager) Stop() {
|
|
jm.cancel()
|
|
jm.loopWg.Wait()
|
|
}
|
|
|
|
func (jm *statsJobManager) triggerStatsTaskLoop() {
|
|
log.Info("start checkStatsTaskLoop...")
|
|
defer jm.loopWg.Done()
|
|
|
|
ticker := time.NewTicker(Params.DataCoordCfg.TaskCheckInterval.GetAsDuration(time.Second))
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-jm.ctx.Done():
|
|
log.Warn("DataCoord context done, exit checkStatsTaskLoop...")
|
|
return
|
|
case <-ticker.C:
|
|
jm.triggerSortStatsTask()
|
|
jm.triggerTextStatsTask()
|
|
jm.triggerBM25StatsTask()
|
|
|
|
case segID := <-getStatsTaskChSingleton():
|
|
log.Info("receive new segment to trigger stats task", zap.Int64("segmentID", segID))
|
|
segment := jm.mt.GetSegment(segID)
|
|
if segment == nil {
|
|
log.Warn("segment is not exist, no need to do stats task", zap.Int64("segmentID", segID))
|
|
continue
|
|
}
|
|
jm.createSortStatsTaskForSegment(segment)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (jm *statsJobManager) triggerSortStatsTask() {
|
|
segments := jm.mt.SelectSegments(SegmentFilterFunc(func(seg *SegmentInfo) bool {
|
|
return isFlush(seg) && seg.GetLevel() != datapb.SegmentLevel_L0 && !seg.GetIsSorted() && !seg.GetIsImporting()
|
|
}))
|
|
for _, segment := range segments {
|
|
jm.createSortStatsTaskForSegment(segment)
|
|
}
|
|
}
|
|
|
|
func (jm *statsJobManager) createSortStatsTaskForSegment(segment *SegmentInfo) {
|
|
targetSegmentID, err := jm.allocator.AllocID(jm.ctx)
|
|
if err != nil {
|
|
log.Warn("allocID for segment stats task failed",
|
|
zap.Int64("segmentID", segment.GetID()), zap.Error(err))
|
|
return
|
|
}
|
|
if err := jm.SubmitStatsTask(segment.GetID(), targetSegmentID, indexpb.StatsSubJob_Sort, true); err != nil {
|
|
log.Warn("create stats task with sort for segment failed, wait for retry",
|
|
zap.Int64("segmentID", segment.GetID()), zap.Error(err))
|
|
return
|
|
}
|
|
}
|
|
|
|
func (jm *statsJobManager) enableBM25() bool {
|
|
return false
|
|
}
|
|
|
|
func needDoTextIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool {
|
|
if !(isFlush(segment) && segment.GetLevel() != datapb.SegmentLevel_L0 &&
|
|
segment.GetIsSorted()) {
|
|
return false
|
|
}
|
|
|
|
for _, fieldID := range fieldIDs {
|
|
if segment.GetTextStatsLogs() == nil {
|
|
return true
|
|
}
|
|
if segment.GetTextStatsLogs()[fieldID] == nil {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func needDoBM25(segment *SegmentInfo, fieldIDs []UniqueID) bool {
|
|
// TODO: docking bm25 stats task
|
|
return false
|
|
}
|
|
|
|
func (jm *statsJobManager) triggerTextStatsTask() {
|
|
collections := jm.mt.GetCollections()
|
|
for _, collection := range collections {
|
|
needTriggerFieldIDs := make([]UniqueID, 0)
|
|
for _, field := range collection.Schema.GetFields() {
|
|
// TODO @longjiquan: please replace it to fieldSchemaHelper.EnableMath
|
|
h := typeutil.CreateFieldSchemaHelper(field)
|
|
if !h.EnableMatch() {
|
|
continue
|
|
}
|
|
needTriggerFieldIDs = append(needTriggerFieldIDs, field.GetFieldID())
|
|
}
|
|
segments := jm.mt.SelectSegments(WithCollection(collection.ID), SegmentFilterFunc(func(seg *SegmentInfo) bool {
|
|
return needDoTextIndex(seg, needTriggerFieldIDs)
|
|
}))
|
|
|
|
for _, segment := range segments {
|
|
if err := jm.SubmitStatsTask(segment.GetID(), segment.GetID(), indexpb.StatsSubJob_TextIndexJob, true); err != nil {
|
|
log.Warn("create stats task with text index for segment failed, wait for retry",
|
|
zap.Int64("segmentID", segment.GetID()), zap.Error(err))
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (jm *statsJobManager) triggerBM25StatsTask() {
|
|
collections := jm.mt.GetCollections()
|
|
for _, collection := range collections {
|
|
needTriggerFieldIDs := make([]UniqueID, 0)
|
|
for _, field := range collection.Schema.GetFields() {
|
|
// TODO: docking bm25 stats task
|
|
if jm.enableBM25() {
|
|
needTriggerFieldIDs = append(needTriggerFieldIDs, field.GetFieldID())
|
|
}
|
|
}
|
|
segments := jm.mt.SelectSegments(WithCollection(collection.ID), SegmentFilterFunc(func(seg *SegmentInfo) bool {
|
|
return needDoBM25(seg, needTriggerFieldIDs)
|
|
}))
|
|
|
|
for _, segment := range segments {
|
|
if err := jm.SubmitStatsTask(segment.GetID(), segment.GetID(), indexpb.StatsSubJob_BM25Job, true); err != nil {
|
|
log.Warn("create stats task with bm25 for segment failed, wait for retry",
|
|
zap.Int64("segmentID", segment.GetID()), zap.Error(err))
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// cleanupStatsTasks clean up the finished/failed stats tasks
|
|
func (jm *statsJobManager) cleanupStatsTasksLoop() {
|
|
log.Info("start cleanupStatsTasksLoop...")
|
|
defer jm.loopWg.Done()
|
|
|
|
ticker := time.NewTicker(Params.DataCoordCfg.GCInterval.GetAsDuration(time.Second))
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-jm.ctx.Done():
|
|
log.Warn("DataCoord context done, exit cleanupStatsTasksLoop...")
|
|
return
|
|
case <-ticker.C:
|
|
start := time.Now()
|
|
log.Info("start cleanupUnusedStatsTasks...", zap.Time("startAt", start))
|
|
|
|
taskIDs := jm.mt.statsTaskMeta.CanCleanedTasks()
|
|
for _, taskID := range taskIDs {
|
|
// waiting for queue processing tasks to complete
|
|
if jm.scheduler.getTask(taskID) == nil {
|
|
if err := jm.mt.statsTaskMeta.DropStatsTask(taskID); err != nil {
|
|
// ignore err, if remove failed, wait next GC
|
|
log.Warn("clean up stats task failed", zap.Int64("taskID", taskID), zap.Error(err))
|
|
}
|
|
}
|
|
}
|
|
log.Info("cleanupUnusedStatsTasks done", zap.Duration("timeCost", time.Since(start)))
|
|
}
|
|
}
|
|
}
|
|
|
|
func (jm *statsJobManager) SubmitStatsTask(originSegmentID, targetSegmentID int64,
|
|
subJobType indexpb.StatsSubJob, canRecycle bool,
|
|
) error {
|
|
originSegment := jm.mt.GetHealthySegment(originSegmentID)
|
|
if originSegment == nil {
|
|
return merr.WrapErrSegmentNotFound(originSegmentID)
|
|
}
|
|
taskID, err := jm.allocator.AllocID(context.Background())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
t := &indexpb.StatsTask{
|
|
CollectionID: originSegment.GetCollectionID(),
|
|
PartitionID: originSegment.GetPartitionID(),
|
|
SegmentID: originSegmentID,
|
|
InsertChannel: originSegment.GetInsertChannel(),
|
|
TaskID: taskID,
|
|
Version: 0,
|
|
NodeID: 0,
|
|
State: indexpb.JobState_JobStateInit,
|
|
FailReason: "",
|
|
TargetSegmentID: targetSegmentID,
|
|
SubJobType: subJobType,
|
|
CanRecycle: canRecycle,
|
|
}
|
|
if err = jm.mt.statsTaskMeta.AddStatsTask(t); err != nil {
|
|
if errors.Is(err, merr.ErrTaskDuplicate) {
|
|
log.Info("stats task already exists", zap.Int64("taskID", taskID),
|
|
zap.Int64("collectionID", originSegment.GetCollectionID()),
|
|
zap.Int64("segmentID", originSegment.GetID()))
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
jm.scheduler.enqueue(newStatsTask(t.GetTaskID(), t.GetSegmentID(), t.GetTargetSegmentID(), subJobType))
|
|
return nil
|
|
}
|
|
|
|
func (jm *statsJobManager) GetStatsTaskState(originSegmentID int64, subJobType indexpb.StatsSubJob) indexpb.JobState {
|
|
state := jm.mt.statsTaskMeta.GetStatsTaskStateBySegmentID(originSegmentID, subJobType)
|
|
log.Info("statsJobManager get stats task state", zap.Int64("segmentID", originSegmentID),
|
|
zap.String("subJobType", subJobType.String()), zap.String("state", state.String()))
|
|
return state
|
|
}
|
|
|
|
func (jm *statsJobManager) DropStatsTask(originSegmentID int64, subJobType indexpb.StatsSubJob) error {
|
|
task := jm.mt.statsTaskMeta.GetStatsTaskBySegmentID(originSegmentID, subJobType)
|
|
if task == nil {
|
|
return nil
|
|
}
|
|
jm.scheduler.AbortTask(task.GetTaskID())
|
|
if err := jm.mt.statsTaskMeta.MarkTaskCanRecycle(task.GetTaskID()); err != nil {
|
|
return err
|
|
}
|
|
|
|
log.Info("statsJobManager drop stats task success", zap.Int64("segmentID", originSegmentID),
|
|
zap.Int64("taskID", task.GetTaskID()), zap.String("subJobType", subJobType.String()))
|
|
return nil
|
|
}
|