milvus/internal/datacoord/job_manager.go
yihao.dai a61668c77e
feat: Introduce stats task for import (#35868)
This PR introduce stats task for import:
1. Define new `Stats` and `IndexBuilding` states for importJob
2. Add new stats step to the import process: trigger the stats task and
wait for its completion
3. Abort stats task if import job failed

issue: https://github.com/milvus-io/milvus/issues/33744

---------

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
2024-09-15 15:17:08 +08:00

288 lines
8.7 KiB
Go

package datacoord
import (
"context"
"sync"
"time"
"github.com/cockroachdb/errors"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/datacoord/allocator"
"github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/proto/indexpb"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
type StatsJobManager interface {
Start()
Stop()
SubmitStatsTask(originSegmentID, targetSegmentID int64, subJobType indexpb.StatsSubJob, canRecycle bool) error
GetStatsTaskState(originSegmentID int64, subJobType indexpb.StatsSubJob) indexpb.JobState
DropStatsTask(originSegmentID int64, subJobType indexpb.StatsSubJob) error
}
var _ StatsJobManager = (*statsJobManager)(nil)
type statsJobManager struct {
ctx context.Context
cancel context.CancelFunc
loopWg sync.WaitGroup
mt *meta
scheduler *taskScheduler
allocator allocator.Allocator
}
func newJobManager(ctx context.Context,
mt *meta,
scheduler *taskScheduler,
allocator allocator.Allocator,
) *statsJobManager {
ctx, cancel := context.WithCancel(ctx)
return &statsJobManager{
ctx: ctx,
cancel: cancel,
loopWg: sync.WaitGroup{},
mt: mt,
scheduler: scheduler,
allocator: allocator,
}
}
func (jm *statsJobManager) Start() {
if Params.DataCoordCfg.EnableStatsTask.GetAsBool() {
jm.loopWg.Add(2)
go jm.triggerStatsTaskLoop()
go jm.cleanupStatsTasksLoop()
}
}
func (jm *statsJobManager) Stop() {
jm.cancel()
jm.loopWg.Wait()
}
func (jm *statsJobManager) triggerStatsTaskLoop() {
log.Info("start checkStatsTaskLoop...")
defer jm.loopWg.Done()
ticker := time.NewTicker(Params.DataCoordCfg.TaskCheckInterval.GetAsDuration(time.Second))
defer ticker.Stop()
for {
select {
case <-jm.ctx.Done():
log.Warn("DataCoord context done, exit checkStatsTaskLoop...")
return
case <-ticker.C:
jm.triggerSortStatsTask()
jm.triggerTextStatsTask()
jm.triggerBM25StatsTask()
case segID := <-getStatsTaskChSingleton():
log.Info("receive new segment to trigger stats task", zap.Int64("segmentID", segID))
segment := jm.mt.GetSegment(segID)
if segment == nil {
log.Warn("segment is not exist, no need to do stats task", zap.Int64("segmentID", segID))
continue
}
jm.createSortStatsTaskForSegment(segment)
}
}
}
func (jm *statsJobManager) triggerSortStatsTask() {
segments := jm.mt.SelectSegments(SegmentFilterFunc(func(seg *SegmentInfo) bool {
return isFlush(seg) && seg.GetLevel() != datapb.SegmentLevel_L0 && !seg.GetIsSorted() && !seg.GetIsImporting()
}))
for _, segment := range segments {
jm.createSortStatsTaskForSegment(segment)
}
}
func (jm *statsJobManager) createSortStatsTaskForSegment(segment *SegmentInfo) {
targetSegmentID, err := jm.allocator.AllocID(jm.ctx)
if err != nil {
log.Warn("allocID for segment stats task failed",
zap.Int64("segmentID", segment.GetID()), zap.Error(err))
return
}
if err := jm.SubmitStatsTask(segment.GetID(), targetSegmentID, indexpb.StatsSubJob_Sort, true); err != nil {
log.Warn("create stats task with sort for segment failed, wait for retry",
zap.Int64("segmentID", segment.GetID()), zap.Error(err))
return
}
}
func (jm *statsJobManager) enableBM25() bool {
return false
}
func needDoTextIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool {
if !(isFlush(segment) && segment.GetLevel() != datapb.SegmentLevel_L0 &&
segment.GetIsSorted()) {
return false
}
for _, fieldID := range fieldIDs {
if segment.GetTextStatsLogs() == nil {
return true
}
if segment.GetTextStatsLogs()[fieldID] == nil {
return true
}
}
return false
}
func needDoBM25(segment *SegmentInfo, fieldIDs []UniqueID) bool {
// TODO: docking bm25 stats task
return false
}
func (jm *statsJobManager) triggerTextStatsTask() {
collections := jm.mt.GetCollections()
for _, collection := range collections {
needTriggerFieldIDs := make([]UniqueID, 0)
for _, field := range collection.Schema.GetFields() {
// TODO @longjiquan: please replace it to fieldSchemaHelper.EnableMath
h := typeutil.CreateFieldSchemaHelper(field)
if !h.EnableMatch() {
continue
}
needTriggerFieldIDs = append(needTriggerFieldIDs, field.GetFieldID())
}
segments := jm.mt.SelectSegments(WithCollection(collection.ID), SegmentFilterFunc(func(seg *SegmentInfo) bool {
return needDoTextIndex(seg, needTriggerFieldIDs)
}))
for _, segment := range segments {
if err := jm.SubmitStatsTask(segment.GetID(), segment.GetID(), indexpb.StatsSubJob_TextIndexJob, true); err != nil {
log.Warn("create stats task with text index for segment failed, wait for retry",
zap.Int64("segmentID", segment.GetID()), zap.Error(err))
continue
}
}
}
}
func (jm *statsJobManager) triggerBM25StatsTask() {
collections := jm.mt.GetCollections()
for _, collection := range collections {
needTriggerFieldIDs := make([]UniqueID, 0)
for _, field := range collection.Schema.GetFields() {
// TODO: docking bm25 stats task
if jm.enableBM25() {
needTriggerFieldIDs = append(needTriggerFieldIDs, field.GetFieldID())
}
}
segments := jm.mt.SelectSegments(WithCollection(collection.ID), SegmentFilterFunc(func(seg *SegmentInfo) bool {
return needDoBM25(seg, needTriggerFieldIDs)
}))
for _, segment := range segments {
if err := jm.SubmitStatsTask(segment.GetID(), segment.GetID(), indexpb.StatsSubJob_BM25Job, true); err != nil {
log.Warn("create stats task with bm25 for segment failed, wait for retry",
zap.Int64("segmentID", segment.GetID()), zap.Error(err))
continue
}
}
}
}
// cleanupStatsTasks clean up the finished/failed stats tasks
func (jm *statsJobManager) cleanupStatsTasksLoop() {
log.Info("start cleanupStatsTasksLoop...")
defer jm.loopWg.Done()
ticker := time.NewTicker(Params.DataCoordCfg.GCInterval.GetAsDuration(time.Second))
defer ticker.Stop()
for {
select {
case <-jm.ctx.Done():
log.Warn("DataCoord context done, exit cleanupStatsTasksLoop...")
return
case <-ticker.C:
start := time.Now()
log.Info("start cleanupUnusedStatsTasks...", zap.Time("startAt", start))
taskIDs := jm.mt.statsTaskMeta.CanCleanedTasks()
for _, taskID := range taskIDs {
// waiting for queue processing tasks to complete
if jm.scheduler.getTask(taskID) == nil {
if err := jm.mt.statsTaskMeta.DropStatsTask(taskID); err != nil {
// ignore err, if remove failed, wait next GC
log.Warn("clean up stats task failed", zap.Int64("taskID", taskID), zap.Error(err))
}
}
}
log.Info("cleanupUnusedStatsTasks done", zap.Duration("timeCost", time.Since(start)))
}
}
}
func (jm *statsJobManager) SubmitStatsTask(originSegmentID, targetSegmentID int64,
subJobType indexpb.StatsSubJob, canRecycle bool,
) error {
originSegment := jm.mt.GetHealthySegment(originSegmentID)
if originSegment == nil {
return merr.WrapErrSegmentNotFound(originSegmentID)
}
taskID, err := jm.allocator.AllocID(context.Background())
if err != nil {
return err
}
t := &indexpb.StatsTask{
CollectionID: originSegment.GetCollectionID(),
PartitionID: originSegment.GetPartitionID(),
SegmentID: originSegmentID,
InsertChannel: originSegment.GetInsertChannel(),
TaskID: taskID,
Version: 0,
NodeID: 0,
State: indexpb.JobState_JobStateInit,
FailReason: "",
TargetSegmentID: targetSegmentID,
SubJobType: subJobType,
CanRecycle: canRecycle,
}
if err = jm.mt.statsTaskMeta.AddStatsTask(t); err != nil {
if errors.Is(err, merr.ErrTaskDuplicate) {
log.Info("stats task already exists", zap.Int64("taskID", taskID),
zap.Int64("collectionID", originSegment.GetCollectionID()),
zap.Int64("segmentID", originSegment.GetID()))
return nil
}
return err
}
jm.scheduler.enqueue(newStatsTask(t.GetTaskID(), t.GetSegmentID(), t.GetTargetSegmentID(), subJobType))
return nil
}
func (jm *statsJobManager) GetStatsTaskState(originSegmentID int64, subJobType indexpb.StatsSubJob) indexpb.JobState {
state := jm.mt.statsTaskMeta.GetStatsTaskStateBySegmentID(originSegmentID, subJobType)
log.Info("statsJobManager get stats task state", zap.Int64("segmentID", originSegmentID),
zap.String("subJobType", subJobType.String()), zap.String("state", state.String()))
return state
}
func (jm *statsJobManager) DropStatsTask(originSegmentID int64, subJobType indexpb.StatsSubJob) error {
task := jm.mt.statsTaskMeta.GetStatsTaskBySegmentID(originSegmentID, subJobType)
if task == nil {
return nil
}
jm.scheduler.AbortTask(task.GetTaskID())
if err := jm.mt.statsTaskMeta.MarkTaskCanRecycle(task.GetTaskID()); err != nil {
return err
}
log.Info("statsJobManager drop stats task success", zap.Int64("segmentID", originSegmentID),
zap.Int64("taskID", task.GetTaskID()), zap.String("subJobType", subJobType.String()))
return nil
}