// Licensed to the LF AI & Data foundation under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package datacoord import ( "context" "sync" "time" "go.uber.org/zap" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus/internal/proto/indexpb" "github.com/milvus-io/milvus/internal/storage" "github.com/milvus-io/milvus/pkg/log" "github.com/milvus-io/milvus/pkg/metrics" "github.com/milvus-io/milvus/pkg/util/lock" ) const ( reqTimeoutInterval = time.Second * 10 ) type taskScheduler struct { lock.RWMutex ctx context.Context cancel context.CancelFunc wg sync.WaitGroup scheduleDuration time.Duration collectMetricsDuration time.Duration // TODO @xiaocai2333: use priority queue tasks map[int64]Task notifyChan chan struct{} taskLock *lock.KeyLock[int64] meta *meta policy buildIndexPolicy nodeManager WorkerManager chunkManager storage.ChunkManager indexEngineVersionManager IndexEngineVersionManager handler Handler } func newTaskScheduler( ctx context.Context, metaTable *meta, nodeManager WorkerManager, chunkManager storage.ChunkManager, indexEngineVersionManager IndexEngineVersionManager, handler Handler, ) *taskScheduler { ctx, cancel := context.WithCancel(ctx) ts := &taskScheduler{ ctx: ctx, cancel: cancel, meta: metaTable, tasks: make(map[int64]Task), notifyChan: make(chan struct{}, 1), taskLock: lock.NewKeyLock[int64](), scheduleDuration: Params.DataCoordCfg.IndexTaskSchedulerInterval.GetAsDuration(time.Millisecond), collectMetricsDuration: time.Minute, policy: defaultBuildIndexPolicy, nodeManager: nodeManager, chunkManager: chunkManager, handler: handler, indexEngineVersionManager: indexEngineVersionManager, } ts.reloadFromKV() return ts } func (s *taskScheduler) Start() { s.wg.Add(2) go s.schedule() go s.collectTaskMetrics() } func (s *taskScheduler) Stop() { s.cancel() s.wg.Wait() } func (s *taskScheduler) reloadFromKV() { segments := s.meta.GetAllSegmentsUnsafe() for _, segment := range segments { for _, segIndex := range s.meta.indexMeta.getSegmentIndexes(segment.ID) { if segIndex.IsDeleted { continue } if segIndex.IndexState != commonpb.IndexState_Finished && segIndex.IndexState != commonpb.IndexState_Failed { s.tasks[segIndex.BuildID] = &indexBuildTask{ taskID: segIndex.BuildID, nodeID: segIndex.NodeID, taskInfo: &indexpb.IndexTaskInfo{ BuildID: segIndex.BuildID, State: segIndex.IndexState, FailReason: segIndex.FailReason, }, queueTime: time.Now(), startTime: time.Now(), endTime: time.Now(), } } } } allAnalyzeTasks := s.meta.analyzeMeta.GetAllTasks() for taskID, t := range allAnalyzeTasks { if t.State != indexpb.JobState_JobStateFinished && t.State != indexpb.JobState_JobStateFailed { s.tasks[taskID] = &analyzeTask{ taskID: taskID, nodeID: t.NodeID, taskInfo: &indexpb.AnalyzeResult{ TaskID: taskID, State: t.State, FailReason: t.FailReason, }, queueTime: time.Now(), startTime: time.Now(), endTime: time.Now(), } } } } // notify is an unblocked notify function func (s *taskScheduler) notify() { select { case s.notifyChan <- struct{}{}: default: } } func (s *taskScheduler) enqueue(task Task) { defer s.notify() s.Lock() defer s.Unlock() taskID := task.GetTaskID() if _, ok := s.tasks[taskID]; !ok { s.tasks[taskID] = task } task.SetQueueTime(time.Now()) log.Info("taskScheduler enqueue task", zap.Int64("taskID", taskID)) } func (s *taskScheduler) schedule() { // receive notifyChan // time ticker log.Ctx(s.ctx).Info("task scheduler loop start") defer s.wg.Done() ticker := time.NewTicker(s.scheduleDuration) defer ticker.Stop() for { select { case <-s.ctx.Done(): log.Ctx(s.ctx).Warn("task scheduler ctx done") return case _, ok := <-s.notifyChan: if ok { s.run() } // !ok means indexBuild is closed. case <-ticker.C: s.run() } } } func (s *taskScheduler) getTask(taskID UniqueID) Task { s.RLock() defer s.RUnlock() return s.tasks[taskID] } func (s *taskScheduler) run() { // schedule policy s.RLock() taskIDs := make([]UniqueID, 0, len(s.tasks)) for tID := range s.tasks { taskIDs = append(taskIDs, tID) } s.RUnlock() if len(taskIDs) > 0 { log.Ctx(s.ctx).Info("task scheduler", zap.Int("task num", len(taskIDs))) } s.policy(taskIDs) for _, taskID := range taskIDs { s.taskLock.Lock(taskID) ok := s.process(taskID) if !ok { s.taskLock.Unlock(taskID) log.Ctx(s.ctx).Info("there is no idle indexing node, wait a minute...") break } s.taskLock.Unlock(taskID) } } func (s *taskScheduler) removeTask(taskID UniqueID) { s.Lock() defer s.Unlock() delete(s.tasks, taskID) } func (s *taskScheduler) process(taskID UniqueID) bool { task := s.getTask(taskID) if !task.CheckTaskHealthy(s.meta) { s.removeTask(taskID) return true } state := task.GetState() log.Ctx(s.ctx).Info("task is processing", zap.Int64("taskID", taskID), zap.String("state", state.String())) switch state { case indexpb.JobState_JobStateNone: s.removeTask(taskID) case indexpb.JobState_JobStateInit: // 0. pre check task skip := task.PreCheck(s.ctx, s) if skip { return true } // 1. pick an indexNode client nodeID, client := s.nodeManager.PickClient() if client == nil { log.Ctx(s.ctx).Debug("pick client failed") return false } log.Ctx(s.ctx).Info("pick client success", zap.Int64("taskID", taskID), zap.Int64("nodeID", nodeID)) // 2. update version if err := task.UpdateVersion(s.ctx, s.meta); err != nil { log.Ctx(s.ctx).Warn("update task version failed", zap.Int64("taskID", taskID), zap.Error(err)) return false } log.Ctx(s.ctx).Info("update task version success", zap.Int64("taskID", taskID)) // 3. assign task to indexNode success := task.AssignTask(s.ctx, client) if !success { log.Ctx(s.ctx).Warn("assign task to client failed", zap.Int64("taskID", taskID), zap.String("new state", task.GetState().String()), zap.String("fail reason", task.GetFailReason())) // If the problem is caused by the task itself, subsequent tasks will not be skipped. // If etcd fails or fails to send tasks to the node, the subsequent tasks will be skipped. return false } log.Ctx(s.ctx).Info("assign task to client success", zap.Int64("taskID", taskID), zap.Int64("nodeID", nodeID)) // 4. update meta state if err := task.UpdateMetaBuildingState(nodeID, s.meta); err != nil { log.Ctx(s.ctx).Warn("update meta building state failed", zap.Int64("taskID", taskID), zap.Error(err)) task.SetState(indexpb.JobState_JobStateRetry, "update meta building state failed") return false } task.SetStartTime(time.Now()) queueingTime := task.GetStartTime().Sub(task.GetQueueTime()) if queueingTime > Params.DataCoordCfg.TaskSlowThreshold.GetAsDuration(time.Second) { log.Warn("task queueing time is too long", zap.Int64("taskID", taskID), zap.Int64("queueing time(ms)", queueingTime.Milliseconds())) } metrics.DataCoordTaskExecuteLatency. WithLabelValues(task.GetTaskType(), metrics.Pending).Observe(float64(queueingTime.Milliseconds())) log.Ctx(s.ctx).Info("update task meta state to InProgress success", zap.Int64("taskID", taskID), zap.Int64("nodeID", nodeID)) case indexpb.JobState_JobStateFinished, indexpb.JobState_JobStateFailed: if err := task.SetJobInfo(s.meta); err != nil { log.Ctx(s.ctx).Warn("update task info failed", zap.Error(err)) return true } task.SetEndTime(time.Now()) runningTime := task.GetEndTime().Sub(task.GetStartTime()) if runningTime > Params.DataCoordCfg.TaskSlowThreshold.GetAsDuration(time.Second) { log.Warn("task running time is too long", zap.Int64("taskID", taskID), zap.Int64("running time(ms)", runningTime.Milliseconds())) } metrics.DataCoordTaskExecuteLatency. WithLabelValues(task.GetTaskType(), metrics.Executing).Observe(float64(runningTime.Milliseconds())) client, exist := s.nodeManager.GetClientByID(task.GetNodeID()) if exist { if !task.DropTaskOnWorker(s.ctx, client) { return true } } s.removeTask(taskID) case indexpb.JobState_JobStateRetry: client, exist := s.nodeManager.GetClientByID(task.GetNodeID()) if exist { if !task.DropTaskOnWorker(s.ctx, client) { return true } } task.SetState(indexpb.JobState_JobStateInit, "") task.ResetNodeID() default: // state: in_progress client, exist := s.nodeManager.GetClientByID(task.GetNodeID()) if exist { task.QueryResult(s.ctx, client) return true } task.SetState(indexpb.JobState_JobStateRetry, "") } return true } func (s *taskScheduler) collectTaskMetrics() { defer s.wg.Done() ticker := time.NewTicker(s.collectMetricsDuration) defer ticker.Stop() for { select { case <-s.ctx.Done(): log.Warn("task scheduler context done") return case <-ticker.C: s.RLock() taskIDs := make([]UniqueID, 0, len(s.tasks)) for tID := range s.tasks { taskIDs = append(taskIDs, tID) } s.RUnlock() maxTaskQueueingTime := make(map[string]int64) maxTaskRunningTime := make(map[string]int64) collectMetricsFunc := func(taskID int64) { s.taskLock.Lock(taskID) defer s.taskLock.Unlock(taskID) task := s.getTask(taskID) if task == nil { return } state := task.GetState() switch state { case indexpb.JobState_JobStateNone: return case indexpb.JobState_JobStateInit: queueingTime := time.Since(task.GetQueueTime()) if queueingTime > Params.DataCoordCfg.TaskSlowThreshold.GetAsDuration(time.Second) { log.Warn("task queueing time is too long", zap.Int64("taskID", taskID), zap.Int64("queueing time(ms)", queueingTime.Milliseconds())) } maxQueueingTime, ok := maxTaskQueueingTime[task.GetTaskType()] if !ok || maxQueueingTime < queueingTime.Milliseconds() { maxTaskQueueingTime[task.GetTaskType()] = queueingTime.Milliseconds() } case indexpb.JobState_JobStateInProgress: runningTime := time.Since(task.GetStartTime()) if runningTime > Params.DataCoordCfg.TaskSlowThreshold.GetAsDuration(time.Second) { log.Warn("task running time is too long", zap.Int64("taskID", taskID), zap.Int64("running time(ms)", runningTime.Milliseconds())) } maxRunningTime, ok := maxTaskRunningTime[task.GetTaskType()] if !ok || maxRunningTime < runningTime.Milliseconds() { maxTaskRunningTime[task.GetTaskType()] = runningTime.Milliseconds() } } } for _, taskID := range taskIDs { collectMetricsFunc(taskID) } for taskType, queueingTime := range maxTaskQueueingTime { metrics.DataCoordTaskExecuteLatency. WithLabelValues(taskType, metrics.Pending).Observe(float64(queueingTime)) } for taskType, runningTime := range maxTaskRunningTime { metrics.DataCoordTaskExecuteLatency. WithLabelValues(taskType, metrics.Executing).Observe(float64(runningTime)) } } } }