enhance: Skip pick worker when task doesn't need to execute actually (#34348)

issue: #34347

Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
This commit is contained in:
cai.zhang 2024-07-03 15:52:09 +08:00 committed by GitHub
parent 8165961223
commit feb13cdf07
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 104 additions and 96 deletions

View File

@ -72,7 +72,7 @@ func (s *Server) createIndexForSegment(segment *SegmentInfo, indexID UniqueID) e
return err return err
} }
s.taskScheduler.enqueue(&indexBuildTask{ s.taskScheduler.enqueue(&indexBuildTask{
buildID: buildID, taskID: buildID,
taskInfo: &indexpb.IndexTaskInfo{ taskInfo: &indexpb.IndexTaskInfo{
BuildID: buildID, BuildID: buildID,
State: commonpb.IndexState_Unissued, State: commonpb.IndexState_Unissued,

View File

@ -34,10 +34,14 @@ import (
"github.com/milvus-io/milvus/pkg/util/typeutil" "github.com/milvus-io/milvus/pkg/util/typeutil"
) )
var _ Task = (*analyzeTask)(nil)
type analyzeTask struct { type analyzeTask struct {
taskID int64 taskID int64
nodeID int64 nodeID int64
taskInfo *indexpb.AnalyzeResult taskInfo *indexpb.AnalyzeResult
req *indexpb.AnalyzeRequest
} }
func (at *analyzeTask) GetTaskID() int64 { func (at *analyzeTask) GetTaskID() int64 {
@ -82,12 +86,12 @@ func (at *analyzeTask) UpdateMetaBuildingState(nodeID int64, meta *meta) error {
return nil return nil
} }
func (at *analyzeTask) AssignTask(ctx context.Context, client types.IndexNodeClient, dependency *taskScheduler) (bool, bool) { func (at *analyzeTask) PreCheck(ctx context.Context, dependency *taskScheduler) bool {
t := dependency.meta.analyzeMeta.GetTask(at.GetTaskID()) t := dependency.meta.analyzeMeta.GetTask(at.GetTaskID())
if t == nil { if t == nil {
log.Ctx(ctx).Info("task is nil, delete it", zap.Int64("taskID", at.GetTaskID())) log.Ctx(ctx).Info("task is nil, delete it", zap.Int64("taskID", at.GetTaskID()))
at.SetState(indexpb.JobState_JobStateNone, "analyze task is nil") at.SetState(indexpb.JobState_JobStateNone, "analyze task is nil")
return false, false return true
} }
var storageConfig *indexpb.StorageConfig var storageConfig *indexpb.StorageConfig
@ -113,7 +117,7 @@ func (at *analyzeTask) AssignTask(ctx context.Context, client types.IndexNodeCli
RequestTimeoutMs: Params.MinioCfg.RequestTimeoutMs.GetAsInt64(), RequestTimeoutMs: Params.MinioCfg.RequestTimeoutMs.GetAsInt64(),
} }
} }
req := &indexpb.AnalyzeRequest{ at.req = &indexpb.AnalyzeRequest{
ClusterID: Params.CommonCfg.ClusterPrefix.GetValue(), ClusterID: Params.CommonCfg.ClusterPrefix.GetValue(),
TaskID: at.GetTaskID(), TaskID: at.GetTaskID(),
CollectionID: t.CollectionID, CollectionID: t.CollectionID,
@ -123,7 +127,7 @@ func (at *analyzeTask) AssignTask(ctx context.Context, client types.IndexNodeCli
FieldType: t.FieldType, FieldType: t.FieldType,
Dim: t.Dim, Dim: t.Dim,
SegmentStats: make(map[int64]*indexpb.SegmentStats), SegmentStats: make(map[int64]*indexpb.SegmentStats),
Version: t.Version, Version: t.Version + 1,
StorageConfig: storageConfig, StorageConfig: storageConfig,
} }
@ -142,13 +146,13 @@ func (at *analyzeTask) AssignTask(ctx context.Context, client types.IndexNodeCli
log.Ctx(ctx).Warn("analyze stats task is processing, but segment is nil, delete the task", log.Ctx(ctx).Warn("analyze stats task is processing, but segment is nil, delete the task",
zap.Int64("taskID", at.GetTaskID()), zap.Int64("segmentID", segID)) zap.Int64("taskID", at.GetTaskID()), zap.Int64("segmentID", segID))
at.SetState(indexpb.JobState_JobStateFailed, fmt.Sprintf("segmentInfo with ID: %d is nil", segID)) at.SetState(indexpb.JobState_JobStateFailed, fmt.Sprintf("segmentInfo with ID: %d is nil", segID))
return false, false return true
} }
totalSegmentsRows += info.GetNumOfRows() totalSegmentsRows += info.GetNumOfRows()
// get binlogIDs // get binlogIDs
binlogIDs := getBinLogIDs(info, t.FieldID) binlogIDs := getBinLogIDs(info, t.FieldID)
req.SegmentStats[segID] = &indexpb.SegmentStats{ at.req.SegmentStats[segID] = &indexpb.SegmentStats{
ID: segID, ID: segID,
NumRows: info.GetNumOfRows(), NumRows: info.GetNumOfRows(),
LogIDs: binlogIDs, LogIDs: binlogIDs,
@ -160,7 +164,7 @@ func (at *analyzeTask) AssignTask(ctx context.Context, client types.IndexNodeCli
log.Ctx(ctx).Info("analyze task get collection info failed", zap.Int64("collectionID", log.Ctx(ctx).Info("analyze task get collection info failed", zap.Int64("collectionID",
segments[0].GetCollectionID()), zap.Error(err)) segments[0].GetCollectionID()), zap.Error(err))
at.SetState(indexpb.JobState_JobStateInit, err.Error()) at.SetState(indexpb.JobState_JobStateInit, err.Error())
return false, false return true
} }
schema := collInfo.Schema schema := collInfo.Schema
@ -175,35 +179,39 @@ func (at *analyzeTask) AssignTask(ctx context.Context, client types.IndexNodeCli
dim, err := storage.GetDimFromParams(field.TypeParams) dim, err := storage.GetDimFromParams(field.TypeParams)
if err != nil { if err != nil {
at.SetState(indexpb.JobState_JobStateInit, err.Error()) at.SetState(indexpb.JobState_JobStateInit, err.Error())
return false, false return true
} }
req.Dim = int64(dim) at.req.Dim = int64(dim)
totalSegmentsRawDataSize := float64(totalSegmentsRows) * float64(dim) * typeutil.VectorTypeSize(t.FieldType) // Byte totalSegmentsRawDataSize := float64(totalSegmentsRows) * float64(dim) * typeutil.VectorTypeSize(t.FieldType) // Byte
numClusters := int64(math.Ceil(totalSegmentsRawDataSize / float64(Params.DataCoordCfg.ClusteringCompactionPreferSegmentSize.GetAsSize()))) numClusters := int64(math.Ceil(totalSegmentsRawDataSize / float64(Params.DataCoordCfg.ClusteringCompactionPreferSegmentSize.GetAsSize())))
if numClusters < Params.DataCoordCfg.ClusteringCompactionMinCentroidsNum.GetAsInt64() { if numClusters < Params.DataCoordCfg.ClusteringCompactionMinCentroidsNum.GetAsInt64() {
log.Ctx(ctx).Info("data size is too small, skip analyze task", zap.Float64("raw data size", totalSegmentsRawDataSize), zap.Int64("num clusters", numClusters), zap.Int64("minimum num clusters required", Params.DataCoordCfg.ClusteringCompactionMinCentroidsNum.GetAsInt64())) log.Ctx(ctx).Info("data size is too small, skip analyze task", zap.Float64("raw data size", totalSegmentsRawDataSize), zap.Int64("num clusters", numClusters), zap.Int64("minimum num clusters required", Params.DataCoordCfg.ClusteringCompactionMinCentroidsNum.GetAsInt64()))
at.SetState(indexpb.JobState_JobStateFinished, "") at.SetState(indexpb.JobState_JobStateFinished, "")
return true, true return true
} }
if numClusters > Params.DataCoordCfg.ClusteringCompactionMaxCentroidsNum.GetAsInt64() { if numClusters > Params.DataCoordCfg.ClusteringCompactionMaxCentroidsNum.GetAsInt64() {
numClusters = Params.DataCoordCfg.ClusteringCompactionMaxCentroidsNum.GetAsInt64() numClusters = Params.DataCoordCfg.ClusteringCompactionMaxCentroidsNum.GetAsInt64()
} }
req.NumClusters = numClusters at.req.NumClusters = numClusters
req.MaxTrainSizeRatio = Params.DataCoordCfg.ClusteringCompactionMaxTrainSizeRatio.GetAsFloat() // control clustering train data size at.req.MaxTrainSizeRatio = Params.DataCoordCfg.ClusteringCompactionMaxTrainSizeRatio.GetAsFloat() // control clustering train data size
// config to detect data skewness // config to detect data skewness
req.MinClusterSizeRatio = Params.DataCoordCfg.ClusteringCompactionMinClusterSizeRatio.GetAsFloat() at.req.MinClusterSizeRatio = Params.DataCoordCfg.ClusteringCompactionMinClusterSizeRatio.GetAsFloat()
req.MaxClusterSizeRatio = Params.DataCoordCfg.ClusteringCompactionMaxClusterSizeRatio.GetAsFloat() at.req.MaxClusterSizeRatio = Params.DataCoordCfg.ClusteringCompactionMaxClusterSizeRatio.GetAsFloat()
req.MaxClusterSize = Params.DataCoordCfg.ClusteringCompactionMaxClusterSize.GetAsSize() at.req.MaxClusterSize = Params.DataCoordCfg.ClusteringCompactionMaxClusterSize.GetAsSize()
return false
}
func (at *analyzeTask) AssignTask(ctx context.Context, client types.IndexNodeClient) bool {
ctx, cancel := context.WithTimeout(context.Background(), reqTimeoutInterval) ctx, cancel := context.WithTimeout(context.Background(), reqTimeoutInterval)
defer cancel() defer cancel()
resp, err := client.CreateJobV2(ctx, &indexpb.CreateJobV2Request{ resp, err := client.CreateJobV2(ctx, &indexpb.CreateJobV2Request{
ClusterID: req.GetClusterID(), ClusterID: at.req.GetClusterID(),
TaskID: req.GetTaskID(), TaskID: at.req.GetTaskID(),
JobType: indexpb.JobType_JobTypeAnalyzeJob, JobType: indexpb.JobType_JobTypeAnalyzeJob,
Request: &indexpb.CreateJobV2Request_AnalyzeRequest{ Request: &indexpb.CreateJobV2Request_AnalyzeRequest{
AnalyzeRequest: req, AnalyzeRequest: at.req,
}, },
}) })
if err == nil { if err == nil {
@ -212,12 +220,12 @@ func (at *analyzeTask) AssignTask(ctx context.Context, client types.IndexNodeCli
if err != nil { if err != nil {
log.Ctx(ctx).Warn("assign analyze task to indexNode failed", zap.Int64("taskID", at.GetTaskID()), zap.Error(err)) log.Ctx(ctx).Warn("assign analyze task to indexNode failed", zap.Int64("taskID", at.GetTaskID()), zap.Error(err))
at.SetState(indexpb.JobState_JobStateRetry, err.Error()) at.SetState(indexpb.JobState_JobStateRetry, err.Error())
return false, true return false
} }
log.Ctx(ctx).Info("analyze task assigned successfully", zap.Int64("taskID", at.GetTaskID())) log.Ctx(ctx).Info("analyze task assigned successfully", zap.Int64("taskID", at.GetTaskID()))
at.SetState(indexpb.JobState_JobStateInProgress, "") at.SetState(indexpb.JobState_JobStateInProgress, "")
return true, false return true
} }
func (at *analyzeTask) setResult(result *indexpb.AnalyzeResult) { func (at *analyzeTask) setResult(result *indexpb.AnalyzeResult) {

View File

@ -37,13 +37,17 @@ import (
) )
type indexBuildTask struct { type indexBuildTask struct {
buildID int64 taskID int64
nodeID int64 nodeID int64
taskInfo *indexpb.IndexTaskInfo taskInfo *indexpb.IndexTaskInfo
req *indexpb.CreateJobRequest
} }
var _ Task = (*indexBuildTask)(nil)
func (it *indexBuildTask) GetTaskID() int64 { func (it *indexBuildTask) GetTaskID() int64 {
return it.buildID return it.taskID
} }
func (it *indexBuildTask) GetNodeID() int64 { func (it *indexBuildTask) GetNodeID() int64 {
@ -73,35 +77,35 @@ func (it *indexBuildTask) GetFailReason() string {
} }
func (it *indexBuildTask) UpdateVersion(ctx context.Context, meta *meta) error { func (it *indexBuildTask) UpdateVersion(ctx context.Context, meta *meta) error {
return meta.indexMeta.UpdateVersion(it.buildID) return meta.indexMeta.UpdateVersion(it.taskID)
} }
func (it *indexBuildTask) UpdateMetaBuildingState(nodeID int64, meta *meta) error { func (it *indexBuildTask) UpdateMetaBuildingState(nodeID int64, meta *meta) error {
it.nodeID = nodeID it.nodeID = nodeID
return meta.indexMeta.BuildIndex(it.buildID, nodeID) return meta.indexMeta.BuildIndex(it.taskID, nodeID)
} }
func (it *indexBuildTask) AssignTask(ctx context.Context, client types.IndexNodeClient, dependency *taskScheduler) (bool, bool) { func (it *indexBuildTask) PreCheck(ctx context.Context, dependency *taskScheduler) bool {
segIndex, exist := dependency.meta.indexMeta.GetIndexJob(it.buildID) segIndex, exist := dependency.meta.indexMeta.GetIndexJob(it.taskID)
if !exist || segIndex == nil { if !exist || segIndex == nil {
log.Ctx(ctx).Info("index task has not exist in meta table, remove task", zap.Int64("buildID", it.buildID)) log.Ctx(ctx).Info("index task has not exist in meta table, remove task", zap.Int64("taskID", it.taskID))
it.SetState(indexpb.JobState_JobStateNone, "index task has not exist in meta table") it.SetState(indexpb.JobState_JobStateNone, "index task has not exist in meta table")
return false, false return true
} }
segment := dependency.meta.GetSegment(segIndex.SegmentID) segment := dependency.meta.GetSegment(segIndex.SegmentID)
if !isSegmentHealthy(segment) || !dependency.meta.indexMeta.IsIndexExist(segIndex.CollectionID, segIndex.IndexID) { if !isSegmentHealthy(segment) || !dependency.meta.indexMeta.IsIndexExist(segIndex.CollectionID, segIndex.IndexID) {
log.Ctx(ctx).Info("task is no need to build index, remove it", zap.Int64("buildID", it.buildID)) log.Ctx(ctx).Info("task is no need to build index, remove it", zap.Int64("taskID", it.taskID))
it.SetState(indexpb.JobState_JobStateNone, "task is no need to build index") it.SetState(indexpb.JobState_JobStateNone, "task is no need to build index")
return false, false return true
} }
indexParams := dependency.meta.indexMeta.GetIndexParams(segIndex.CollectionID, segIndex.IndexID) indexParams := dependency.meta.indexMeta.GetIndexParams(segIndex.CollectionID, segIndex.IndexID)
indexType := GetIndexType(indexParams) indexType := GetIndexType(indexParams)
if isFlatIndex(indexType) || segIndex.NumRows < Params.DataCoordCfg.MinSegmentNumRowsToEnableIndex.GetAsInt64() { if isFlatIndex(indexType) || segIndex.NumRows < Params.DataCoordCfg.MinSegmentNumRowsToEnableIndex.GetAsInt64() {
log.Ctx(ctx).Info("segment does not need index really", zap.Int64("buildID", it.buildID), log.Ctx(ctx).Info("segment does not need index really", zap.Int64("taskID", it.taskID),
zap.Int64("segmentID", segIndex.SegmentID), zap.Int64("num rows", segIndex.NumRows)) zap.Int64("segmentID", segIndex.SegmentID), zap.Int64("num rows", segIndex.NumRows))
it.SetState(indexpb.JobState_JobStateFinished, "fake finished index success") it.SetState(indexpb.JobState_JobStateFinished, "fake finished index success")
return true, true return true
} }
// vector index build needs information of optional scalar fields data // vector index build needs information of optional scalar fields data
optionalFields := make([]*indexpb.OptionalFieldInfo, 0) optionalFields := make([]*indexpb.OptionalFieldInfo, 0)
@ -110,12 +114,12 @@ func (it *indexBuildTask) AssignTask(ctx context.Context, client types.IndexNode
if err != nil || collInfo == nil { if err != nil || collInfo == nil {
log.Ctx(ctx).Warn("get collection failed", zap.Int64("collID", segIndex.CollectionID), zap.Error(err)) log.Ctx(ctx).Warn("get collection failed", zap.Int64("collID", segIndex.CollectionID), zap.Error(err))
it.SetState(indexpb.JobState_JobStateInit, err.Error()) it.SetState(indexpb.JobState_JobStateInit, err.Error())
return false, false return true
} }
colSchema := collInfo.Schema colSchema := collInfo.Schema
partitionKeyField, err := typeutil.GetPartitionKeyFieldSchema(colSchema) partitionKeyField, err := typeutil.GetPartitionKeyFieldSchema(colSchema)
if partitionKeyField == nil || err != nil { if partitionKeyField == nil || err != nil {
log.Ctx(ctx).Warn("index builder get partition key field failed", zap.Int64("buildID", it.buildID), zap.Error(err)) log.Ctx(ctx).Warn("index builder get partition key field failed", zap.Int64("taskID", it.taskID), zap.Error(err))
} else { } else {
if typeutil.IsFieldDataTypeSupportMaterializedView(partitionKeyField) { if typeutil.IsFieldDataTypeSupportMaterializedView(partitionKeyField) {
optionalFields = append(optionalFields, &indexpb.OptionalFieldInfo{ optionalFields = append(optionalFields, &indexpb.OptionalFieldInfo{
@ -161,16 +165,16 @@ func (it *indexBuildTask) AssignTask(ctx context.Context, client types.IndexNode
var err error var err error
indexParams, err = indexparams.UpdateDiskIndexBuildParams(Params, indexParams) indexParams, err = indexparams.UpdateDiskIndexBuildParams(Params, indexParams)
if err != nil { if err != nil {
log.Ctx(ctx).Warn("failed to append index build params", zap.Int64("buildID", it.buildID), zap.Error(err)) log.Ctx(ctx).Warn("failed to append index build params", zap.Int64("taskID", it.taskID), zap.Error(err))
it.SetState(indexpb.JobState_JobStateInit, err.Error()) it.SetState(indexpb.JobState_JobStateInit, err.Error())
return false, false return true
} }
} }
var req *indexpb.CreateJobRequest
collectionInfo, err := dependency.handler.GetCollection(ctx, segment.GetCollectionID()) collectionInfo, err := dependency.handler.GetCollection(ctx, segment.GetCollectionID())
if err != nil { if err != nil {
log.Ctx(ctx).Info("index builder get collection info failed", zap.Int64("collectionID", segment.GetCollectionID()), zap.Error(err)) log.Ctx(ctx).Info("index builder get collection info failed", zap.Int64("collectionID", segment.GetCollectionID()), zap.Error(err))
return false, false return true
} }
schema := collectionInfo.Schema schema := collectionInfo.Schema
@ -183,7 +187,7 @@ func (it *indexBuildTask) AssignTask(ctx context.Context, client types.IndexNode
} }
} }
dim, err := storage.GetDimFromParams(field.TypeParams) dim, err := storage.GetDimFromParams(field.GetTypeParams())
if err != nil { if err != nil {
log.Ctx(ctx).Warn("failed to get dim from field type params", log.Ctx(ctx).Warn("failed to get dim from field type params",
zap.String("field type", field.GetDataType().String()), zap.Error(err)) zap.String("field type", field.GetDataType().String()), zap.Error(err))
@ -195,84 +199,90 @@ func (it *indexBuildTask) AssignTask(ctx context.Context, client types.IndexNode
if err != nil { if err != nil {
log.Ctx(ctx).Warn("failed to get storage uri", zap.Error(err)) log.Ctx(ctx).Warn("failed to get storage uri", zap.Error(err))
it.SetState(indexpb.JobState_JobStateInit, err.Error()) it.SetState(indexpb.JobState_JobStateInit, err.Error())
return false, false return true
} }
indexStorePath, err := itypeutil.GetStorageURI(params.Params.CommonCfg.StorageScheme.GetValue(), params.Params.CommonCfg.StoragePathPrefix.GetValue()+"/index", segment.GetID()) indexStorePath, err := itypeutil.GetStorageURI(params.Params.CommonCfg.StorageScheme.GetValue(), params.Params.CommonCfg.StoragePathPrefix.GetValue()+"/index", segment.GetID())
if err != nil { if err != nil {
log.Ctx(ctx).Warn("failed to get storage uri", zap.Error(err)) log.Ctx(ctx).Warn("failed to get storage uri", zap.Error(err))
it.SetState(indexpb.JobState_JobStateInit, err.Error()) it.SetState(indexpb.JobState_JobStateInit, err.Error())
return false, false return true
} }
req = &indexpb.CreateJobRequest{ it.req = &indexpb.CreateJobRequest{
ClusterID: Params.CommonCfg.ClusterPrefix.GetValue(), ClusterID: Params.CommonCfg.ClusterPrefix.GetValue(),
IndexFilePrefix: path.Join(dependency.chunkManager.RootPath(), common.SegmentIndexPath), IndexFilePrefix: path.Join(dependency.chunkManager.RootPath(), common.SegmentIndexPath),
BuildID: it.buildID, BuildID: it.taskID,
IndexVersion: segIndex.IndexVersion, IndexVersion: segIndex.IndexVersion + 1,
StorageConfig: storageConfig, StorageConfig: storageConfig,
IndexParams: indexParams, IndexParams: indexParams,
TypeParams: typeParams, TypeParams: typeParams,
NumRows: segIndex.NumRows, NumRows: segIndex.NumRows,
CurrentIndexVersion: dependency.indexEngineVersionManager.GetCurrentIndexEngineVersion(),
CollectionID: segment.GetCollectionID(), CollectionID: segment.GetCollectionID(),
PartitionID: segment.GetPartitionID(), PartitionID: segment.GetPartitionID(),
SegmentID: segment.GetID(), SegmentID: segment.GetID(),
FieldID: fieldID, FieldID: fieldID,
FieldName: field.Name, FieldName: field.GetName(),
FieldType: field.DataType, FieldType: field.GetDataType(),
StorePath: storePath, StorePath: storePath,
StoreVersion: segment.GetStorageVersion(), StoreVersion: segment.GetStorageVersion(),
IndexStorePath: indexStorePath, IndexStorePath: indexStorePath,
Dim: int64(dim), Dim: int64(dim),
CurrentIndexVersion: dependency.indexEngineVersionManager.GetCurrentIndexEngineVersion(),
DataIds: binlogIDs, DataIds: binlogIDs,
OptionalScalarFields: optionalFields, OptionalScalarFields: optionalFields,
Field: field, Field: field,
} }
} else { } else {
req = &indexpb.CreateJobRequest{ it.req = &indexpb.CreateJobRequest{
ClusterID: Params.CommonCfg.ClusterPrefix.GetValue(), ClusterID: Params.CommonCfg.ClusterPrefix.GetValue(),
IndexFilePrefix: path.Join(dependency.chunkManager.RootPath(), common.SegmentIndexPath), IndexFilePrefix: path.Join(dependency.chunkManager.RootPath(), common.SegmentIndexPath),
BuildID: it.buildID, BuildID: it.taskID,
IndexVersion: segIndex.IndexVersion, IndexVersion: segIndex.IndexVersion + 1,
StorageConfig: storageConfig, StorageConfig: storageConfig,
IndexParams: indexParams, IndexParams: indexParams,
TypeParams: typeParams, TypeParams: typeParams,
NumRows: segIndex.NumRows, NumRows: segIndex.NumRows,
CurrentIndexVersion: dependency.indexEngineVersionManager.GetCurrentIndexEngineVersion(), CurrentIndexVersion: dependency.indexEngineVersionManager.GetCurrentIndexEngineVersion(),
DataIds: binlogIDs,
CollectionID: segment.GetCollectionID(), CollectionID: segment.GetCollectionID(),
PartitionID: segment.GetPartitionID(), PartitionID: segment.GetPartitionID(),
SegmentID: segment.GetID(), SegmentID: segment.GetID(),
FieldID: fieldID, FieldID: fieldID,
OptionalScalarFields: optionalFields, FieldName: field.GetName(),
FieldType: field.GetDataType(),
Dim: int64(dim), Dim: int64(dim),
DataIds: binlogIDs,
OptionalScalarFields: optionalFields,
Field: field, Field: field,
} }
} }
log.Ctx(ctx).Info("index task pre check successfully", zap.Int64("taskID", it.GetTaskID()))
return false
}
func (it *indexBuildTask) AssignTask(ctx context.Context, client types.IndexNodeClient) bool {
ctx, cancel := context.WithTimeout(context.Background(), reqTimeoutInterval) ctx, cancel := context.WithTimeout(context.Background(), reqTimeoutInterval)
defer cancel() defer cancel()
resp, err := client.CreateJobV2(ctx, &indexpb.CreateJobV2Request{ resp, err := client.CreateJobV2(ctx, &indexpb.CreateJobV2Request{
ClusterID: req.GetClusterID(), ClusterID: it.req.GetClusterID(),
TaskID: req.GetBuildID(), TaskID: it.req.GetBuildID(),
JobType: indexpb.JobType_JobTypeIndexJob, JobType: indexpb.JobType_JobTypeIndexJob,
Request: &indexpb.CreateJobV2Request_IndexRequest{ Request: &indexpb.CreateJobV2Request_IndexRequest{
IndexRequest: req, IndexRequest: it.req,
}, },
}) })
if err == nil { if err == nil {
err = merr.Error(resp) err = merr.Error(resp)
} }
if err != nil { if err != nil {
log.Ctx(ctx).Warn("assign index task to indexNode failed", zap.Int64("buildID", it.buildID), zap.Error(err)) log.Ctx(ctx).Warn("assign index task to indexNode failed", zap.Int64("taskID", it.taskID), zap.Error(err))
it.SetState(indexpb.JobState_JobStateRetry, err.Error()) it.SetState(indexpb.JobState_JobStateRetry, err.Error())
return false, true return false
} }
log.Ctx(ctx).Info("index task assigned successfully", zap.Int64("buildID", it.buildID), log.Ctx(ctx).Info("index task assigned successfully", zap.Int64("taskID", it.taskID))
zap.Int64("segmentID", segIndex.SegmentID))
it.SetState(indexpb.JobState_JobStateInProgress, "") it.SetState(indexpb.JobState_JobStateInProgress, "")
return true, false return true
} }
func (it *indexBuildTask) setResult(info *indexpb.IndexTaskInfo) { func (it *indexBuildTask) setResult(info *indexpb.IndexTaskInfo) {
@ -289,7 +299,7 @@ func (it *indexBuildTask) QueryResult(ctx context.Context, node types.IndexNodeC
err = merr.Error(resp.GetStatus()) err = merr.Error(resp.GetStatus())
} }
if err != nil { if err != nil {
log.Ctx(ctx).Warn("get jobs info from IndexNode failed", zap.Int64("buildID", it.GetTaskID()), log.Ctx(ctx).Warn("get jobs info from IndexNode failed", zap.Int64("taskID", it.GetTaskID()),
zap.Int64("nodeID", it.GetNodeID()), zap.Error(err)) zap.Int64("nodeID", it.GetNodeID()), zap.Error(err))
it.SetState(indexpb.JobState_JobStateRetry, err.Error()) it.SetState(indexpb.JobState_JobStateRetry, err.Error())
return return

View File

@ -100,8 +100,8 @@ func (s *taskScheduler) reloadFromKV() {
} }
if segIndex.IndexState != commonpb.IndexState_Finished && segIndex.IndexState != commonpb.IndexState_Failed { if segIndex.IndexState != commonpb.IndexState_Finished && segIndex.IndexState != commonpb.IndexState_Failed {
s.tasks[segIndex.BuildID] = &indexBuildTask{ s.tasks[segIndex.BuildID] = &indexBuildTask{
buildID: segIndex.BuildID, taskID: segIndex.BuildID,
nodeID: segIndex.NodeID, nodeID: segIndex.NodeID,
taskInfo: &indexpb.IndexTaskInfo{ taskInfo: &indexpb.IndexTaskInfo{
BuildID: segIndex.BuildID, BuildID: segIndex.BuildID,
State: segIndex.IndexState, State: segIndex.IndexState,
@ -223,6 +223,12 @@ func (s *taskScheduler) process(taskID UniqueID) bool {
s.removeTask(taskID) s.removeTask(taskID)
case indexpb.JobState_JobStateInit: case indexpb.JobState_JobStateInit:
// 0. pre check task
skip := task.PreCheck(s.ctx, s)
if skip {
return true
}
// 1. pick an indexNode client // 1. pick an indexNode client
nodeID, client := s.nodeManager.PickClient() nodeID, client := s.nodeManager.PickClient()
if client == nil { if client == nil {
@ -239,17 +245,13 @@ func (s *taskScheduler) process(taskID UniqueID) bool {
log.Ctx(s.ctx).Info("update task version success", zap.Int64("taskID", taskID)) log.Ctx(s.ctx).Info("update task version success", zap.Int64("taskID", taskID))
// 3. assign task to indexNode // 3. assign task to indexNode
success, skip := task.AssignTask(s.ctx, client, s) success := task.AssignTask(s.ctx, client)
if !success { if !success {
log.Ctx(s.ctx).Warn("assign task to client failed", zap.Int64("taskID", taskID), log.Ctx(s.ctx).Warn("assign task to client failed", zap.Int64("taskID", taskID),
zap.String("new state", task.GetState().String()), zap.String("fail reason", task.GetFailReason())) zap.String("new state", task.GetState().String()), zap.String("fail reason", task.GetFailReason()))
// If the problem is caused by the task itself, subsequent tasks will not be skipped. // If the problem is caused by the task itself, subsequent tasks will not be skipped.
// If etcd fails or fails to send tasks to the node, the subsequent tasks will be skipped. // If etcd fails or fails to send tasks to the node, the subsequent tasks will be skipped.
return !skip return false
}
if skip {
// create index for small segment(<1024), skip next steps.
return true
} }
log.Ctx(s.ctx).Info("assign task to client success", zap.Int64("taskID", taskID), zap.Int64("nodeID", nodeID)) log.Ctx(s.ctx).Info("assign task to client success", zap.Int64("taskID", taskID), zap.Int64("nodeID", nodeID))

View File

@ -927,7 +927,6 @@ func (s *taskSchedulerSuite) Test_analyzeTaskFailCase() {
ctx := context.Background() ctx := context.Background()
catalog := catalogmocks.NewDataCoordCatalog(s.T()) catalog := catalogmocks.NewDataCoordCatalog(s.T())
in := mocks.NewMockIndexNodeClient(s.T())
workerManager := NewMockWorkerManager(s.T()) workerManager := NewMockWorkerManager(s.T())
mt := createMeta(catalog, mt := createMeta(catalog,
@ -958,9 +957,7 @@ func (s *taskSchedulerSuite) Test_analyzeTaskFailCase() {
scheduler.scheduleDuration = s.duration scheduler.scheduleDuration = s.duration
scheduler.Start() scheduler.Start()
// taskID 1 peek client success, update version success. AssignTask failed --> state: Failed --> save // taskID 1 PreCheck failed --> state: Failed --> save
workerManager.EXPECT().PickClient().Return(s.nodeID, in).Once()
catalog.EXPECT().SaveAnalyzeTask(mock.Anything, mock.Anything).Return(nil).Once()
catalog.EXPECT().SaveAnalyzeTask(mock.Anything, mock.Anything).Return(nil).Once() catalog.EXPECT().SaveAnalyzeTask(mock.Anything, mock.Anything).Return(nil).Once()
workerManager.EXPECT().GetClientByID(mock.Anything).Return(nil, false).Once() workerManager.EXPECT().GetClientByID(mock.Anything).Return(nil, false).Once()
@ -1298,14 +1295,10 @@ func (s *taskSchedulerSuite) Test_indexTaskFailCase() {
defer Params.CommonCfg.EnableStorageV2.SwapTempValue("False") defer Params.CommonCfg.EnableStorageV2.SwapTempValue("False")
scheduler.Start() scheduler.Start()
// peek client success, update version success, get collection info failed --> init // get collection info failed --> init
workerManager.EXPECT().PickClient().Return(s.nodeID, in).Once()
catalog.EXPECT().AlterSegmentIndexes(mock.Anything, mock.Anything).Return(nil).Once()
handler.EXPECT().GetCollection(mock.Anything, mock.Anything).Return(nil, errors.New("mock error")).Once() handler.EXPECT().GetCollection(mock.Anything, mock.Anything).Return(nil, errors.New("mock error")).Once()
// peek client success, update version success, partition key field is nil, get collection info failed --> init // partition key field is nil, get collection info failed --> init
workerManager.EXPECT().PickClient().Return(s.nodeID, in).Once()
catalog.EXPECT().AlterSegmentIndexes(mock.Anything, mock.Anything).Return(nil).Once()
handler.EXPECT().GetCollection(mock.Anything, mock.Anything).Return(&collectionInfo{ handler.EXPECT().GetCollection(mock.Anything, mock.Anything).Return(&collectionInfo{
ID: collID, ID: collID,
Schema: &schemapb.CollectionSchema{ Schema: &schemapb.CollectionSchema{
@ -1316,9 +1309,7 @@ func (s *taskSchedulerSuite) Test_indexTaskFailCase() {
}, nil).Once() }, nil).Once()
handler.EXPECT().GetCollection(mock.Anything, mock.Anything).Return(nil, errors.New("mock error")).Once() handler.EXPECT().GetCollection(mock.Anything, mock.Anything).Return(nil, errors.New("mock error")).Once()
// peek client success, update version success, get collection info success, get dim failed --> init // get collection info success, get dim failed --> init
workerManager.EXPECT().PickClient().Return(s.nodeID, in).Once()
catalog.EXPECT().AlterSegmentIndexes(mock.Anything, mock.Anything).Return(nil).Once()
handler.EXPECT().GetCollection(mock.Anything, mock.Anything).Return(&collectionInfo{ handler.EXPECT().GetCollection(mock.Anything, mock.Anything).Return(&collectionInfo{
ID: collID, ID: collID,
Schema: &schemapb.CollectionSchema{ Schema: &schemapb.CollectionSchema{
@ -1331,8 +1322,6 @@ func (s *taskSchedulerSuite) Test_indexTaskFailCase() {
// peek client success, update version success, get collection info success, get dim success, get storage uri failed --> init // peek client success, update version success, get collection info success, get dim success, get storage uri failed --> init
s.NoError(err) s.NoError(err)
workerManager.EXPECT().PickClient().Return(s.nodeID, in).Once()
catalog.EXPECT().AlterSegmentIndexes(mock.Anything, mock.Anything).Return(nil).Once()
handler.EXPECT().GetCollection(mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, i int64) (*collectionInfo, error) { handler.EXPECT().GetCollection(mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, i int64) (*collectionInfo, error) {
return &collectionInfo{ return &collectionInfo{
ID: collID, ID: collID,
@ -1676,14 +1665,12 @@ func (s *taskSchedulerSuite) Test_indexTaskWithMvOptionalScalarField() {
return merr.Success(), nil return merr.Success(), nil
}).Once() }).Once()
t := &indexBuildTask{ t := &indexBuildTask{
buildID: buildID, taskID: buildID,
nodeID: nodeID, nodeID: nodeID,
taskInfo: &indexpb.IndexTaskInfo{ taskInfo: &indexpb.IndexTaskInfo{
BuildID: buildID, BuildID: buildID,
State: commonpb.IndexState_Unissued, State: commonpb.IndexState_Unissued,
FailReason: "", FailReason: "",
// CurrentIndexVersion: 0,
// IndexStoreVersion: 0,
}, },
} }
scheduler.enqueue(t) scheduler.enqueue(t)
@ -1701,8 +1688,8 @@ func (s *taskSchedulerSuite) Test_indexTaskWithMvOptionalScalarField() {
return merr.Success(), nil return merr.Success(), nil
}).Once() }).Once()
t := &indexBuildTask{ t := &indexBuildTask{
buildID: buildID, taskID: buildID,
nodeID: nodeID, nodeID: nodeID,
taskInfo: &indexpb.IndexTaskInfo{ taskInfo: &indexpb.IndexTaskInfo{
BuildID: buildID, BuildID: buildID,
State: commonpb.IndexState_Unissued, State: commonpb.IndexState_Unissued,
@ -1730,8 +1717,8 @@ func (s *taskSchedulerSuite) Test_indexTaskWithMvOptionalScalarField() {
return merr.Success(), nil return merr.Success(), nil
}).Once() }).Once()
t := &indexBuildTask{ t := &indexBuildTask{
buildID: buildID, taskID: buildID,
nodeID: nodeID, nodeID: nodeID,
taskInfo: &indexpb.IndexTaskInfo{ taskInfo: &indexpb.IndexTaskInfo{
BuildID: buildID, BuildID: buildID,
State: commonpb.IndexState_Unissued, State: commonpb.IndexState_Unissued,
@ -1753,8 +1740,8 @@ func (s *taskSchedulerSuite) Test_indexTaskWithMvOptionalScalarField() {
return merr.Success(), nil return merr.Success(), nil
}).Once() }).Once()
t := &indexBuildTask{ t := &indexBuildTask{
buildID: buildID, taskID: buildID,
nodeID: nodeID, nodeID: nodeID,
taskInfo: &indexpb.IndexTaskInfo{ taskInfo: &indexpb.IndexTaskInfo{
BuildID: buildID, BuildID: buildID,
State: commonpb.IndexState_Unissued, State: commonpb.IndexState_Unissued,

View File

@ -27,13 +27,14 @@ type Task interface {
GetTaskID() int64 GetTaskID() int64
GetNodeID() int64 GetNodeID() int64
ResetNodeID() ResetNodeID()
PreCheck(ctx context.Context, dependency *taskScheduler) bool
CheckTaskHealthy(mt *meta) bool CheckTaskHealthy(mt *meta) bool
SetState(state indexpb.JobState, failReason string) SetState(state indexpb.JobState, failReason string)
GetState() indexpb.JobState GetState() indexpb.JobState
GetFailReason() string GetFailReason() string
UpdateVersion(ctx context.Context, meta *meta) error UpdateVersion(ctx context.Context, meta *meta) error
UpdateMetaBuildingState(nodeID int64, meta *meta) error UpdateMetaBuildingState(nodeID int64, meta *meta) error
AssignTask(ctx context.Context, client types.IndexNodeClient, dependency *taskScheduler) (bool, bool) AssignTask(ctx context.Context, client types.IndexNodeClient) bool
QueryResult(ctx context.Context, client types.IndexNodeClient) QueryResult(ctx context.Context, client types.IndexNodeClient)
DropTaskOnWorker(ctx context.Context, client types.IndexNodeClient) bool DropTaskOnWorker(ctx context.Context, client types.IndexNodeClient) bool
SetJobInfo(meta *meta) error SetJobInfo(meta *meta) error