milvus/internal/datacoord/task_index.go
foxspy 3224e58c5b
enhance: add unify vector index config management (#36846)
issue: #34298

Signed-off-by: xianliang.li <xianliang.li@zilliz.com>
2024-11-01 06:18:21 +08:00

357 lines
12 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package datacoord
import (
"context"
"path"
"time"
"go.uber.org/zap"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/internal/proto/indexpb"
"github.com/milvus-io/milvus/internal/proto/workerpb"
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/internal/types"
"github.com/milvus-io/milvus/internal/util/vecindexmgr"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/indexparams"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
type indexBuildTask struct {
taskID int64
nodeID int64
taskInfo *workerpb.IndexTaskInfo
queueTime time.Time
startTime time.Time
endTime time.Time
req *workerpb.CreateJobRequest
}
var _ Task = (*indexBuildTask)(nil)
func newIndexBuildTask(taskID int64) *indexBuildTask {
return &indexBuildTask{
taskID: taskID,
taskInfo: &workerpb.IndexTaskInfo{
BuildID: taskID,
State: commonpb.IndexState_Unissued,
},
}
}
func (it *indexBuildTask) GetTaskID() int64 {
return it.taskID
}
func (it *indexBuildTask) GetNodeID() int64 {
return it.nodeID
}
func (it *indexBuildTask) ResetTask(mt *meta) {
it.nodeID = 0
}
func (it *indexBuildTask) SetQueueTime(t time.Time) {
it.queueTime = t
}
func (it *indexBuildTask) GetQueueTime() time.Time {
return it.queueTime
}
func (it *indexBuildTask) SetStartTime(t time.Time) {
it.startTime = t
}
func (it *indexBuildTask) GetStartTime() time.Time {
return it.startTime
}
func (it *indexBuildTask) SetEndTime(t time.Time) {
it.endTime = t
}
func (it *indexBuildTask) GetEndTime() time.Time {
return it.endTime
}
func (it *indexBuildTask) GetTaskType() string {
return indexpb.JobType_JobTypeIndexJob.String()
}
func (it *indexBuildTask) CheckTaskHealthy(mt *meta) bool {
_, exist := mt.indexMeta.GetIndexJob(it.GetTaskID())
return exist
}
func (it *indexBuildTask) SetState(state indexpb.JobState, failReason string) {
it.taskInfo.State = commonpb.IndexState(state)
it.taskInfo.FailReason = failReason
}
func (it *indexBuildTask) GetState() indexpb.JobState {
return indexpb.JobState(it.taskInfo.GetState())
}
func (it *indexBuildTask) GetFailReason() string {
return it.taskInfo.FailReason
}
func (it *indexBuildTask) UpdateVersion(ctx context.Context, nodeID int64, meta *meta) error {
if err := meta.indexMeta.UpdateVersion(it.taskID, nodeID); err != nil {
return err
}
it.nodeID = nodeID
return nil
}
func (it *indexBuildTask) UpdateMetaBuildingState(meta *meta) error {
return meta.indexMeta.BuildIndex(it.taskID)
}
func (it *indexBuildTask) PreCheck(ctx context.Context, dependency *taskScheduler) bool {
segIndex, exist := dependency.meta.indexMeta.GetIndexJob(it.taskID)
if !exist || segIndex == nil {
log.Ctx(ctx).Info("index task has not exist in meta table, remove task", zap.Int64("taskID", it.taskID))
it.SetState(indexpb.JobState_JobStateNone, "index task has not exist in meta table")
return false
}
segment := dependency.meta.GetSegment(segIndex.SegmentID)
if !isSegmentHealthy(segment) || !dependency.meta.indexMeta.IsIndexExist(segIndex.CollectionID, segIndex.IndexID) {
log.Ctx(ctx).Info("task is no need to build index, remove it", zap.Int64("taskID", it.taskID))
it.SetState(indexpb.JobState_JobStateNone, "task is no need to build index")
return false
}
indexParams := dependency.meta.indexMeta.GetIndexParams(segIndex.CollectionID, segIndex.IndexID)
indexType := GetIndexType(indexParams)
if isNoTrainIndex(indexType) || segIndex.NumRows < Params.DataCoordCfg.MinSegmentNumRowsToEnableIndex.GetAsInt64() {
log.Ctx(ctx).Info("segment does not need index really", zap.Int64("taskID", it.taskID),
zap.Int64("segmentID", segIndex.SegmentID), zap.Int64("num rows", segIndex.NumRows))
it.SetStartTime(time.Now())
it.SetEndTime(time.Now())
it.SetState(indexpb.JobState_JobStateFinished, "fake finished index success")
return false
}
typeParams := dependency.meta.indexMeta.GetTypeParams(segIndex.CollectionID, segIndex.IndexID)
fieldID := dependency.meta.indexMeta.GetFieldIDByIndexID(segIndex.CollectionID, segIndex.IndexID)
binlogIDs := getBinLogIDs(segment, fieldID)
// When new index parameters are added, these parameters need to be updated to ensure they are included during the index-building process.
if vecindexmgr.GetVecIndexMgrInstance().IsVecIndex(indexType) && Params.KnowhereConfig.Enable.GetAsBool() {
var ret error
indexParams, ret = Params.KnowhereConfig.UpdateIndexParams(GetIndexType(indexParams), paramtable.BuildStage, indexParams)
if ret != nil {
log.Ctx(ctx).Warn("failed to update index build params defined in yaml", zap.Int64("taskID", it.taskID), zap.Error(ret))
it.SetState(indexpb.JobState_JobStateInit, ret.Error())
return false
}
}
if isDiskANNIndex(GetIndexType(indexParams)) {
var err error
indexParams, err = indexparams.UpdateDiskIndexBuildParams(Params, indexParams)
if err != nil {
log.Ctx(ctx).Warn("failed to append index build params", zap.Int64("taskID", it.taskID), zap.Error(err))
it.SetState(indexpb.JobState_JobStateInit, err.Error())
return false
}
}
collectionInfo, err := dependency.handler.GetCollection(ctx, segment.GetCollectionID())
if err != nil {
log.Ctx(ctx).Info("index builder get collection info failed", zap.Int64("collectionID", segment.GetCollectionID()), zap.Error(err))
return false
}
schema := collectionInfo.Schema
var field *schemapb.FieldSchema
for _, f := range schema.Fields {
if f.FieldID == fieldID {
field = f
break
}
}
dim, err := storage.GetDimFromParams(field.GetTypeParams())
if err != nil {
log.Ctx(ctx).Warn("failed to get dim from field type params",
zap.String("field type", field.GetDataType().String()), zap.Error(err))
// don't return, maybe field is scalar field or sparseFloatVector
}
// vector index build needs information of optional scalar fields data
optionalFields := make([]*indexpb.OptionalFieldInfo, 0)
partitionKeyIsolation := false
isVectorTypeSupported := typeutil.IsDenseFloatVectorType(field.DataType) || typeutil.IsBinaryVectorType(field.DataType)
if Params.CommonCfg.EnableMaterializedView.GetAsBool() && isOptionalScalarFieldSupported(indexType) && isVectorTypeSupported {
if collectionInfo == nil {
log.Ctx(ctx).Warn("get collection failed", zap.Int64("collID", segIndex.CollectionID), zap.Error(err))
it.SetState(indexpb.JobState_JobStateInit, err.Error())
return true
}
partitionKeyField, _ := typeutil.GetPartitionKeyFieldSchema(schema)
if partitionKeyField != nil && typeutil.IsFieldDataTypeSupportMaterializedView(partitionKeyField) {
optionalFields = append(optionalFields, &indexpb.OptionalFieldInfo{
FieldID: partitionKeyField.FieldID,
FieldName: partitionKeyField.Name,
FieldType: int32(partitionKeyField.DataType),
DataIds: getBinLogIDs(segment, partitionKeyField.FieldID),
})
iso, isoErr := common.IsPartitionKeyIsolationPropEnabled(collectionInfo.Properties)
if isoErr != nil {
log.Ctx(ctx).Warn("failed to parse partition key isolation", zap.Error(isoErr))
}
if iso {
partitionKeyIsolation = true
}
}
}
it.req = &workerpb.CreateJobRequest{
ClusterID: Params.CommonCfg.ClusterPrefix.GetValue(),
IndexFilePrefix: path.Join(dependency.chunkManager.RootPath(), common.SegmentIndexPath),
BuildID: it.taskID,
IndexVersion: segIndex.IndexVersion + 1,
StorageConfig: createStorageConfig(),
IndexParams: indexParams,
TypeParams: typeParams,
NumRows: segIndex.NumRows,
CurrentIndexVersion: dependency.indexEngineVersionManager.GetCurrentIndexEngineVersion(),
CollectionID: segment.GetCollectionID(),
PartitionID: segment.GetPartitionID(),
SegmentID: segment.GetID(),
FieldID: fieldID,
FieldName: field.GetName(),
FieldType: field.GetDataType(),
Dim: int64(dim),
DataIds: binlogIDs,
OptionalScalarFields: optionalFields,
Field: field,
PartitionKeyIsolation: partitionKeyIsolation,
}
log.Ctx(ctx).Info("index task pre check successfully", zap.Int64("taskID", it.GetTaskID()),
zap.Int64("segID", segment.GetID()))
return true
}
func (it *indexBuildTask) AssignTask(ctx context.Context, client types.IndexNodeClient) bool {
ctx, cancel := context.WithTimeout(context.Background(), reqTimeoutInterval)
defer cancel()
resp, err := client.CreateJobV2(ctx, &workerpb.CreateJobV2Request{
ClusterID: it.req.GetClusterID(),
TaskID: it.req.GetBuildID(),
JobType: indexpb.JobType_JobTypeIndexJob,
Request: &workerpb.CreateJobV2Request_IndexRequest{
IndexRequest: it.req,
},
})
if err == nil {
err = merr.Error(resp)
}
if err != nil {
log.Ctx(ctx).Warn("assign index task to indexNode failed", zap.Int64("taskID", it.taskID), zap.Error(err))
it.SetState(indexpb.JobState_JobStateRetry, err.Error())
return false
}
log.Ctx(ctx).Info("index task assigned successfully", zap.Int64("taskID", it.taskID))
it.SetState(indexpb.JobState_JobStateInProgress, "")
return true
}
func (it *indexBuildTask) setResult(info *workerpb.IndexTaskInfo) {
it.taskInfo = info
}
func (it *indexBuildTask) QueryResult(ctx context.Context, node types.IndexNodeClient) {
ctx, cancel := context.WithTimeout(context.Background(), reqTimeoutInterval)
defer cancel()
resp, err := node.QueryJobsV2(ctx, &workerpb.QueryJobsV2Request{
ClusterID: Params.CommonCfg.ClusterPrefix.GetValue(),
TaskIDs: []UniqueID{it.GetTaskID()},
JobType: indexpb.JobType_JobTypeIndexJob,
})
if err == nil {
err = merr.Error(resp.GetStatus())
}
if err != nil {
log.Ctx(ctx).Warn("get jobs info from IndexNode failed", zap.Int64("taskID", it.GetTaskID()),
zap.Int64("nodeID", it.GetNodeID()), zap.Error(err))
it.SetState(indexpb.JobState_JobStateRetry, err.Error())
return
}
// indexInfos length is always one.
for _, info := range resp.GetIndexJobResults().GetResults() {
if info.GetBuildID() == it.GetTaskID() {
log.Ctx(ctx).Info("query task index info successfully",
zap.Int64("taskID", it.GetTaskID()), zap.String("result state", info.GetState().String()),
zap.String("failReason", info.GetFailReason()))
if info.GetState() == commonpb.IndexState_Finished || info.GetState() == commonpb.IndexState_Failed ||
info.GetState() == commonpb.IndexState_Retry {
// state is retry or finished or failed
it.setResult(info)
} else if info.GetState() == commonpb.IndexState_IndexStateNone {
it.SetState(indexpb.JobState_JobStateRetry, "index state is none in info response")
}
// inProgress or unissued, keep InProgress state
return
}
}
it.SetState(indexpb.JobState_JobStateRetry, "index is not in info response")
}
func (it *indexBuildTask) DropTaskOnWorker(ctx context.Context, client types.IndexNodeClient) bool {
ctx, cancel := context.WithTimeout(context.Background(), reqTimeoutInterval)
defer cancel()
resp, err := client.DropJobsV2(ctx, &workerpb.DropJobsV2Request{
ClusterID: Params.CommonCfg.ClusterPrefix.GetValue(),
TaskIDs: []UniqueID{it.GetTaskID()},
JobType: indexpb.JobType_JobTypeIndexJob,
})
if err == nil {
err = merr.Error(resp)
}
if err != nil {
log.Ctx(ctx).Warn("notify worker drop the index task fail", zap.Int64("taskID", it.GetTaskID()),
zap.Int64("nodeID", it.GetNodeID()), zap.Error(err))
return false
}
log.Ctx(ctx).Info("drop index task on worker success", zap.Int64("taskID", it.GetTaskID()),
zap.Int64("nodeID", it.GetNodeID()))
return true
}
func (it *indexBuildTask) SetJobInfo(meta *meta) error {
return meta.indexMeta.FinishTask(it.taskInfo)
}