mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-02 11:59:00 +08:00
7bf40694fd
issue: #33744 --------- Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
330 lines
11 KiB
Go
330 lines
11 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package datacoord
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus/internal/proto/indexpb"
|
|
"github.com/milvus-io/milvus/internal/proto/workerpb"
|
|
"github.com/milvus-io/milvus/internal/types"
|
|
"github.com/milvus-io/milvus/pkg/log"
|
|
"github.com/milvus-io/milvus/pkg/util/merr"
|
|
"github.com/milvus-io/milvus/pkg/util/tsoutil"
|
|
)
|
|
|
|
type statsTask struct {
|
|
taskID int64
|
|
segmentID int64
|
|
targetSegmentID int64
|
|
nodeID int64
|
|
taskInfo *workerpb.StatsResult
|
|
|
|
queueTime time.Time
|
|
startTime time.Time
|
|
endTime time.Time
|
|
|
|
req *workerpb.CreateStatsRequest
|
|
|
|
subJobType indexpb.StatsSubJob
|
|
}
|
|
|
|
var _ Task = (*statsTask)(nil)
|
|
|
|
func newStatsTask(taskID int64, segmentID, targetSegmentID int64, subJobType indexpb.StatsSubJob) *statsTask {
|
|
return &statsTask{
|
|
taskID: taskID,
|
|
segmentID: segmentID,
|
|
targetSegmentID: targetSegmentID,
|
|
taskInfo: &workerpb.StatsResult{
|
|
TaskID: taskID,
|
|
State: indexpb.JobState_JobStateInit,
|
|
},
|
|
subJobType: subJobType,
|
|
}
|
|
}
|
|
|
|
func (st *statsTask) setResult(result *workerpb.StatsResult) {
|
|
st.taskInfo = result
|
|
}
|
|
|
|
func (st *statsTask) GetTaskID() int64 {
|
|
return st.taskID
|
|
}
|
|
|
|
func (st *statsTask) GetNodeID() int64 {
|
|
return st.nodeID
|
|
}
|
|
|
|
func (st *statsTask) ResetTask(mt *meta) {
|
|
st.nodeID = 0
|
|
// reset isCompacting
|
|
|
|
mt.SetSegmentsCompacting([]UniqueID{st.segmentID}, false)
|
|
}
|
|
|
|
func (st *statsTask) SetQueueTime(t time.Time) {
|
|
st.queueTime = t
|
|
}
|
|
|
|
func (st *statsTask) GetQueueTime() time.Time {
|
|
return st.queueTime
|
|
}
|
|
|
|
func (st *statsTask) SetStartTime(t time.Time) {
|
|
st.startTime = t
|
|
}
|
|
|
|
func (st *statsTask) GetStartTime() time.Time {
|
|
return st.startTime
|
|
}
|
|
|
|
func (st *statsTask) SetEndTime(t time.Time) {
|
|
st.endTime = t
|
|
}
|
|
|
|
func (st *statsTask) GetEndTime() time.Time {
|
|
return st.endTime
|
|
}
|
|
|
|
func (st *statsTask) GetTaskType() string {
|
|
return indexpb.JobType_JobTypeStatsJob.String()
|
|
}
|
|
|
|
func (st *statsTask) CheckTaskHealthy(mt *meta) bool {
|
|
seg := mt.GetHealthySegment(st.segmentID)
|
|
return seg != nil
|
|
}
|
|
|
|
func (st *statsTask) SetState(state indexpb.JobState, failReason string) {
|
|
st.taskInfo.State = state
|
|
st.taskInfo.FailReason = failReason
|
|
}
|
|
|
|
func (st *statsTask) GetState() indexpb.JobState {
|
|
return st.taskInfo.GetState()
|
|
}
|
|
|
|
func (st *statsTask) GetFailReason() string {
|
|
return st.taskInfo.GetFailReason()
|
|
}
|
|
|
|
func (st *statsTask) UpdateVersion(ctx context.Context, nodeID int64, meta *meta) error {
|
|
// mark compacting
|
|
if exist, canDo := meta.CheckAndSetSegmentsCompacting([]UniqueID{st.segmentID}); !exist || !canDo {
|
|
log.Warn("segment is not exist or is compacting, skip stats",
|
|
zap.Bool("exist", exist), zap.Bool("canDo", canDo))
|
|
st.SetState(indexpb.JobState_JobStateNone, "segment is not healthy")
|
|
return fmt.Errorf("mark segment compacting failed, isCompacting: %v", !canDo)
|
|
}
|
|
|
|
if err := meta.statsTaskMeta.UpdateVersion(st.taskID, nodeID); err != nil {
|
|
return err
|
|
}
|
|
st.nodeID = nodeID
|
|
return nil
|
|
}
|
|
|
|
func (st *statsTask) UpdateMetaBuildingState(meta *meta) error {
|
|
return meta.statsTaskMeta.UpdateBuildingTask(st.taskID)
|
|
}
|
|
|
|
func (st *statsTask) PreCheck(ctx context.Context, dependency *taskScheduler) bool {
|
|
// set segment compacting
|
|
log := log.Ctx(ctx).With(zap.Int64("taskID", st.taskID), zap.Int64("segmentID", st.segmentID))
|
|
segment := dependency.meta.GetHealthySegment(st.segmentID)
|
|
if segment == nil {
|
|
log.Warn("segment is node healthy, skip stats")
|
|
st.SetState(indexpb.JobState_JobStateNone, "segment is not healthy")
|
|
return false
|
|
}
|
|
|
|
if segment.GetIsSorted() && st.subJobType == indexpb.StatsSubJob_Sort {
|
|
log.Info("stats task is marked as sorted, skip stats")
|
|
st.SetState(indexpb.JobState_JobStateNone, "segment is marked as sorted")
|
|
return false
|
|
}
|
|
|
|
collInfo, err := dependency.handler.GetCollection(ctx, segment.GetCollectionID())
|
|
if err != nil {
|
|
log.Warn("stats task get collection info failed", zap.Int64("collectionID",
|
|
segment.GetCollectionID()), zap.Error(err))
|
|
st.SetState(indexpb.JobState_JobStateInit, err.Error())
|
|
return false
|
|
}
|
|
|
|
collTtl, err := getCollectionTTL(collInfo.Properties)
|
|
if err != nil {
|
|
log.Warn("stats task get collection ttl failed", zap.Int64("collectionID", segment.GetCollectionID()), zap.Error(err))
|
|
st.SetState(indexpb.JobState_JobStateInit, err.Error())
|
|
return false
|
|
}
|
|
|
|
start, end, err := dependency.allocator.AllocN(segment.getSegmentSize() / Params.DataNodeCfg.BinLogMaxSize.GetAsInt64() * int64(len(collInfo.Schema.GetFields())) * 2)
|
|
if err != nil {
|
|
log.Warn("stats task alloc logID failed", zap.Int64("collectionID", segment.GetCollectionID()), zap.Error(err))
|
|
st.SetState(indexpb.JobState_JobStateInit, err.Error())
|
|
return false
|
|
}
|
|
|
|
st.req = &workerpb.CreateStatsRequest{
|
|
ClusterID: Params.CommonCfg.ClusterPrefix.GetValue(),
|
|
TaskID: st.GetTaskID(),
|
|
CollectionID: segment.GetCollectionID(),
|
|
PartitionID: segment.GetPartitionID(),
|
|
InsertChannel: segment.GetInsertChannel(),
|
|
SegmentID: segment.GetID(),
|
|
InsertLogs: segment.GetBinlogs(),
|
|
DeltaLogs: segment.GetDeltalogs(),
|
|
StorageConfig: createStorageConfig(),
|
|
Schema: collInfo.Schema,
|
|
SubJobType: st.subJobType,
|
|
TargetSegmentID: st.targetSegmentID,
|
|
StartLogID: start,
|
|
EndLogID: end,
|
|
NumRows: segment.GetNumOfRows(),
|
|
CollectionTtl: collTtl.Nanoseconds(),
|
|
CurrentTs: tsoutil.GetCurrentTime(),
|
|
BinlogMaxSize: Params.DataNodeCfg.BinLogMaxSize.GetAsUint64(),
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func (st *statsTask) AssignTask(ctx context.Context, client types.IndexNodeClient) bool {
|
|
ctx, cancel := context.WithTimeout(ctx, reqTimeoutInterval)
|
|
defer cancel()
|
|
resp, err := client.CreateJobV2(ctx, &workerpb.CreateJobV2Request{
|
|
ClusterID: st.req.GetClusterID(),
|
|
TaskID: st.req.GetTaskID(),
|
|
JobType: indexpb.JobType_JobTypeStatsJob,
|
|
Request: &workerpb.CreateJobV2Request_StatsRequest{
|
|
StatsRequest: st.req,
|
|
},
|
|
})
|
|
if err := merr.CheckRPCCall(resp, err); err != nil {
|
|
log.Ctx(ctx).Warn("assign stats task failed", zap.Int64("taskID", st.taskID),
|
|
zap.Int64("segmentID", st.segmentID), zap.Error(err))
|
|
st.SetState(indexpb.JobState_JobStateRetry, err.Error())
|
|
return false
|
|
}
|
|
|
|
log.Ctx(ctx).Info("assign stats task success", zap.Int64("taskID", st.taskID), zap.Int64("segmentID", st.segmentID))
|
|
st.SetState(indexpb.JobState_JobStateInProgress, "")
|
|
return true
|
|
}
|
|
|
|
func (st *statsTask) QueryResult(ctx context.Context, client types.IndexNodeClient) {
|
|
ctx, cancel := context.WithTimeout(ctx, reqTimeoutInterval)
|
|
defer cancel()
|
|
resp, err := client.QueryJobsV2(ctx, &workerpb.QueryJobsV2Request{
|
|
ClusterID: st.req.GetClusterID(),
|
|
TaskIDs: []int64{st.GetTaskID()},
|
|
JobType: indexpb.JobType_JobTypeStatsJob,
|
|
})
|
|
|
|
if err := merr.CheckRPCCall(resp, err); err != nil {
|
|
log.Ctx(ctx).Warn("query stats task result failed", zap.Int64("taskID", st.GetTaskID()),
|
|
zap.Int64("segmentID", st.segmentID), zap.Error(err))
|
|
st.SetState(indexpb.JobState_JobStateRetry, err.Error())
|
|
return
|
|
}
|
|
|
|
for _, result := range resp.GetStatsJobResults().GetResults() {
|
|
if result.GetTaskID() == st.GetTaskID() {
|
|
log.Ctx(ctx).Info("query stats task result success", zap.Int64("taskID", st.GetTaskID()),
|
|
zap.Int64("segmentID", st.segmentID), zap.String("result state", result.GetState().String()),
|
|
zap.String("failReason", result.GetFailReason()))
|
|
if result.GetState() == indexpb.JobState_JobStateFinished || result.GetState() == indexpb.JobState_JobStateRetry ||
|
|
result.GetState() == indexpb.JobState_JobStateFailed {
|
|
st.setResult(result)
|
|
} else if result.GetState() == indexpb.JobState_JobStateNone {
|
|
st.SetState(indexpb.JobState_JobStateRetry, "stats task state is none in info response")
|
|
}
|
|
// inProgress or unissued/init, keep InProgress state
|
|
return
|
|
}
|
|
}
|
|
log.Ctx(ctx).Warn("query stats task result failed, indexNode does not have task info",
|
|
zap.Int64("taskID", st.GetTaskID()), zap.Int64("segmentID", st.segmentID))
|
|
st.SetState(indexpb.JobState_JobStateRetry, "stats task is not in info response")
|
|
}
|
|
|
|
func (st *statsTask) DropTaskOnWorker(ctx context.Context, client types.IndexNodeClient) bool {
|
|
ctx, cancel := context.WithTimeout(ctx, reqTimeoutInterval)
|
|
defer cancel()
|
|
resp, err := client.DropJobsV2(ctx, &workerpb.DropJobsV2Request{
|
|
ClusterID: st.req.GetClusterID(),
|
|
TaskIDs: []int64{st.GetTaskID()},
|
|
JobType: indexpb.JobType_JobTypeStatsJob,
|
|
})
|
|
|
|
if err := merr.CheckRPCCall(resp, err); err != nil {
|
|
log.Ctx(ctx).Warn("notify worker drop the stats task failed", zap.Int64("taskID", st.GetTaskID()),
|
|
zap.Int64("segmentID", st.segmentID), zap.Error(err))
|
|
return false
|
|
}
|
|
log.Ctx(ctx).Info("drop stats task success", zap.Int64("taskID", st.GetTaskID()),
|
|
zap.Int64("segmentID", st.segmentID))
|
|
return true
|
|
}
|
|
|
|
func (st *statsTask) SetJobInfo(meta *meta) error {
|
|
if st.GetState() == indexpb.JobState_JobStateFinished {
|
|
switch st.subJobType {
|
|
case indexpb.StatsSubJob_Sort:
|
|
// first update segment, failed state cannot generate new segment
|
|
metricMutation, err := meta.SaveStatsResultSegment(st.segmentID, st.taskInfo)
|
|
if err != nil {
|
|
log.Warn("save sort stats result failed", zap.Int64("taskID", st.taskID),
|
|
zap.Int64("segmentID", st.segmentID), zap.Error(err))
|
|
return err
|
|
}
|
|
metricMutation.commit()
|
|
|
|
select {
|
|
case getBuildIndexChSingleton() <- st.taskInfo.GetSegmentID():
|
|
default:
|
|
}
|
|
case indexpb.StatsSubJob_TextIndexJob:
|
|
err := meta.UpdateSegment(st.taskInfo.GetSegmentID(), SetTextIndexLogs(st.taskInfo.GetTextStatsLogs()))
|
|
if err != nil {
|
|
log.Warn("save text index stats result failed", zap.Int64("taskID", st.taskID),
|
|
zap.Int64("segmentID", st.segmentID), zap.Error(err))
|
|
return err
|
|
}
|
|
case indexpb.StatsSubJob_BM25Job:
|
|
// TODO: support bm25 job
|
|
}
|
|
}
|
|
|
|
// second update the task meta
|
|
if err := meta.statsTaskMeta.FinishTask(st.taskID, st.taskInfo); err != nil {
|
|
log.Warn("save stats result failed", zap.Int64("taskID", st.taskID), zap.Error(err))
|
|
return err
|
|
}
|
|
|
|
log.Info("SetJobInfo for stats task success", zap.Int64("taskID", st.taskID),
|
|
zap.Int64("oldSegmentID", st.segmentID), zap.Int64("targetSegmentID", st.taskInfo.GetSegmentID()),
|
|
zap.String("subJobType", st.subJobType.String()), zap.String("state", st.taskInfo.GetState().String()))
|
|
return nil
|
|
}
|