2022-11-07 14:23:02 +08:00
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2022-08-25 15:48:54 +08:00
package indexnode
import (
"context"
"fmt"
"strconv"
"github.com/golang/protobuf/proto"
"go.uber.org/zap"
2022-10-16 20:49:27 +08:00
"github.com/milvus-io/milvus-proto/go-api/commonpb"
"github.com/milvus-io/milvus-proto/go-api/milvuspb"
2022-10-19 16:55:27 +08:00
"github.com/milvus-io/milvus/internal/common"
2022-08-25 15:48:54 +08:00
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/metrics"
"github.com/milvus-io/milvus/internal/proto/indexpb"
"github.com/milvus-io/milvus/internal/util/metricsinfo"
2022-11-04 14:25:38 +08:00
"github.com/milvus-io/milvus/internal/util/paramtable"
2022-08-25 15:48:54 +08:00
"github.com/milvus-io/milvus/internal/util/timerecord"
"github.com/milvus-io/milvus/internal/util/trace"
)
func ( i * IndexNode ) CreateJob ( ctx context . Context , req * indexpb . CreateJobRequest ) ( * commonpb . Status , error ) {
2022-10-10 15:55:22 +08:00
stateCode := i . stateCode . Load ( ) . ( commonpb . StateCode )
if stateCode != commonpb . StateCode_Healthy {
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Warn ( "index node not ready" , zap . Int32 ( "state" , int32 ( stateCode ) ) , zap . String ( "ClusterID" , req . ClusterID ) , zap . Int64 ( "IndexBuildID" , req . BuildID ) )
2022-08-25 15:48:54 +08:00
return & commonpb . Status {
ErrorCode : commonpb . ErrorCode_UnexpectedError ,
Reason : "state code is not healthy" ,
} , nil
}
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Info ( "IndexNode building index ..." ,
2022-09-06 17:19:11 +08:00
zap . String ( "ClusterID" , req . ClusterID ) ,
2022-08-25 15:48:54 +08:00
zap . Int64 ( "IndexBuildID" , req . BuildID ) ,
zap . Int64 ( "IndexID" , req . IndexID ) ,
zap . String ( "IndexName" , req . IndexName ) ,
zap . String ( "IndexFilePrefix" , req . IndexFilePrefix ) ,
zap . Int64 ( "IndexVersion" , req . IndexVersion ) ,
zap . Strings ( "DataPaths" , req . DataPaths ) ,
zap . Any ( "TypeParams" , req . TypeParams ) ,
2022-11-03 18:19:35 +08:00
zap . Any ( "IndexParams" , req . IndexParams ) ,
zap . Int64 ( "num_rows" , req . GetNumRows ( ) ) )
2022-08-25 15:48:54 +08:00
sp , _ := trace . StartSpanFromContextWithOperationName ( ctx , "IndexNode-CreateIndex" )
defer sp . Finish ( )
sp . SetTag ( "IndexBuildID" , strconv . FormatInt ( req . BuildID , 10 ) )
2022-09-06 17:19:11 +08:00
sp . SetTag ( "ClusterID" , req . ClusterID )
2022-11-04 14:25:38 +08:00
metrics . IndexNodeBuildIndexTaskCounter . WithLabelValues ( strconv . FormatInt ( paramtable . GetNodeID ( ) , 10 ) , metrics . TotalLabel ) . Inc ( )
2022-08-25 15:48:54 +08:00
2022-09-24 13:40:52 +08:00
taskCtx , taskCancel := context . WithCancel ( i . loopCtx )
2022-08-25 15:48:54 +08:00
if oldInfo := i . loadOrStoreTask ( req . ClusterID , req . BuildID , & taskInfo {
cancel : taskCancel ,
state : commonpb . IndexState_InProgress } ) ; oldInfo != nil {
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Warn ( "duplicated index build task" , zap . String ( "ClusterID" , req . ClusterID ) , zap . Int64 ( "BuildID" , req . BuildID ) )
2022-08-25 15:48:54 +08:00
return & commonpb . Status {
ErrorCode : commonpb . ErrorCode_BuildIndexError ,
Reason : "duplicated index build task" ,
} , nil
}
cm , err := i . storageFactory . NewChunkManager ( i . loopCtx , req . StorageConfig )
if err != nil {
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Error ( "create chunk manager failed" , zap . String ( "Bucket" , req . StorageConfig . BucketName ) ,
2022-08-25 15:48:54 +08:00
zap . String ( "AccessKey" , req . StorageConfig . AccessKeyID ) ,
2022-09-06 17:19:11 +08:00
zap . String ( "ClusterID" , req . ClusterID ) , zap . Int64 ( "IndexBuildID" , req . BuildID ) )
2022-08-25 15:48:54 +08:00
return & commonpb . Status {
ErrorCode : commonpb . ErrorCode_BuildIndexError ,
Reason : "create chunk manager failed" ,
} , nil
}
task := & indexBuildTask {
2022-09-06 17:19:11 +08:00
ident : fmt . Sprintf ( "%s/%d" , req . ClusterID , req . BuildID ) ,
2022-08-25 15:48:54 +08:00
ctx : taskCtx ,
cancel : taskCancel ,
BuildID : req . BuildID ,
ClusterID : req . ClusterID ,
node : i ,
req : req ,
cm : cm ,
nodeID : i . GetNodeID ( ) ,
2022-09-06 17:19:11 +08:00
tr : timerecord . NewTimeRecorder ( fmt . Sprintf ( "IndexBuildID: %d, ClusterID: %s" , req . BuildID , req . ClusterID ) ) ,
2022-08-25 15:48:54 +08:00
serializedSize : 0 ,
}
ret := & commonpb . Status {
ErrorCode : commonpb . ErrorCode_Success ,
Reason : "" ,
}
if err := i . sched . IndexBuildQueue . Enqueue ( task ) ; err != nil {
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Warn ( "IndexNode failed to schedule" , zap . Int64 ( "IndexBuildID" , req . BuildID ) , zap . String ( "ClusterID" , req . ClusterID ) , zap . Error ( err ) )
2022-08-25 15:48:54 +08:00
ret . ErrorCode = commonpb . ErrorCode_UnexpectedError
ret . Reason = err . Error ( )
2022-11-04 14:25:38 +08:00
metrics . IndexNodeBuildIndexTaskCounter . WithLabelValues ( strconv . FormatInt ( paramtable . GetNodeID ( ) , 10 ) , metrics . FailLabel ) . Inc ( )
2022-08-25 15:48:54 +08:00
return ret , nil
}
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Info ( "IndexNode successfully scheduled" , zap . Int64 ( "IndexBuildID" , req . BuildID ) , zap . String ( "ClusterID" , req . ClusterID ) , zap . String ( "indexName" , req . IndexName ) )
2022-08-25 15:48:54 +08:00
return ret , nil
}
func ( i * IndexNode ) QueryJobs ( ctx context . Context , req * indexpb . QueryJobsRequest ) ( * indexpb . QueryJobsResponse , error ) {
2022-10-10 15:55:22 +08:00
stateCode := i . stateCode . Load ( ) . ( commonpb . StateCode )
if stateCode != commonpb . StateCode_Healthy {
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Warn ( "index node not ready" , zap . Int32 ( "state" , int32 ( stateCode ) ) , zap . String ( "ClusterID" , req . ClusterID ) )
2022-08-25 15:48:54 +08:00
return & indexpb . QueryJobsResponse {
Status : & commonpb . Status {
ErrorCode : commonpb . ErrorCode_UnexpectedError ,
Reason : "state code is not healthy" ,
} ,
} , nil
}
infos := make ( map [ UniqueID ] * taskInfo )
2022-09-06 17:19:11 +08:00
i . foreachTaskInfo ( func ( ClusterID string , buildID UniqueID , info * taskInfo ) {
if ClusterID == req . ClusterID {
2022-08-25 15:48:54 +08:00
infos [ buildID ] = & taskInfo {
state : info . state ,
2022-10-19 16:55:27 +08:00
fileKeys : common . CloneStringList ( info . fileKeys ) ,
2022-08-25 15:48:54 +08:00
serializedSize : info . serializedSize ,
2022-09-09 15:52:35 +08:00
failReason : info . failReason ,
2022-08-25 15:48:54 +08:00
}
}
} )
ret := & indexpb . QueryJobsResponse {
Status : & commonpb . Status {
ErrorCode : commonpb . ErrorCode_Success ,
Reason : "" ,
} ,
ClusterID : req . ClusterID ,
IndexInfos : make ( [ ] * indexpb . IndexTaskInfo , 0 , len ( req . BuildIDs ) ) ,
}
for i , buildID := range req . BuildIDs {
ret . IndexInfos = append ( ret . IndexInfos , & indexpb . IndexTaskInfo {
BuildID : buildID ,
State : commonpb . IndexState_IndexStateNone ,
2022-10-19 16:55:27 +08:00
IndexFileKeys : nil ,
2022-08-25 15:48:54 +08:00
SerializedSize : 0 ,
} )
if info , ok := infos [ buildID ] ; ok {
ret . IndexInfos [ i ] . State = info . state
2022-10-19 16:55:27 +08:00
ret . IndexInfos [ i ] . IndexFileKeys = info . fileKeys
2022-08-25 15:48:54 +08:00
ret . IndexInfos [ i ] . SerializedSize = info . serializedSize
2022-09-06 17:19:11 +08:00
ret . IndexInfos [ i ] . FailReason = info . failReason
2022-11-07 14:23:02 +08:00
log . RatedDebug ( 5 , "querying index build task" , zap . String ( "ClusterID" , req . ClusterID ) ,
2022-09-24 13:40:52 +08:00
zap . Int64 ( "IndexBuildID" , buildID ) , zap . String ( "state" , info . state . String ( ) ) ,
zap . String ( "fail reason" , info . failReason ) )
2022-08-25 15:48:54 +08:00
}
}
return ret , nil
}
func ( i * IndexNode ) DropJobs ( ctx context . Context , req * indexpb . DropJobsRequest ) ( * commonpb . Status , error ) {
2022-11-07 14:23:02 +08:00
log . Ctx ( ctx ) . Info ( "drop index build jobs" , zap . String ( "ClusterID" , req . ClusterID ) , zap . Int64s ( "IndexBuildIDs" , req . BuildIDs ) )
2022-10-10 15:55:22 +08:00
stateCode := i . stateCode . Load ( ) . ( commonpb . StateCode )
if stateCode != commonpb . StateCode_Healthy {
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Warn ( "index node not ready" , zap . Int32 ( "state" , int32 ( stateCode ) ) , zap . String ( "ClusterID" , req . ClusterID ) )
2022-08-25 15:48:54 +08:00
return & commonpb . Status {
ErrorCode : commonpb . ErrorCode_UnexpectedError ,
Reason : "state code is not healthy" ,
} , nil
}
keys := make ( [ ] taskKey , 0 , len ( req . BuildIDs ) )
for _ , buildID := range req . BuildIDs {
keys = append ( keys , taskKey { ClusterID : req . ClusterID , BuildID : buildID } )
}
infos := i . deleteTaskInfos ( keys )
for _ , info := range infos {
if info . cancel != nil {
info . cancel ( )
}
}
2022-11-07 14:23:02 +08:00
log . Ctx ( ctx ) . Info ( "drop index build jobs success" , zap . String ( "ClusterID" , req . ClusterID ) ,
2022-09-24 13:40:52 +08:00
zap . Int64s ( "IndexBuildIDs" , req . BuildIDs ) )
2022-08-25 15:48:54 +08:00
return & commonpb . Status {
ErrorCode : commonpb . ErrorCode_Success ,
Reason : "" ,
} , nil
}
func ( i * IndexNode ) GetJobStats ( ctx context . Context , req * indexpb . GetJobStatsRequest ) ( * indexpb . GetJobStatsResponse , error ) {
2022-10-10 15:55:22 +08:00
stateCode := i . stateCode . Load ( ) . ( commonpb . StateCode )
if stateCode != commonpb . StateCode_Healthy {
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Warn ( "index node not ready" , zap . Int32 ( "state" , int32 ( stateCode ) ) )
2022-08-25 15:48:54 +08:00
return & indexpb . GetJobStatsResponse {
Status : & commonpb . Status {
ErrorCode : commonpb . ErrorCode_UnexpectedError ,
Reason : "state code is not healthy" ,
} ,
} , nil
}
2022-09-23 09:40:52 +08:00
unissued , active := i . sched . IndexBuildQueue . GetTaskNum ( )
2022-08-25 15:48:54 +08:00
jobInfos := make ( [ ] * indexpb . JobInfo , 0 )
2022-09-06 17:19:11 +08:00
i . foreachTaskInfo ( func ( ClusterID string , buildID UniqueID , info * taskInfo ) {
2022-08-25 15:48:54 +08:00
if info . statistic != nil {
jobInfos = append ( jobInfos , proto . Clone ( info . statistic ) . ( * indexpb . JobInfo ) )
}
} )
slots := 0
2022-09-23 09:40:52 +08:00
if i . sched . buildParallel > unissued + active {
slots = i . sched . buildParallel - unissued - active
2022-08-25 15:48:54 +08:00
}
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Info ( "Get Index Job Stats" , zap . Int ( "Unissued" , unissued ) , zap . Int ( "Active" , active ) , zap . Int ( "Slot" , slots ) )
2022-08-25 15:48:54 +08:00
return & indexpb . GetJobStatsResponse {
Status : & commonpb . Status {
ErrorCode : commonpb . ErrorCode_Success ,
Reason : "" ,
} ,
2022-09-23 09:40:52 +08:00
TotalJobNum : int64 ( active ) + int64 ( unissued ) ,
InProgressJobNum : int64 ( active ) ,
EnqueueJobNum : int64 ( unissued ) ,
2022-08-25 15:48:54 +08:00
TaskSlots : int64 ( slots ) ,
JobInfos : jobInfos ,
2022-12-07 18:01:19 +08:00
EnableDisk : Params . IndexNodeCfg . EnableDisk . GetAsBool ( ) ,
2022-08-25 15:48:54 +08:00
} , nil
}
// GetMetrics gets the metrics info of IndexNode.
// TODO(dragondriver): cache the Metrics and set a retention to the cache
func ( i * IndexNode ) GetMetrics ( ctx context . Context , req * milvuspb . GetMetricsRequest ) ( * milvuspb . GetMetricsResponse , error ) {
if ! i . isHealthy ( ) {
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Warn ( "IndexNode.GetMetrics failed" ,
2022-11-04 14:25:38 +08:00
zap . Int64 ( "node_id" , paramtable . GetNodeID ( ) ) ,
2022-08-25 15:48:54 +08:00
zap . String ( "req" , req . Request ) ,
2022-11-04 14:25:38 +08:00
zap . Error ( errIndexNodeIsUnhealthy ( paramtable . GetNodeID ( ) ) ) )
2022-08-25 15:48:54 +08:00
return & milvuspb . GetMetricsResponse {
Status : & commonpb . Status {
ErrorCode : commonpb . ErrorCode_UnexpectedError ,
2022-11-04 14:25:38 +08:00
Reason : msgIndexNodeIsUnhealthy ( paramtable . GetNodeID ( ) ) ,
2022-08-25 15:48:54 +08:00
} ,
Response : "" ,
} , nil
}
metricType , err := metricsinfo . ParseMetricType ( req . Request )
if err != nil {
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Warn ( "IndexNode.GetMetrics failed to parse metric type" ,
2022-11-04 14:25:38 +08:00
zap . Int64 ( "node_id" , paramtable . GetNodeID ( ) ) ,
2022-08-25 15:48:54 +08:00
zap . String ( "req" , req . Request ) ,
zap . Error ( err ) )
return & milvuspb . GetMetricsResponse {
Status : & commonpb . Status {
ErrorCode : commonpb . ErrorCode_UnexpectedError ,
Reason : err . Error ( ) ,
} ,
Response : "" ,
} , nil
}
if metricType == metricsinfo . SystemInfoMetrics {
metrics , err := getSystemInfoMetrics ( ctx , req , i )
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Debug ( "IndexNode.GetMetrics" ,
2022-11-04 14:25:38 +08:00
zap . Int64 ( "node_id" , paramtable . GetNodeID ( ) ) ,
2022-08-25 15:48:54 +08:00
zap . String ( "req" , req . Request ) ,
zap . String ( "metric_type" , metricType ) ,
zap . Error ( err ) )
return metrics , nil
}
2022-09-24 13:40:52 +08:00
log . Ctx ( ctx ) . Warn ( "IndexNode.GetMetrics failed, request metric type is not implemented yet" ,
2022-11-04 14:25:38 +08:00
zap . Int64 ( "node_id" , paramtable . GetNodeID ( ) ) ,
2022-08-25 15:48:54 +08:00
zap . String ( "req" , req . Request ) ,
zap . String ( "metric_type" , metricType ) )
return & milvuspb . GetMetricsResponse {
Status : & commonpb . Status {
ErrorCode : commonpb . ErrorCode_UnexpectedError ,
Reason : metricsinfo . MsgUnimplementedMetric ,
} ,
Response : "" ,
} , nil
}