2021-11-17 19:49:32 +08:00
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
2021-04-19 13:47:10 +08:00
// with the License. You may obtain a copy of the License at
//
2021-11-17 19:49:32 +08:00
// http://www.apache.org/licenses/LICENSE-2.0
2021-04-19 13:47:10 +08:00
//
2021-11-17 19:49:32 +08:00
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2021-04-19 13:47:10 +08:00
2021-06-22 16:44:09 +08:00
package querycoord
2021-04-15 15:15:46 +08:00
import (
"context"
2021-06-23 17:44:12 +08:00
"errors"
2021-04-15 15:15:46 +08:00
"fmt"
2022-03-07 19:23:58 +08:00
"sort"
2021-10-11 09:54:37 +08:00
"sync"
2021-12-03 15:15:32 +08:00
"time"
2021-04-15 15:15:46 +08:00
2021-06-19 11:45:09 +08:00
"github.com/golang/protobuf/proto"
2021-04-15 15:15:46 +08:00
"go.uber.org/zap"
2022-05-19 16:51:57 +08:00
"golang.org/x/sync/errgroup"
2021-04-15 15:15:46 +08:00
2021-04-22 14:45:57 +08:00
"github.com/milvus-io/milvus/internal/log"
2022-02-28 16:51:55 +08:00
"github.com/milvus-io/milvus/internal/metrics"
2021-04-22 14:45:57 +08:00
"github.com/milvus-io/milvus/internal/proto/commonpb"
"github.com/milvus-io/milvus/internal/proto/datapb"
2022-04-20 16:15:41 +08:00
"github.com/milvus-io/milvus/internal/proto/milvuspb"
2021-04-22 14:45:57 +08:00
"github.com/milvus-io/milvus/internal/proto/querypb"
2022-03-15 21:51:21 +08:00
"github.com/milvus-io/milvus/internal/util/funcutil"
2022-02-28 16:51:55 +08:00
"github.com/milvus-io/milvus/internal/util/timerecord"
2022-04-29 15:19:47 +08:00
"github.com/milvus-io/milvus/internal/util/typeutil"
2021-04-15 15:15:46 +08:00
)
2021-12-03 15:15:32 +08:00
const timeoutForRPC = 10 * time . Second
2021-06-19 11:45:09 +08:00
const (
2021-06-22 16:44:09 +08:00
triggerTaskPrefix = "queryCoord-triggerTask"
activeTaskPrefix = "queryCoord-activeTask"
taskInfoPrefix = "queryCoord-taskInfo"
loadBalanceInfoPrefix = "queryCoord-loadBalanceInfo"
2021-06-19 11:45:09 +08:00
)
2021-10-11 09:54:37 +08:00
const (
2021-10-15 20:25:08 +08:00
// MaxRetryNum is the maximum number of times that each task can be retried
2021-10-11 09:54:37 +08:00
MaxRetryNum = 5
2021-10-21 10:53:09 +08:00
// MaxSendSizeToEtcd is the default limit size of etcd messages that can be sent and received
2021-10-27 19:32:21 +08:00
// MaxSendSizeToEtcd = 2097152
// Limit size of every loadSegmentReq to 200k
MaxSendSizeToEtcd = 200000
2021-10-11 09:54:37 +08:00
)
2021-06-19 11:45:09 +08:00
type taskState int
const (
taskUndo taskState = 0
taskDoing taskState = 1
taskDone taskState = 3
taskExpired taskState = 4
2021-10-11 09:54:37 +08:00
taskFailed taskState = 5
2021-06-19 11:45:09 +08:00
)
2021-04-15 15:15:46 +08:00
type task interface {
2021-10-14 20:18:33 +08:00
traceCtx ( ) context . Context
getTaskID ( ) UniqueID // return ReqId
setTaskID ( id UniqueID )
msgBase ( ) * commonpb . MsgBase
msgType ( ) commonpb . MsgType
timestamp ( ) Timestamp
getTriggerCondition ( ) querypb . TriggerCondition
2022-01-25 17:26:13 +08:00
setTriggerCondition ( trigger querypb . TriggerCondition )
2021-10-14 20:18:33 +08:00
preExecute ( ctx context . Context ) error
execute ( ctx context . Context ) error
postExecute ( ctx context . Context ) error
2022-05-05 16:25:50 +08:00
globalPostExecute ( ctx context . Context ) error // execute after all child task completed
2021-10-14 20:18:33 +08:00
reschedule ( ctx context . Context ) ( [ ] task , error )
rollBack ( ctx context . Context ) [ ] task
waitToFinish ( ) error
notify ( err error )
taskPriority ( ) querypb . TriggerCondition
setParentTask ( t task )
getParentTask ( ) task
getChildTask ( ) [ ] task
addChildTask ( t task )
removeChildTaskByID ( taskID UniqueID )
isValid ( ) bool
marshal ( ) ( [ ] byte , error )
getState ( ) taskState
setState ( state taskState )
isRetryable ( ) bool
setResultInfo ( err error )
getResultInfo ( ) * commonpb . Status
updateTaskProcess ( )
2022-02-28 16:51:55 +08:00
elapseSpan ( ) time . Duration
2022-06-14 17:12:09 +08:00
finishContext ( )
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
type baseTask struct {
2021-10-21 10:51:14 +08:00
condition
2021-10-11 09:54:37 +08:00
ctx context . Context
cancel context . CancelFunc
result * commonpb . Status
resultMu sync . RWMutex
state taskState
stateMu sync . RWMutex
retryCount int
2021-12-29 12:15:21 +08:00
retryMu sync . RWMutex
2021-10-11 09:54:37 +08:00
//sync.RWMutex
2021-06-15 12:41:40 +08:00
taskID UniqueID
triggerCondition querypb . TriggerCondition
2022-01-25 17:26:13 +08:00
triggerMu sync . RWMutex
2021-06-15 12:41:40 +08:00
parentTask task
childTasks [ ] task
2021-10-11 09:54:37 +08:00
childTasksMu sync . RWMutex
2022-02-28 16:51:55 +08:00
timeRecorder * timerecord . TimeRecorder
2021-10-11 09:54:37 +08:00
}
2021-10-18 21:34:47 +08:00
func newBaseTask ( ctx context . Context , triggerType querypb . TriggerCondition ) * baseTask {
2021-10-11 09:54:37 +08:00
childCtx , cancel := context . WithCancel ( ctx )
2021-10-21 10:51:14 +08:00
condition := newTaskCondition ( childCtx )
2021-10-11 09:54:37 +08:00
2021-10-18 21:34:47 +08:00
baseTask := & baseTask {
2021-10-11 09:54:37 +08:00
ctx : childCtx ,
cancel : cancel ,
2021-10-21 10:51:14 +08:00
condition : condition ,
2021-10-11 09:54:37 +08:00
state : taskUndo ,
retryCount : MaxRetryNum ,
triggerCondition : triggerType ,
childTasks : [ ] task { } ,
2022-02-28 16:51:55 +08:00
timeRecorder : timerecord . NewTimeRecorder ( "QueryCoordBaseTask" ) ,
2021-10-11 09:54:37 +08:00
}
return baseTask
2021-06-15 12:41:40 +08:00
}
2022-06-01 20:00:03 +08:00
func newBaseTaskWithRetry ( ctx context . Context , triggerType querypb . TriggerCondition , retryCount int ) * baseTask {
baseTask := newBaseTask ( ctx , triggerType )
baseTask . retryCount = retryCount
return baseTask
}
2021-10-14 20:18:33 +08:00
// getTaskID function returns the unique taskID of the trigger task
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) getTaskID ( ) UniqueID {
2021-06-15 12:41:40 +08:00
return bt . taskID
}
2021-10-14 20:18:33 +08:00
// setTaskID function sets the trigger task with a unique id, which is allocated by tso
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) setTaskID ( id UniqueID ) {
2021-06-15 12:41:40 +08:00
bt . taskID = id
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) traceCtx ( ) context . Context {
2021-04-15 15:15:46 +08:00
return bt . ctx
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) getTriggerCondition ( ) querypb . TriggerCondition {
2022-01-25 17:26:13 +08:00
bt . triggerMu . RLock ( )
defer bt . triggerMu . RUnlock ( )
2021-10-11 09:54:37 +08:00
return bt . triggerCondition
}
2022-01-25 17:26:13 +08:00
func ( bt * baseTask ) setTriggerCondition ( trigger querypb . TriggerCondition ) {
bt . triggerMu . Lock ( )
defer bt . triggerMu . Unlock ( )
bt . triggerCondition = trigger
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) taskPriority ( ) querypb . TriggerCondition {
2021-06-15 12:41:40 +08:00
return bt . triggerCondition
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) setParentTask ( t task ) {
2021-10-11 09:54:37 +08:00
bt . parentTask = t
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) getParentTask ( ) task {
2021-06-15 12:41:40 +08:00
return bt . parentTask
}
2021-10-13 21:26:33 +08:00
// GetChildTask function returns all the child tasks of the trigger task
2022-05-30 19:50:04 +08:00
// Child task may be loadSegmentTask or watchDmChannelTask
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) getChildTask ( ) [ ] task {
2021-10-11 09:54:37 +08:00
bt . childTasksMu . RLock ( )
defer bt . childTasksMu . RUnlock ( )
2021-06-15 12:41:40 +08:00
return bt . childTasks
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) addChildTask ( t task ) {
2021-10-11 09:54:37 +08:00
bt . childTasksMu . Lock ( )
defer bt . childTasksMu . Unlock ( )
2021-06-15 12:41:40 +08:00
bt . childTasks = append ( bt . childTasks , t )
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) removeChildTaskByID ( taskID UniqueID ) {
2021-10-11 09:54:37 +08:00
bt . childTasksMu . Lock ( )
defer bt . childTasksMu . Unlock ( )
result := make ( [ ] task , 0 )
for _ , t := range bt . childTasks {
2021-10-14 20:18:33 +08:00
if t . getTaskID ( ) != taskID {
2021-10-11 09:54:37 +08:00
result = append ( result , t )
}
}
bt . childTasks = result
2022-02-28 16:51:55 +08:00
metrics . QueryCoordNumChildTasks . WithLabelValues ( ) . Dec ( )
2021-10-11 09:54:37 +08:00
}
2021-12-29 12:15:21 +08:00
func ( bt * baseTask ) clearChildTasks ( ) {
bt . childTasksMu . Lock ( )
defer bt . childTasksMu . Unlock ( )
bt . childTasks = [ ] task { }
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) isValid ( ) bool {
2021-06-19 11:45:09 +08:00
return true
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) reschedule ( ctx context . Context ) ( [ ] task , error ) {
2021-06-19 11:45:09 +08:00
return nil , nil
}
2021-10-12 23:40:36 +08:00
// State returns the state of task, such as taskUndo, taskDoing, taskDone, taskExpired, taskFailed
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) getState ( ) taskState {
2021-10-11 09:54:37 +08:00
bt . stateMu . RLock ( )
defer bt . stateMu . RUnlock ( )
2021-06-19 11:45:09 +08:00
return bt . state
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) setState ( state taskState ) {
2021-10-11 09:54:37 +08:00
bt . stateMu . Lock ( )
defer bt . stateMu . Unlock ( )
2021-06-19 11:45:09 +08:00
bt . state = state
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) isRetryable ( ) bool {
2021-12-29 12:15:21 +08:00
bt . retryMu . RLock ( )
defer bt . retryMu . RUnlock ( )
2021-10-11 09:54:37 +08:00
return bt . retryCount > 0
}
2021-12-29 12:15:21 +08:00
func ( bt * baseTask ) reduceRetryCount ( ) {
bt . retryMu . Lock ( )
defer bt . retryMu . Unlock ( )
bt . retryCount --
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) setResultInfo ( err error ) {
2021-10-11 09:54:37 +08:00
bt . resultMu . Lock ( )
defer bt . resultMu . Unlock ( )
if bt . result == nil {
bt . result = & commonpb . Status { }
}
if err == nil {
bt . result . ErrorCode = commonpb . ErrorCode_Success
bt . result . Reason = ""
return
}
bt . result . ErrorCode = commonpb . ErrorCode_UnexpectedError
bt . result . Reason = bt . result . Reason + ", " + err . Error ( )
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) getResultInfo ( ) * commonpb . Status {
2021-10-11 09:54:37 +08:00
bt . resultMu . RLock ( )
defer bt . resultMu . RUnlock ( )
return proto . Clone ( bt . result ) . ( * commonpb . Status )
}
2022-05-05 16:25:50 +08:00
func ( bt * baseTask ) globalPostExecute ( ctx context . Context ) error {
return nil
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) updateTaskProcess ( ) {
2021-10-11 09:54:37 +08:00
// TODO::
}
2021-10-18 21:34:47 +08:00
func ( bt * baseTask ) rollBack ( ctx context . Context ) [ ] task {
2021-10-11 09:54:37 +08:00
//TODO::
return nil
}
2022-02-28 16:51:55 +08:00
func ( bt * baseTask ) elapseSpan ( ) time . Duration {
return bt . timeRecorder . ElapseSpan ( )
}
2022-06-14 17:12:09 +08:00
// finishContext calls the cancel function for the trace ctx.
func ( bt * baseTask ) finishContext ( ) {
if bt . cancel != nil {
bt . cancel ( )
}
}
2021-10-18 21:34:47 +08:00
type loadCollectionTask struct {
* baseTask
2021-04-15 15:15:46 +08:00
* querypb . LoadCollectionRequest
2022-02-08 21:57:46 +08:00
broker * globalMetaBroker
cluster Cluster
meta Meta
2022-02-28 16:51:55 +08:00
once sync . Once
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lct * loadCollectionTask ) msgBase ( ) * commonpb . MsgBase {
2021-06-26 16:08:11 +08:00
return lct . Base
}
2021-10-18 21:34:47 +08:00
func ( lct * loadCollectionTask ) marshal ( ) ( [ ] byte , error ) {
2021-08-03 22:03:25 +08:00
return proto . Marshal ( lct . LoadCollectionRequest )
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lct * loadCollectionTask ) msgType ( ) commonpb . MsgType {
2021-04-15 15:15:46 +08:00
return lct . Base . MsgType
}
2021-10-18 21:34:47 +08:00
func ( lct * loadCollectionTask ) timestamp ( ) Timestamp {
2021-04-15 15:15:46 +08:00
return lct . Base . Timestamp
}
2021-10-18 21:34:47 +08:00
func ( lct * loadCollectionTask ) updateTaskProcess ( ) {
2021-10-11 09:54:37 +08:00
collectionID := lct . CollectionID
2021-10-14 20:18:33 +08:00
childTasks := lct . getChildTask ( )
2021-10-11 09:54:37 +08:00
allDone := true
for _ , t := range childTasks {
2021-10-14 20:18:33 +08:00
if t . getState ( ) != taskDone {
2021-10-11 09:54:37 +08:00
allDone = false
2021-12-30 22:51:21 +08:00
break
2021-10-11 09:54:37 +08:00
}
2021-12-30 22:51:21 +08:00
2022-05-30 19:50:04 +08:00
// wait watchDeltaChannel task done after loading segment
2021-12-30 22:51:21 +08:00
nodeID := getDstNodeIDByTask ( t )
if t . msgType ( ) == commonpb . MsgType_LoadSegments {
2022-06-02 13:16:05 +08:00
if ! lct . cluster . HasWatchedDeltaChannel ( lct . ctx , nodeID , collectionID ) {
2021-12-30 22:51:21 +08:00
allDone = false
break
}
}
2021-10-11 09:54:37 +08:00
}
if allDone {
2022-04-28 16:01:48 +08:00
err := syncReplicaSegments ( lct . ctx , lct . cluster , childTasks )
if err != nil {
log . Error ( "loadCollectionTask: failed to sync replica segments to shard leader" ,
zap . Int64 ( "taskID" , lct . getTaskID ( ) ) ,
zap . Int64 ( "collectionID" , collectionID ) ,
zap . Error ( err ) )
lct . setResultInfo ( err )
return
}
err = lct . meta . setLoadPercentage ( collectionID , 0 , 100 , querypb . LoadType_LoadCollection )
2021-10-11 09:54:37 +08:00
if err != nil {
log . Error ( "loadCollectionTask: set load percentage to meta's collectionInfo" , zap . Int64 ( "collectionID" , collectionID ) )
2021-10-14 20:18:33 +08:00
lct . setResultInfo ( err )
2022-04-28 16:01:48 +08:00
return
2021-10-11 09:54:37 +08:00
}
2022-04-28 16:01:48 +08:00
2022-02-28 16:51:55 +08:00
lct . once . Do ( func ( ) {
2022-03-03 16:05:57 +08:00
metrics . QueryCoordLoadCount . WithLabelValues ( metrics . SuccessLabel ) . Inc ( )
2022-02-28 16:51:55 +08:00
metrics . QueryCoordLoadLatency . WithLabelValues ( ) . Observe ( float64 ( lct . elapseSpan ( ) . Milliseconds ( ) ) )
metrics . QueryCoordNumChildTasks . WithLabelValues ( ) . Sub ( float64 ( len ( lct . getChildTask ( ) ) ) )
} )
2021-10-11 09:54:37 +08:00
}
}
2021-10-18 21:34:47 +08:00
func ( lct * loadCollectionTask ) preExecute ( ctx context . Context ) error {
2022-04-20 16:15:41 +08:00
if lct . ReplicaNumber < 1 {
log . Warn ( "replicaNumber is less than 1 for load collection request, will set it to 1" ,
zap . Int32 ( "replicaNumber" , lct . ReplicaNumber ) )
lct . ReplicaNumber = 1
}
2021-04-15 15:15:46 +08:00
collectionID := lct . CollectionID
schema := lct . Schema
2021-10-14 20:18:33 +08:00
lct . setResultInfo ( nil )
2022-04-26 11:29:54 +08:00
log . Info ( "start do loadCollectionTask" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , lct . getTaskID ( ) ) ,
2021-04-15 15:15:46 +08:00
zap . Int64 ( "collectionID" , collectionID ) ,
2022-04-20 16:15:41 +08:00
zap . Stringer ( "schema" , schema ) ,
zap . Int32 ( "replicaNumber" , lct . ReplicaNumber ) )
2021-06-30 16:18:13 +08:00
return nil
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lct * loadCollectionTask ) execute ( ctx context . Context ) error {
2021-12-29 12:15:21 +08:00
defer lct . reduceRetryCount ( )
2021-04-15 15:15:46 +08:00
collectionID := lct . CollectionID
2022-04-20 16:15:41 +08:00
partitionIds , err := lct . broker . showPartitionIDs ( ctx , collectionID )
2021-04-15 15:15:46 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Error ( "loadCollectionTask: showPartition failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) , zap . Error ( err ) )
2021-12-13 10:29:27 +08:00
lct . setResultInfo ( err )
return err
}
2022-04-26 11:29:54 +08:00
log . Info ( "loadCollectionTask: get collection's all partitionIDs" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64s ( "partitionIDs" , partitionIds ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) )
2021-06-23 17:44:12 +08:00
2022-03-21 11:25:24 +08:00
var (
2022-04-20 16:15:41 +08:00
replicas = make ( [ ] * milvuspb . ReplicaInfo , lct . ReplicaNumber )
replicaIds = make ( [ ] int64 , lct . ReplicaNumber )
segmentLoadInfos = make ( [ ] * querypb . SegmentLoadInfo , 0 )
deltaChannelInfos = make ( [ ] * datapb . VchannelInfo , 0 )
dmChannelInfos = make ( [ ] * datapb . VchannelInfo , 0 )
collectionSize uint64
2022-03-21 11:25:24 +08:00
)
2022-04-20 16:15:41 +08:00
for _ , partitionID := range partitionIds {
2022-02-08 21:57:46 +08:00
vChannelInfos , binlogs , err := lct . broker . getRecoveryInfo ( lct . ctx , collectionID , partitionID )
2021-06-15 12:41:40 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Error ( "loadCollectionTask: getRecoveryInfo failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "partitionID" , partitionID ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) , zap . Error ( err ) )
2021-10-14 20:18:33 +08:00
lct . setResultInfo ( err )
2021-06-15 12:41:40 +08:00
return err
}
2021-04-15 15:15:46 +08:00
2022-03-14 09:50:01 +08:00
for _ , segmentBinlog := range binlogs {
segmentLoadInfo := lct . broker . generateSegmentLoadInfo ( ctx , collectionID , partitionID , segmentBinlog , true , lct . Schema )
2022-04-20 16:15:41 +08:00
collectionSize += uint64 ( segmentLoadInfo . SegmentSize )
segmentLoadInfos = append ( segmentLoadInfos , segmentLoadInfo )
2021-04-15 15:15:46 +08:00
}
2021-06-15 12:41:40 +08:00
2021-12-21 11:57:39 +08:00
for _ , info := range vChannelInfos {
deltaChannelInfo , err := generateWatchDeltaChannelInfo ( info )
2021-11-26 22:47:17 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Error ( "loadCollectionTask: generateWatchDeltaChannelInfo failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . String ( "channelName" , info . ChannelName ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) , zap . Error ( err ) )
lct . setResultInfo ( err )
2021-11-26 22:47:17 +08:00
return err
2021-11-05 14:47:19 +08:00
}
2021-12-21 11:57:39 +08:00
deltaChannelInfos = append ( deltaChannelInfos , deltaChannelInfo )
dmChannelInfos = append ( dmChannelInfos , info )
2021-04-15 15:15:46 +08:00
}
2021-11-13 08:49:08 +08:00
}
2022-04-20 16:15:41 +08:00
2021-12-21 11:57:39 +08:00
mergedDeltaChannels := mergeWatchDeltaChannelInfo ( deltaChannelInfos )
2021-11-13 08:49:08 +08:00
// If meta is not updated here, deltaChannel meta will not be available when loadSegment reschedule
2021-12-21 11:57:39 +08:00
err = lct . meta . setDeltaChannel ( collectionID , mergedDeltaChannels )
2021-12-01 16:39:31 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Error ( "loadCollectionTask: set delta channel info failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) , zap . Error ( err ) )
lct . setResultInfo ( err )
2021-12-01 16:39:31 +08:00
return err
}
2021-04-15 15:15:46 +08:00
2021-12-21 11:57:39 +08:00
mergedDmChannel := mergeDmChannelInfo ( dmChannelInfos )
2022-04-20 16:15:41 +08:00
for i := range replicas {
replica , err := lct . meta . generateReplica ( lct . CollectionID , partitionIds )
if err != nil {
lct . setResultInfo ( err )
return err
2021-12-21 11:57:39 +08:00
}
2022-04-20 16:15:41 +08:00
replicas [ i ] = replica
replicaIds [ i ] = replica . ReplicaID
2021-12-21 11:57:39 +08:00
}
2022-06-02 13:16:05 +08:00
err = lct . cluster . AssignNodesToReplicas ( ctx , replicas , collectionSize )
2021-09-29 09:56:04 +08:00
if err != nil {
2022-04-20 16:15:41 +08:00
log . Error ( "failed to assign nodes to replicas" ,
zap . Int64 ( "collectionID" , collectionID ) ,
zap . Int64s ( "partitionIDs" , partitionIds ) ,
zap . Int64 ( "msgID" , lct . Base . MsgID ) ,
zap . Int32 ( "replicaNumber" , lct . ReplicaNumber ) ,
zap . Error ( err ) )
2021-10-14 20:18:33 +08:00
lct . setResultInfo ( err )
2021-09-29 09:56:04 +08:00
return err
}
2022-04-20 16:15:41 +08:00
for _ , replica := range replicas {
var (
loadSegmentReqs = [ ] * querypb . LoadSegmentsRequest { }
watchDmChannelReqs = [ ] * querypb . WatchDmChannelsRequest { }
)
for _ , segmentLoadInfo := range segmentLoadInfos {
msgBase := proto . Clone ( lct . Base ) . ( * commonpb . MsgBase )
msgBase . MsgType = commonpb . MsgType_LoadSegments
loadSegmentReq := & querypb . LoadSegmentsRequest {
Base : msgBase ,
Infos : [ ] * querypb . SegmentLoadInfo { segmentLoadInfo } ,
Schema : lct . Schema ,
CollectionID : collectionID ,
LoadMeta : & querypb . LoadMetaInfo {
LoadType : querypb . LoadType_LoadCollection ,
CollectionID : collectionID ,
PartitionIDs : partitionIds ,
} ,
ReplicaID : replica . ReplicaID ,
}
loadSegmentReqs = append ( loadSegmentReqs , loadSegmentReq )
}
//TODO:: queryNode receive dm message according partitionID cache
//TODO:: queryNode add partitionID to cache if receive create partition message from dmChannel
for _ , info := range mergedDmChannel {
msgBase := proto . Clone ( lct . Base ) . ( * commonpb . MsgBase )
msgBase . MsgType = commonpb . MsgType_WatchDmChannels
2022-06-16 12:00:10 +08:00
segmentInfos , err := generateVChannelSegmentInfos ( lct . meta , info )
if err != nil {
lct . setResultInfo ( err )
return err
}
2022-04-20 16:15:41 +08:00
watchRequest := & querypb . WatchDmChannelsRequest {
Base : msgBase ,
CollectionID : collectionID ,
//PartitionIDs: toLoadPartitionIDs,
2022-06-16 12:00:10 +08:00
Infos : [ ] * datapb . VchannelInfo { info } ,
Schema : lct . Schema ,
SegmentInfos : segmentInfos ,
2022-04-20 16:15:41 +08:00
LoadMeta : & querypb . LoadMetaInfo {
LoadType : querypb . LoadType_LoadCollection ,
CollectionID : collectionID ,
PartitionIDs : partitionIds ,
} ,
ReplicaID : replica . GetReplicaID ( ) ,
}
watchDmChannelReqs = append ( watchDmChannelReqs , watchRequest )
}
2022-05-31 16:36:03 +08:00
internalTasks , err := assignInternalTask ( ctx , lct , lct . meta , lct . cluster , loadSegmentReqs , watchDmChannelReqs , false , nil , replica . GetNodeIds ( ) , - 1 , lct . broker )
2022-04-20 16:15:41 +08:00
if err != nil {
log . Error ( "loadCollectionTask: assign child task failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) , zap . Error ( err ) )
lct . setResultInfo ( err )
return err
}
for _ , internalTask := range internalTasks {
lct . addChildTask ( internalTask )
if task , ok := internalTask . ( * watchDmChannelTask ) ; ok {
2022-06-02 13:16:05 +08:00
nodeInfo , err := lct . cluster . GetNodeInfoByID ( task . NodeID )
2022-04-20 16:15:41 +08:00
if err != nil {
log . Error ( "loadCollectionTask: get shard leader node info failed" ,
zap . Int64 ( "collectionID" , collectionID ) ,
zap . Int64 ( "msgID" , lct . Base . MsgID ) ,
zap . Int64 ( "nodeID" , task . NodeID ) ,
zap . Error ( err ) )
lct . setResultInfo ( err )
return err
}
replica . ShardReplicas = append ( replica . ShardReplicas , & milvuspb . ShardReplica {
LeaderID : task . NodeID ,
LeaderAddr : nodeInfo . ( * queryNode ) . address ,
DmChannelName : task . WatchDmChannelsRequest . Infos [ 0 ] . ChannelName ,
} )
}
2022-05-25 03:08:00 +08:00
log . Info ( "loadCollectionTask: add a childTask" , zap . Int64 ( "collectionID" , collectionID ) , zap . String ( "task type" , internalTask . msgType ( ) . String ( ) ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) )
2022-04-20 16:15:41 +08:00
}
metrics . QueryCoordNumChildTasks . WithLabelValues ( ) . Add ( float64 ( len ( internalTasks ) ) )
2022-04-26 11:29:54 +08:00
log . Info ( "loadCollectionTask: assign child task done" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) )
2021-12-21 11:57:39 +08:00
}
2022-03-14 09:50:01 +08:00
err = lct . meta . addCollection ( collectionID , querypb . LoadType_LoadCollection , lct . Schema )
2021-12-21 11:57:39 +08:00
if err != nil {
log . Error ( "loadCollectionTask: add collection to meta failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) , zap . Error ( err ) )
lct . setResultInfo ( err )
return err
}
2022-04-20 16:15:41 +08:00
err = lct . meta . addPartitions ( collectionID , partitionIds )
2021-12-21 11:57:39 +08:00
if err != nil {
2022-04-20 16:15:41 +08:00
log . Error ( "loadCollectionTask: add partitions to meta failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64s ( "partitionIDs" , partitionIds ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) , zap . Error ( err ) )
2021-12-21 11:57:39 +08:00
lct . setResultInfo ( err )
return err
2021-11-11 12:56:42 +08:00
}
2021-06-15 12:41:40 +08:00
2022-04-20 16:15:41 +08:00
for _ , replica := range replicas {
err = lct . meta . addReplica ( replica )
if err != nil {
log . Error ( "failed to add replica" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64s ( "partitionIDs" , partitionIds ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) , zap . Int32 ( "replicaNumber" , lct . ReplicaNumber ) )
lct . setResultInfo ( err )
return err
}
}
2022-04-26 11:29:54 +08:00
log . Info ( "LoadCollection execute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , lct . getTaskID ( ) ) ,
2021-06-15 12:41:40 +08:00
zap . Int64 ( "collectionID" , collectionID ) )
2021-04-15 15:15:46 +08:00
return nil
}
2021-10-18 21:34:47 +08:00
func ( lct * loadCollectionTask ) postExecute ( ctx context . Context ) error {
2021-04-15 15:15:46 +08:00
collectionID := lct . CollectionID
2021-12-29 12:15:21 +08:00
if lct . getResultInfo ( ) . ErrorCode != commonpb . ErrorCode_Success {
lct . clearChildTasks ( )
2021-10-11 09:54:37 +08:00
err := lct . meta . releaseCollection ( collectionID )
2021-06-30 17:48:19 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Error ( "loadCollectionTask: occur error when release collection info from meta" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) , zap . Error ( err ) )
panic ( err )
2021-06-23 17:44:12 +08:00
}
}
2021-10-11 09:54:37 +08:00
2022-04-26 11:29:54 +08:00
log . Info ( "loadCollectionTask postExecute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , lct . getTaskID ( ) ) ,
2021-04-15 15:15:46 +08:00
zap . Int64 ( "collectionID" , collectionID ) )
2021-06-30 16:18:13 +08:00
return nil
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lct * loadCollectionTask ) rollBack ( ctx context . Context ) [ ] task {
2022-06-02 13:16:05 +08:00
onlineNodeIDs := lct . cluster . OnlineNodeIDs ( )
2021-10-11 09:54:37 +08:00
resultTasks := make ( [ ] task , 0 )
2021-12-21 11:57:39 +08:00
for _ , nodeID := range onlineNodeIDs {
2021-10-11 09:54:37 +08:00
//brute force rollBack, should optimize
2021-12-21 11:57:39 +08:00
msgBase := proto . Clone ( lct . Base ) . ( * commonpb . MsgBase )
msgBase . MsgType = commonpb . MsgType_ReleaseCollection
2021-10-11 09:54:37 +08:00
req := & querypb . ReleaseCollectionRequest {
2021-12-21 11:57:39 +08:00
Base : msgBase ,
2021-10-11 09:54:37 +08:00
DbID : lct . DbID ,
CollectionID : lct . CollectionID ,
NodeID : nodeID ,
}
2021-12-15 16:53:12 +08:00
baseTask := newBaseTask ( ctx , querypb . TriggerCondition_GrpcRequest )
2021-10-14 20:18:33 +08:00
baseTask . setParentTask ( lct )
2021-10-18 21:34:47 +08:00
releaseCollectionTask := & releaseCollectionTask {
baseTask : baseTask ,
2021-10-11 09:54:37 +08:00
ReleaseCollectionRequest : req ,
cluster : lct . cluster ,
}
resultTasks = append ( resultTasks , releaseCollectionTask )
}
2021-12-21 11:57:39 +08:00
err := lct . meta . releaseCollection ( lct . CollectionID )
if err != nil {
log . Error ( "releaseCollectionTask: release collectionInfo from meta failed" , zap . Int64 ( "collectionID" , lct . CollectionID ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) , zap . Error ( err ) )
panic ( err )
}
2022-04-26 11:29:54 +08:00
log . Info ( "loadCollectionTask: generate rollBack task for loadCollectionTask" , zap . Int64 ( "collectionID" , lct . CollectionID ) , zap . Int64 ( "msgID" , lct . Base . MsgID ) )
2021-10-11 09:54:37 +08:00
return resultTasks
}
2021-10-18 21:34:47 +08:00
// releaseCollectionTask will release all the data of this collection on query nodes
type releaseCollectionTask struct {
* baseTask
2021-04-15 15:15:46 +08:00
* querypb . ReleaseCollectionRequest
2022-02-08 21:57:46 +08:00
cluster Cluster
meta Meta
broker * globalMetaBroker
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( rct * releaseCollectionTask ) msgBase ( ) * commonpb . MsgBase {
2021-06-26 16:08:11 +08:00
return rct . Base
}
2021-10-18 21:34:47 +08:00
func ( rct * releaseCollectionTask ) marshal ( ) ( [ ] byte , error ) {
2021-08-03 22:03:25 +08:00
return proto . Marshal ( rct . ReleaseCollectionRequest )
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( rct * releaseCollectionTask ) msgType ( ) commonpb . MsgType {
2021-04-15 15:15:46 +08:00
return rct . Base . MsgType
}
2021-10-18 21:34:47 +08:00
func ( rct * releaseCollectionTask ) timestamp ( ) Timestamp {
2021-04-15 15:15:46 +08:00
return rct . Base . Timestamp
}
2022-01-17 17:37:37 +08:00
func ( rct * releaseCollectionTask ) updateTaskProcess ( ) {
collectionID := rct . CollectionID
parentTask := rct . getParentTask ( )
if parentTask == nil {
// all queryNodes have successfully released the data, clean up collectionMeta
err := rct . meta . releaseCollection ( collectionID )
if err != nil {
log . Error ( "releaseCollectionTask: release collectionInfo from meta failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , rct . Base . MsgID ) , zap . Error ( err ) )
panic ( err )
}
}
}
2021-10-18 21:34:47 +08:00
func ( rct * releaseCollectionTask ) preExecute ( context . Context ) error {
2021-04-15 15:15:46 +08:00
collectionID := rct . CollectionID
2021-10-14 20:18:33 +08:00
rct . setResultInfo ( nil )
2022-04-26 11:29:54 +08:00
log . Info ( "start do releaseCollectionTask" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , rct . getTaskID ( ) ) ,
2021-04-15 15:15:46 +08:00
zap . Int64 ( "collectionID" , collectionID ) )
2021-06-30 16:18:13 +08:00
return nil
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( rct * releaseCollectionTask ) execute ( ctx context . Context ) error {
2021-04-15 15:15:46 +08:00
collectionID := rct . CollectionID
2021-10-11 09:54:37 +08:00
2021-09-18 18:45:51 +08:00
// if nodeID ==0, it means that the release request has not been assigned to the specified query node
2021-06-19 11:45:09 +08:00
if rct . NodeID <= 0 {
2022-05-19 10:13:56 +08:00
// invalidate all the collection meta cache with the specified collectionID
2022-05-30 19:50:04 +08:00
err := rct . broker . invalidateCollectionMetaCache ( ctx , collectionID )
2022-05-19 10:13:56 +08:00
if err != nil {
log . Error ( "releaseCollectionTask: release collection end, invalidateCollectionMetaCache occur error" , zap . Int64 ( "collectionID" , rct . CollectionID ) , zap . Int64 ( "msgID" , rct . Base . MsgID ) , zap . Error ( err ) )
rct . setResultInfo ( err )
return err
}
2022-04-20 16:15:41 +08:00
// TODO(yah01): broadcast to all nodes? Or only nodes serve the collection
2022-06-02 13:16:05 +08:00
onlineNodeIDs := rct . cluster . OnlineNodeIDs ( )
2021-12-21 11:57:39 +08:00
for _ , nodeID := range onlineNodeIDs {
2021-06-19 11:45:09 +08:00
req := proto . Clone ( rct . ReleaseCollectionRequest ) . ( * querypb . ReleaseCollectionRequest )
req . NodeID = nodeID
2021-12-15 16:53:12 +08:00
baseTask := newBaseTask ( ctx , querypb . TriggerCondition_GrpcRequest )
2021-10-14 20:18:33 +08:00
baseTask . setParentTask ( rct )
2021-10-18 21:34:47 +08:00
releaseCollectionTask := & releaseCollectionTask {
baseTask : baseTask ,
2021-06-19 11:45:09 +08:00
ReleaseCollectionRequest : req ,
2021-06-15 12:41:40 +08:00
cluster : rct . cluster ,
}
2021-10-11 09:54:37 +08:00
2021-10-14 20:18:33 +08:00
rct . addChildTask ( releaseCollectionTask )
2022-04-26 11:29:54 +08:00
log . Info ( "releaseCollectionTask: add a releaseCollectionTask to releaseCollectionTask's childTask" , zap . Any ( "task" , releaseCollectionTask ) )
2021-06-15 12:41:40 +08:00
}
} else {
2022-05-05 21:15:50 +08:00
// If the node crashed or be offline, the loaded segments are lost
defer rct . reduceRetryCount ( )
2022-06-02 13:16:05 +08:00
err := rct . cluster . ReleaseCollection ( ctx , rct . NodeID , rct . ReleaseCollectionRequest )
2021-04-15 15:15:46 +08:00
if err != nil {
2022-01-17 17:37:37 +08:00
log . Warn ( "releaseCollectionTask: release collection end, node occur error" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "nodeID" , rct . NodeID ) )
// after release failed, the task will always redo
// if the query node happens to be down, the node release was judged to have succeeded
2021-06-23 17:44:12 +08:00
return err
}
2021-04-15 15:15:46 +08:00
}
2022-04-26 11:29:54 +08:00
log . Info ( "releaseCollectionTask Execute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , rct . getTaskID ( ) ) ,
2021-06-15 12:41:40 +08:00
zap . Int64 ( "collectionID" , collectionID ) ,
2021-06-19 11:45:09 +08:00
zap . Int64 ( "nodeID" , rct . NodeID ) )
2021-04-15 15:15:46 +08:00
return nil
}
2021-10-18 21:34:47 +08:00
func ( rct * releaseCollectionTask ) postExecute ( context . Context ) error {
2021-04-15 15:15:46 +08:00
collectionID := rct . CollectionID
2021-12-29 12:15:21 +08:00
if rct . getResultInfo ( ) . ErrorCode != commonpb . ErrorCode_Success {
rct . clearChildTasks ( )
2021-10-11 09:54:37 +08:00
}
2021-04-15 15:15:46 +08:00
2022-04-26 11:29:54 +08:00
log . Info ( "releaseCollectionTask postExecute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , rct . getTaskID ( ) ) ,
2021-06-15 12:41:40 +08:00
zap . Int64 ( "collectionID" , collectionID ) ,
2021-06-19 11:45:09 +08:00
zap . Int64 ( "nodeID" , rct . NodeID ) )
2021-06-30 16:18:13 +08:00
return nil
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( rct * releaseCollectionTask ) rollBack ( ctx context . Context ) [ ] task {
2021-10-11 09:54:37 +08:00
//TODO::
//if taskID == 0, recovery meta
//if taskID != 0, recovery collection on queryNode
return nil
}
2021-10-18 21:34:47 +08:00
// loadPartitionTask will load all the data of this partition to query nodes
type loadPartitionTask struct {
* baseTask
2021-04-15 15:15:46 +08:00
* querypb . LoadPartitionsRequest
2022-02-08 21:57:46 +08:00
broker * globalMetaBroker
cluster Cluster
meta Meta
addCol bool
2022-02-28 16:51:55 +08:00
once sync . Once
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lpt * loadPartitionTask ) msgBase ( ) * commonpb . MsgBase {
2021-06-26 16:08:11 +08:00
return lpt . Base
}
2021-10-18 21:34:47 +08:00
func ( lpt * loadPartitionTask ) marshal ( ) ( [ ] byte , error ) {
2021-08-03 22:03:25 +08:00
return proto . Marshal ( lpt . LoadPartitionsRequest )
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lpt * loadPartitionTask ) msgType ( ) commonpb . MsgType {
2021-04-15 15:15:46 +08:00
return lpt . Base . MsgType
}
2021-10-18 21:34:47 +08:00
func ( lpt * loadPartitionTask ) timestamp ( ) Timestamp {
2021-04-15 15:15:46 +08:00
return lpt . Base . Timestamp
}
2021-10-18 21:34:47 +08:00
func ( lpt * loadPartitionTask ) updateTaskProcess ( ) {
2021-04-15 15:15:46 +08:00
collectionID := lpt . CollectionID
2021-10-11 09:54:37 +08:00
partitionIDs := lpt . PartitionIDs
2021-10-14 20:18:33 +08:00
childTasks := lpt . getChildTask ( )
2021-10-11 09:54:37 +08:00
allDone := true
for _ , t := range childTasks {
2021-10-14 20:18:33 +08:00
if t . getState ( ) != taskDone {
2021-10-11 09:54:37 +08:00
allDone = false
}
2021-12-30 22:51:21 +08:00
2022-05-30 19:50:04 +08:00
// wait watchDeltaChannel task done after loading segment
2021-12-30 22:51:21 +08:00
nodeID := getDstNodeIDByTask ( t )
if t . msgType ( ) == commonpb . MsgType_LoadSegments {
2022-06-02 13:16:05 +08:00
if ! lpt . cluster . HasWatchedDeltaChannel ( lpt . ctx , nodeID , collectionID ) {
2021-12-30 22:51:21 +08:00
allDone = false
break
}
}
2021-10-11 09:54:37 +08:00
}
if allDone {
2022-04-28 16:01:48 +08:00
err := syncReplicaSegments ( lpt . ctx , lpt . cluster , childTasks )
if err != nil {
log . Error ( "loadPartitionTask: failed to sync replica segments to shard leader" ,
zap . Int64 ( "taskID" , lpt . getTaskID ( ) ) ,
zap . Int64 ( "collectionID" , collectionID ) ,
zap . Error ( err ) )
lpt . setResultInfo ( err )
return
}
2021-10-11 09:54:37 +08:00
for _ , id := range partitionIDs {
err := lpt . meta . setLoadPercentage ( collectionID , id , 100 , querypb . LoadType_LoadPartition )
if err != nil {
log . Error ( "loadPartitionTask: set load percentage to meta's collectionInfo" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "partitionID" , id ) )
2021-10-14 20:18:33 +08:00
lpt . setResultInfo ( err )
2022-04-28 16:01:48 +08:00
return
2021-10-11 09:54:37 +08:00
}
}
2022-02-28 16:51:55 +08:00
lpt . once . Do ( func ( ) {
2022-03-03 16:05:57 +08:00
metrics . QueryCoordLoadCount . WithLabelValues ( metrics . SuccessLabel ) . Inc ( )
2022-02-28 16:51:55 +08:00
metrics . QueryCoordLoadLatency . WithLabelValues ( ) . Observe ( float64 ( lpt . elapseSpan ( ) . Milliseconds ( ) ) )
metrics . QueryCoordNumChildTasks . WithLabelValues ( ) . Sub ( float64 ( len ( lpt . getChildTask ( ) ) ) )
} )
2021-06-23 17:44:12 +08:00
}
2021-10-11 09:54:37 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lpt * loadPartitionTask ) preExecute ( context . Context ) error {
2022-04-20 16:15:41 +08:00
if lpt . ReplicaNumber < 1 {
log . Warn ( "replicaNumber is less than 1 for load partitions request, will set it to 1" ,
zap . Int32 ( "replicaNumber" , lpt . ReplicaNumber ) )
lpt . ReplicaNumber = 1
}
2021-10-11 09:54:37 +08:00
collectionID := lpt . CollectionID
2021-10-14 20:18:33 +08:00
lpt . setResultInfo ( nil )
2022-04-26 11:29:54 +08:00
log . Info ( "start do loadPartitionTask" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , lpt . getTaskID ( ) ) ,
2021-04-15 15:15:46 +08:00
zap . Int64 ( "collectionID" , collectionID ) )
2021-06-30 16:18:13 +08:00
return nil
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lpt * loadPartitionTask ) execute ( ctx context . Context ) error {
2021-12-29 12:15:21 +08:00
defer lpt . reduceRetryCount ( )
2021-04-15 15:15:46 +08:00
collectionID := lpt . CollectionID
partitionIDs := lpt . PartitionIDs
2022-04-20 16:15:41 +08:00
var (
replicas = make ( [ ] * milvuspb . ReplicaInfo , lpt . ReplicaNumber )
replicaIds = make ( [ ] int64 , lpt . ReplicaNumber )
segmentLoadInfos = make ( [ ] * querypb . SegmentLoadInfo , 0 )
deltaChannelInfos = make ( [ ] * datapb . VchannelInfo , 0 )
dmChannelInfos = make ( [ ] * datapb . VchannelInfo , 0 )
collectionSize uint64
)
2021-04-15 15:15:46 +08:00
for _ , partitionID := range partitionIDs {
2022-02-08 21:57:46 +08:00
vChannelInfos , binlogs , err := lpt . broker . getRecoveryInfo ( lpt . ctx , collectionID , partitionID )
2021-04-15 15:15:46 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Error ( "loadPartitionTask: getRecoveryInfo failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "partitionID" , partitionID ) , zap . Int64 ( "msgID" , lpt . Base . MsgID ) , zap . Error ( err ) )
2021-10-14 20:18:33 +08:00
lpt . setResultInfo ( err )
2021-04-15 15:15:46 +08:00
return err
}
2021-06-15 12:41:40 +08:00
2021-12-13 10:29:27 +08:00
for _ , segmentBingLog := range binlogs {
2022-02-08 21:57:46 +08:00
segmentLoadInfo := lpt . broker . generateSegmentLoadInfo ( ctx , collectionID , partitionID , segmentBingLog , true , lpt . Schema )
2022-04-20 16:15:41 +08:00
segmentLoadInfos = append ( segmentLoadInfos , segmentLoadInfo )
collectionSize += uint64 ( segmentLoadInfo . SegmentSize )
2021-06-15 12:41:40 +08:00
}
2021-12-21 11:57:39 +08:00
for _ , info := range vChannelInfos {
deltaChannelInfo , err := generateWatchDeltaChannelInfo ( info )
2021-11-25 18:49:16 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Error ( "loadPartitionTask: generateWatchDeltaChannelInfo failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . String ( "channelName" , info . ChannelName ) , zap . Int64 ( "msgID" , lpt . Base . MsgID ) , zap . Error ( err ) )
lpt . setResultInfo ( err )
2021-11-25 18:49:16 +08:00
return err
2021-11-05 14:47:19 +08:00
}
2021-12-21 11:57:39 +08:00
deltaChannelInfos = append ( deltaChannelInfos , deltaChannelInfo )
dmChannelInfos = append ( dmChannelInfos , info )
2021-11-05 14:47:19 +08:00
}
2021-11-13 08:49:08 +08:00
}
2021-12-21 11:57:39 +08:00
mergedDeltaChannels := mergeWatchDeltaChannelInfo ( deltaChannelInfos )
2021-11-13 08:49:08 +08:00
// If meta is not updated here, deltaChannel meta will not be available when loadSegment reschedule
2021-12-21 11:57:39 +08:00
err := lpt . meta . setDeltaChannel ( collectionID , mergedDeltaChannels )
2021-12-01 16:39:31 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Error ( "loadPartitionTask: set delta channel info failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , lpt . Base . MsgID ) , zap . Error ( err ) )
lpt . setResultInfo ( err )
2021-12-01 16:39:31 +08:00
return err
}
2021-12-21 11:57:39 +08:00
mergedDmChannel := mergeDmChannelInfo ( dmChannelInfos )
2022-04-20 16:15:41 +08:00
for i := range replicas {
replica , err := lpt . meta . generateReplica ( lpt . CollectionID , partitionIDs )
if err != nil {
lpt . setResultInfo ( err )
return err
2021-12-21 11:57:39 +08:00
}
2022-04-20 16:15:41 +08:00
replicas [ i ] = replica
replicaIds [ i ] = replica . ReplicaID
2021-12-21 11:57:39 +08:00
}
2022-06-02 13:16:05 +08:00
err = lpt . cluster . AssignNodesToReplicas ( ctx , replicas , collectionSize )
2021-09-29 09:56:04 +08:00
if err != nil {
2022-04-20 16:15:41 +08:00
log . Error ( "failed to assign nodes to replicas" ,
zap . Int64 ( "collectionID" , collectionID ) ,
zap . Int64s ( "partitionIDs" , partitionIDs ) ,
zap . Int64 ( "msgID" , lpt . Base . MsgID ) ,
zap . Int32 ( "replicaNumber" , lpt . ReplicaNumber ) ,
zap . Error ( err ) )
2021-10-14 20:18:33 +08:00
lpt . setResultInfo ( err )
2021-09-29 09:56:04 +08:00
return err
}
2022-04-20 16:15:41 +08:00
for _ , replica := range replicas {
var (
loadSegmentReqs = [ ] * querypb . LoadSegmentsRequest { }
watchDmChannelReqs = [ ] * querypb . WatchDmChannelsRequest { }
)
for _ , segmentLoadInfo := range segmentLoadInfos {
msgBase := proto . Clone ( lpt . Base ) . ( * commonpb . MsgBase )
msgBase . MsgType = commonpb . MsgType_LoadSegments
loadSegmentReq := & querypb . LoadSegmentsRequest {
Base : msgBase ,
Infos : [ ] * querypb . SegmentLoadInfo { segmentLoadInfo } ,
Schema : lpt . Schema ,
CollectionID : collectionID ,
LoadMeta : & querypb . LoadMetaInfo {
LoadType : querypb . LoadType_LoadPartition ,
CollectionID : collectionID ,
PartitionIDs : partitionIDs ,
} ,
ReplicaID : replica . ReplicaID ,
}
loadSegmentReqs = append ( loadSegmentReqs , loadSegmentReq )
}
for _ , info := range mergedDmChannel {
msgBase := proto . Clone ( lpt . Base ) . ( * commonpb . MsgBase )
msgBase . MsgType = commonpb . MsgType_WatchDmChannels
2022-06-16 12:00:10 +08:00
segmentInfos , err := generateVChannelSegmentInfos ( lpt . meta , info )
if err != nil {
lpt . setResultInfo ( err )
return err
}
2022-04-20 16:15:41 +08:00
watchRequest := & querypb . WatchDmChannelsRequest {
Base : msgBase ,
CollectionID : collectionID ,
PartitionIDs : partitionIDs ,
Infos : [ ] * datapb . VchannelInfo { info } ,
Schema : lpt . Schema ,
2022-06-16 12:00:10 +08:00
SegmentInfos : segmentInfos ,
2022-04-20 16:15:41 +08:00
LoadMeta : & querypb . LoadMetaInfo {
LoadType : querypb . LoadType_LoadPartition ,
CollectionID : collectionID ,
PartitionIDs : partitionIDs ,
} ,
ReplicaID : replica . GetReplicaID ( ) ,
}
watchDmChannelReqs = append ( watchDmChannelReqs , watchRequest )
}
2022-05-31 16:36:03 +08:00
internalTasks , err := assignInternalTask ( ctx , lpt , lpt . meta , lpt . cluster , loadSegmentReqs , watchDmChannelReqs , false , nil , replica . GetNodeIds ( ) , - 1 , lpt . broker )
2022-04-20 16:15:41 +08:00
if err != nil {
log . Error ( "loadPartitionTask: assign child task failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64s ( "partitionIDs" , partitionIDs ) , zap . Int64 ( "msgID" , lpt . Base . MsgID ) , zap . Error ( err ) )
lpt . setResultInfo ( err )
return err
}
for _ , internalTask := range internalTasks {
lpt . addChildTask ( internalTask )
if task , ok := internalTask . ( * watchDmChannelTask ) ; ok {
2022-06-02 13:16:05 +08:00
nodeInfo , err := lpt . cluster . GetNodeInfoByID ( task . NodeID )
2022-04-20 16:15:41 +08:00
if err != nil {
log . Error ( "loadCollectionTask: get shard leader node info failed" ,
zap . Int64 ( "collectionID" , collectionID ) ,
zap . Int64 ( "msgID" , lpt . Base . MsgID ) ,
zap . Int64 ( "nodeID" , task . NodeID ) ,
zap . Error ( err ) )
lpt . setResultInfo ( err )
return err
}
replica . ShardReplicas = append ( replica . ShardReplicas , & milvuspb . ShardReplica {
LeaderID : task . NodeID ,
LeaderAddr : nodeInfo . ( * queryNode ) . address ,
DmChannelName : task . WatchDmChannelsRequest . Infos [ 0 ] . ChannelName ,
} )
}
2022-05-25 03:08:00 +08:00
log . Info ( "loadPartitionTask: add a childTask" , zap . Int64 ( "collectionID" , collectionID ) , zap . String ( "task type" , internalTask . msgType ( ) . String ( ) ) )
2022-04-20 16:15:41 +08:00
}
metrics . QueryCoordNumChildTasks . WithLabelValues ( ) . Add ( float64 ( len ( internalTasks ) ) )
2022-04-26 11:29:54 +08:00
log . Info ( "loadPartitionTask: assign child task done" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64s ( "partitionIDs" , partitionIDs ) , zap . Int64 ( "msgID" , lpt . Base . MsgID ) )
2021-12-21 11:57:39 +08:00
}
err = lpt . meta . addCollection ( collectionID , querypb . LoadType_LoadPartition , lpt . Schema )
if err != nil {
log . Error ( "loadPartitionTask: add collection to meta failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , lpt . Base . MsgID ) , zap . Error ( err ) )
lpt . setResultInfo ( err )
return err
}
err = lpt . meta . addPartitions ( collectionID , partitionIDs )
if err != nil {
log . Error ( "loadPartitionTask: add partition to meta failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64s ( "partitionIDs" , partitionIDs ) , zap . Int64 ( "msgID" , lpt . Base . MsgID ) , zap . Error ( err ) )
lpt . setResultInfo ( err )
return err
2021-11-11 12:56:42 +08:00
}
2021-04-15 15:15:46 +08:00
2022-04-20 16:15:41 +08:00
for _ , replica := range replicas {
err = lpt . meta . addReplica ( replica )
if err != nil {
log . Error ( "failed to add replica" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64s ( "partitionIDs" , partitionIDs ) , zap . Int64 ( "msgID" , lpt . Base . MsgID ) , zap . Int32 ( "replicaNumber" , lpt . ReplicaNumber ) )
lpt . setResultInfo ( err )
return err
}
}
2022-04-26 11:29:54 +08:00
log . Info ( "loadPartitionTask Execute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , lpt . getTaskID ( ) ) ,
2021-04-15 15:15:46 +08:00
zap . Int64 ( "collectionID" , collectionID ) ,
2021-12-21 11:57:39 +08:00
zap . Int64s ( "partitionIDs" , partitionIDs ) ,
zap . Int64 ( "msgID" , lpt . Base . MsgID ) )
2021-04-15 15:15:46 +08:00
return nil
}
2021-10-18 21:34:47 +08:00
func ( lpt * loadPartitionTask ) postExecute ( ctx context . Context ) error {
2021-04-15 15:15:46 +08:00
collectionID := lpt . CollectionID
partitionIDs := lpt . PartitionIDs
2021-12-29 12:15:21 +08:00
if lpt . getResultInfo ( ) . ErrorCode != commonpb . ErrorCode_Success {
lpt . clearChildTasks ( )
2021-12-21 11:57:39 +08:00
err := lpt . meta . releaseCollection ( collectionID )
if err != nil {
log . Error ( "loadPartitionTask: occur error when release collection info from meta" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , lpt . Base . MsgID ) , zap . Error ( err ) )
panic ( err )
2021-06-23 17:44:12 +08:00
}
}
2021-10-11 09:54:37 +08:00
2022-04-26 11:29:54 +08:00
log . Info ( "loadPartitionTask postExecute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , lpt . getTaskID ( ) ) ,
2021-06-15 12:41:40 +08:00
zap . Int64 ( "collectionID" , collectionID ) ,
zap . Int64s ( "partitionIDs" , partitionIDs ) )
2021-06-30 16:18:13 +08:00
return nil
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lpt * loadPartitionTask ) rollBack ( ctx context . Context ) [ ] task {
2021-12-21 11:57:39 +08:00
collectionID := lpt . CollectionID
2021-10-11 09:54:37 +08:00
resultTasks := make ( [ ] task , 0 )
//brute force rollBack, should optimize
2022-06-02 13:16:05 +08:00
onlineNodeIDs := lpt . cluster . OnlineNodeIDs ( )
2021-12-21 11:57:39 +08:00
for _ , nodeID := range onlineNodeIDs {
req := & querypb . ReleaseCollectionRequest {
Base : & commonpb . MsgBase {
MsgType : commonpb . MsgType_ReleaseCollection ,
MsgID : lpt . Base . MsgID ,
Timestamp : lpt . Base . Timestamp ,
SourceID : lpt . Base . SourceID ,
} ,
DbID : lpt . DbID ,
CollectionID : collectionID ,
NodeID : nodeID ,
2021-10-11 09:54:37 +08:00
}
2021-12-21 11:57:39 +08:00
baseTask := newBaseTask ( ctx , querypb . TriggerCondition_GrpcRequest )
baseTask . setParentTask ( lpt )
releaseCollectionTask := & releaseCollectionTask {
baseTask : baseTask ,
ReleaseCollectionRequest : req ,
cluster : lpt . cluster ,
2021-10-11 09:54:37 +08:00
}
2021-12-21 11:57:39 +08:00
resultTasks = append ( resultTasks , releaseCollectionTask )
2021-10-11 09:54:37 +08:00
}
2021-12-21 11:57:39 +08:00
err := lpt . meta . releaseCollection ( collectionID )
if err != nil {
log . Error ( "loadPartitionTask: release collection info from meta failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , lpt . Base . MsgID ) , zap . Error ( err ) )
panic ( err )
}
2022-04-26 11:29:54 +08:00
log . Info ( "loadPartitionTask: generate rollBack task for loadPartitionTask" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , lpt . Base . MsgID ) )
2021-10-11 09:54:37 +08:00
return resultTasks
}
2021-10-18 21:34:47 +08:00
// releasePartitionTask will release all the data of this partition on query nodes
type releasePartitionTask struct {
* baseTask
2021-04-15 15:15:46 +08:00
* querypb . ReleasePartitionsRequest
2021-09-15 20:40:07 +08:00
cluster Cluster
2021-12-21 11:57:39 +08:00
meta Meta
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( rpt * releasePartitionTask ) msgBase ( ) * commonpb . MsgBase {
2021-06-26 16:08:11 +08:00
return rpt . Base
}
2021-10-18 21:34:47 +08:00
func ( rpt * releasePartitionTask ) marshal ( ) ( [ ] byte , error ) {
2021-08-03 22:03:25 +08:00
return proto . Marshal ( rpt . ReleasePartitionsRequest )
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( rpt * releasePartitionTask ) msgType ( ) commonpb . MsgType {
2021-04-15 15:15:46 +08:00
return rpt . Base . MsgType
}
2021-10-18 21:34:47 +08:00
func ( rpt * releasePartitionTask ) timestamp ( ) Timestamp {
2021-04-15 15:15:46 +08:00
return rpt . Base . Timestamp
}
2022-01-17 17:37:37 +08:00
func ( rpt * releasePartitionTask ) updateTaskProcess ( ) {
collectionID := rpt . CollectionID
partitionIDs := rpt . PartitionIDs
parentTask := rpt . getParentTask ( )
if parentTask == nil {
// all queryNodes have successfully released the data, clean up collectionMeta
err := rpt . meta . releasePartitions ( collectionID , partitionIDs )
if err != nil {
log . Error ( "releasePartitionTask: release collectionInfo from meta failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , rpt . Base . MsgID ) , zap . Error ( err ) )
panic ( err )
}
}
}
2021-10-18 21:34:47 +08:00
func ( rpt * releasePartitionTask ) preExecute ( context . Context ) error {
2021-04-15 15:15:46 +08:00
collectionID := rpt . CollectionID
2021-12-21 11:57:39 +08:00
partitionIDs := rpt . PartitionIDs
2021-10-14 20:18:33 +08:00
rpt . setResultInfo ( nil )
2022-04-26 11:29:54 +08:00
log . Info ( "start do releasePartitionTask" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , rpt . getTaskID ( ) ) ,
2021-12-21 11:57:39 +08:00
zap . Int64 ( "collectionID" , collectionID ) ,
zap . Int64s ( "partitionIDs" , partitionIDs ) )
2021-06-30 16:18:13 +08:00
return nil
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( rpt * releasePartitionTask ) execute ( ctx context . Context ) error {
2021-04-15 15:15:46 +08:00
collectionID := rpt . CollectionID
partitionIDs := rpt . PartitionIDs
2021-10-11 09:54:37 +08:00
2021-09-23 21:06:02 +08:00
// if nodeID ==0, it means that the release request has not been assigned to the specified query node
2021-06-19 11:45:09 +08:00
if rpt . NodeID <= 0 {
2022-06-02 13:16:05 +08:00
onlineNodeIDs := rpt . cluster . OnlineNodeIDs ( )
2021-12-21 11:57:39 +08:00
for _ , nodeID := range onlineNodeIDs {
2021-06-19 11:45:09 +08:00
req := proto . Clone ( rpt . ReleasePartitionsRequest ) . ( * querypb . ReleasePartitionsRequest )
req . NodeID = nodeID
2021-12-15 16:53:12 +08:00
baseTask := newBaseTask ( ctx , querypb . TriggerCondition_GrpcRequest )
2021-10-14 20:18:33 +08:00
baseTask . setParentTask ( rpt )
2021-10-18 21:34:47 +08:00
releasePartitionTask := & releasePartitionTask {
baseTask : baseTask ,
2021-06-19 11:45:09 +08:00
ReleasePartitionsRequest : req ,
2021-06-15 12:41:40 +08:00
cluster : rpt . cluster ,
2021-12-21 11:57:39 +08:00
meta : rpt . meta ,
2021-06-15 12:41:40 +08:00
}
2021-10-14 20:18:33 +08:00
rpt . addChildTask ( releasePartitionTask )
2022-04-26 11:29:54 +08:00
log . Info ( "releasePartitionTask: add a releasePartitionTask to releasePartitionTask's childTask" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "msgID" , rpt . Base . MsgID ) )
2021-12-21 11:57:39 +08:00
}
2021-06-15 12:41:40 +08:00
} else {
2022-05-05 21:15:50 +08:00
// If the node crashed or be offline, the loaded segments are lost
defer rpt . reduceRetryCount ( )
2022-06-02 13:16:05 +08:00
err := rpt . cluster . ReleasePartitions ( ctx , rpt . NodeID , rpt . ReleasePartitionsRequest )
2021-04-15 15:15:46 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Warn ( "ReleasePartitionsTask: release partition end, node occur error" , zap . Int64 ( "collectionID" , collectionID ) , zap . String ( "nodeID" , fmt . Sprintln ( rpt . NodeID ) ) )
2022-01-17 17:37:37 +08:00
// after release failed, the task will always redo
// if the query node happens to be down, the node release was judged to have succeeded
2021-06-23 17:44:12 +08:00
return err
}
2021-04-15 15:15:46 +08:00
}
2022-04-26 11:29:54 +08:00
log . Info ( "releasePartitionTask Execute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , rpt . getTaskID ( ) ) ,
2021-04-15 15:15:46 +08:00
zap . Int64 ( "collectionID" , collectionID ) ,
2021-06-15 12:41:40 +08:00
zap . Int64s ( "partitionIDs" , partitionIDs ) ,
2021-06-19 11:45:09 +08:00
zap . Int64 ( "nodeID" , rpt . NodeID ) )
2021-04-15 15:15:46 +08:00
return nil
}
2021-10-18 21:34:47 +08:00
func ( rpt * releasePartitionTask ) postExecute ( context . Context ) error {
2021-04-15 15:15:46 +08:00
collectionID := rpt . CollectionID
partitionIDs := rpt . PartitionIDs
2021-12-29 12:15:21 +08:00
if rpt . getResultInfo ( ) . ErrorCode != commonpb . ErrorCode_Success {
rpt . clearChildTasks ( )
2021-10-11 09:54:37 +08:00
}
2021-06-15 12:41:40 +08:00
2022-04-26 11:29:54 +08:00
log . Info ( "releasePartitionTask postExecute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "msgID" , rpt . getTaskID ( ) ) ,
2021-06-15 12:41:40 +08:00
zap . Int64 ( "collectionID" , collectionID ) ,
zap . Int64s ( "partitionIDs" , partitionIDs ) ,
2021-06-19 11:45:09 +08:00
zap . Int64 ( "nodeID" , rpt . NodeID ) )
2021-06-30 16:18:13 +08:00
return nil
2021-04-15 15:15:46 +08:00
}
2021-10-18 21:34:47 +08:00
func ( rpt * releasePartitionTask ) rollBack ( ctx context . Context ) [ ] task {
2021-10-11 09:54:37 +08:00
//TODO::
//if taskID == 0, recovery meta
//if taskID != 0, recovery partition on queryNode
return nil
}
2021-10-18 21:34:47 +08:00
type loadSegmentTask struct {
* baseTask
2021-06-15 12:41:40 +08:00
* querypb . LoadSegmentsRequest
2021-10-11 09:54:37 +08:00
meta Meta
cluster Cluster
excludeNodeIDs [ ] int64
2022-05-31 16:36:03 +08:00
broker * globalMetaBroker
2021-06-15 12:41:40 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lst * loadSegmentTask ) msgBase ( ) * commonpb . MsgBase {
2021-06-26 16:08:11 +08:00
return lst . Base
}
2021-10-18 21:34:47 +08:00
func ( lst * loadSegmentTask ) marshal ( ) ( [ ] byte , error ) {
2021-08-03 22:03:25 +08:00
return proto . Marshal ( lst . LoadSegmentsRequest )
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lst * loadSegmentTask ) isValid ( ) bool {
2022-06-02 13:16:05 +08:00
online , err := lst . cluster . IsOnline ( lst . DstNodeID )
2021-06-30 17:48:19 +08:00
if err != nil {
return false
}
2021-10-11 09:54:37 +08:00
return lst . ctx != nil && online
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lst * loadSegmentTask ) msgType ( ) commonpb . MsgType {
2021-06-15 12:41:40 +08:00
return lst . Base . MsgType
}
2021-06-19 11:45:09 +08:00
2021-10-18 21:34:47 +08:00
func ( lst * loadSegmentTask ) timestamp ( ) Timestamp {
2021-06-15 12:41:40 +08:00
return lst . Base . Timestamp
}
2021-06-19 11:45:09 +08:00
2021-10-18 21:34:47 +08:00
func ( lst * loadSegmentTask ) updateTaskProcess ( ) {
2021-10-14 20:18:33 +08:00
parentTask := lst . getParentTask ( )
2021-10-11 09:54:37 +08:00
if parentTask == nil {
2021-10-18 21:34:47 +08:00
log . Warn ( "loadSegmentTask: parentTask should not be nil" )
2021-10-11 09:54:37 +08:00
return
}
2021-10-14 20:18:33 +08:00
parentTask . updateTaskProcess ( )
2021-10-11 09:54:37 +08:00
}
2022-05-31 16:36:03 +08:00
func ( lst * loadSegmentTask ) preExecute ( ctx context . Context ) error {
2021-06-15 12:41:40 +08:00
segmentIDs := make ( [ ] UniqueID , 0 )
for _ , info := range lst . Infos {
segmentIDs = append ( segmentIDs , info . SegmentID )
}
2021-10-14 20:18:33 +08:00
lst . setResultInfo ( nil )
2022-04-26 11:29:54 +08:00
log . Info ( "start do loadSegmentTask" ,
2021-06-19 11:45:09 +08:00
zap . Int64s ( "segmentIDs" , segmentIDs ) ,
2021-10-22 19:07:15 +08:00
zap . Int64 ( "loaded nodeID" , lst . DstNodeID ) ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "taskID" , lst . getTaskID ( ) ) )
2022-05-31 16:36:03 +08:00
2022-06-15 21:38:10 +08:00
if err := lst . broker . acquireSegmentsReferLock ( ctx , lst . taskID , segmentIDs ) ; err != nil {
2022-05-31 16:36:03 +08:00
log . Error ( "acquire reference lock on segments failed" , zap . Int64s ( "segmentIDs" , segmentIDs ) ,
zap . Error ( err ) )
return err
}
2021-06-30 16:18:13 +08:00
return nil
2021-06-15 12:41:40 +08:00
}
2021-06-19 11:45:09 +08:00
2021-10-18 21:34:47 +08:00
func ( lst * loadSegmentTask ) execute ( ctx context . Context ) error {
2021-12-29 12:15:21 +08:00
defer lst . reduceRetryCount ( )
2021-10-11 09:54:37 +08:00
2022-06-02 13:16:05 +08:00
err := lst . cluster . LoadSegments ( ctx , lst . DstNodeID , lst . LoadSegmentsRequest )
2021-04-15 15:15:46 +08:00
if err != nil {
2021-10-18 21:34:47 +08:00
log . Warn ( "loadSegmentTask: loadSegment occur error" , zap . Int64 ( "taskID" , lst . getTaskID ( ) ) )
2021-10-14 20:18:33 +08:00
lst . setResultInfo ( err )
2021-04-15 15:15:46 +08:00
return err
}
2021-06-15 12:41:40 +08:00
2022-04-26 11:29:54 +08:00
log . Info ( "loadSegmentTask Execute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "taskID" , lst . getTaskID ( ) ) )
2021-06-15 12:41:40 +08:00
return nil
}
2021-10-11 09:54:37 +08:00
2021-10-18 21:34:47 +08:00
func ( lst * loadSegmentTask ) postExecute ( context . Context ) error {
2022-05-31 16:36:03 +08:00
segmentIDs := make ( [ ] UniqueID , 0 )
for _ , info := range lst . Infos {
segmentIDs = append ( segmentIDs , info . SegmentID )
}
2022-06-15 21:38:10 +08:00
if err := lst . broker . releaseSegmentReferLock ( lst . ctx , lst . taskID , segmentIDs ) ; err != nil {
2022-05-31 16:36:03 +08:00
panic ( err )
}
2022-04-26 11:29:54 +08:00
log . Info ( "loadSegmentTask postExecute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "taskID" , lst . getTaskID ( ) ) )
2021-06-30 16:18:13 +08:00
return nil
2021-06-15 12:41:40 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lst * loadSegmentTask ) reschedule ( ctx context . Context ) ( [ ] task , error ) {
2021-11-11 12:56:42 +08:00
loadSegmentReqs := make ( [ ] * querypb . LoadSegmentsRequest , 0 )
2021-06-19 11:45:09 +08:00
for _ , info := range lst . Infos {
2021-11-11 12:56:42 +08:00
msgBase := proto . Clone ( lst . Base ) . ( * commonpb . MsgBase )
msgBase . MsgType = commonpb . MsgType_LoadSegments
req := & querypb . LoadSegmentsRequest {
2021-12-15 16:53:12 +08:00
Base : msgBase ,
Infos : [ ] * querypb . SegmentLoadInfo { info } ,
Schema : lst . Schema ,
SourceNodeID : lst . SourceNodeID ,
CollectionID : lst . CollectionID ,
2022-03-16 11:43:21 +08:00
LoadMeta : & querypb . LoadMetaInfo {
LoadType : lst . GetLoadMeta ( ) . GetLoadType ( ) ,
CollectionID : lst . GetCollectionID ( ) ,
PartitionIDs : lst . GetLoadMeta ( ) . GetPartitionIDs ( ) ,
} ,
2022-04-20 16:15:41 +08:00
ReplicaID : lst . ReplicaID ,
2021-11-11 12:56:42 +08:00
}
loadSegmentReqs = append ( loadSegmentReqs , req )
}
if lst . excludeNodeIDs == nil {
lst . excludeNodeIDs = [ ] int64 { }
2021-06-19 11:45:09 +08:00
}
2021-10-22 19:07:15 +08:00
lst . excludeNodeIDs = append ( lst . excludeNodeIDs , lst . DstNodeID )
2021-11-13 08:49:08 +08:00
2021-12-24 10:59:31 +08:00
wait2AssignTaskSuccess := false
if lst . getParentTask ( ) . getTriggerCondition ( ) == querypb . TriggerCondition_NodeDown {
wait2AssignTaskSuccess = true
}
2022-05-31 16:36:03 +08:00
reScheduledTasks , err := assignInternalTask ( ctx , lst . getParentTask ( ) , lst . meta , lst . cluster , loadSegmentReqs , nil , wait2AssignTaskSuccess , lst . excludeNodeIDs , nil , lst . ReplicaID , lst . broker )
2021-10-11 09:54:37 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Error ( "loadSegment reschedule failed" , zap . Int64s ( "excludeNodes" , lst . excludeNodeIDs ) , zap . Int64 ( "taskID" , lst . getTaskID ( ) ) , zap . Error ( err ) )
2021-10-11 09:54:37 +08:00
return nil , err
}
2021-06-19 11:45:09 +08:00
2021-11-11 12:56:42 +08:00
return reScheduledTasks , nil
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
type releaseSegmentTask struct {
* baseTask
2021-06-15 12:41:40 +08:00
* querypb . ReleaseSegmentsRequest
2022-05-25 15:17:59 +08:00
cluster Cluster
leaderID UniqueID
2021-06-15 12:41:40 +08:00
}
2021-10-18 21:34:47 +08:00
func ( rst * releaseSegmentTask ) msgBase ( ) * commonpb . MsgBase {
2021-06-26 16:08:11 +08:00
return rst . Base
}
2021-10-18 21:34:47 +08:00
func ( rst * releaseSegmentTask ) marshal ( ) ( [ ] byte , error ) {
2021-08-03 22:03:25 +08:00
return proto . Marshal ( rst . ReleaseSegmentsRequest )
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( rst * releaseSegmentTask ) isValid ( ) bool {
2022-06-02 13:16:05 +08:00
online , err := rst . cluster . IsOnline ( rst . NodeID )
2021-06-30 17:48:19 +08:00
if err != nil {
return false
}
2021-10-11 09:54:37 +08:00
return rst . ctx != nil && online
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( rst * releaseSegmentTask ) msgType ( ) commonpb . MsgType {
2021-06-15 12:41:40 +08:00
return rst . Base . MsgType
}
2021-06-19 11:45:09 +08:00
2021-10-18 21:34:47 +08:00
func ( rst * releaseSegmentTask ) timestamp ( ) Timestamp {
2021-06-15 12:41:40 +08:00
return rst . Base . Timestamp
}
2021-06-19 11:45:09 +08:00
2021-10-18 21:34:47 +08:00
func ( rst * releaseSegmentTask ) preExecute ( context . Context ) error {
2021-06-15 12:41:40 +08:00
segmentIDs := rst . SegmentIDs
2021-10-14 20:18:33 +08:00
rst . setResultInfo ( nil )
2022-04-26 11:29:54 +08:00
log . Info ( "start do releaseSegmentTask" ,
2021-06-19 11:45:09 +08:00
zap . Int64s ( "segmentIDs" , segmentIDs ) ,
zap . Int64 ( "loaded nodeID" , rst . NodeID ) ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "taskID" , rst . getTaskID ( ) ) )
2021-06-30 16:18:13 +08:00
return nil
2021-06-15 12:41:40 +08:00
}
2021-06-19 11:45:09 +08:00
2021-10-18 21:34:47 +08:00
func ( rst * releaseSegmentTask ) execute ( ctx context . Context ) error {
2021-12-29 12:15:21 +08:00
defer rst . reduceRetryCount ( )
2021-10-11 09:54:37 +08:00
2022-06-02 13:16:05 +08:00
err := rst . cluster . ReleaseSegments ( rst . ctx , rst . leaderID , rst . ReleaseSegmentsRequest )
2021-06-15 12:41:40 +08:00
if err != nil {
2021-10-18 21:34:47 +08:00
log . Warn ( "releaseSegmentTask: releaseSegment occur error" , zap . Int64 ( "taskID" , rst . getTaskID ( ) ) )
2021-10-14 20:18:33 +08:00
rst . setResultInfo ( err )
2021-06-15 12:41:40 +08:00
return err
}
2022-04-26 11:29:54 +08:00
log . Info ( "releaseSegmentTask Execute done" ,
2021-06-19 11:45:09 +08:00
zap . Int64s ( "segmentIDs" , rst . SegmentIDs ) ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "taskID" , rst . getTaskID ( ) ) )
2021-06-15 12:41:40 +08:00
return nil
}
2021-06-19 11:45:09 +08:00
2021-10-18 21:34:47 +08:00
func ( rst * releaseSegmentTask ) postExecute ( context . Context ) error {
2021-06-15 12:41:40 +08:00
segmentIDs := rst . SegmentIDs
2022-04-26 11:29:54 +08:00
log . Info ( "releaseSegmentTask postExecute done" ,
2021-06-19 11:45:09 +08:00
zap . Int64s ( "segmentIDs" , segmentIDs ) ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "taskID" , rst . getTaskID ( ) ) )
2021-06-30 16:18:13 +08:00
return nil
2021-06-15 12:41:40 +08:00
}
2021-10-18 21:34:47 +08:00
type watchDmChannelTask struct {
* baseTask
2021-06-15 12:41:40 +08:00
* querypb . WatchDmChannelsRequest
2021-10-11 09:54:37 +08:00
meta Meta
cluster Cluster
excludeNodeIDs [ ] int64
2021-06-15 12:41:40 +08:00
}
2021-10-18 21:34:47 +08:00
func ( wdt * watchDmChannelTask ) msgBase ( ) * commonpb . MsgBase {
2021-06-26 16:08:11 +08:00
return wdt . Base
}
2021-10-18 21:34:47 +08:00
func ( wdt * watchDmChannelTask ) marshal ( ) ( [ ] byte , error ) {
2021-08-03 22:03:25 +08:00
return proto . Marshal ( wdt . WatchDmChannelsRequest )
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( wdt * watchDmChannelTask ) isValid ( ) bool {
2022-06-02 13:16:05 +08:00
online , err := wdt . cluster . IsOnline ( wdt . NodeID )
2021-06-30 17:48:19 +08:00
if err != nil {
return false
}
2021-10-11 09:54:37 +08:00
return wdt . ctx != nil && online
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( wdt * watchDmChannelTask ) msgType ( ) commonpb . MsgType {
2021-06-15 12:41:40 +08:00
return wdt . Base . MsgType
}
2021-06-19 11:45:09 +08:00
2021-10-18 21:34:47 +08:00
func ( wdt * watchDmChannelTask ) timestamp ( ) Timestamp {
2021-06-15 12:41:40 +08:00
return wdt . Base . Timestamp
}
2021-06-19 11:45:09 +08:00
2021-10-18 21:34:47 +08:00
func ( wdt * watchDmChannelTask ) updateTaskProcess ( ) {
2021-10-14 20:18:33 +08:00
parentTask := wdt . getParentTask ( )
2021-10-11 09:54:37 +08:00
if parentTask == nil {
2021-10-18 21:34:47 +08:00
log . Warn ( "watchDmChannelTask: parentTask should not be nil" )
2021-10-11 09:54:37 +08:00
return
}
2021-10-14 20:18:33 +08:00
parentTask . updateTaskProcess ( )
2021-10-11 09:54:37 +08:00
}
2021-10-18 21:34:47 +08:00
func ( wdt * watchDmChannelTask ) preExecute ( context . Context ) error {
2021-06-15 12:41:40 +08:00
channelInfos := wdt . Infos
channels := make ( [ ] string , 0 )
for _ , info := range channelInfos {
channels = append ( channels , info . ChannelName )
2021-04-15 15:15:46 +08:00
}
2021-10-14 20:18:33 +08:00
wdt . setResultInfo ( nil )
2022-04-26 11:29:54 +08:00
log . Info ( "start do watchDmChannelTask" ,
2021-06-19 11:45:09 +08:00
zap . Strings ( "dmChannels" , channels ) ,
zap . Int64 ( "loaded nodeID" , wdt . NodeID ) ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "taskID" , wdt . getTaskID ( ) ) )
2021-06-30 16:18:13 +08:00
return nil
2021-06-15 12:41:40 +08:00
}
2021-06-19 11:45:09 +08:00
2021-10-18 21:34:47 +08:00
func ( wdt * watchDmChannelTask ) execute ( ctx context . Context ) error {
2021-12-29 12:15:21 +08:00
defer wdt . reduceRetryCount ( )
2021-10-11 09:54:37 +08:00
2022-06-02 13:16:05 +08:00
err := wdt . cluster . WatchDmChannels ( wdt . ctx , wdt . NodeID , wdt . WatchDmChannelsRequest )
2021-04-15 15:15:46 +08:00
if err != nil {
2021-10-18 21:34:47 +08:00
log . Warn ( "watchDmChannelTask: watchDmChannel occur error" , zap . Int64 ( "taskID" , wdt . getTaskID ( ) ) )
2021-10-14 20:18:33 +08:00
wdt . setResultInfo ( err )
2021-04-15 15:15:46 +08:00
return err
}
2021-06-15 12:41:40 +08:00
2022-04-26 11:29:54 +08:00
log . Info ( "watchDmChannelsTask Execute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "taskID" , wdt . getTaskID ( ) ) )
2021-06-15 12:41:40 +08:00
return nil
}
2021-06-19 11:45:09 +08:00
2021-10-18 21:34:47 +08:00
func ( wdt * watchDmChannelTask ) postExecute ( context . Context ) error {
2022-04-26 11:29:54 +08:00
log . Info ( "watchDmChannelTask postExecute done" ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "taskID" , wdt . getTaskID ( ) ) )
2021-06-30 16:18:13 +08:00
return nil
2021-06-15 12:41:40 +08:00
}
2021-10-18 21:34:47 +08:00
func ( wdt * watchDmChannelTask ) reschedule ( ctx context . Context ) ( [ ] task , error ) {
2021-06-19 11:45:09 +08:00
collectionID := wdt . CollectionID
2021-11-11 12:56:42 +08:00
watchDmChannelReqs := make ( [ ] * querypb . WatchDmChannelsRequest , 0 )
2021-06-19 11:45:09 +08:00
for _ , info := range wdt . Infos {
2021-11-11 12:56:42 +08:00
msgBase := proto . Clone ( wdt . Base ) . ( * commonpb . MsgBase )
msgBase . MsgType = commonpb . MsgType_WatchDmChannels
req := & querypb . WatchDmChannelsRequest {
Base : msgBase ,
CollectionID : collectionID ,
2021-12-17 20:12:42 +08:00
PartitionIDs : wdt . PartitionIDs ,
2021-11-11 12:56:42 +08:00
Infos : [ ] * datapb . VchannelInfo { info } ,
Schema : wdt . Schema ,
2022-06-16 12:00:10 +08:00
SegmentInfos : wdt . SegmentInfos ,
2022-03-16 11:43:21 +08:00
LoadMeta : & querypb . LoadMetaInfo {
LoadType : wdt . GetLoadMeta ( ) . GetLoadType ( ) ,
CollectionID : collectionID ,
PartitionIDs : wdt . GetLoadMeta ( ) . GetPartitionIDs ( ) ,
} ,
2022-04-20 16:15:41 +08:00
ReplicaID : wdt . GetReplicaID ( ) ,
2021-11-11 12:56:42 +08:00
}
watchDmChannelReqs = append ( watchDmChannelReqs , req )
2021-06-19 11:45:09 +08:00
}
2021-11-11 12:56:42 +08:00
if wdt . excludeNodeIDs == nil {
wdt . excludeNodeIDs = [ ] int64 { }
}
2021-10-11 09:54:37 +08:00
wdt . excludeNodeIDs = append ( wdt . excludeNodeIDs , wdt . NodeID )
2021-12-24 10:59:31 +08:00
wait2AssignTaskSuccess := false
if wdt . getParentTask ( ) . getTriggerCondition ( ) == querypb . TriggerCondition_NodeDown {
wait2AssignTaskSuccess = true
}
2022-05-31 16:36:03 +08:00
reScheduledTasks , err := assignInternalTask ( ctx , wdt . parentTask , wdt . meta , wdt . cluster , nil , watchDmChannelReqs , wait2AssignTaskSuccess , wdt . excludeNodeIDs , nil , wdt . ReplicaID , nil )
2021-10-11 09:54:37 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Error ( "watchDmChannel reschedule failed" , zap . Int64 ( "taskID" , wdt . getTaskID ( ) ) , zap . Int64s ( "excludeNodes" , wdt . excludeNodeIDs ) , zap . Error ( err ) )
2021-10-11 09:54:37 +08:00
return nil , err
}
2021-06-19 11:45:09 +08:00
2021-11-11 12:56:42 +08:00
return reScheduledTasks , nil
2021-06-19 11:45:09 +08:00
}
2021-11-05 14:47:19 +08:00
type watchDeltaChannelTask struct {
* baseTask
* querypb . WatchDeltaChannelsRequest
2022-01-04 19:21:31 +08:00
cluster Cluster
2021-11-05 14:47:19 +08:00
}
func ( wdt * watchDeltaChannelTask ) msgBase ( ) * commonpb . MsgBase {
return wdt . Base
}
func ( wdt * watchDeltaChannelTask ) marshal ( ) ( [ ] byte , error ) {
return proto . Marshal ( wdt . WatchDeltaChannelsRequest )
}
2022-01-17 17:37:37 +08:00
func ( wdt * watchDeltaChannelTask ) isValid ( ) bool {
2022-06-02 13:16:05 +08:00
online , err := wdt . cluster . IsOnline ( wdt . NodeID )
2022-01-17 17:37:37 +08:00
if err != nil {
return false
}
return wdt . ctx != nil && online
}
2021-11-05 14:47:19 +08:00
func ( wdt * watchDeltaChannelTask ) msgType ( ) commonpb . MsgType {
return wdt . Base . MsgType
}
func ( wdt * watchDeltaChannelTask ) timestamp ( ) Timestamp {
return wdt . Base . Timestamp
}
func ( wdt * watchDeltaChannelTask ) updateTaskProcess ( ) {
parentTask := wdt . getParentTask ( )
if parentTask == nil {
log . Warn ( "watchDeltaChannel: parentTask should not be nil" )
return
}
parentTask . updateTaskProcess ( )
}
func ( wdt * watchDeltaChannelTask ) preExecute ( context . Context ) error {
channelInfos := wdt . Infos
channels := make ( [ ] string , 0 )
for _ , info := range channelInfos {
channels = append ( channels , info . ChannelName )
}
wdt . setResultInfo ( nil )
2022-04-26 11:29:54 +08:00
log . Info ( "start do watchDeltaChannelTask" ,
2021-11-05 14:47:19 +08:00
zap . Strings ( "deltaChannels" , channels ) ,
zap . Int64 ( "loaded nodeID" , wdt . NodeID ) ,
zap . Int64 ( "taskID" , wdt . getTaskID ( ) ) )
return nil
}
func ( wdt * watchDeltaChannelTask ) execute ( ctx context . Context ) error {
2021-12-29 12:15:21 +08:00
defer wdt . reduceRetryCount ( )
2021-11-05 14:47:19 +08:00
2022-06-02 13:16:05 +08:00
err := wdt . cluster . WatchDeltaChannels ( wdt . ctx , wdt . NodeID , wdt . WatchDeltaChannelsRequest )
2021-11-05 14:47:19 +08:00
if err != nil {
log . Warn ( "watchDeltaChannelTask: watchDeltaChannel occur error" , zap . Int64 ( "taskID" , wdt . getTaskID ( ) ) , zap . Error ( err ) )
wdt . setResultInfo ( err )
return err
}
2022-04-26 11:29:54 +08:00
log . Info ( "watchDeltaChannelsTask Execute done" ,
2021-11-05 14:47:19 +08:00
zap . Int64 ( "taskID" , wdt . getTaskID ( ) ) )
return nil
}
func ( wdt * watchDeltaChannelTask ) postExecute ( context . Context ) error {
2022-04-26 11:29:54 +08:00
log . Info ( "watchDeltaChannelTask postExecute done" ,
2021-11-05 14:47:19 +08:00
zap . Int64 ( "taskID" , wdt . getTaskID ( ) ) )
return nil
}
2021-10-24 22:39:09 +08:00
//****************************handoff task********************************//
2021-10-18 21:34:47 +08:00
type handoffTask struct {
2021-10-24 22:39:09 +08:00
* baseTask
* querypb . HandoffSegmentsRequest
2022-02-08 21:57:46 +08:00
broker * globalMetaBroker
cluster Cluster
meta Meta
2021-10-24 22:39:09 +08:00
}
func ( ht * handoffTask ) msgBase ( ) * commonpb . MsgBase {
return ht . Base
}
func ( ht * handoffTask ) marshal ( ) ( [ ] byte , error ) {
return proto . Marshal ( ht . HandoffSegmentsRequest )
}
func ( ht * handoffTask ) msgType ( ) commonpb . MsgType {
return ht . Base . MsgType
}
func ( ht * handoffTask ) timestamp ( ) Timestamp {
return ht . Base . Timestamp
}
func ( ht * handoffTask ) preExecute ( context . Context ) error {
ht . setResultInfo ( nil )
segmentIDs := make ( [ ] UniqueID , 0 )
segmentInfos := ht . HandoffSegmentsRequest . SegmentInfos
for _ , info := range segmentInfos {
segmentIDs = append ( segmentIDs , info . SegmentID )
}
2022-04-26 11:29:54 +08:00
log . Info ( "start do handoff segments task" ,
2021-10-24 22:39:09 +08:00
zap . Int64s ( "segmentIDs" , segmentIDs ) )
return nil
}
func ( ht * handoffTask ) execute ( ctx context . Context ) error {
segmentInfos := ht . HandoffSegmentsRequest . SegmentInfos
for _ , segmentInfo := range segmentInfos {
collectionID := segmentInfo . CollectionID
partitionID := segmentInfo . PartitionID
segmentID := segmentInfo . SegmentID
collectionInfo , err := ht . meta . getCollectionInfoByID ( collectionID )
if err != nil {
2022-04-26 11:29:54 +08:00
log . Warn ( "handoffTask: collection has not been loaded into memory" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "segmentID" , segmentID ) )
2021-10-24 22:39:09 +08:00
continue
}
2022-03-14 09:50:01 +08:00
if collectionInfo . LoadType == querypb . LoadType_LoadCollection && ht . meta . hasReleasePartition ( collectionID , partitionID ) {
2022-05-31 16:36:03 +08:00
log . Warn ( "handoffTask: partition has been released" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "partitionID" , partitionID ) )
2021-10-26 13:04:18 +08:00
continue
}
2021-10-24 22:39:09 +08:00
partitionLoaded := false
for _ , id := range collectionInfo . PartitionIDs {
if id == partitionID {
partitionLoaded = true
}
}
2022-03-14 09:50:01 +08:00
if collectionInfo . LoadType != querypb . LoadType_LoadCollection && ! partitionLoaded {
2022-04-26 11:29:54 +08:00
log . Warn ( "handoffTask: partition has not been loaded into memory" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "partitionID" , partitionID ) , zap . Int64 ( "segmentID" , segmentID ) )
2021-10-24 22:39:09 +08:00
continue
}
2021-12-10 10:13:44 +08:00
// segment which is compacted from should exist in query node
2021-11-08 21:00:02 +08:00
for _ , compactedSegID := range segmentInfo . CompactionFrom {
_ , err = ht . meta . getSegmentInfoByID ( compactedSegID )
if err != nil {
log . Error ( "handoffTask: compacted segment has not been loaded into memory" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "partitionID" , partitionID ) , zap . Int64 ( "segmentID" , segmentID ) )
ht . setResultInfo ( err )
return err
}
}
// segment which is compacted to should not exist in query node
2021-10-24 22:39:09 +08:00
_ , err = ht . meta . getSegmentInfoByID ( segmentID )
if err != nil {
2022-02-08 21:57:46 +08:00
dmChannelInfos , binlogs , err := ht . broker . getRecoveryInfo ( ht . ctx , collectionID , partitionID )
2021-10-24 22:39:09 +08:00
if err != nil {
2021-12-13 10:29:27 +08:00
log . Error ( "handoffTask: getRecoveryInfo failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "partitionID" , partitionID ) , zap . Error ( err ) )
2021-10-24 22:39:09 +08:00
ht . setResultInfo ( err )
return err
}
findBinlog := false
var loadSegmentReq * querypb . LoadSegmentsRequest
2021-11-13 08:49:08 +08:00
var watchDeltaChannels [ ] * datapb . VchannelInfo
2022-02-08 21:57:46 +08:00
for _ , segmentBinlog := range binlogs {
if segmentBinlog . SegmentID == segmentID {
2021-10-24 22:39:09 +08:00
findBinlog = true
2022-02-08 21:57:46 +08:00
segmentLoadInfo := ht . broker . generateSegmentLoadInfo ( ctx , collectionID , partitionID , segmentBinlog , false , nil )
segmentLoadInfo . CompactionFrom = segmentInfo . CompactionFrom
segmentLoadInfo . IndexInfos = segmentInfo . IndexInfos
2021-10-24 22:39:09 +08:00
msgBase := proto . Clone ( ht . Base ) . ( * commonpb . MsgBase )
msgBase . MsgType = commonpb . MsgType_LoadSegments
loadSegmentReq = & querypb . LoadSegmentsRequest {
2021-12-21 11:57:39 +08:00
Base : msgBase ,
Infos : [ ] * querypb . SegmentLoadInfo { segmentLoadInfo } ,
Schema : collectionInfo . Schema ,
CollectionID : collectionID ,
2022-05-24 18:24:00 +08:00
LoadMeta : & querypb . LoadMetaInfo {
CollectionID : collectionID ,
PartitionIDs : collectionInfo . PartitionIDs ,
} ,
2021-10-24 22:39:09 +08:00
}
}
}
2021-12-13 10:29:27 +08:00
for _ , info := range dmChannelInfos {
2021-11-26 22:47:17 +08:00
deltaChannel , err := generateWatchDeltaChannelInfo ( info )
if err != nil {
return err
2021-11-05 14:47:19 +08:00
}
2021-11-26 22:47:17 +08:00
watchDeltaChannels = append ( watchDeltaChannels , deltaChannel )
2021-11-05 14:47:19 +08:00
}
2021-10-24 22:39:09 +08:00
if ! findBinlog {
err = fmt . Errorf ( "segmnet has not been flushed, segmentID is %d" , segmentID )
ht . setResultInfo ( err )
return err
}
2021-11-26 22:47:17 +08:00
mergedDeltaChannels := mergeWatchDeltaChannelInfo ( watchDeltaChannels )
2021-11-13 08:49:08 +08:00
// If meta is not updated here, deltaChannel meta will not be available when loadSegment reschedule
2021-12-21 11:57:39 +08:00
err = ht . meta . setDeltaChannel ( collectionID , mergedDeltaChannels )
2021-12-01 16:39:31 +08:00
if err != nil {
2021-12-21 11:57:39 +08:00
log . Error ( "handoffTask: set delta channel info to meta failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "segmentID" , segmentID ) , zap . Error ( err ) )
ht . setResultInfo ( err )
2021-12-01 16:39:31 +08:00
return err
}
2022-04-20 16:15:41 +08:00
replicas , err := ht . meta . getReplicasByCollectionID ( collectionID )
2021-10-24 22:39:09 +08:00
if err != nil {
ht . setResultInfo ( err )
return err
}
2022-04-20 16:15:41 +08:00
var internalTasks [ ] task
for _ , replica := range replicas {
if len ( replica . NodeIds ) == 0 {
log . Warn ( "handoffTask: find empty replica" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "segmentID" , segmentID ) , zap . Int64 ( "replicaID" , replica . GetReplicaID ( ) ) )
err := fmt . Errorf ( "replica %d of collection %d is empty" , replica . GetReplicaID ( ) , collectionID )
ht . setResultInfo ( err )
return err
}
// we should copy a request because assignInternalTask will change DstNodeID of LoadSegmentRequest
clonedReq := proto . Clone ( loadSegmentReq ) . ( * querypb . LoadSegmentsRequest )
clonedReq . ReplicaID = replica . ReplicaID
2022-05-31 16:36:03 +08:00
tasks , err := assignInternalTask ( ctx , ht , ht . meta , ht . cluster , [ ] * querypb . LoadSegmentsRequest { clonedReq } , nil , true , nil , nil , replica . GetReplicaID ( ) , ht . broker )
2022-04-20 16:15:41 +08:00
if err != nil {
log . Error ( "handoffTask: assign child task failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "segmentID" , segmentID ) , zap . Error ( err ) )
ht . setResultInfo ( err )
return err
}
internalTasks = append ( internalTasks , tasks ... )
}
2021-11-11 12:56:42 +08:00
for _ , internalTask := range internalTasks {
ht . addChildTask ( internalTask )
2022-05-25 03:08:00 +08:00
log . Info ( "handoffTask: add a childTask" , zap . String ( "task type" , internalTask . msgType ( ) . String ( ) ) , zap . Int64 ( "segmentID" , segmentID ) )
2021-11-11 12:56:42 +08:00
}
2021-10-24 22:39:09 +08:00
} else {
err = fmt . Errorf ( "sealed segment has been exist on query node, segmentID is %d" , segmentID )
2021-12-21 11:57:39 +08:00
log . Error ( "handoffTask: handoff segment failed" , zap . Int64 ( "segmentID" , segmentID ) , zap . Error ( err ) )
2021-10-24 22:39:09 +08:00
ht . setResultInfo ( err )
return err
}
}
2022-04-26 11:29:54 +08:00
log . Info ( "handoffTask: assign child task done" , zap . Any ( "segmentInfos" , segmentInfos ) , zap . Int64 ( "taskID" , ht . getTaskID ( ) ) )
2021-10-24 22:39:09 +08:00
return nil
}
func ( ht * handoffTask ) postExecute ( context . Context ) error {
2021-12-29 12:15:21 +08:00
if ht . getResultInfo ( ) . ErrorCode != commonpb . ErrorCode_Success {
ht . clearChildTasks ( )
2021-10-24 22:39:09 +08:00
}
2022-04-26 11:29:54 +08:00
log . Info ( "handoffTask postExecute done" , zap . Int64 ( "taskID" , ht . getTaskID ( ) ) )
2021-10-24 22:39:09 +08:00
return nil
}
func ( ht * handoffTask ) rollBack ( ctx context . Context ) [ ] task {
resultTasks := make ( [ ] task , 0 )
childTasks := ht . getChildTask ( )
for _ , childTask := range childTasks {
if childTask . msgType ( ) == commonpb . MsgType_LoadSegments {
// TODO:: add release segment to rollBack, no release does not affect correctness of query
}
}
return resultTasks
2021-06-19 11:45:09 +08:00
}
2021-06-15 12:41:40 +08:00
2021-10-18 21:34:47 +08:00
type loadBalanceTask struct {
* baseTask
2021-06-19 11:45:09 +08:00
* querypb . LoadBalanceRequest
2022-04-20 16:15:41 +08:00
broker * globalMetaBroker
cluster Cluster
meta Meta
replicaID int64
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lbt * loadBalanceTask ) msgBase ( ) * commonpb . MsgBase {
2021-06-26 16:08:11 +08:00
return lbt . Base
}
2021-10-18 21:34:47 +08:00
func ( lbt * loadBalanceTask ) marshal ( ) ( [ ] byte , error ) {
2021-08-03 22:03:25 +08:00
return proto . Marshal ( lbt . LoadBalanceRequest )
2021-06-19 11:45:09 +08:00
}
2021-10-18 21:34:47 +08:00
func ( lbt * loadBalanceTask ) msgType ( ) commonpb . MsgType {
2021-06-19 11:45:09 +08:00
return lbt . Base . MsgType
}
2021-10-18 21:34:47 +08:00
func ( lbt * loadBalanceTask ) timestamp ( ) Timestamp {
2021-06-19 11:45:09 +08:00
return lbt . Base . Timestamp
}
2021-10-18 21:34:47 +08:00
func ( lbt * loadBalanceTask ) preExecute ( context . Context ) error {
2021-10-14 20:18:33 +08:00
lbt . setResultInfo ( nil )
2022-04-26 11:29:54 +08:00
log . Info ( "start do loadBalanceTask" ,
2021-11-06 15:22:56 +08:00
zap . Int32 ( "trigger type" , int32 ( lbt . triggerCondition ) ) ,
2021-06-19 11:45:09 +08:00
zap . Int64s ( "sourceNodeIDs" , lbt . SourceNodeIDs ) ,
zap . Any ( "balanceReason" , lbt . BalanceReason ) ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "taskID" , lbt . getTaskID ( ) ) )
2022-06-01 20:00:03 +08:00
if lbt . triggerCondition == querypb . TriggerCondition_LoadBalance {
if err := lbt . checkForManualLoadBalance ( ) ; err != nil {
lbt . setResultInfo ( err )
return err
}
if len ( lbt . SourceNodeIDs ) == 0 {
err := errors . New ( "loadBalanceTask: empty source Node list to balance" )
log . Error ( err . Error ( ) )
lbt . setResultInfo ( err )
return err
}
}
2021-06-30 16:18:13 +08:00
return nil
2021-06-19 11:45:09 +08:00
}
2022-04-20 16:15:41 +08:00
func ( lbt * loadBalanceTask ) checkForManualLoadBalance ( ) error {
// check segments belong to the same collection
collectionID := lbt . GetCollectionID ( )
for _ , sid := range lbt . SealedSegmentIDs {
segment , err := lbt . meta . getSegmentInfoByID ( sid )
if err != nil {
return err
}
if collectionID == 0 {
collectionID = segment . GetCollectionID ( )
} else if collectionID != segment . GetCollectionID ( ) {
err := errors . New ( "segments of a load balance task do not belong to the same collection" )
return err
}
}
if collectionID == 0 {
err := errors . New ( "a load balance task has to specify a collectionID or pass segments of a collection" )
return err
}
// check source and dst nodes belong to the same replica
var replicaID int64 = - 1
for _ , nodeID := range lbt . SourceNodeIDs {
replica , err := lbt . getReplica ( nodeID , collectionID )
if err != nil {
return err
}
if replicaID == - 1 {
replicaID = replica . GetReplicaID ( )
} else if replicaID != replica . GetReplicaID ( ) {
err := errors . New ( "source nodes and destination nodes must be in the same replica group" )
return err
}
}
if replicaID == - 1 {
return errors . New ( "source nodes is empty" )
}
for _ , nodeID := range lbt . DstNodeIDs {
replica , err := lbt . getReplica ( nodeID , collectionID )
if err != nil {
return err
}
if replicaID != replica . GetReplicaID ( ) {
err := errors . New ( "source nodes and destination nodes must be in the same replica group" )
return err
}
}
lbt . replicaID = replicaID
2022-04-26 11:29:54 +08:00
log . Info ( "start do loadBalanceTask" ,
2022-04-20 16:15:41 +08:00
zap . Int32 ( "trigger type" , int32 ( lbt . triggerCondition ) ) ,
zap . Int64s ( "sourceNodeIDs" , lbt . SourceNodeIDs ) ,
zap . Any ( "balanceReason" , lbt . BalanceReason ) ,
zap . Int64 ( "taskID" , lbt . getTaskID ( ) ) )
return nil
}
2021-10-18 21:34:47 +08:00
func ( lbt * loadBalanceTask ) execute ( ctx context . Context ) error {
2021-12-29 12:15:21 +08:00
defer lbt . reduceRetryCount ( )
2021-06-19 11:45:09 +08:00
2021-12-15 16:53:12 +08:00
if lbt . triggerCondition == querypb . TriggerCondition_NodeDown {
2022-06-01 20:00:03 +08:00
err := lbt . processNodeDownLoadBalance ( ctx )
if err != nil {
return err
}
} else if lbt . triggerCondition == querypb . TriggerCondition_LoadBalance {
err := lbt . processManualLoadBalance ( ctx )
if err != nil {
return err
}
}
2022-04-29 15:19:47 +08:00
2022-06-01 20:00:03 +08:00
log . Info ( "loadBalanceTask Execute done" ,
zap . Int32 ( "trigger type" , int32 ( lbt . triggerCondition ) ) ,
zap . Int64s ( "sourceNodeIDs" , lbt . SourceNodeIDs ) ,
zap . Any ( "balanceReason" , lbt . BalanceReason ) ,
zap . Int64 ( "taskID" , lbt . getTaskID ( ) ) )
return nil
}
2022-04-20 16:15:41 +08:00
2022-06-01 20:00:03 +08:00
func ( lbt * loadBalanceTask ) processNodeDownLoadBalance ( ctx context . Context ) error {
var internalTasks [ ] task
for _ , nodeID := range lbt . SourceNodeIDs {
segments := make ( map [ UniqueID ] * querypb . SegmentInfo )
dmChannels := make ( map [ string ] * querypb . DmChannelWatchInfo )
recoveredCollectionIDs := make ( typeutil . UniqueSet )
segmentInfos := lbt . meta . getSegmentInfosByNode ( nodeID )
for _ , segmentInfo := range segmentInfos {
segments [ segmentInfo . SegmentID ] = segmentInfo
recoveredCollectionIDs . Insert ( segmentInfo . CollectionID )
}
dmChannelWatchInfos := lbt . meta . getDmChannelInfosByNodeID ( nodeID )
for _ , watchInfo := range dmChannelWatchInfos {
dmChannels [ watchInfo . DmChannel ] = watchInfo
recoveredCollectionIDs . Insert ( watchInfo . CollectionID )
}
2022-04-20 16:15:41 +08:00
2022-06-01 20:00:03 +08:00
if len ( segments ) == 0 && len ( dmChannels ) == 0 {
continue
}
2022-04-29 15:19:47 +08:00
2022-06-01 20:00:03 +08:00
for collectionID := range recoveredCollectionIDs {
loadSegmentReqs := make ( [ ] * querypb . LoadSegmentsRequest , 0 )
watchDmChannelReqs := make ( [ ] * querypb . WatchDmChannelsRequest , 0 )
collectionInfo , err := lbt . meta . getCollectionInfoByID ( collectionID )
if err != nil {
log . Error ( "loadBalanceTask: get collectionInfo from meta failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Error ( err ) )
lbt . setResultInfo ( err )
return err
}
schema := collectionInfo . Schema
var deltaChannelInfos [ ] * datapb . VchannelInfo
var dmChannelInfos [ ] * datapb . VchannelInfo
2022-04-20 16:15:41 +08:00
2022-06-01 20:00:03 +08:00
var toRecoverPartitionIDs [ ] UniqueID
if collectionInfo . LoadType == querypb . LoadType_LoadCollection {
toRecoverPartitionIDs , err = lbt . broker . showPartitionIDs ( ctx , collectionID )
if err != nil {
log . Error ( "loadBalanceTask: show collection's partitionIDs failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Error ( err ) )
lbt . setResultInfo ( err )
panic ( err )
2022-04-20 16:15:41 +08:00
}
2022-06-01 20:00:03 +08:00
} else {
toRecoverPartitionIDs = collectionInfo . PartitionIDs
}
log . Info ( "loadBalanceTask: get collection's all partitionIDs" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64s ( "partitionIDs" , toRecoverPartitionIDs ) )
replica , err := lbt . getReplica ( nodeID , collectionID )
if err != nil {
2022-06-10 14:50:08 +08:00
// getReplica maybe failed, it will cause the balanceTask execute infinitely
log . Warn ( "loadBalanceTask: get replica failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "nodeId" , nodeID ) )
continue
2022-06-01 20:00:03 +08:00
}
2022-04-20 16:15:41 +08:00
2022-06-01 20:00:03 +08:00
for _ , partitionID := range toRecoverPartitionIDs {
vChannelInfos , binlogs , err := lbt . broker . getRecoveryInfo ( lbt . ctx , collectionID , partitionID )
2021-06-19 11:45:09 +08:00
if err != nil {
2022-06-01 20:00:03 +08:00
log . Error ( "loadBalanceTask: getRecoveryInfo failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "partitionID" , partitionID ) , zap . Error ( err ) )
2022-01-13 17:59:34 +08:00
lbt . setResultInfo ( err )
2022-01-12 17:43:37 +08:00
panic ( err )
2021-06-15 12:41:40 +08:00
}
2021-06-19 11:45:09 +08:00
2022-06-01 20:00:03 +08:00
for _ , segmentBingLog := range binlogs {
segmentID := segmentBingLog . SegmentID
if _ , ok := segments [ segmentID ] ; ok {
segmentLoadInfo := lbt . broker . generateSegmentLoadInfo ( ctx , collectionID , partitionID , segmentBingLog , true , schema )
2021-07-13 14:16:00 +08:00
msgBase := proto . Clone ( lbt . Base ) . ( * commonpb . MsgBase )
2022-06-01 20:00:03 +08:00
msgBase . MsgType = commonpb . MsgType_LoadSegments
loadSegmentReq := & querypb . LoadSegmentsRequest {
2021-12-15 16:53:12 +08:00
Base : msgBase ,
2022-06-01 20:00:03 +08:00
Infos : [ ] * querypb . SegmentLoadInfo { segmentLoadInfo } ,
2022-04-20 16:15:41 +08:00
Schema : schema ,
2022-06-01 20:00:03 +08:00
CollectionID : collectionID ,
2022-03-15 20:37:21 +08:00
LoadMeta : & querypb . LoadMetaInfo {
LoadType : collectionInfo . LoadType ,
CollectionID : collectionID ,
PartitionIDs : toRecoverPartitionIDs ,
} ,
2022-05-19 16:51:57 +08:00
ReplicaID : replica . ReplicaID ,
2021-06-30 17:48:19 +08:00
}
2022-06-01 20:00:03 +08:00
loadSegmentReqs = append ( loadSegmentReqs , loadSegmentReq )
}
}
2021-06-30 17:48:19 +08:00
2022-06-01 20:00:03 +08:00
for _ , info := range vChannelInfos {
deltaChannel , err := generateWatchDeltaChannelInfo ( info )
if err != nil {
log . Error ( "loadBalanceTask: generateWatchDeltaChannelInfo failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . String ( "channelName" , info . ChannelName ) , zap . Error ( err ) )
lbt . setResultInfo ( err )
panic ( err )
2021-11-05 14:47:19 +08:00
}
2022-06-01 20:00:03 +08:00
deltaChannelInfos = append ( deltaChannelInfos , deltaChannel )
dmChannelInfos = append ( dmChannelInfos , info )
2021-12-21 11:57:39 +08:00
}
2022-06-01 20:00:03 +08:00
}
2021-11-05 14:47:19 +08:00
2022-06-01 20:00:03 +08:00
mergedDeltaChannel := mergeWatchDeltaChannelInfo ( deltaChannelInfos )
// If meta is not updated here, deltaChannel meta will not be available when loadSegment reschedule
err = lbt . meta . setDeltaChannel ( collectionID , mergedDeltaChannel )
if err != nil {
log . Error ( "loadBalanceTask: set delta channel info meta failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Error ( err ) )
lbt . setResultInfo ( err )
panic ( err )
}
mergedDmChannel := mergeDmChannelInfo ( dmChannelInfos )
for channelName , vChannelInfo := range mergedDmChannel {
if _ , ok := dmChannels [ channelName ] ; ok {
msgBase := proto . Clone ( lbt . Base ) . ( * commonpb . MsgBase )
msgBase . MsgType = commonpb . MsgType_WatchDmChannels
2022-06-16 12:00:10 +08:00
channelSegmentInfos , err := generateVChannelSegmentInfos ( lbt . meta , vChannelInfo )
if err != nil {
lbt . setResultInfo ( err )
return err
}
2022-06-01 20:00:03 +08:00
watchRequest := & querypb . WatchDmChannelsRequest {
Base : msgBase ,
CollectionID : collectionID ,
Infos : [ ] * datapb . VchannelInfo { vChannelInfo } ,
Schema : schema ,
2022-06-16 12:00:10 +08:00
SegmentInfos : channelSegmentInfos ,
2022-06-01 20:00:03 +08:00
LoadMeta : & querypb . LoadMetaInfo {
LoadType : collectionInfo . LoadType ,
CollectionID : collectionID ,
PartitionIDs : toRecoverPartitionIDs ,
} ,
ReplicaID : replica . ReplicaID ,
}
if collectionInfo . LoadType == querypb . LoadType_LoadPartition {
watchRequest . PartitionIDs = toRecoverPartitionIDs
}
watchDmChannelReqs = append ( watchDmChannelReqs , watchRequest )
2021-09-29 09:56:04 +08:00
}
2021-06-15 12:41:40 +08:00
}
2022-06-01 20:00:03 +08:00
tasks , err := assignInternalTask ( ctx , lbt , lbt . meta , lbt . cluster , loadSegmentReqs , watchDmChannelReqs , true , lbt . SourceNodeIDs , lbt . DstNodeIDs , replica . GetReplicaID ( ) , lbt . broker )
if err != nil {
log . Error ( "loadBalanceTask: assign child task failed" , zap . Int64 ( "sourceNodeID" , nodeID ) )
lbt . setResultInfo ( err )
panic ( err )
}
internalTasks = append ( internalTasks , tasks ... )
2021-04-15 15:15:46 +08:00
}
2022-06-01 20:00:03 +08:00
}
for _ , internalTask := range internalTasks {
lbt . addChildTask ( internalTask )
log . Info ( "loadBalanceTask: add a childTask" , zap . String ( "task type" , internalTask . msgType ( ) . String ( ) ) , zap . Any ( "task" , internalTask ) )
}
log . Info ( "loadBalanceTask: assign child task done" , zap . Int64s ( "sourceNodeIDs" , lbt . SourceNodeIDs ) )
return nil
}
func ( lbt * loadBalanceTask ) processManualLoadBalance ( ctx context . Context ) error {
balancedSegmentInfos := make ( map [ UniqueID ] * querypb . SegmentInfo )
balancedSegmentIDs := make ( [ ] UniqueID , 0 )
for _ , nodeID := range lbt . SourceNodeIDs {
2022-06-02 13:16:05 +08:00
nodeExist := lbt . cluster . HasNode ( nodeID )
2022-06-01 20:00:03 +08:00
if ! nodeExist {
err := fmt . Errorf ( "loadBalanceTask: query node %d is not exist to balance" , nodeID )
log . Error ( err . Error ( ) )
2022-04-20 16:15:41 +08:00
lbt . setResultInfo ( err )
return err
}
2022-06-01 20:00:03 +08:00
segmentInfos := lbt . meta . getSegmentInfosByNode ( nodeID )
for _ , info := range segmentInfos {
balancedSegmentInfos [ info . SegmentID ] = info
balancedSegmentIDs = append ( balancedSegmentIDs , info . SegmentID )
}
}
// check balanced sealedSegmentIDs in request whether exist in query node
for _ , segmentID := range lbt . SealedSegmentIDs {
if _ , ok := balancedSegmentInfos [ segmentID ] ; ! ok {
err := fmt . Errorf ( "loadBalanceTask: unloaded segment %d" , segmentID )
log . Warn ( err . Error ( ) )
2021-11-06 15:22:56 +08:00
lbt . setResultInfo ( err )
return err
}
2022-06-01 20:00:03 +08:00
}
2021-11-06 15:22:56 +08:00
2022-06-01 20:00:03 +08:00
if len ( lbt . SealedSegmentIDs ) != 0 {
balancedSegmentIDs = lbt . SealedSegmentIDs
}
2021-11-06 15:22:56 +08:00
2022-06-01 20:00:03 +08:00
// TODO(yah01): release balanced segments in source nodes
// balancedSegmentSet := make(typeutil.UniqueSet)
// balancedSegmentSet.Insert(balancedSegmentIDs...)
2021-11-06 15:22:56 +08:00
2022-06-01 20:00:03 +08:00
// for _, nodeID := range lbt.SourceNodeIDs {
// segments := lbt.meta.getSegmentInfosByNode(nodeID)
// shardSegments := make(map[string][]UniqueID)
// for _, segment := range segments {
// if !balancedSegmentSet.Contain(segment.SegmentID) {
// continue
// }
// shardSegments[segment.DmChannel] = append(shardSegments[segment.DmChannel], segment.SegmentID)
// }
// for dmc, segmentIDs := range shardSegments {
// shardLeader, err := getShardLeaderByNodeID(lbt.meta, lbt.replicaID, dmc)
// if err != nil {
// log.Error("failed to get shardLeader",
// zap.Int64("replicaID", lbt.replicaID),
// zap.Int64("nodeID", nodeID),
// zap.String("dmChannel", dmc),
// zap.Error(err))
// lbt.setResultInfo(err)
// return err
// }
// releaseSegmentReq := &querypb.ReleaseSegmentsRequest{
// Base: &commonpb.MsgBase{
// MsgType: commonpb.MsgType_ReleaseSegments,
// },
// NodeID: nodeID,
// SegmentIDs: segmentIDs,
// }
// baseTask := newBaseTask(ctx, querypb.TriggerCondition_LoadBalance)
// lbt.addChildTask(&releaseSegmentTask{
// baseTask: baseTask,
// ReleaseSegmentsRequest: releaseSegmentReq,
// cluster: lbt.cluster,
// leaderID: shardLeader,
// })
// }
// }
col2PartitionIDs := make ( map [ UniqueID ] [ ] UniqueID )
par2Segments := make ( map [ UniqueID ] [ ] * querypb . SegmentInfo )
for _ , segmentID := range balancedSegmentIDs {
info := balancedSegmentInfos [ segmentID ]
collectionID := info . CollectionID
partitionID := info . PartitionID
if _ , ok := col2PartitionIDs [ collectionID ] ; ! ok {
col2PartitionIDs [ collectionID ] = make ( [ ] UniqueID , 0 )
}
if _ , ok := par2Segments [ partitionID ] ; ! ok {
col2PartitionIDs [ collectionID ] = append ( col2PartitionIDs [ collectionID ] , partitionID )
par2Segments [ partitionID ] = make ( [ ] * querypb . SegmentInfo , 0 )
2021-11-06 15:22:56 +08:00
}
2022-06-01 20:00:03 +08:00
par2Segments [ partitionID ] = append ( par2Segments [ partitionID ] , info )
}
2021-11-06 15:22:56 +08:00
2022-06-01 20:00:03 +08:00
loadSegmentReqs := make ( [ ] * querypb . LoadSegmentsRequest , 0 )
for collectionID , partitionIDs := range col2PartitionIDs {
var watchDeltaChannels [ ] * datapb . VchannelInfo
collectionInfo , err := lbt . meta . getCollectionInfoByID ( collectionID )
if err != nil {
log . Error ( "loadBalanceTask: can't find collectionID in meta" , zap . Int64 ( "collectionID" , collectionID ) , zap . Error ( err ) )
lbt . setResultInfo ( err )
return err
}
for _ , partitionID := range partitionIDs {
dmChannelInfos , binlogs , err := lbt . broker . getRecoveryInfo ( lbt . ctx , collectionID , partitionID )
2021-11-06 15:22:56 +08:00
if err != nil {
2022-06-01 20:00:03 +08:00
log . Error ( "loadBalanceTask: getRecoveryInfo failed" , zap . Int64 ( "collectionID" , collectionID ) , zap . Int64 ( "partitionID" , partitionID ) , zap . Error ( err ) )
2021-11-06 15:22:56 +08:00
lbt . setResultInfo ( err )
return err
}
2022-06-01 20:00:03 +08:00
segmentID2Binlog := make ( map [ UniqueID ] * datapb . SegmentBinlogs )
for _ , binlog := range binlogs {
segmentID2Binlog [ binlog . SegmentID ] = binlog
}
2021-11-06 15:22:56 +08:00
2022-06-01 20:00:03 +08:00
for _ , segmentInfo := range par2Segments [ partitionID ] {
segmentID := segmentInfo . SegmentID
if _ , ok := segmentID2Binlog [ segmentID ] ; ! ok {
log . Warn ( "loadBalanceTask: can't find binlog of segment to balance, may be has been compacted" , zap . Int64 ( "segmentID" , segmentID ) )
continue
}
2022-04-20 16:15:41 +08:00
2022-06-01 20:00:03 +08:00
segmentBingLog := segmentID2Binlog [ segmentID ]
segmentLoadInfo := lbt . broker . generateSegmentLoadInfo ( ctx , collectionID , partitionID , segmentBingLog , true , collectionInfo . Schema )
msgBase := proto . Clone ( lbt . Base ) . ( * commonpb . MsgBase )
msgBase . MsgType = commonpb . MsgType_LoadSegments
loadSegmentReq := & querypb . LoadSegmentsRequest {
Base : msgBase ,
Infos : [ ] * querypb . SegmentLoadInfo { segmentLoadInfo } ,
Schema : collectionInfo . Schema ,
CollectionID : collectionID ,
ReplicaID : lbt . replicaID ,
LoadMeta : & querypb . LoadMetaInfo {
2022-05-16 15:41:56 +08:00
CollectionID : collectionID ,
2022-06-01 20:00:03 +08:00
PartitionIDs : collectionInfo . PartitionIDs ,
} ,
2021-11-06 15:22:56 +08:00
}
2022-06-01 20:00:03 +08:00
loadSegmentReqs = append ( loadSegmentReqs , loadSegmentReq )
}
2021-11-06 15:22:56 +08:00
2022-06-01 20:00:03 +08:00
for _ , info := range dmChannelInfos {
deltaChannel , err := generateWatchDeltaChannelInfo ( info )
if err != nil {
return err
2021-11-06 15:22:56 +08:00
}
2022-06-01 20:00:03 +08:00
watchDeltaChannels = append ( watchDeltaChannels , deltaChannel )
2021-11-06 15:22:56 +08:00
}
2021-12-21 11:57:39 +08:00
}
2022-06-01 20:00:03 +08:00
mergedDeltaChannels := mergeWatchDeltaChannelInfo ( watchDeltaChannels )
// If meta is not updated here, deltaChannel meta will not be available when loadSegment reschedule
err = lbt . meta . setDeltaChannel ( collectionID , mergedDeltaChannels )
2021-12-21 11:57:39 +08:00
if err != nil {
2022-06-01 20:00:03 +08:00
log . Error ( "loadBalanceTask: set delta channel info to meta failed" , zap . Error ( err ) )
2021-12-21 11:57:39 +08:00
lbt . setResultInfo ( err )
return err
}
2021-11-06 15:22:56 +08:00
}
2022-06-01 20:00:03 +08:00
internalTasks , err := assignInternalTask ( ctx , lbt , lbt . meta , lbt . cluster , loadSegmentReqs , nil , false , lbt . SourceNodeIDs , lbt . DstNodeIDs , lbt . replicaID , lbt . broker )
if err != nil {
log . Error ( "loadBalanceTask: assign child task failed" , zap . Any ( "balance request" , lbt . LoadBalanceRequest ) )
lbt . setResultInfo ( err )
return err
}
for _ , internalTask := range internalTasks {
lbt . addChildTask ( internalTask )
log . Info ( "loadBalanceTask: add a childTask" , zap . String ( "task type" , internalTask . msgType ( ) . String ( ) ) , zap . Any ( "balance request" , lbt . LoadBalanceRequest ) )
}
log . Info ( "loadBalanceTask: assign child task done" , zap . Any ( "balance request" , lbt . LoadBalanceRequest ) )
2021-06-19 11:45:09 +08:00
return nil
}
2022-04-20 16:15:41 +08:00
func ( lbt * loadBalanceTask ) getReplica ( nodeID , collectionID int64 ) ( * milvuspb . ReplicaInfo , error ) {
replicas , err := lbt . meta . getReplicasByNodeID ( nodeID )
if err != nil {
return nil , err
}
for _ , replica := range replicas {
if replica . GetCollectionID ( ) == collectionID {
return replica , nil
}
}
return nil , fmt . Errorf ( "unable to find replicas of collection %d and node %d" , collectionID , nodeID )
}
2021-10-18 21:34:47 +08:00
func ( lbt * loadBalanceTask ) postExecute ( context . Context ) error {
2021-12-29 12:15:21 +08:00
if lbt . getResultInfo ( ) . ErrorCode != commonpb . ErrorCode_Success {
lbt . clearChildTasks ( )
2021-11-06 15:22:56 +08:00
}
2022-01-13 17:59:34 +08:00
2022-04-26 11:29:54 +08:00
log . Info ( "loadBalanceTask postExecute done" ,
2021-11-06 15:22:56 +08:00
zap . Int32 ( "trigger type" , int32 ( lbt . triggerCondition ) ) ,
2021-06-19 11:45:09 +08:00
zap . Int64s ( "sourceNodeIDs" , lbt . SourceNodeIDs ) ,
zap . Any ( "balanceReason" , lbt . BalanceReason ) ,
2021-10-14 20:18:33 +08:00
zap . Int64 ( "taskID" , lbt . getTaskID ( ) ) )
2021-06-30 16:18:13 +08:00
return nil
2021-04-15 15:15:46 +08:00
}
2022-05-05 16:25:50 +08:00
func ( lbt * loadBalanceTask ) globalPostExecute ( ctx context . Context ) error {
if len ( lbt . getChildTask ( ) ) > 0 {
2022-05-18 11:55:56 +08:00
replicas := make ( map [ UniqueID ] * milvuspb . ReplicaInfo )
segments := make ( map [ UniqueID ] * querypb . SegmentInfo )
for _ , id := range lbt . SourceNodeIDs {
for _ , segment := range lbt . meta . getSegmentInfosByNode ( id ) {
segments [ segment . SegmentID ] = segment
}
nodeReplicas , err := lbt . meta . getReplicasByNodeID ( id )
if err != nil {
log . Warn ( "failed to get replicas for removing offline querynode from it" ,
zap . Int64 ( "querynodeID" , id ) ,
zap . Error ( err ) )
continue
}
for _ , replica := range nodeReplicas {
replicas [ replica . ReplicaID ] = replica
}
}
log . Debug ( "removing offline nodes from replicas and segments..." ,
2022-06-01 20:00:03 +08:00
zap . Int ( "replicaNum" , len ( replicas ) ) ,
zap . Int ( "segmentNum" , len ( segments ) ) ,
zap . Int64 ( "triggerTaskID" , lbt . getTaskID ( ) ) ,
2022-05-18 11:55:56 +08:00
)
2022-05-19 16:51:57 +08:00
2022-05-18 11:55:56 +08:00
// Remove offline nodes from replica
2022-05-19 16:51:57 +08:00
wg := errgroup . Group { }
2022-05-05 21:15:50 +08:00
if lbt . triggerCondition == querypb . TriggerCondition_NodeDown {
offlineNodes := make ( typeutil . UniqueSet , len ( lbt . SourceNodeIDs ) )
for _ , nodeID := range lbt . SourceNodeIDs {
offlineNodes . Insert ( nodeID )
}
2022-05-19 16:51:57 +08:00
for _ , replica := range replicas {
replica := replica
wg . Go ( func ( ) error {
2022-05-05 21:15:50 +08:00
onlineNodes := make ( [ ] UniqueID , 0 , len ( replica . NodeIds ) )
for _ , nodeID := range replica . NodeIds {
if ! offlineNodes . Contain ( nodeID ) {
onlineNodes = append ( onlineNodes , nodeID )
}
}
replica . NodeIds = onlineNodes
err := lbt . meta . setReplicaInfo ( replica )
if err != nil {
2022-05-19 16:51:57 +08:00
log . Error ( "failed to remove offline nodes from replica info" ,
2022-05-05 21:15:50 +08:00
zap . Int64 ( "replicaID" , replica . ReplicaID ) ,
zap . Error ( err ) )
2022-05-19 16:51:57 +08:00
return err
2022-05-05 21:15:50 +08:00
}
2022-05-19 16:51:57 +08:00
return nil
} )
2022-05-05 21:15:50 +08:00
}
2022-06-01 20:00:03 +08:00
// if loadBalanceTask execute failed after query node down, the lbt.getResultInfo().ErrorCode will be set to commonpb.ErrorCode_UnexpectedError
// then the queryCoord will panic, and the nodeInfo should not be removed immediately
// after queryCoord recovery, the balanceTask will redo
for _ , offlineNodeID := range lbt . SourceNodeIDs {
2022-06-02 13:16:05 +08:00
err := lbt . cluster . RemoveNodeInfo ( offlineNodeID )
2022-06-01 20:00:03 +08:00
if err != nil {
log . Error ( "loadBalanceTask: occur error when removing node info from cluster" ,
zap . Int64 ( "nodeID" , offlineNodeID ) ,
zap . Error ( err ) )
lbt . setResultInfo ( err )
return err
}
}
2022-05-18 11:55:56 +08:00
}
2022-05-10 15:47:53 +08:00
2022-05-18 11:55:56 +08:00
for _ , segment := range segments {
2022-05-19 16:51:57 +08:00
segment := segment
wg . Go ( func ( ) error {
2022-05-18 11:55:56 +08:00
segment . NodeID = - 1
segment . NodeIds = removeFromSlice ( segment . NodeIds , lbt . SourceNodeIDs ... )
if len ( segment . NodeIds ) > 0 {
segment . NodeID = segment . NodeIds [ 0 ]
}
2022-05-10 15:47:53 +08:00
2022-05-18 11:55:56 +08:00
err := lbt . meta . saveSegmentInfo ( segment )
if err != nil {
2022-05-19 16:51:57 +08:00
log . Error ( "failed to remove offline nodes from segment info" ,
2022-05-18 11:55:56 +08:00
zap . Int64 ( "segmentID" , segment . SegmentID ) ,
zap . Error ( err ) )
2022-05-19 16:51:57 +08:00
return err
2022-05-18 11:55:56 +08:00
}
2022-05-05 21:15:50 +08:00
2022-05-20 18:03:58 +08:00
log . Info ( "remove offline nodes from segment" ,
zap . Int64 ( "taskID" , lbt . getTaskID ( ) ) ,
zap . Int64 ( "segmentID" , segment . GetSegmentID ( ) ) ,
zap . Int64s ( "nodeIds" , segment . GetNodeIds ( ) ) )
2022-05-19 16:51:57 +08:00
return nil
} )
2022-05-05 21:15:50 +08:00
}
for _ , childTask := range lbt . getChildTask ( ) {
if task , ok := childTask . ( * watchDmChannelTask ) ; ok {
2022-05-19 16:51:57 +08:00
wg . Go ( func ( ) error {
2022-05-26 17:14:02 +08:00
leaderID := task . NodeID
dmChannel := task . Infos [ 0 ] . ChannelName
2022-06-02 13:16:05 +08:00
nodeInfo , err := lbt . cluster . GetNodeInfoByID ( leaderID )
2022-05-19 16:51:57 +08:00
if err != nil {
log . Error ( "failed to get node info to update shard leader info" ,
zap . Int64 ( "triggerTaskID" , lbt . getTaskID ( ) ) ,
zap . Int64 ( "taskID" , task . getTaskID ( ) ) ,
2022-05-26 17:14:02 +08:00
zap . Int64 ( "nodeID" , leaderID ) ,
zap . String ( "dmChannel" , dmChannel ) ,
2022-05-19 16:51:57 +08:00
zap . Error ( err ) )
return err
}
2022-05-05 21:15:50 +08:00
2022-05-26 17:14:02 +08:00
err = lbt . meta . updateShardLeader ( task . ReplicaID , dmChannel , leaderID , nodeInfo . ( * queryNode ) . address )
2022-05-19 16:51:57 +08:00
if err != nil {
2022-05-26 17:14:02 +08:00
log . Error ( "failed to update shard leader info of replica" ,
2022-05-19 16:51:57 +08:00
zap . Int64 ( "triggerTaskID" , lbt . getTaskID ( ) ) ,
zap . Int64 ( "taskID" , task . getTaskID ( ) ) ,
zap . Int64 ( "replicaID" , task . ReplicaID ) ,
2022-05-26 17:14:02 +08:00
zap . String ( "dmChannel" , dmChannel ) ,
2022-05-19 16:51:57 +08:00
zap . Error ( err ) )
return err
}
2022-05-05 21:15:50 +08:00
2022-05-26 17:14:02 +08:00
log . Debug ( "LoadBalance: update shard leader" ,
zap . Int64 ( "triggerTaskID" , lbt . getTaskID ( ) ) ,
zap . Int64 ( "taskID" , task . getTaskID ( ) ) ,
zap . String ( "dmChannel" , dmChannel ) ,
zap . Int64 ( "leader" , leaderID ) )
2022-05-19 16:51:57 +08:00
return nil
} )
2022-05-05 21:15:50 +08:00
}
}
2022-05-19 16:51:57 +08:00
err := wg . Wait ( )
if err != nil {
return err
}
err = syncReplicaSegments ( ctx , lbt . cluster , lbt . getChildTask ( ) )
if err != nil {
return err
}
2022-05-05 16:25:50 +08:00
}
2022-05-05 21:15:50 +08:00
2022-05-05 16:25:50 +08:00
return nil
}
2021-06-30 16:18:13 +08:00
func assignInternalTask ( ctx context . Context ,
2021-12-21 11:57:39 +08:00
parentTask task , meta Meta , cluster Cluster ,
2021-06-26 16:08:11 +08:00
loadSegmentRequests [ ] * querypb . LoadSegmentsRequest ,
2021-10-11 09:54:37 +08:00
watchDmChannelRequests [ ] * querypb . WatchDmChannelsRequest ,
2022-05-31 16:36:03 +08:00
wait bool , excludeNodeIDs [ ] int64 , includeNodeIDs [ ] int64 , replicaID int64 ,
broker * globalMetaBroker ) ( [ ] task , error ) {
2022-04-26 11:29:54 +08:00
2021-11-11 12:56:42 +08:00
internalTasks := make ( [ ] task , 0 )
2022-06-02 13:16:05 +08:00
err := cluster . AllocateSegmentsToQueryNode ( ctx , loadSegmentRequests , wait , excludeNodeIDs , includeNodeIDs , replicaID )
2021-10-11 09:54:37 +08:00
if err != nil {
2022-04-26 11:29:54 +08:00
log . Error ( "assignInternalTask: assign segment to node failed" , zap . Error ( err ) )
2021-11-11 12:56:42 +08:00
return nil , err
2021-10-11 09:54:37 +08:00
}
2022-04-26 11:29:54 +08:00
log . Info ( "assignInternalTask: assign segment to node success" , zap . Int ( "load segments" , len ( loadSegmentRequests ) ) )
2021-11-11 12:56:42 +08:00
2022-06-02 13:16:05 +08:00
err = cluster . AllocateChannelsToQueryNode ( ctx , watchDmChannelRequests , wait , excludeNodeIDs , includeNodeIDs , replicaID )
2021-10-11 09:54:37 +08:00
if err != nil {
2022-04-26 11:29:54 +08:00
log . Error ( "assignInternalTask: assign dmChannel to node failed" , zap . Error ( err ) )
2021-11-11 12:56:42 +08:00
return nil , err
2021-10-11 09:54:37 +08:00
}
2022-04-26 11:29:54 +08:00
log . Info ( "assignInternalTask: assign dmChannel to node success" , zap . Int ( "watch dmchannels" , len ( watchDmChannelRequests ) ) )
2021-12-21 11:57:39 +08:00
2022-03-07 19:23:58 +08:00
if len ( loadSegmentRequests ) > 0 {
sort . Slice ( loadSegmentRequests , func ( i , j int ) bool {
return loadSegmentRequests [ i ] . CollectionID < loadSegmentRequests [ j ] . CollectionID ||
loadSegmentRequests [ i ] . CollectionID == loadSegmentRequests [ j ] . CollectionID && loadSegmentRequests [ i ] . DstNodeID < loadSegmentRequests [ j ] . DstNodeID
} )
2021-06-26 16:08:11 +08:00
2022-03-07 19:23:58 +08:00
batchReq := loadSegmentRequests [ 0 ]
batchSize := proto . Size ( batchReq )
for _ , req := range loadSegmentRequests [ 1 : ] {
// Pack current batch, switch to new batch
if req . CollectionID != batchReq . CollectionID || req . DstNodeID != batchReq . DstNodeID ||
batchSize + proto . Size ( req ) > MaxSendSizeToEtcd {
2021-12-21 11:57:39 +08:00
baseTask := newBaseTask ( ctx , parentTask . getTriggerCondition ( ) )
baseTask . setParentTask ( parentTask )
loadSegmentTask := & loadSegmentTask {
baseTask : baseTask ,
2022-03-07 19:23:58 +08:00
LoadSegmentsRequest : batchReq ,
2021-12-21 11:57:39 +08:00
meta : meta ,
cluster : cluster ,
excludeNodeIDs : excludeNodeIDs ,
2022-05-31 16:36:03 +08:00
broker : broker ,
2021-12-21 11:57:39 +08:00
}
internalTasks = append ( internalTasks , loadSegmentTask )
2022-03-07 19:23:58 +08:00
batchReq = req
batchSize = proto . Size ( batchReq )
} else {
batchReq . Infos = append ( batchReq . Infos , req . Infos ... )
batchSize += proto . Size ( req )
2021-10-19 10:40:35 +08:00
}
2021-11-05 14:47:19 +08:00
}
2022-03-07 19:23:58 +08:00
// Pack the last batch
baseTask := newBaseTask ( ctx , parentTask . getTriggerCondition ( ) )
baseTask . setParentTask ( parentTask )
loadSegmentTask := & loadSegmentTask {
baseTask : baseTask ,
LoadSegmentsRequest : batchReq ,
meta : meta ,
cluster : cluster ,
excludeNodeIDs : excludeNodeIDs ,
2022-05-31 16:36:03 +08:00
broker : broker ,
2022-03-07 19:23:58 +08:00
}
internalTasks = append ( internalTasks , loadSegmentTask )
2021-06-26 16:08:11 +08:00
}
2021-11-11 12:56:42 +08:00
for _ , req := range watchDmChannelRequests {
2021-10-14 20:18:33 +08:00
baseTask := newBaseTask ( ctx , parentTask . getTriggerCondition ( ) )
baseTask . setParentTask ( parentTask )
2021-10-18 21:34:47 +08:00
watchDmChannelTask := & watchDmChannelTask {
baseTask : baseTask ,
2021-11-11 12:56:42 +08:00
WatchDmChannelsRequest : req ,
2021-06-26 16:08:11 +08:00
meta : meta ,
cluster : cluster ,
2021-11-11 12:56:42 +08:00
excludeNodeIDs : excludeNodeIDs ,
2021-06-26 16:08:11 +08:00
}
2021-11-11 12:56:42 +08:00
internalTasks = append ( internalTasks , watchDmChannelTask )
2021-12-21 11:57:39 +08:00
}
2021-11-11 12:56:42 +08:00
return internalTasks , nil
2021-06-26 16:08:11 +08:00
}
2021-10-19 10:40:35 +08:00
2021-11-25 17:37:15 +08:00
func generateWatchDeltaChannelInfo ( info * datapb . VchannelInfo ) ( * datapb . VchannelInfo , error ) {
2022-03-15 21:51:21 +08:00
deltaChannelName , err := funcutil . ConvertChannelName ( info . ChannelName , Params . CommonCfg . RootCoordDml , Params . CommonCfg . RootCoordDelta )
2021-11-25 17:37:15 +08:00
if err != nil {
return nil , err
}
deltaChannel := proto . Clone ( info ) . ( * datapb . VchannelInfo )
deltaChannel . ChannelName = deltaChannelName
2022-06-16 12:00:10 +08:00
deltaChannel . UnflushedSegmentIds = nil
deltaChannel . FlushedSegmentIds = nil
deltaChannel . DroppedSegmentIds = nil
2021-11-25 17:37:15 +08:00
return deltaChannel , nil
}
2021-11-25 18:49:16 +08:00
func mergeWatchDeltaChannelInfo ( infos [ ] * datapb . VchannelInfo ) [ ] * datapb . VchannelInfo {
minPositions := make ( map [ string ] int )
for index , info := range infos {
_ , ok := minPositions [ info . ChannelName ]
if ! ok {
minPositions [ info . ChannelName ] = index
}
minTimeStampIndex := minPositions [ info . ChannelName ]
if info . SeekPosition . GetTimestamp ( ) < infos [ minTimeStampIndex ] . SeekPosition . GetTimestamp ( ) {
minPositions [ info . ChannelName ] = index
}
}
var result [ ] * datapb . VchannelInfo
for _ , index := range minPositions {
result = append ( result , infos [ index ] )
}
2022-04-26 11:29:54 +08:00
log . Info ( "merge delta channels finished" ,
2021-12-01 16:39:31 +08:00
zap . Any ( "origin info length" , len ( infos ) ) ,
zap . Any ( "merged info length" , len ( result ) ) ,
)
2021-11-25 18:49:16 +08:00
return result
}
2021-12-13 10:29:27 +08:00
2021-12-21 11:57:39 +08:00
func mergeDmChannelInfo ( infos [ ] * datapb . VchannelInfo ) map [ string ] * datapb . VchannelInfo {
minPositions := make ( map [ string ] * datapb . VchannelInfo )
for _ , info := range infos {
if _ , ok := minPositions [ info . ChannelName ] ; ! ok {
minPositions [ info . ChannelName ] = info
continue
}
minPositionInfo := minPositions [ info . ChannelName ]
if info . SeekPosition . GetTimestamp ( ) < minPositionInfo . SeekPosition . GetTimestamp ( ) {
minPositionInfo . SeekPosition = info . SeekPosition
}
2022-06-16 12:00:10 +08:00
minPositionInfo . DroppedSegmentIds = append ( minPositionInfo . DroppedSegmentIds , info . DroppedSegmentIds ... )
minPositionInfo . UnflushedSegmentIds = append ( minPositionInfo . UnflushedSegmentIds , info . UnflushedSegmentIds ... )
minPositionInfo . FlushedSegmentIds = append ( minPositionInfo . FlushedSegmentIds , info . FlushedSegmentIds ... )
2021-12-21 11:57:39 +08:00
}
return minPositions
}
2022-06-16 12:00:10 +08:00
// generateVChannelSegmentInfos returns a map<id, SegmentInfo> contains
// all the segment infos(flushed, unflushed and dropped) in a vChannel.
func generateVChannelSegmentInfos ( m Meta , vChannel * datapb . VchannelInfo ) ( map [ int64 ] * datapb . SegmentInfo , error ) {
segmentIds := make ( [ ] int64 , 0 )
segmentIds = append ( append ( append ( segmentIds , vChannel . FlushedSegmentIds ... ) , vChannel . UnflushedSegmentIds ... ) , vChannel . DroppedSegmentIds ... )
segmentInfos , err := m . getDataSegmentInfosByIDs ( segmentIds )
if err != nil {
log . Error ( "Get Vchannel SegmentInfos failed" , zap . String ( "vChannel" , vChannel . String ( ) ) , zap . Error ( err ) )
return nil , err
}
segmentDict := make ( map [ int64 ] * datapb . SegmentInfo )
for _ , info := range segmentInfos {
segmentDict [ info . ID ] = info
}
return segmentDict , nil
}