mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-04 04:49:08 +08:00
Modify the replicas' shard info after load balance (#16785)
Signed-off-by: yah01 <yang.cen@zilliz.com>
This commit is contained in:
parent
5922f147e5
commit
c82e2453eb
@ -675,8 +675,6 @@ func (rct *releaseCollectionTask) preExecute(context.Context) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (rct *releaseCollectionTask) execute(ctx context.Context) error {
|
func (rct *releaseCollectionTask) execute(ctx context.Context) error {
|
||||||
// cancel the maximum number of retries for queryNode cleaning data until the data is completely freed
|
|
||||||
// defer rct.reduceRetryCount()
|
|
||||||
collectionID := rct.CollectionID
|
collectionID := rct.CollectionID
|
||||||
|
|
||||||
// if nodeID ==0, it means that the release request has not been assigned to the specified query node
|
// if nodeID ==0, it means that the release request has not been assigned to the specified query node
|
||||||
@ -707,6 +705,8 @@ func (rct *releaseCollectionTask) execute(ctx context.Context) error {
|
|||||||
log.Info("releaseCollectionTask: add a releaseCollectionTask to releaseCollectionTask's childTask", zap.Any("task", releaseCollectionTask))
|
log.Info("releaseCollectionTask: add a releaseCollectionTask to releaseCollectionTask's childTask", zap.Any("task", releaseCollectionTask))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// If the node crashed or be offline, the loaded segments are lost
|
||||||
|
defer rct.reduceRetryCount()
|
||||||
err := rct.cluster.releaseCollection(ctx, rct.NodeID, rct.ReleaseCollectionRequest)
|
err := rct.cluster.releaseCollection(ctx, rct.NodeID, rct.ReleaseCollectionRequest)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("releaseCollectionTask: release collection end, node occur error", zap.Int64("collectionID", collectionID), zap.Int64("nodeID", rct.NodeID))
|
log.Warn("releaseCollectionTask: release collection end, node occur error", zap.Int64("collectionID", collectionID), zap.Int64("nodeID", rct.NodeID))
|
||||||
@ -1126,8 +1126,6 @@ func (rpt *releasePartitionTask) preExecute(context.Context) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (rpt *releasePartitionTask) execute(ctx context.Context) error {
|
func (rpt *releasePartitionTask) execute(ctx context.Context) error {
|
||||||
// cancel the maximum number of retries for queryNode cleaning data until the data is completely freed
|
|
||||||
// defer rpt.reduceRetryCount()
|
|
||||||
collectionID := rpt.CollectionID
|
collectionID := rpt.CollectionID
|
||||||
partitionIDs := rpt.PartitionIDs
|
partitionIDs := rpt.PartitionIDs
|
||||||
|
|
||||||
@ -1149,6 +1147,8 @@ func (rpt *releasePartitionTask) execute(ctx context.Context) error {
|
|||||||
log.Info("releasePartitionTask: add a releasePartitionTask to releasePartitionTask's childTask", zap.Int64("collectionID", collectionID), zap.Int64("msgID", rpt.Base.MsgID))
|
log.Info("releasePartitionTask: add a releasePartitionTask to releasePartitionTask's childTask", zap.Int64("collectionID", collectionID), zap.Int64("msgID", rpt.Base.MsgID))
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// If the node crashed or be offline, the loaded segments are lost
|
||||||
|
defer rpt.reduceRetryCount()
|
||||||
err := rpt.cluster.releasePartitions(ctx, rpt.NodeID, rpt.ReleasePartitionsRequest)
|
err := rpt.cluster.releasePartitions(ctx, rpt.NodeID, rpt.ReleasePartitionsRequest)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("ReleasePartitionsTask: release partition end, node occur error", zap.Int64("collectionID", collectionID), zap.String("nodeID", fmt.Sprintln(rpt.NodeID)))
|
log.Warn("ReleasePartitionsTask: release partition end, node occur error", zap.Int64("collectionID", collectionID), zap.String("nodeID", fmt.Sprintln(rpt.NodeID)))
|
||||||
@ -2235,12 +2235,6 @@ func (lbt *loadBalanceTask) postExecute(context.Context) error {
|
|||||||
// then the queryCoord will panic, and the nodeInfo should not be removed immediately
|
// then the queryCoord will panic, and the nodeInfo should not be removed immediately
|
||||||
// after queryCoord recovery, the balanceTask will redo
|
// after queryCoord recovery, the balanceTask will redo
|
||||||
if lbt.triggerCondition == querypb.TriggerCondition_NodeDown && lbt.getResultInfo().ErrorCode == commonpb.ErrorCode_Success {
|
if lbt.triggerCondition == querypb.TriggerCondition_NodeDown && lbt.getResultInfo().ErrorCode == commonpb.ErrorCode_Success {
|
||||||
offlineNodes := make(map[UniqueID]struct{}, len(lbt.SourceNodeIDs))
|
|
||||||
for _, nodeID := range lbt.SourceNodeIDs {
|
|
||||||
offlineNodes[nodeID] = struct{}{}
|
|
||||||
}
|
|
||||||
replicas := make(map[UniqueID]*milvuspb.ReplicaInfo)
|
|
||||||
|
|
||||||
for _, id := range lbt.SourceNodeIDs {
|
for _, id := range lbt.SourceNodeIDs {
|
||||||
err := lbt.cluster.removeNodeInfo(id)
|
err := lbt.cluster.removeNodeInfo(id)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -2250,7 +2244,27 @@ func (lbt *loadBalanceTask) postExecute(context.Context) error {
|
|||||||
zap.Error(err))
|
zap.Error(err))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Info("loadBalanceTask postExecute done",
|
||||||
|
zap.Int32("trigger type", int32(lbt.triggerCondition)),
|
||||||
|
zap.Int64s("sourceNodeIDs", lbt.SourceNodeIDs),
|
||||||
|
zap.Any("balanceReason", lbt.BalanceReason),
|
||||||
|
zap.Int64("taskID", lbt.getTaskID()))
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (lbt *loadBalanceTask) globalPostExecute(ctx context.Context) error {
|
||||||
|
if len(lbt.getChildTask()) > 0 {
|
||||||
|
if lbt.triggerCondition == querypb.TriggerCondition_NodeDown {
|
||||||
|
offlineNodes := make(typeutil.UniqueSet, len(lbt.SourceNodeIDs))
|
||||||
|
for _, nodeID := range lbt.SourceNodeIDs {
|
||||||
|
offlineNodes.Insert(nodeID)
|
||||||
|
}
|
||||||
|
replicas := make(map[UniqueID]*milvuspb.ReplicaInfo)
|
||||||
|
|
||||||
|
for _, id := range lbt.SourceNodeIDs {
|
||||||
replica, err := lbt.getReplica(id, lbt.CollectionID)
|
replica, err := lbt.getReplica(id, lbt.CollectionID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warn("failed to get replica for removing offline querynode from it",
|
log.Warn("failed to get replica for removing offline querynode from it",
|
||||||
@ -2272,7 +2286,7 @@ func (lbt *loadBalanceTask) postExecute(context.Context) error {
|
|||||||
|
|
||||||
onlineNodes := make([]UniqueID, 0, len(replica.NodeIds))
|
onlineNodes := make([]UniqueID, 0, len(replica.NodeIds))
|
||||||
for _, nodeID := range replica.NodeIds {
|
for _, nodeID := range replica.NodeIds {
|
||||||
if _, ok := offlineNodes[nodeID]; !ok {
|
if !offlineNodes.Contain(nodeID) {
|
||||||
onlineNodes = append(onlineNodes, nodeID)
|
onlineNodes = append(onlineNodes, nodeID)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2289,18 +2303,33 @@ func (lbt *loadBalanceTask) postExecute(context.Context) error {
|
|||||||
wg.Wait()
|
wg.Wait()
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Info("loadBalanceTask postExecute done",
|
err := syncReplicaSegments(ctx, lbt.cluster, lbt.getChildTask())
|
||||||
zap.Int32("trigger type", int32(lbt.triggerCondition)),
|
if err != nil {
|
||||||
zap.Int64s("sourceNodeIDs", lbt.SourceNodeIDs),
|
return err
|
||||||
zap.Any("balanceReason", lbt.BalanceReason),
|
|
||||||
zap.Int64("taskID", lbt.getTaskID()))
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (lbt *loadBalanceTask) globalPostExecute(ctx context.Context) error {
|
for _, childTask := range lbt.getChildTask() {
|
||||||
if len(lbt.getChildTask()) > 0 {
|
if task, ok := childTask.(*watchDmChannelTask); ok {
|
||||||
return syncReplicaSegments(ctx, lbt.cluster, lbt.getChildTask())
|
nodeInfo, err := lbt.cluster.getNodeInfoByID(task.NodeID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
replica, err := lbt.meta.getReplicaByID(task.ReplicaID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, shard := range replica.ShardReplicas {
|
||||||
|
if shard.DmChannelName == task.Infos[0].ChannelName {
|
||||||
|
shard.LeaderID = task.NodeID
|
||||||
|
shard.LeaderAddr = nodeInfo.(*queryNode).address
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user