2024-01-05 15:54:55 +08:00
|
|
|
// Licensed to the LF AI & Data foundation under one
|
|
|
|
// or more contributor license agreements. See the NOTICE file
|
|
|
|
// distributed with this work for additional information
|
|
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
|
|
// to you under the Apache License, Version 2.0 (the
|
|
|
|
// "License"); you may not use this file except in compliance
|
|
|
|
// with the License. You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package checkers
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"go.uber.org/zap"
|
|
|
|
|
2024-03-19 09:59:05 +08:00
|
|
|
"github.com/milvus-io/milvus/internal/proto/datapb"
|
2024-01-05 15:54:55 +08:00
|
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
|
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
|
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
|
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/utils"
|
2024-07-01 17:40:07 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/common"
|
2024-01-05 15:54:55 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/log"
|
|
|
|
)
|
|
|
|
|
|
|
|
var _ Checker = (*LeaderChecker)(nil)
|
|
|
|
|
|
|
|
// LeaderChecker perform segment index check.
|
|
|
|
type LeaderChecker struct {
|
|
|
|
*checkerActivation
|
2024-06-11 14:21:56 +08:00
|
|
|
meta *meta.Meta
|
|
|
|
dist *meta.DistributionManager
|
2024-07-01 10:26:06 +08:00
|
|
|
target meta.TargetManagerInterface
|
2024-06-11 14:21:56 +08:00
|
|
|
nodeMgr *session.NodeManager
|
2024-01-05 15:54:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
func NewLeaderChecker(
|
|
|
|
meta *meta.Meta,
|
|
|
|
dist *meta.DistributionManager,
|
2024-07-01 10:26:06 +08:00
|
|
|
target meta.TargetManagerInterface,
|
2024-01-05 15:54:55 +08:00
|
|
|
nodeMgr *session.NodeManager,
|
|
|
|
) *LeaderChecker {
|
|
|
|
return &LeaderChecker{
|
2024-06-11 14:21:56 +08:00
|
|
|
checkerActivation: newCheckerActivation(),
|
|
|
|
meta: meta,
|
|
|
|
dist: dist,
|
|
|
|
target: target,
|
|
|
|
nodeMgr: nodeMgr,
|
2024-01-05 15:54:55 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *LeaderChecker) ID() utils.CheckerType {
|
|
|
|
return utils.LeaderChecker
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *LeaderChecker) Description() string {
|
|
|
|
return "LeaderChecker checks the difference of leader view between dist, and try to correct it"
|
|
|
|
}
|
|
|
|
|
2024-01-20 18:58:58 +08:00
|
|
|
func (c *LeaderChecker) readyToCheck(collectionID int64) bool {
|
|
|
|
metaExist := (c.meta.GetCollection(collectionID) != nil)
|
2024-07-01 17:40:07 +08:00
|
|
|
targetExist := c.target.IsNextTargetExist(collectionID) || c.target.IsCurrentTargetExist(collectionID, common.AllPartitionsID)
|
2024-01-20 18:58:58 +08:00
|
|
|
|
|
|
|
return metaExist && targetExist
|
|
|
|
}
|
|
|
|
|
2024-01-05 15:54:55 +08:00
|
|
|
func (c *LeaderChecker) Check(ctx context.Context) []task.Task {
|
|
|
|
if !c.IsActive() {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
collectionIDs := c.meta.CollectionManager.GetAll()
|
|
|
|
tasks := make([]task.Task, 0)
|
|
|
|
|
|
|
|
for _, collectionID := range collectionIDs {
|
2024-01-20 18:58:58 +08:00
|
|
|
if !c.readyToCheck(collectionID) {
|
|
|
|
continue
|
|
|
|
}
|
2024-01-05 15:54:55 +08:00
|
|
|
collection := c.meta.CollectionManager.GetCollection(collectionID)
|
|
|
|
if collection == nil {
|
|
|
|
log.Warn("collection released during check leader", zap.Int64("collection", collectionID))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
replicas := c.meta.ReplicaManager.GetByCollection(collectionID)
|
|
|
|
for _, replica := range replicas {
|
2024-05-20 10:21:38 +08:00
|
|
|
for _, node := range replica.GetRWNodes() {
|
2024-04-10 15:13:36 +08:00
|
|
|
leaderViews := c.dist.LeaderViewManager.GetByFilter(meta.WithCollectionID2LeaderView(replica.GetCollectionID()), meta.WithNodeID2LeaderView(node))
|
|
|
|
for _, leaderView := range leaderViews {
|
|
|
|
dist := c.dist.SegmentDistManager.GetByFilter(meta.WithChannel(leaderView.Channel), meta.WithReplica(replica))
|
2024-03-21 11:59:12 +08:00
|
|
|
tasks = append(tasks, c.findNeedLoadedSegments(ctx, replica, leaderView, dist)...)
|
|
|
|
tasks = append(tasks, c.findNeedRemovedSegments(ctx, replica, leaderView, dist)...)
|
2024-06-11 14:21:56 +08:00
|
|
|
tasks = append(tasks, c.findNeedSyncPartitionStats(ctx, replica, leaderView, node)...)
|
2024-01-05 15:54:55 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return tasks
|
|
|
|
}
|
|
|
|
|
2024-06-10 21:34:08 +08:00
|
|
|
func (c *LeaderChecker) findNeedSyncPartitionStats(ctx context.Context, replica *meta.Replica, leaderView *meta.LeaderView, nodeID int64) []task.Task {
|
|
|
|
ret := make([]task.Task, 0)
|
|
|
|
curDmlChannel := c.target.GetDmChannel(leaderView.CollectionID, leaderView.Channel, meta.CurrentTarget)
|
|
|
|
if curDmlChannel == nil {
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
partStatsInTarget := curDmlChannel.GetPartitionStatsVersions()
|
|
|
|
partStatsInLView := leaderView.PartitionStatsVersions
|
|
|
|
partStatsToUpdate := make(map[int64]int64)
|
|
|
|
|
|
|
|
for partID, psVersionInTarget := range partStatsInTarget {
|
|
|
|
psVersionInLView := partStatsInLView[partID]
|
|
|
|
if psVersionInLView < psVersionInTarget {
|
|
|
|
partStatsToUpdate[partID] = psVersionInTarget
|
|
|
|
}
|
|
|
|
}
|
2024-06-11 14:21:56 +08:00
|
|
|
if len(partStatsToUpdate) > 0 {
|
|
|
|
action := task.NewLeaderUpdatePartStatsAction(leaderView.ID, nodeID, task.ActionTypeUpdate, leaderView.Channel, partStatsToUpdate)
|
|
|
|
|
|
|
|
t := task.NewLeaderPartStatsTask(
|
|
|
|
ctx,
|
|
|
|
c.ID(),
|
|
|
|
leaderView.CollectionID,
|
|
|
|
replica,
|
|
|
|
leaderView.ID,
|
|
|
|
action,
|
|
|
|
)
|
|
|
|
|
|
|
|
// leader task shouldn't replace executing segment task
|
|
|
|
t.SetPriority(task.TaskPriorityLow)
|
|
|
|
t.SetReason("sync partition stats versions")
|
|
|
|
ret = append(ret, t)
|
|
|
|
}
|
2024-06-10 21:34:08 +08:00
|
|
|
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
2024-03-21 11:59:12 +08:00
|
|
|
func (c *LeaderChecker) findNeedLoadedSegments(ctx context.Context, replica *meta.Replica, leaderView *meta.LeaderView, dist []*meta.Segment) []task.Task {
|
2024-01-05 15:54:55 +08:00
|
|
|
log := log.Ctx(ctx).With(
|
|
|
|
zap.Int64("collectionID", leaderView.CollectionID),
|
2024-03-21 11:59:12 +08:00
|
|
|
zap.Int64("replica", replica.GetID()),
|
2024-01-05 15:54:55 +08:00
|
|
|
zap.String("channel", leaderView.Channel),
|
|
|
|
zap.Int64("leaderViewID", leaderView.ID),
|
|
|
|
)
|
|
|
|
ret := make([]task.Task, 0)
|
2024-04-01 10:37:21 +08:00
|
|
|
|
2024-04-09 15:33:25 +08:00
|
|
|
latestNodeDist := utils.FindMaxVersionSegments(dist)
|
2024-04-01 10:37:21 +08:00
|
|
|
for _, s := range latestNodeDist {
|
2024-03-19 09:59:05 +08:00
|
|
|
segment := c.target.GetSealedSegment(leaderView.CollectionID, s.GetID(), meta.CurrentTargetFirst)
|
|
|
|
existInTarget := segment != nil
|
|
|
|
isL0Segment := existInTarget && segment.GetLevel() == datapb.SegmentLevel_L0
|
2024-04-10 15:13:36 +08:00
|
|
|
// shouldn't set l0 segment location to delegator. l0 segment should be reload in delegator
|
2024-03-19 09:59:05 +08:00
|
|
|
if !existInTarget || isL0Segment {
|
2024-01-05 15:54:55 +08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2024-04-01 10:37:21 +08:00
|
|
|
// when segment's version in leader view doesn't match segment's version in dist
|
|
|
|
// which means leader view store wrong segment location in leader view, then we should update segment location and segment's version
|
2024-03-08 11:57:01 +08:00
|
|
|
version, ok := leaderView.Segments[s.GetID()]
|
2024-04-01 10:37:21 +08:00
|
|
|
if !ok || version.GetVersion() != s.Version {
|
2024-01-14 10:19:16 +08:00
|
|
|
log.RatedDebug(10, "leader checker append a segment to set",
|
2024-01-05 15:54:55 +08:00
|
|
|
zap.Int64("segmentID", s.GetID()),
|
|
|
|
zap.Int64("nodeID", s.Node))
|
2024-04-01 10:37:21 +08:00
|
|
|
action := task.NewLeaderAction(leaderView.ID, s.Node, task.ActionTypeGrow, s.GetInsertChannel(), s.GetID(), time.Now().UnixNano())
|
2024-06-10 21:34:08 +08:00
|
|
|
t := task.NewLeaderSegmentTask(
|
2024-01-05 15:54:55 +08:00
|
|
|
ctx,
|
|
|
|
c.ID(),
|
|
|
|
s.GetCollectionID(),
|
|
|
|
replica,
|
2024-02-21 11:08:51 +08:00
|
|
|
leaderView.ID,
|
2024-01-05 15:54:55 +08:00
|
|
|
action,
|
|
|
|
)
|
2024-04-01 10:37:21 +08:00
|
|
|
|
|
|
|
// leader task shouldn't replace executing segment task
|
|
|
|
t.SetPriority(task.TaskPriorityLow)
|
2024-01-05 15:54:55 +08:00
|
|
|
t.SetReason("add segment to leader view")
|
|
|
|
ret = append(ret, t)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
2024-03-21 11:59:12 +08:00
|
|
|
func (c *LeaderChecker) findNeedRemovedSegments(ctx context.Context, replica *meta.Replica, leaderView *meta.LeaderView, dists []*meta.Segment) []task.Task {
|
2024-01-05 15:54:55 +08:00
|
|
|
log := log.Ctx(ctx).With(
|
|
|
|
zap.Int64("collectionID", leaderView.CollectionID),
|
2024-03-21 11:59:12 +08:00
|
|
|
zap.Int64("replica", replica.GetID()),
|
2024-01-05 15:54:55 +08:00
|
|
|
zap.String("channel", leaderView.Channel),
|
|
|
|
zap.Int64("leaderViewID", leaderView.ID),
|
|
|
|
)
|
|
|
|
|
|
|
|
ret := make([]task.Task, 0)
|
|
|
|
distMap := make(map[int64]struct{})
|
|
|
|
for _, s := range dists {
|
|
|
|
distMap[s.GetID()] = struct{}{}
|
|
|
|
}
|
|
|
|
|
|
|
|
for sid, s := range leaderView.Segments {
|
|
|
|
_, ok := distMap[sid]
|
2024-03-19 09:59:05 +08:00
|
|
|
segment := c.target.GetSealedSegment(leaderView.CollectionID, sid, meta.CurrentTargetFirst)
|
|
|
|
existInTarget := segment != nil
|
|
|
|
isL0Segment := existInTarget && segment.GetLevel() == datapb.SegmentLevel_L0
|
|
|
|
if ok || existInTarget || isL0Segment {
|
2024-01-05 15:54:55 +08:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
log.Debug("leader checker append a segment to remove",
|
|
|
|
zap.Int64("segmentID", sid),
|
|
|
|
zap.Int64("nodeID", s.NodeID))
|
2024-04-01 10:39:12 +08:00
|
|
|
// reduce leader action won't be execute on worker, in order to remove segment from delegator success even when worker done
|
|
|
|
// set workerID to leader view's node
|
|
|
|
action := task.NewLeaderAction(leaderView.ID, leaderView.ID, task.ActionTypeReduce, leaderView.Channel, sid, 0)
|
2024-06-10 21:34:08 +08:00
|
|
|
t := task.NewLeaderSegmentTask(
|
2024-01-05 15:54:55 +08:00
|
|
|
ctx,
|
|
|
|
c.ID(),
|
|
|
|
leaderView.CollectionID,
|
|
|
|
replica,
|
2024-02-21 11:08:51 +08:00
|
|
|
leaderView.ID,
|
2024-01-05 15:54:55 +08:00
|
|
|
action,
|
|
|
|
)
|
|
|
|
|
2024-04-01 10:37:21 +08:00
|
|
|
// leader task shouldn't replace executing segment task
|
|
|
|
t.SetPriority(task.TaskPriorityLow)
|
2024-01-05 15:54:55 +08:00
|
|
|
t.SetReason("remove segment from leader view")
|
|
|
|
ret = append(ret, t)
|
|
|
|
}
|
|
|
|
return ret
|
|
|
|
}
|