mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-05 05:18:52 +08:00
ddd918ba04
This PR change frequency log of check shard leader to rated level --------- Signed-off-by: Wei Liu <wei.liu@zilliz.com>
261 lines
8.6 KiB
Go
261 lines
8.6 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package checkers
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/samber/lo"
|
|
"go.opentelemetry.io/otel/trace"
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/balance"
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
|
|
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/session"
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/task"
|
|
"github.com/milvus-io/milvus/internal/querycoordv2/utils"
|
|
"github.com/milvus-io/milvus/pkg/log"
|
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
|
)
|
|
|
|
// TODO(sunby): have too much similar codes with SegmentChecker
|
|
type ChannelChecker struct {
|
|
*checkerActivation
|
|
meta *meta.Meta
|
|
dist *meta.DistributionManager
|
|
targetMgr *meta.TargetManager
|
|
nodeMgr *session.NodeManager
|
|
balancer balance.Balance
|
|
}
|
|
|
|
func NewChannelChecker(
|
|
meta *meta.Meta,
|
|
dist *meta.DistributionManager,
|
|
targetMgr *meta.TargetManager,
|
|
balancer balance.Balance,
|
|
nodeMgr *session.NodeManager,
|
|
) *ChannelChecker {
|
|
return &ChannelChecker{
|
|
checkerActivation: newCheckerActivation(),
|
|
meta: meta,
|
|
dist: dist,
|
|
targetMgr: targetMgr,
|
|
balancer: balancer,
|
|
nodeMgr: nodeMgr,
|
|
}
|
|
}
|
|
|
|
func (c *ChannelChecker) ID() utils.CheckerType {
|
|
return utils.ChannelChecker
|
|
}
|
|
|
|
func (c *ChannelChecker) Description() string {
|
|
return "DmChannelChecker checks the lack of DmChannels, or some DmChannels are redundant"
|
|
}
|
|
|
|
func (c *ChannelChecker) readyToCheck(collectionID int64) bool {
|
|
metaExist := (c.meta.GetCollection(collectionID) != nil)
|
|
targetExist := c.targetMgr.IsNextTargetExist(collectionID) || c.targetMgr.IsCurrentTargetExist(collectionID)
|
|
|
|
return metaExist && targetExist
|
|
}
|
|
|
|
func (c *ChannelChecker) Check(ctx context.Context) []task.Task {
|
|
if !c.IsActive() {
|
|
return nil
|
|
}
|
|
collectionIDs := c.meta.CollectionManager.GetAll()
|
|
tasks := make([]task.Task, 0)
|
|
for _, cid := range collectionIDs {
|
|
if c.readyToCheck(cid) {
|
|
replicas := c.meta.ReplicaManager.GetByCollection(cid)
|
|
for _, r := range replicas {
|
|
tasks = append(tasks, c.checkReplica(ctx, r)...)
|
|
}
|
|
}
|
|
}
|
|
|
|
channels := c.dist.ChannelDistManager.GetAll()
|
|
released := utils.FilterReleased(channels, collectionIDs)
|
|
releaseTasks := c.createChannelReduceTasks(ctx, released, -1)
|
|
task.SetReason("collection released", releaseTasks...)
|
|
tasks = append(tasks, releaseTasks...)
|
|
return tasks
|
|
}
|
|
|
|
func (c *ChannelChecker) checkReplica(ctx context.Context, replica *meta.Replica) []task.Task {
|
|
ret := make([]task.Task, 0)
|
|
|
|
lacks, redundancies := c.getDmChannelDiff(replica.GetCollectionID(), replica.GetID())
|
|
tasks := c.createChannelLoadTask(c.getTraceCtx(ctx, replica.CollectionID), lacks, replica)
|
|
task.SetReason("lacks of channel", tasks...)
|
|
ret = append(ret, tasks...)
|
|
|
|
tasks = c.createChannelReduceTasks(c.getTraceCtx(ctx, replica.CollectionID), redundancies, replica.GetID())
|
|
task.SetReason("collection released", tasks...)
|
|
ret = append(ret, tasks...)
|
|
|
|
repeated := c.findRepeatedChannels(ctx, replica.GetID())
|
|
tasks = c.createChannelReduceTasks(c.getTraceCtx(ctx, replica.CollectionID), repeated, replica.GetID())
|
|
task.SetReason("redundancies of channel", tasks...)
|
|
ret = append(ret, tasks...)
|
|
|
|
// All channel related tasks should be with high priority
|
|
task.SetPriority(task.TaskPriorityHigh, tasks...)
|
|
return ret
|
|
}
|
|
|
|
// GetDmChannelDiff get channel diff between target and dist
|
|
func (c *ChannelChecker) getDmChannelDiff(collectionID int64,
|
|
replicaID int64,
|
|
) (toLoad, toRelease []*meta.DmChannel) {
|
|
replica := c.meta.Get(replicaID)
|
|
if replica == nil {
|
|
log.Info("replica does not exist, skip it")
|
|
return
|
|
}
|
|
|
|
dist := c.getChannelDist(replica)
|
|
distMap := typeutil.NewSet[string]()
|
|
for _, ch := range dist {
|
|
distMap.Insert(ch.GetChannelName())
|
|
}
|
|
|
|
nextTargetMap := c.targetMgr.GetDmChannelsByCollection(collectionID, meta.NextTarget)
|
|
currentTargetMap := c.targetMgr.GetDmChannelsByCollection(collectionID, meta.CurrentTarget)
|
|
|
|
// get channels which exists on dist, but not exist on current and next
|
|
for _, ch := range dist {
|
|
_, existOnCurrent := currentTargetMap[ch.GetChannelName()]
|
|
_, existOnNext := nextTargetMap[ch.GetChannelName()]
|
|
if !existOnNext && !existOnCurrent {
|
|
toRelease = append(toRelease, ch)
|
|
}
|
|
}
|
|
|
|
// get channels which exists on next target, but not on dist
|
|
for name, channel := range nextTargetMap {
|
|
_, existOnDist := distMap[name]
|
|
if !existOnDist {
|
|
toLoad = append(toLoad, channel)
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func (c *ChannelChecker) getChannelDist(replica *meta.Replica) []*meta.DmChannel {
|
|
dist := make([]*meta.DmChannel, 0)
|
|
for _, nodeID := range replica.GetNodes() {
|
|
dist = append(dist, c.dist.ChannelDistManager.GetByCollectionAndNode(replica.GetCollectionID(), nodeID)...)
|
|
}
|
|
return dist
|
|
}
|
|
|
|
func (c *ChannelChecker) findRepeatedChannels(ctx context.Context, replicaID int64) []*meta.DmChannel {
|
|
log := log.Ctx(ctx).WithRateGroup("ChannelChecker.findRepeatedChannels", 1, 60)
|
|
replica := c.meta.Get(replicaID)
|
|
ret := make([]*meta.DmChannel, 0)
|
|
|
|
if replica == nil {
|
|
log.Info("replica does not exist, skip it")
|
|
return ret
|
|
}
|
|
dist := c.getChannelDist(replica)
|
|
|
|
targets := c.targetMgr.GetSealedSegmentsByCollection(replica.GetCollectionID(), meta.CurrentTarget)
|
|
versionsMap := make(map[string]*meta.DmChannel)
|
|
for _, ch := range dist {
|
|
leaderView := c.dist.LeaderViewManager.GetLeaderShardView(ch.Node, ch.GetChannelName())
|
|
if leaderView == nil {
|
|
log.Info("shard leadview is not ready, skip",
|
|
zap.Int64("collectionID", replica.GetCollectionID()),
|
|
zap.Int64("replicaID", replicaID),
|
|
zap.Int64("leaderID", ch.Node),
|
|
zap.String("channel", ch.GetChannelName()))
|
|
continue
|
|
}
|
|
|
|
if err := CheckLeaderAvailable(c.nodeMgr, leaderView, targets); err != nil {
|
|
log.RatedInfo(10, "replica has unavailable shard leader",
|
|
zap.Int64("collectionID", replica.GetCollectionID()),
|
|
zap.Int64("replicaID", replicaID),
|
|
zap.Int64("leaderID", ch.Node),
|
|
zap.String("channel", ch.GetChannelName()),
|
|
zap.Error(err))
|
|
continue
|
|
}
|
|
|
|
maxVer, ok := versionsMap[ch.GetChannelName()]
|
|
if !ok {
|
|
versionsMap[ch.GetChannelName()] = ch
|
|
continue
|
|
}
|
|
if maxVer.Version <= ch.Version {
|
|
ret = append(ret, maxVer)
|
|
versionsMap[ch.GetChannelName()] = ch
|
|
} else {
|
|
ret = append(ret, ch)
|
|
}
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func (c *ChannelChecker) createChannelLoadTask(ctx context.Context, channels []*meta.DmChannel, replica *meta.Replica) []task.Task {
|
|
outboundNodes := c.meta.ResourceManager.CheckOutboundNodes(replica)
|
|
availableNodes := lo.Filter(replica.Replica.GetNodes(), func(node int64, _ int) bool {
|
|
return !outboundNodes.Contain(node)
|
|
})
|
|
plans := c.balancer.AssignChannel(channels, availableNodes)
|
|
for i := range plans {
|
|
plans[i].ReplicaID = replica.GetID()
|
|
}
|
|
|
|
return balance.CreateChannelTasksFromPlans(ctx, c.ID(), Params.QueryCoordCfg.ChannelTaskTimeout.GetAsDuration(time.Millisecond), plans)
|
|
}
|
|
|
|
func (c *ChannelChecker) createChannelReduceTasks(ctx context.Context, channels []*meta.DmChannel, replicaID int64) []task.Task {
|
|
ret := make([]task.Task, 0, len(channels))
|
|
for _, ch := range channels {
|
|
action := task.NewChannelAction(ch.Node, task.ActionTypeReduce, ch.GetChannelName())
|
|
task, err := task.NewChannelTask(ctx, Params.QueryCoordCfg.ChannelTaskTimeout.GetAsDuration(time.Millisecond), c.ID(), ch.GetCollectionID(), replicaID, action)
|
|
if err != nil {
|
|
log.Warn("create channel reduce task failed",
|
|
zap.Int64("collection", ch.GetCollectionID()),
|
|
zap.Int64("replica", replicaID),
|
|
zap.String("channel", ch.GetChannelName()),
|
|
zap.Int64("from", ch.Node),
|
|
zap.Error(err),
|
|
)
|
|
continue
|
|
}
|
|
ret = append(ret, task)
|
|
}
|
|
return ret
|
|
}
|
|
|
|
func (c *ChannelChecker) getTraceCtx(ctx context.Context, collectionID int64) context.Context {
|
|
coll := c.meta.GetCollection(collectionID)
|
|
if coll == nil || coll.LoadSpan == nil {
|
|
return ctx
|
|
}
|
|
|
|
return trace.ContextWithSpan(ctx, coll.LoadSpan)
|
|
}
|