package balance import ( "math" "math/rand" "sort" "time" "github.com/samber/lo" "go.uber.org/zap" "github.com/milvus-io/milvus/internal/proto/datapb" "github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/params" "github.com/milvus-io/milvus/internal/querycoordv2/session" "github.com/milvus-io/milvus/internal/querycoordv2/task" "github.com/milvus-io/milvus/pkg/log" "github.com/milvus-io/milvus/pkg/util/paramtable" "github.com/milvus-io/milvus/pkg/util/typeutil" ) func init() { rand.Seed(time.Now().UnixNano()) } type rowCountCostModel struct { nodeSegments map[int64][]*meta.Segment } func (m *rowCountCostModel) cost() float64 { nodeCount := len(m.nodeSegments) if nodeCount == 0 { return 0 } totalRowCount := 0 nodesRowCount := make(map[int64]int) for node, segments := range m.nodeSegments { rowCount := 0 for _, segment := range segments { rowCount += int(segment.GetNumOfRows()) } totalRowCount += rowCount nodesRowCount[node] = rowCount } expectAvg := float64(totalRowCount) / float64(nodeCount) // calculate worst case, all rows are allocated to only one node worst := float64(nodeCount-1)*expectAvg + float64(totalRowCount) - expectAvg // calculate best case, all rows are allocated meanly nodeWithMoreRows := totalRowCount % nodeCount best := float64(nodeWithMoreRows)*(math.Ceil(expectAvg)-expectAvg) + float64(nodeCount-nodeWithMoreRows)*(expectAvg-math.Floor(expectAvg)) if worst == best { return 0 } var currCost float64 for _, rowCount := range nodesRowCount { currCost += math.Abs(float64(rowCount) - expectAvg) } // normalization return (currCost - best) / (worst - best) } type segmentCountCostModel struct { nodeSegments map[int64][]*meta.Segment } func (m *segmentCountCostModel) cost() float64 { nodeCount := len(m.nodeSegments) if nodeCount == 0 { return 0 } totalSegmentCount := 0 nodeSegmentCount := make(map[int64]int) for node, segments := range m.nodeSegments { totalSegmentCount += len(segments) nodeSegmentCount[node] = len(segments) } expectAvg := float64(totalSegmentCount) / float64(nodeCount) // calculate worst case, all segments are allocated to only one node worst := float64(nodeCount-1)*expectAvg + float64(totalSegmentCount) - expectAvg // calculate best case, all segments are allocated meanly nodeWithMoreRows := totalSegmentCount % nodeCount best := float64(nodeWithMoreRows)*(math.Ceil(expectAvg)-expectAvg) + float64(nodeCount-nodeWithMoreRows)*(expectAvg-math.Floor(expectAvg)) var currCost float64 for _, count := range nodeSegmentCount { currCost += math.Abs(float64(count) - expectAvg) } if worst == best { return 0 } // normalization return (currCost - best) / (worst - best) } func cmpCost(f1, f2 float64) int { if math.Abs(f1-f2) < params.Params.QueryCoordCfg.BalanceCostThreshold.GetAsFloat() { return 0 } if f1 < f2 { return -1 } return 1 } type generator interface { setPlans(plans []SegmentAssignPlan) setReplicaNodeSegments(replicaNodeSegments map[int64][]*meta.Segment) setGlobalNodeSegments(globalNodeSegments map[int64][]*meta.Segment) setCost(cost float64) getReplicaNodeSegments() map[int64][]*meta.Segment getGlobalNodeSegments() map[int64][]*meta.Segment getCost() float64 generatePlans() []SegmentAssignPlan } type basePlanGenerator struct { plans []SegmentAssignPlan currClusterCost float64 replicaNodeSegments map[int64][]*meta.Segment globalNodeSegments map[int64][]*meta.Segment rowCountCostWeight float64 globalRowCountCostWeight float64 segmentCountCostWeight float64 globalSegmentCountCostWeight float64 } func newBasePlanGenerator() *basePlanGenerator { return &basePlanGenerator{ rowCountCostWeight: params.Params.QueryCoordCfg.RowCountFactor.GetAsFloat(), globalRowCountCostWeight: params.Params.QueryCoordCfg.GlobalRowCountFactor.GetAsFloat(), segmentCountCostWeight: params.Params.QueryCoordCfg.SegmentCountFactor.GetAsFloat(), globalSegmentCountCostWeight: params.Params.QueryCoordCfg.GlobalSegmentCountFactor.GetAsFloat(), } } func (g *basePlanGenerator) setPlans(plans []SegmentAssignPlan) { g.plans = plans } func (g *basePlanGenerator) setReplicaNodeSegments(replicaNodeSegments map[int64][]*meta.Segment) { g.replicaNodeSegments = replicaNodeSegments } func (g *basePlanGenerator) setGlobalNodeSegments(globalNodeSegments map[int64][]*meta.Segment) { g.globalNodeSegments = globalNodeSegments } func (g *basePlanGenerator) setCost(cost float64) { g.currClusterCost = cost } func (g *basePlanGenerator) getReplicaNodeSegments() map[int64][]*meta.Segment { return g.replicaNodeSegments } func (g *basePlanGenerator) getGlobalNodeSegments() map[int64][]*meta.Segment { return g.globalNodeSegments } func (g *basePlanGenerator) getCost() float64 { return g.currClusterCost } func (g *basePlanGenerator) applyPlans(nodeSegments map[int64][]*meta.Segment, plans []SegmentAssignPlan) map[int64][]*meta.Segment { newCluster := make(map[int64][]*meta.Segment) for k, v := range nodeSegments { newCluster[k] = append(newCluster[k], v...) } for _, p := range plans { for i, s := range newCluster[p.From] { if s.GetID() == p.Segment.ID { newCluster[p.From] = append(newCluster[p.From][:i], newCluster[p.From][i+1:]...) break } } newCluster[p.To] = append(newCluster[p.To], p.Segment) } return newCluster } func (g *basePlanGenerator) calClusterCost(replicaNodeSegments, globalNodeSegments map[int64][]*meta.Segment) float64 { replicaRowCountCostModel, replicaSegmentCountCostModel := &rowCountCostModel{replicaNodeSegments}, &segmentCountCostModel{replicaNodeSegments} globalRowCountCostModel, globalSegmentCountCostModel := &rowCountCostModel{globalNodeSegments}, &segmentCountCostModel{globalNodeSegments} replicaCost1, replicaCost2 := replicaRowCountCostModel.cost(), replicaSegmentCountCostModel.cost() globalCost1, globalCost2 := globalRowCountCostModel.cost(), globalSegmentCountCostModel.cost() return replicaCost1*g.rowCountCostWeight + replicaCost2*g.segmentCountCostWeight + globalCost1*g.globalRowCountCostWeight + globalCost2*g.globalSegmentCountCostWeight } func (g *basePlanGenerator) mergePlans(curr []SegmentAssignPlan, inc []SegmentAssignPlan) []SegmentAssignPlan { // merge plans with the same segment // eg, plan1 is move segment1 from node1 to node2, plan2 is move segment1 from node2 to node3 // we should merge plan1 and plan2 to one plan, which is move segment1 from node1 to node2 result := make([]SegmentAssignPlan, 0, len(curr)+len(inc)) processed := typeutil.NewSet[int]() for _, p := range curr { newPlan, idx, has := lo.FindIndexOf(inc, func(newPlan SegmentAssignPlan) bool { return newPlan.Segment.GetID() == p.Segment.GetID() && newPlan.From == p.To }) if has { processed.Insert(idx) p.To = newPlan.To } // in case of generator 1 move segment from node 1 to node 2 and generator 2 move segment back if p.From != p.To { result = append(result, p) } } // add not merged inc plans result = append(result, lo.Filter(inc, func(_ SegmentAssignPlan, idx int) bool { return !processed.Contain(idx) })...) return result } type rowCountBasedPlanGenerator struct { *basePlanGenerator maxSteps int isGlobal bool } func newRowCountBasedPlanGenerator(maxSteps int, isGlobal bool) *rowCountBasedPlanGenerator { return &rowCountBasedPlanGenerator{ basePlanGenerator: newBasePlanGenerator(), maxSteps: maxSteps, isGlobal: isGlobal, } } func (g *rowCountBasedPlanGenerator) generatePlans() []SegmentAssignPlan { type nodeWithRowCount struct { id int64 count int segments []*meta.Segment } if g.currClusterCost == 0 { g.currClusterCost = g.calClusterCost(g.replicaNodeSegments, g.globalNodeSegments) } nodeSegments := g.replicaNodeSegments if g.isGlobal { nodeSegments = g.globalNodeSegments } nodesWithRowCount := make([]*nodeWithRowCount, 0) for node, segments := range g.replicaNodeSegments { rowCount := 0 for _, segment := range nodeSegments[node] { rowCount += int(segment.GetNumOfRows()) } nodesWithRowCount = append(nodesWithRowCount, &nodeWithRowCount{ id: node, count: rowCount, segments: segments, }) } modified := true for i := 0; i < g.maxSteps; i++ { if modified { sort.Slice(nodesWithRowCount, func(i, j int) bool { return nodesWithRowCount[i].count < nodesWithRowCount[j].count }) } maxNode, minNode := nodesWithRowCount[len(nodesWithRowCount)-1], nodesWithRowCount[0] segment := maxNode.segments[rand.Intn(len(maxNode.segments))] plan := SegmentAssignPlan{ Segment: segment, From: maxNode.id, To: minNode.id, } newCluster := g.applyPlans(g.replicaNodeSegments, []SegmentAssignPlan{plan}) newGlobalCluster := g.applyPlans(g.globalNodeSegments, []SegmentAssignPlan{plan}) newCost := g.calClusterCost(newCluster, newGlobalCluster) if cmpCost(newCost, g.currClusterCost) < 0 { g.currClusterCost = newCost g.replicaNodeSegments = newCluster g.globalNodeSegments = newGlobalCluster maxNode.count -= int(segment.GetNumOfRows()) minNode.count += int(segment.GetNumOfRows()) for n, segment := range maxNode.segments { if segment.GetID() == plan.Segment.ID { maxNode.segments = append(maxNode.segments[:n], maxNode.segments[n+1:]...) break } } minNode.segments = append(minNode.segments, segment) g.plans = g.mergePlans(g.plans, []SegmentAssignPlan{plan}) modified = true } else { modified = false } } return g.plans } type segmentCountBasedPlanGenerator struct { *basePlanGenerator maxSteps int isGlobal bool } func newSegmentCountBasedPlanGenerator(maxSteps int, isGlobal bool) *segmentCountBasedPlanGenerator { return &segmentCountBasedPlanGenerator{ basePlanGenerator: newBasePlanGenerator(), maxSteps: maxSteps, isGlobal: isGlobal, } } func (g *segmentCountBasedPlanGenerator) generatePlans() []SegmentAssignPlan { type nodeWithSegmentCount struct { id int64 count int segments []*meta.Segment } if g.currClusterCost == 0 { g.currClusterCost = g.calClusterCost(g.replicaNodeSegments, g.globalNodeSegments) } nodeSegments := g.replicaNodeSegments if g.isGlobal { nodeSegments = g.globalNodeSegments } nodesWithSegmentCount := make([]*nodeWithSegmentCount, 0) for node, segments := range g.replicaNodeSegments { nodesWithSegmentCount = append(nodesWithSegmentCount, &nodeWithSegmentCount{ id: node, count: len(nodeSegments[node]), segments: segments, }) } modified := true for i := 0; i < g.maxSteps; i++ { if modified { sort.Slice(nodesWithSegmentCount, func(i, j int) bool { return nodesWithSegmentCount[i].count < nodesWithSegmentCount[j].count }) } maxNode, minNode := nodesWithSegmentCount[len(nodesWithSegmentCount)-1], nodesWithSegmentCount[0] segment := maxNode.segments[rand.Intn(len(maxNode.segments))] plan := SegmentAssignPlan{ Segment: segment, From: maxNode.id, To: minNode.id, } newCluster := g.applyPlans(g.replicaNodeSegments, []SegmentAssignPlan{plan}) newGlobalCluster := g.applyPlans(g.globalNodeSegments, []SegmentAssignPlan{plan}) newCost := g.calClusterCost(newCluster, newGlobalCluster) if cmpCost(newCost, g.currClusterCost) < 0 { g.currClusterCost = newCost g.replicaNodeSegments = newCluster g.globalNodeSegments = newGlobalCluster maxNode.count -= 1 minNode.count += 1 for n, segment := range maxNode.segments { if segment.GetID() == plan.Segment.ID { maxNode.segments = append(maxNode.segments[:n], maxNode.segments[n+1:]...) break } } minNode.segments = append(minNode.segments, segment) g.plans = g.mergePlans(g.plans, []SegmentAssignPlan{plan}) modified = true } else { modified = false } } return g.plans } type planType int const ( movePlan planType = iota + 1 swapPlan ) type randomPlanGenerator struct { *basePlanGenerator maxSteps int } func newRandomPlanGenerator(maxSteps int) *randomPlanGenerator { return &randomPlanGenerator{ basePlanGenerator: newBasePlanGenerator(), maxSteps: maxSteps, } } func (g *randomPlanGenerator) generatePlans() []SegmentAssignPlan { g.currClusterCost = g.calClusterCost(g.replicaNodeSegments, g.globalNodeSegments) nodes := lo.Keys(g.replicaNodeSegments) for i := 0; i < g.maxSteps; i++ { // random select two nodes and two segments node1 := nodes[rand.Intn(len(nodes))] node2 := nodes[rand.Intn(len(nodes))] if node1 == node2 { continue } segments1 := g.replicaNodeSegments[node1] segments2 := g.replicaNodeSegments[node2] segment1 := segments1[rand.Intn(len(segments1))] segment2 := segments2[rand.Intn(len(segments2))] // random select plan type, for move type, we move segment1 to node2; for swap type, we swap segment1 and segment2 plans := make([]SegmentAssignPlan, 0) planType := planType(rand.Intn(2) + 1) if planType == movePlan { plan := SegmentAssignPlan{ From: node1, To: node2, Segment: segment1, } plans = append(plans, plan) } else { plan1 := SegmentAssignPlan{ From: node1, To: node2, Segment: segment1, } plan2 := SegmentAssignPlan{ From: node2, To: node1, Segment: segment2, } plans = append(plans, plan1, plan2) } // validate the plan, if the plan is valid, we apply the plan and update the cluster cost newCluster := g.applyPlans(g.replicaNodeSegments, plans) newGlobalCluster := g.applyPlans(g.globalNodeSegments, plans) newCost := g.calClusterCost(newCluster, newGlobalCluster) if cmpCost(newCost, g.currClusterCost) < 0 { g.currClusterCost = newCost g.replicaNodeSegments = newCluster g.globalNodeSegments = newGlobalCluster g.plans = g.mergePlans(g.plans, plans) } } return g.plans } type MultiTargetBalancer struct { *ScoreBasedBalancer dist *meta.DistributionManager targetMgr *meta.TargetManager } func (b *MultiTargetBalancer) BalanceReplica(replica *meta.Replica) ([]SegmentAssignPlan, []ChannelAssignPlan) { log := log.With( zap.Int64("collection", replica.GetCollectionID()), zap.Int64("replica id", replica.GetID()), zap.String("replica group", replica.GetResourceGroup()), ) if replica.NodesCount() == 0 { return nil, nil } rwNodes := replica.GetRWNodes() roNodes := replica.GetRONodes() if len(rwNodes) == 0 { // no available nodes to balance return nil, nil } // print current distribution before generating plans segmentPlans, channelPlans := make([]SegmentAssignPlan, 0), make([]ChannelAssignPlan, 0) if len(roNodes) != 0 { if !paramtable.Get().QueryCoordCfg.EnableStoppingBalance.GetAsBool() { log.RatedInfo(10, "stopping balance is disabled!", zap.Int64s("stoppingNode", roNodes)) return nil, nil } log.Info("Handle stopping nodes", zap.Any("stopping nodes", roNodes), zap.Any("available nodes", rwNodes), ) // handle stopped nodes here, have to assign segments on stopping nodes to nodes with the smallest score channelPlans = append(channelPlans, b.genStoppingChannelPlan(replica, rwNodes, roNodes)...) if len(channelPlans) == 0 { segmentPlans = append(segmentPlans, b.genStoppingSegmentPlan(replica, rwNodes, roNodes)...) } } else { if paramtable.Get().QueryCoordCfg.AutoBalanceChannel.GetAsBool() { channelPlans = append(channelPlans, b.genChannelPlan(replica, rwNodes)...) } if len(channelPlans) == 0 { segmentPlans = b.genSegmentPlan(replica, rwNodes) } } return segmentPlans, channelPlans } func (b *MultiTargetBalancer) genSegmentPlan(replica *meta.Replica, rwNodes []int64) []SegmentAssignPlan { // get segments distribution on replica level and global level nodeSegments := make(map[int64][]*meta.Segment) globalNodeSegments := make(map[int64][]*meta.Segment) for _, node := range rwNodes { dist := b.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithNodeID(node)) segments := lo.Filter(dist, func(segment *meta.Segment, _ int) bool { return b.targetMgr.GetSealedSegment(segment.GetCollectionID(), segment.GetID(), meta.CurrentTarget) != nil && b.targetMgr.GetSealedSegment(segment.GetCollectionID(), segment.GetID(), meta.NextTarget) != nil && segment.GetLevel() != datapb.SegmentLevel_L0 }) nodeSegments[node] = segments globalNodeSegments[node] = b.dist.SegmentDistManager.GetByFilter(meta.WithNodeID(node)) } return b.genPlanByDistributions(nodeSegments, globalNodeSegments) } func (b *MultiTargetBalancer) genPlanByDistributions(nodeSegments, globalNodeSegments map[int64][]*meta.Segment) []SegmentAssignPlan { // create generators // we have 3 types of generators: row count, segment count, random // for row count based and segment count based generator, we have 2 types of generators: replica level and global level generators := make([]generator, 0) generators = append(generators, newRowCountBasedPlanGenerator(params.Params.QueryCoordCfg.RowCountMaxSteps.GetAsInt(), false), newRowCountBasedPlanGenerator(params.Params.QueryCoordCfg.RowCountMaxSteps.GetAsInt(), true), newSegmentCountBasedPlanGenerator(params.Params.QueryCoordCfg.SegmentCountMaxSteps.GetAsInt(), false), newSegmentCountBasedPlanGenerator(params.Params.QueryCoordCfg.SegmentCountMaxSteps.GetAsInt(), true), newRandomPlanGenerator(params.Params.QueryCoordCfg.RandomMaxSteps.GetAsInt()), ) // run generators sequentially to generate plans var cost float64 var plans []SegmentAssignPlan for _, generator := range generators { generator.setCost(cost) generator.setPlans(plans) generator.setReplicaNodeSegments(nodeSegments) generator.setGlobalNodeSegments(globalNodeSegments) plans = generator.generatePlans() cost = generator.getCost() nodeSegments = generator.getReplicaNodeSegments() globalNodeSegments = generator.getGlobalNodeSegments() } return plans } func NewMultiTargetBalancer(scheduler task.Scheduler, nodeManager *session.NodeManager, dist *meta.DistributionManager, meta *meta.Meta, targetMgr *meta.TargetManager) *MultiTargetBalancer { return &MultiTargetBalancer{ ScoreBasedBalancer: NewScoreBasedBalancer(scheduler, nodeManager, dist, meta, targetMgr), dist: dist, targetMgr: targetMgr, } }