mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-01 11:29:48 +08:00
d7f38a803d
Signed-off-by: SimFG <bang.fu@zilliz.com> Signed-off-by: SimFG <bang.fu@zilliz.com>
795 lines
26 KiB
Go
795 lines
26 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package querycoord
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"math"
|
|
"math/rand"
|
|
"os"
|
|
"sort"
|
|
"sync"
|
|
"sync/atomic"
|
|
"syscall"
|
|
"time"
|
|
|
|
v3rpc "go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
|
|
|
|
"github.com/golang/protobuf/proto"
|
|
"go.etcd.io/etcd/api/v3/mvccpb"
|
|
clientv3 "go.etcd.io/etcd/client/v3"
|
|
"go.uber.org/zap"
|
|
|
|
"github.com/milvus-io/milvus/api/commonpb"
|
|
"github.com/milvus-io/milvus/api/milvuspb"
|
|
"github.com/milvus-io/milvus/internal/allocator"
|
|
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
|
|
"github.com/milvus-io/milvus/internal/log"
|
|
"github.com/milvus-io/milvus/internal/proto/internalpb"
|
|
"github.com/milvus-io/milvus/internal/proto/querypb"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
"github.com/milvus-io/milvus/internal/types"
|
|
"github.com/milvus-io/milvus/internal/util"
|
|
"github.com/milvus-io/milvus/internal/util/dependency"
|
|
"github.com/milvus-io/milvus/internal/util/metricsinfo"
|
|
"github.com/milvus-io/milvus/internal/util/paramtable"
|
|
"github.com/milvus-io/milvus/internal/util/sessionutil"
|
|
"github.com/milvus-io/milvus/internal/util/tsoutil"
|
|
"github.com/milvus-io/milvus/internal/util/typeutil"
|
|
)
|
|
|
|
// UniqueID is an alias for the Int64 type
|
|
type UniqueID = typeutil.UniqueID
|
|
|
|
// Timestamp is an alias for the Int64 type
|
|
type Timestamp = typeutil.Timestamp
|
|
|
|
// Params is param table of query coordinator
|
|
var Params paramtable.ComponentParam
|
|
|
|
// QueryCoord is the coordinator of queryNodes
|
|
type QueryCoord struct {
|
|
loopCtx context.Context
|
|
loopCancel context.CancelFunc
|
|
loopWg sync.WaitGroup
|
|
kvClient *etcdkv.EtcdKV
|
|
|
|
initOnce sync.Once
|
|
|
|
queryCoordID uint64
|
|
meta Meta
|
|
cluster Cluster
|
|
channelCleaner *ChannelCleaner
|
|
newNodeFn newQueryNodeFn
|
|
scheduler *TaskScheduler
|
|
idAllocator func() (UniqueID, error)
|
|
handoffHandler *HandoffHandler
|
|
|
|
metricsCacheManager *metricsinfo.MetricsCacheManager
|
|
|
|
etcdCli *clientv3.Client
|
|
dataCoordClient types.DataCoord
|
|
rootCoordClient types.RootCoord
|
|
indexCoordClient types.IndexCoord
|
|
broker *globalMetaBroker
|
|
|
|
session *sessionutil.Session
|
|
eventChan <-chan *sessionutil.SessionEvent
|
|
offlineNodesChan chan UniqueID
|
|
offlineNodes map[UniqueID]struct{}
|
|
|
|
stateCode atomic.Value
|
|
|
|
factory dependency.Factory
|
|
chunkManager storage.ChunkManager
|
|
groupBalancer Balancer
|
|
}
|
|
|
|
// Register register query service at etcd
|
|
func (qc *QueryCoord) Register() error {
|
|
qc.session.Register()
|
|
go qc.session.LivenessCheck(qc.loopCtx, func() {
|
|
log.Error("Query Coord disconnected from etcd, process will exit", zap.Int64("Server Id", qc.session.ServerID))
|
|
if err := qc.Stop(); err != nil {
|
|
log.Fatal("failed to stop server", zap.Error(err))
|
|
}
|
|
// manually send signal to starter goroutine
|
|
if qc.session.TriggerKill {
|
|
if p, err := os.FindProcess(os.Getpid()); err == nil {
|
|
p.Signal(syscall.SIGINT)
|
|
}
|
|
}
|
|
})
|
|
return nil
|
|
}
|
|
|
|
func (qc *QueryCoord) initSession() error {
|
|
qc.session = sessionutil.NewSession(qc.loopCtx, Params.EtcdCfg.MetaRootPath, qc.etcdCli)
|
|
if qc.session == nil {
|
|
return fmt.Errorf("session is nil, the etcd client connection may have failed")
|
|
}
|
|
qc.session.Init(typeutil.QueryCoordRole, Params.QueryCoordCfg.Address, true, true)
|
|
Params.QueryCoordCfg.SetNodeID(qc.session.ServerID)
|
|
Params.SetLogger(qc.session.ServerID)
|
|
return nil
|
|
}
|
|
|
|
// Init function initializes the queryCoord's meta, cluster, etcdKV and task scheduler
|
|
func (qc *QueryCoord) Init() error {
|
|
log.Info("query coordinator start init, session info", zap.String("metaPath", Params.EtcdCfg.MetaRootPath), zap.String("address", Params.QueryCoordCfg.Address))
|
|
var initError error
|
|
qc.initOnce.Do(func() {
|
|
err := qc.initSession()
|
|
if err != nil {
|
|
log.Error("queryCoord init session failed", zap.Error(err))
|
|
initError = err
|
|
return
|
|
}
|
|
etcdKV := etcdkv.NewEtcdKV(qc.etcdCli, Params.EtcdCfg.MetaRootPath)
|
|
qc.kvClient = etcdKV
|
|
log.Debug("query coordinator try to connect etcd success")
|
|
|
|
// init id allocator
|
|
idAllocatorKV := tsoutil.NewTSOKVBase(qc.etcdCli, Params.EtcdCfg.KvRootPath, "queryCoordTaskID")
|
|
idAllocator := allocator.NewGlobalIDAllocator("idTimestamp", idAllocatorKV)
|
|
initError = idAllocator.Initialize()
|
|
if initError != nil {
|
|
log.Error("query coordinator idAllocator initialize failed", zap.Error(initError))
|
|
return
|
|
}
|
|
qc.idAllocator = func() (UniqueID, error) {
|
|
return idAllocator.AllocOne()
|
|
}
|
|
|
|
qc.factory.Init(&Params)
|
|
|
|
// init meta
|
|
qc.meta, initError = newMeta(qc.loopCtx, qc.kvClient, qc.factory, qc.idAllocator)
|
|
if initError != nil {
|
|
log.Error("query coordinator init meta failed", zap.Error(initError))
|
|
return
|
|
}
|
|
meta, ok := qc.meta.(*MetaReplica)
|
|
if !ok {
|
|
panic("QueryCoord qc.meta assertion of MetaReplica error")
|
|
}
|
|
|
|
meta.dataCoord = qc.dataCoordClient
|
|
fixErr := meta.fixSegmentInfoDMChannel()
|
|
if fixErr != nil {
|
|
log.Error("QueryCoord newMeta fixSegmentInfoDMChannel failed", zap.Error(fixErr))
|
|
}
|
|
|
|
// init channelUnsubscribeHandler
|
|
qc.channelCleaner, initError = NewChannelCleaner(qc.loopCtx, qc.kvClient, qc.factory)
|
|
if initError != nil {
|
|
log.Error("query coordinator init channelUnsubscribeHandler failed", zap.Error(initError))
|
|
return
|
|
}
|
|
|
|
// init cluster
|
|
qc.cluster, initError = newQueryNodeCluster(qc.loopCtx, qc.meta, qc.kvClient, qc.newNodeFn, qc.session, qc.channelCleaner)
|
|
if initError != nil {
|
|
log.Error("query coordinator init cluster failed", zap.Error(initError))
|
|
return
|
|
}
|
|
|
|
qc.groupBalancer = newReplicaBalancer(qc.meta, qc.cluster)
|
|
|
|
// NOTE: ignore the returned error
|
|
// we only try best to reload the leader addresses
|
|
reloadShardLeaderAddress(qc.meta, qc.cluster)
|
|
|
|
qc.chunkManager, initError = qc.factory.NewVectorStorageChunkManager(qc.loopCtx)
|
|
|
|
if initError != nil {
|
|
log.Error("query coordinator init cluster failed", zap.Error(initError))
|
|
return
|
|
}
|
|
|
|
//init globalMetaBroker
|
|
qc.broker, initError = newGlobalMetaBroker(qc.loopCtx, qc.rootCoordClient, qc.dataCoordClient, qc.indexCoordClient, qc.chunkManager)
|
|
if initError != nil {
|
|
log.Error("query coordinator init globalMetaBroker failed", zap.Error(initError))
|
|
return
|
|
}
|
|
|
|
// init task scheduler
|
|
qc.scheduler, initError = newTaskScheduler(qc.loopCtx, qc.meta, qc.cluster, qc.kvClient, qc.broker, qc.idAllocator)
|
|
if initError != nil {
|
|
log.Error("query coordinator init task scheduler failed", zap.Error(initError))
|
|
return
|
|
}
|
|
|
|
// init index checker
|
|
qc.handoffHandler, initError = newHandoffHandler(qc.loopCtx, qc.kvClient, qc.meta, qc.cluster, qc.scheduler, qc.broker)
|
|
if initError != nil {
|
|
log.Error("query coordinator init index checker failed", zap.Error(initError))
|
|
return
|
|
}
|
|
|
|
qc.metricsCacheManager = metricsinfo.NewMetricsCacheManager()
|
|
})
|
|
log.Info("QueryCoord init success")
|
|
return initError
|
|
}
|
|
|
|
// Start function starts the goroutines to watch the meta and node updates
|
|
func (qc *QueryCoord) Start() error {
|
|
qc.scheduler.Start()
|
|
log.Info("start scheduler ...")
|
|
|
|
qc.handoffHandler.Start()
|
|
log.Info("start index checker ...")
|
|
|
|
qc.channelCleaner.start()
|
|
log.Info("start channel cleaner loop ...")
|
|
|
|
Params.QueryCoordCfg.CreatedTime = time.Now()
|
|
Params.QueryCoordCfg.UpdatedTime = time.Now()
|
|
|
|
qc.loopWg.Add(1)
|
|
go qc.offlineNodeLoop()
|
|
|
|
qc.loopWg.Add(1)
|
|
go qc.watchNodeLoop()
|
|
|
|
qc.loopWg.Add(1)
|
|
go qc.handoffNotificationLoop()
|
|
|
|
if Params.QueryCoordCfg.AutoBalance {
|
|
qc.loopWg.Add(1)
|
|
go qc.loadBalanceSegmentLoop()
|
|
}
|
|
|
|
qc.UpdateStateCode(internalpb.StateCode_Healthy)
|
|
|
|
return nil
|
|
}
|
|
|
|
// Stop function stops watching the meta and node updates
|
|
func (qc *QueryCoord) Stop() error {
|
|
qc.UpdateStateCode(internalpb.StateCode_Abnormal)
|
|
|
|
if qc.scheduler != nil {
|
|
log.Info("close scheduler...")
|
|
qc.scheduler.Close()
|
|
}
|
|
|
|
if qc.handoffHandler != nil {
|
|
log.Info("close index checker...")
|
|
qc.handoffHandler.Stop()
|
|
}
|
|
|
|
if qc.channelCleaner != nil {
|
|
log.Info("close channel cleaner loop...")
|
|
qc.channelCleaner.close()
|
|
}
|
|
|
|
if qc.loopCancel != nil {
|
|
log.Info("cancel the loop of QueryCoord...")
|
|
qc.loopCancel()
|
|
}
|
|
|
|
log.Info("Query Coord stopped successfully...")
|
|
qc.loopWg.Wait()
|
|
qc.session.Revoke(time.Second)
|
|
return nil
|
|
}
|
|
|
|
// UpdateStateCode updates the status of the coord, including healthy, unhealthy
|
|
func (qc *QueryCoord) UpdateStateCode(code internalpb.StateCode) {
|
|
qc.stateCode.Store(code)
|
|
}
|
|
|
|
// NewQueryCoord creates a QueryCoord object.
|
|
func NewQueryCoord(ctx context.Context, factory dependency.Factory) (*QueryCoord, error) {
|
|
rand.Seed(time.Now().UnixNano())
|
|
ctx1, cancel := context.WithCancel(ctx)
|
|
service := &QueryCoord{
|
|
loopCtx: ctx1,
|
|
loopCancel: cancel,
|
|
factory: factory,
|
|
newNodeFn: newQueryNode,
|
|
offlineNodesChan: make(chan UniqueID, 256),
|
|
offlineNodes: make(map[UniqueID]struct{}, 256),
|
|
}
|
|
|
|
service.UpdateStateCode(internalpb.StateCode_Abnormal)
|
|
return service, nil
|
|
}
|
|
|
|
// SetEtcdClient sets etcd's client
|
|
func (qc *QueryCoord) SetEtcdClient(etcdClient *clientv3.Client) {
|
|
qc.etcdCli = etcdClient
|
|
}
|
|
|
|
// SetRootCoord sets root coordinator's client
|
|
func (qc *QueryCoord) SetRootCoord(rootCoord types.RootCoord) error {
|
|
if rootCoord == nil {
|
|
return errors.New("null RootCoord interface")
|
|
}
|
|
|
|
qc.rootCoordClient = rootCoord
|
|
return nil
|
|
}
|
|
|
|
// SetDataCoord sets data coordinator's client
|
|
func (qc *QueryCoord) SetDataCoord(dataCoord types.DataCoord) error {
|
|
if dataCoord == nil {
|
|
return errors.New("null DataCoord interface")
|
|
}
|
|
|
|
qc.dataCoordClient = dataCoord
|
|
return nil
|
|
}
|
|
|
|
// SetIndexCoord sets index coordinator's client
|
|
func (qc *QueryCoord) SetIndexCoord(indexCoord types.IndexCoord) error {
|
|
if indexCoord == nil {
|
|
return errors.New("null IndexCoord interface")
|
|
}
|
|
|
|
qc.indexCoordClient = indexCoord
|
|
return nil
|
|
}
|
|
|
|
func (qc *QueryCoord) watchNodeLoop() {
|
|
ctx, cancel := context.WithCancel(qc.loopCtx)
|
|
defer cancel()
|
|
defer qc.loopWg.Done()
|
|
log.Info("QueryCoord start watch node loop")
|
|
|
|
// first check all the node has been assigned to replica
|
|
onlineNodes := qc.cluster.OnlineNodeIDs()
|
|
for _, node := range onlineNodes {
|
|
if err := qc.allocateNode(node); err != nil {
|
|
log.Error("unable to allocate node", zap.Int64("nodeID", node), zap.Error(err))
|
|
panic(err)
|
|
}
|
|
}
|
|
|
|
// the only judgement of processing a offline node is 1) etcd queryNodeInfoPrefix exist 2) the querynode session not exist
|
|
offlineNodes := qc.cluster.OfflineNodeIDs()
|
|
if len(offlineNodes) != 0 {
|
|
log.Warn("find querynode down while coord not alive", zap.Any("nodeIDs", offlineNodes))
|
|
for _, node := range offlineNodes {
|
|
qc.offlineNodesChan <- node
|
|
}
|
|
}
|
|
|
|
// TODO silverxia add Rewatch logic
|
|
qc.eventChan = qc.session.WatchServices(typeutil.QueryNodeRole, qc.cluster.GetSessionVersion()+1, nil)
|
|
qc.handleNodeEvent(ctx)
|
|
}
|
|
|
|
func (qc *QueryCoord) allocateNode(nodeID int64) error {
|
|
plans, err := qc.groupBalancer.AddNode(nodeID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, p := range plans {
|
|
if err := qc.meta.applyReplicaBalancePlan(p); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (qc *QueryCoord) handleNodeEvent(ctx context.Context) {
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
|
|
case event, ok := <-qc.eventChan:
|
|
if !ok {
|
|
// ErrCompacted is handled inside SessionWatcher
|
|
log.Error("Session Watcher channel closed", zap.Int64("server id", qc.session.ServerID))
|
|
go qc.Stop()
|
|
if qc.session.TriggerKill {
|
|
if p, err := os.FindProcess(os.Getpid()); err == nil {
|
|
p.Signal(syscall.SIGINT)
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
switch event.EventType {
|
|
case sessionutil.SessionAddEvent:
|
|
serverID := event.Session.ServerID
|
|
log.Info("start add a QueryNode to cluster", zap.Any("nodeID", serverID))
|
|
err := qc.cluster.RegisterNode(ctx, event.Session, serverID, disConnect)
|
|
if err != nil {
|
|
log.Error("QueryCoord failed to register a QueryNode", zap.Int64("nodeID", serverID), zap.String("error info", err.Error()))
|
|
continue
|
|
}
|
|
go func(serverID int64) {
|
|
for {
|
|
// retry forever, or crash.
|
|
// we should apply replica asyncly
|
|
err := qc.allocateNode(serverID)
|
|
if err != nil {
|
|
log.Error("unable to allocate node", zap.Int64("nodeID", serverID), zap.Error(err))
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
}(serverID)
|
|
qc.metricsCacheManager.InvalidateSystemInfoMetrics()
|
|
|
|
case sessionutil.SessionDelEvent:
|
|
serverID := event.Session.ServerID
|
|
log.Info("get a del event after QueryNode down", zap.Int64("nodeID", serverID))
|
|
nodeExist := qc.cluster.HasNode(serverID)
|
|
if !nodeExist {
|
|
log.Error("QueryNode not exist", zap.Int64("nodeID", serverID))
|
|
continue
|
|
}
|
|
|
|
qc.cluster.StopNode(serverID)
|
|
qc.offlineNodesChan <- serverID
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (qc *QueryCoord) offlineNodeLoop() {
|
|
ctx, cancel := context.WithCancel(qc.loopCtx)
|
|
defer cancel()
|
|
defer qc.loopWg.Done()
|
|
|
|
ticker := time.NewTicker(time.Millisecond * 100)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
log.Info("offline node loop exit")
|
|
return
|
|
case node := <-qc.offlineNodesChan:
|
|
qc.offlineNodes[node] = struct{}{}
|
|
qc.processOfflineNodes()
|
|
case <-ticker.C:
|
|
qc.processOfflineNodes()
|
|
}
|
|
}
|
|
}
|
|
|
|
func (qc *QueryCoord) processOfflineNodes() {
|
|
for node := range qc.offlineNodes {
|
|
// check if all channel unsubscribe is handled, if not wait for next cycle
|
|
if !qc.channelCleaner.isNodeChannelCleanHandled(node) {
|
|
log.Info("node channel is not cleaned, skip offline processing", zap.Int64("node", node))
|
|
continue
|
|
}
|
|
|
|
loadBalanceSegment := &querypb.LoadBalanceRequest{
|
|
Base: &commonpb.MsgBase{
|
|
MsgType: commonpb.MsgType_LoadBalanceSegments,
|
|
SourceID: qc.session.ServerID,
|
|
},
|
|
SourceNodeIDs: []int64{node},
|
|
BalanceReason: querypb.TriggerCondition_NodeDown,
|
|
}
|
|
|
|
baseTask := newBaseTaskWithRetry(qc.loopCtx, querypb.TriggerCondition_NodeDown, 0)
|
|
loadBalanceTask := &loadBalanceTask{
|
|
baseTask: baseTask,
|
|
LoadBalanceRequest: loadBalanceSegment,
|
|
broker: qc.broker,
|
|
cluster: qc.cluster,
|
|
meta: qc.meta,
|
|
}
|
|
qc.metricsCacheManager.InvalidateSystemInfoMetrics()
|
|
|
|
err := qc.scheduler.Enqueue(loadBalanceTask)
|
|
if err != nil {
|
|
log.Warn("failed to enqueue LoadBalance task into the scheduler",
|
|
zap.Int64("nodeID", node),
|
|
zap.Error(err))
|
|
continue
|
|
}
|
|
|
|
log.Info("start a loadBalance task",
|
|
zap.Int64("nodeID", node),
|
|
zap.Int64("taskID", loadBalanceTask.getTaskID()))
|
|
|
|
err = loadBalanceTask.waitToFinish()
|
|
if err != nil {
|
|
log.Warn("failed to process LoadBalance task",
|
|
zap.Int64("nodeID", node),
|
|
zap.Error(err))
|
|
continue
|
|
}
|
|
|
|
delete(qc.offlineNodes, node)
|
|
log.Info("LoadBalance task done, offline node is removed", zap.Int64("nodeID", node))
|
|
}
|
|
}
|
|
|
|
func (qc *QueryCoord) handoffNotificationLoop() {
|
|
ctx, cancel := context.WithCancel(qc.loopCtx)
|
|
|
|
defer cancel()
|
|
defer qc.loopWg.Done()
|
|
log.Info("QueryCoord start watch segment loop")
|
|
|
|
watchChan := qc.kvClient.WatchWithRevision(util.HandoffSegmentPrefix, qc.handoffHandler.revision+1)
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case resp, ok := <-watchChan:
|
|
if !ok {
|
|
log.Warn("QueryCoord watch handoff segment loop failed because watch channel is closed")
|
|
panic("QueryCoord watch handoff segment loop failed because watch channel is closed")
|
|
}
|
|
if err := resp.Err(); err != nil {
|
|
// https://github.com/etcd-io/etcd/issues/8980
|
|
if err == v3rpc.ErrCompacted {
|
|
qc.handoffHandler, err = newHandoffHandler(qc.loopCtx, qc.kvClient, qc.meta, qc.cluster, qc.scheduler, qc.broker)
|
|
if err != nil {
|
|
log.Error("query coordinator re new handoff handler failed", zap.Error(err))
|
|
panic("failed to handle etcd request, exit..")
|
|
}
|
|
if err2 := qc.handoffHandler.reloadFromKV(); err2 != nil {
|
|
log.Error("reload index checker meta fails when etcd has a compaction error",
|
|
zap.String("etcd error", err.Error()), zap.Error(err2))
|
|
panic("failed to handle etcd request, exit..")
|
|
}
|
|
qc.loopWg.Add(1)
|
|
go qc.handoffNotificationLoop()
|
|
return
|
|
}
|
|
log.Error("received error event from etcd watcher", zap.String("prefix", util.HandoffSegmentPrefix),
|
|
zap.Error(err))
|
|
panic("failed to handle etcd request, exit..")
|
|
}
|
|
for _, event := range resp.Events {
|
|
segmentInfo := &querypb.SegmentInfo{}
|
|
err := proto.Unmarshal(event.Kv.Value, segmentInfo)
|
|
if err != nil {
|
|
log.Error("watchHandoffSegmentLoop: unmarshal failed", zap.Any("error", err.Error()))
|
|
continue
|
|
}
|
|
switch event.Type {
|
|
case mvccpb.PUT:
|
|
qc.handoffHandler.enqueue(segmentInfo)
|
|
log.Info("watchHandoffSegmentLoop: enqueue a handoff request to index checker", zap.Any("segment info", segmentInfo))
|
|
default:
|
|
// do nothing
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (qc *QueryCoord) loadBalanceSegmentLoop() {
|
|
ctx, cancel := context.WithCancel(qc.loopCtx)
|
|
defer cancel()
|
|
defer qc.loopWg.Done()
|
|
log.Info("QueryCoord start load balance segment loop")
|
|
|
|
timer := time.NewTicker(time.Duration(Params.QueryCoordCfg.BalanceIntervalSeconds) * time.Second)
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-timer.C:
|
|
startTs := time.Now()
|
|
// do not trigger load balance if task queue is not empty
|
|
if !qc.scheduler.taskEmpty() {
|
|
continue
|
|
}
|
|
|
|
collectionInfos := qc.meta.showCollections()
|
|
// shuffle to avoid always balance the same collections
|
|
rand.Seed(time.Now().UnixNano())
|
|
rand.Shuffle(len(collectionInfos), func(i, j int) {
|
|
collectionInfos[i], collectionInfos[j] = collectionInfos[j], collectionInfos[i]
|
|
})
|
|
|
|
// get mem info of online nodes from cluster
|
|
nodeID2MemUsageRate := make(map[int64]float64)
|
|
nodeID2MemUsage := make(map[int64]uint64)
|
|
nodeID2TotalMem := make(map[int64]uint64)
|
|
loadBalanceTasks := make([]*loadBalanceTask, 0)
|
|
// balance at most 20 collections in a round
|
|
for i := 0; i < len(collectionInfos) && i < 20; i++ {
|
|
info := collectionInfos[i]
|
|
replicas, err := qc.meta.getReplicasByCollectionID(info.GetCollectionID())
|
|
if err != nil {
|
|
log.Warn("unable to get replicas of collection", zap.Int64("collectionID", info.GetCollectionID()))
|
|
continue
|
|
}
|
|
for _, replica := range replicas {
|
|
loadBalanceTasks = append(loadBalanceTasks, qc.balanceReplica(ctx, replica, nodeID2MemUsageRate, nodeID2MemUsage, nodeID2TotalMem)...)
|
|
}
|
|
}
|
|
for _, t := range loadBalanceTasks {
|
|
err := qc.scheduler.Enqueue(t)
|
|
if err != nil {
|
|
log.Error("loadBalanceSegmentLoop: balance task enqueue failed", zap.Any("task", t), zap.Error(err))
|
|
continue
|
|
}
|
|
err = t.waitToFinish()
|
|
if err != nil {
|
|
// if failed, wait for next balance loop
|
|
// it may be that the collection/partition of the balanced segment has been released
|
|
// it also may be other abnormal errors
|
|
log.Error("loadBalanceSegmentLoop: balance task execute failed", zap.Any("task", t), zap.Error(err))
|
|
} else {
|
|
log.Info("loadBalanceSegmentLoop: balance task execute success", zap.Any("task", t))
|
|
}
|
|
}
|
|
log.Info("finish balance loop successfully", zap.Duration("time spent", time.Since(startTs)))
|
|
}
|
|
}
|
|
}
|
|
|
|
// TODO balance replica need to be optimized, we can not get segment info in evert balance round
|
|
func (qc *QueryCoord) balanceReplica(ctx context.Context, replica *milvuspb.ReplicaInfo, nodeID2MemUsageRate map[int64]float64,
|
|
nodeID2MemUsage map[int64]uint64, nodeID2TotalMem map[int64]uint64) []*loadBalanceTask {
|
|
loadBalanceTasks := make([]*loadBalanceTask, 0)
|
|
// auto balance is executed on replica level
|
|
onlineNodeIDs := replica.GetNodeIds()
|
|
if len(onlineNodeIDs) == 0 {
|
|
log.Error("loadBalanceSegmentLoop: there are no online QueryNode to balance", zap.Int64("collection", replica.CollectionID), zap.Int64("replica", replica.ReplicaID))
|
|
return loadBalanceTasks
|
|
}
|
|
var availableNodeIDs []int64
|
|
nodeID2SegmentInfos := make(map[int64]map[UniqueID]*querypb.SegmentInfo)
|
|
for _, nodeID := range onlineNodeIDs {
|
|
if _, ok := nodeID2MemUsage[nodeID]; !ok {
|
|
nodeInfo, err := qc.cluster.GetNodeInfoByID(nodeID)
|
|
if err != nil {
|
|
log.Warn("loadBalanceSegmentLoop: get node info from QueryNode failed",
|
|
zap.Int64("nodeID", nodeID), zap.Int64("collection", replica.CollectionID), zap.Int64("replica", replica.ReplicaID),
|
|
zap.Error(err))
|
|
continue
|
|
}
|
|
nodeID2MemUsageRate[nodeID] = nodeInfo.(*queryNode).memUsageRate
|
|
nodeID2MemUsage[nodeID] = nodeInfo.(*queryNode).memUsage
|
|
nodeID2TotalMem[nodeID] = nodeInfo.(*queryNode).totalMem
|
|
}
|
|
|
|
updateSegmentInfoDone := true
|
|
leastSegmentInfos := make(map[UniqueID]*querypb.SegmentInfo)
|
|
segmentInfos := qc.meta.getSegmentInfosByNodeAndCollection(nodeID, replica.GetCollectionID())
|
|
for _, segmentInfo := range segmentInfos {
|
|
leastInfo, err := qc.cluster.GetSegmentInfoByID(ctx, segmentInfo.SegmentID)
|
|
if err != nil {
|
|
log.Warn("loadBalanceSegmentLoop: get segment info from QueryNode failed", zap.Int64("nodeID", nodeID),
|
|
zap.Int64("collection", replica.CollectionID), zap.Int64("replica", replica.ReplicaID),
|
|
zap.Error(err))
|
|
updateSegmentInfoDone = false
|
|
break
|
|
}
|
|
leastSegmentInfos[segmentInfo.SegmentID] = leastInfo
|
|
}
|
|
if updateSegmentInfoDone {
|
|
availableNodeIDs = append(availableNodeIDs, nodeID)
|
|
nodeID2SegmentInfos[nodeID] = leastSegmentInfos
|
|
}
|
|
}
|
|
log.Info("loadBalanceSegmentLoop: memory usage rate of all online QueryNode", zap.Int64("collection", replica.CollectionID),
|
|
zap.Int64("replica", replica.ReplicaID), zap.Any("mem rate", nodeID2MemUsageRate))
|
|
if len(availableNodeIDs) <= 1 {
|
|
log.Info("loadBalanceSegmentLoop: there are too few available query nodes to balance",
|
|
zap.Int64("collection", replica.CollectionID), zap.Int64("replica", replica.ReplicaID),
|
|
zap.Int64s("onlineNodeIDs", onlineNodeIDs), zap.Int64s("availableNodeIDs", availableNodeIDs))
|
|
return loadBalanceTasks
|
|
}
|
|
|
|
// check which nodes need balance and determine which segments on these nodes need to be migrated to other nodes
|
|
for {
|
|
sort.Slice(availableNodeIDs, func(i, j int) bool {
|
|
return nodeID2MemUsageRate[availableNodeIDs[i]] > nodeID2MemUsageRate[availableNodeIDs[j]]
|
|
})
|
|
|
|
// the memoryUsageRate of the sourceNode is higher than other query node
|
|
sourceNodeID := availableNodeIDs[0]
|
|
dstNodeID := availableNodeIDs[len(availableNodeIDs)-1]
|
|
|
|
memUsageRateDiff := nodeID2MemUsageRate[sourceNodeID] - nodeID2MemUsageRate[dstNodeID]
|
|
if nodeID2MemUsageRate[sourceNodeID] <= Params.QueryCoordCfg.OverloadedMemoryThresholdPercentage &&
|
|
memUsageRateDiff <= Params.QueryCoordCfg.MemoryUsageMaxDifferencePercentage {
|
|
break
|
|
}
|
|
// if memoryUsageRate of source node is greater than 90%, and the max memUsageDiff is greater than 30%
|
|
// then migrate the segments on source node to other query nodes
|
|
segmentInfos := nodeID2SegmentInfos[sourceNodeID]
|
|
// select the segment that needs balance on the source node
|
|
selectedSegmentInfo, err := chooseSegmentToBalance(sourceNodeID, dstNodeID, segmentInfos, nodeID2MemUsage, nodeID2TotalMem, nodeID2MemUsageRate)
|
|
if err != nil {
|
|
break
|
|
}
|
|
if selectedSegmentInfo == nil {
|
|
break
|
|
}
|
|
// select a segment to balance successfully, then recursive traversal whether there are other segments that can balance
|
|
req := &querypb.LoadBalanceRequest{
|
|
Base: &commonpb.MsgBase{
|
|
MsgType: commonpb.MsgType_LoadBalanceSegments,
|
|
},
|
|
BalanceReason: querypb.TriggerCondition_LoadBalance,
|
|
SourceNodeIDs: []UniqueID{sourceNodeID},
|
|
DstNodeIDs: []UniqueID{dstNodeID},
|
|
SealedSegmentIDs: []UniqueID{selectedSegmentInfo.SegmentID},
|
|
}
|
|
baseTask := newBaseTask(qc.loopCtx, querypb.TriggerCondition_LoadBalance)
|
|
balanceTask := &loadBalanceTask{
|
|
baseTask: baseTask,
|
|
LoadBalanceRequest: req,
|
|
broker: qc.broker,
|
|
cluster: qc.cluster,
|
|
meta: qc.meta,
|
|
}
|
|
log.Info("loadBalanceSegmentLoop: generate a loadBalance task",
|
|
zap.Int64("collection", replica.CollectionID), zap.Int64("replica", replica.ReplicaID),
|
|
zap.Any("task", balanceTask))
|
|
loadBalanceTasks = append(loadBalanceTasks, balanceTask)
|
|
nodeID2MemUsage[sourceNodeID] -= uint64(selectedSegmentInfo.MemSize)
|
|
nodeID2MemUsage[dstNodeID] += uint64(selectedSegmentInfo.MemSize)
|
|
nodeID2MemUsageRate[sourceNodeID] = float64(nodeID2MemUsage[sourceNodeID]) / float64(nodeID2TotalMem[sourceNodeID])
|
|
nodeID2MemUsageRate[dstNodeID] = float64(nodeID2MemUsage[dstNodeID]) / float64(nodeID2TotalMem[dstNodeID])
|
|
delete(nodeID2SegmentInfos[sourceNodeID], selectedSegmentInfo.SegmentID)
|
|
nodeID2SegmentInfos[dstNodeID][selectedSegmentInfo.SegmentID] = selectedSegmentInfo
|
|
continue
|
|
}
|
|
return loadBalanceTasks
|
|
}
|
|
|
|
func chooseSegmentToBalance(sourceNodeID int64, dstNodeID int64,
|
|
segmentInfos map[UniqueID]*querypb.SegmentInfo,
|
|
nodeID2MemUsage map[int64]uint64,
|
|
nodeID2TotalMem map[int64]uint64,
|
|
nodeID2MemUsageRate map[int64]float64) (*querypb.SegmentInfo, error) {
|
|
memoryInsufficient := true
|
|
minMemDiffPercentage := 1.0
|
|
var selectedSegmentInfo *querypb.SegmentInfo
|
|
for _, info := range segmentInfos {
|
|
dstNodeMemUsageAfterBalance := nodeID2MemUsage[dstNodeID] + uint64(info.MemSize)
|
|
dstNodeMemUsageRateAfterBalance := float64(dstNodeMemUsageAfterBalance) / float64(nodeID2TotalMem[dstNodeID])
|
|
// if memUsageRate of dstNode is greater than OverloadedMemoryThresholdPercentage after balance, than can't balance
|
|
if dstNodeMemUsageRateAfterBalance < Params.QueryCoordCfg.OverloadedMemoryThresholdPercentage {
|
|
memoryInsufficient = false
|
|
sourceNodeMemUsageAfterBalance := nodeID2MemUsage[sourceNodeID] - uint64(info.MemSize)
|
|
sourceNodeMemUsageRateAfterBalance := float64(sourceNodeMemUsageAfterBalance) / float64(nodeID2TotalMem[sourceNodeID])
|
|
// assume all query node has same memory capacity
|
|
// if the memUsageRateDiff between the two nodes does not become smaller after balance, there is no need for balance
|
|
diffBeforBalance := nodeID2MemUsageRate[sourceNodeID] - nodeID2MemUsageRate[dstNodeID]
|
|
diffAfterBalance := dstNodeMemUsageRateAfterBalance - sourceNodeMemUsageRateAfterBalance
|
|
if diffAfterBalance < diffBeforBalance {
|
|
if math.Abs(diffAfterBalance) < minMemDiffPercentage {
|
|
selectedSegmentInfo = info
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if memoryInsufficient {
|
|
return nil, errors.New("all QueryNode has insufficient memory")
|
|
}
|
|
|
|
return selectedSegmentInfo, nil
|
|
}
|