mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-02 03:48:37 +08:00
fix: Clean offline node from resource group after qc restart (#33232)
issue: #33200 #33207 pr#33104 causes the offline node will be kept in resource group after qc recover, and offline node will be assign to new replica as rwNode, then request send to those node will fail by NodeNotFound. Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
parent
3d105fcb4d
commit
303470fc35
@ -456,7 +456,7 @@ func (s *Server) startQueryCoord() error {
|
|||||||
s.nodeMgr.Stopping(node.ServerID)
|
s.nodeMgr.Stopping(node.ServerID)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
s.checkReplicas()
|
s.checkNodeStateInRG()
|
||||||
for _, node := range sessions {
|
for _, node := range sessions {
|
||||||
s.handleNodeUp(node.ServerID)
|
s.handleNodeUp(node.ServerID)
|
||||||
}
|
}
|
||||||
@ -778,28 +778,15 @@ func (s *Server) handleNodeDown(node int64) {
|
|||||||
s.meta.ResourceManager.HandleNodeDown(node)
|
s.meta.ResourceManager.HandleNodeDown(node)
|
||||||
}
|
}
|
||||||
|
|
||||||
// checkReplicas checks whether replica contains offline node, and remove those nodes
|
func (s *Server) checkNodeStateInRG() {
|
||||||
func (s *Server) checkReplicas() {
|
for _, rgName := range s.meta.ListResourceGroups() {
|
||||||
for _, collection := range s.meta.CollectionManager.GetAll() {
|
rg := s.meta.ResourceManager.GetResourceGroup(rgName)
|
||||||
log := log.With(zap.Int64("collectionID", collection))
|
for _, node := range rg.GetNodes() {
|
||||||
replicas := s.meta.ReplicaManager.GetByCollection(collection)
|
info := s.nodeMgr.Get(node)
|
||||||
for _, replica := range replicas {
|
if info == nil {
|
||||||
toRemove := make([]int64, 0)
|
s.meta.ResourceManager.HandleNodeDown(node)
|
||||||
for _, node := range replica.GetNodes() {
|
} else if info.IsStoppingState() {
|
||||||
if s.nodeMgr.Get(node) == nil {
|
s.meta.ResourceManager.HandleNodeStopping(node)
|
||||||
toRemove = append(toRemove, node)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(toRemove) > 0 {
|
|
||||||
log := log.With(
|
|
||||||
zap.Int64("replicaID", replica.GetID()),
|
|
||||||
zap.Int64s("offlineNodes", toRemove),
|
|
||||||
)
|
|
||||||
log.Info("some nodes are offline, remove them from replica", zap.Any("toRemove", toRemove))
|
|
||||||
if err := s.meta.ReplicaManager.RemoveNode(replica.GetID(), toRemove...); err != nil {
|
|
||||||
log.Warn("failed to remove offline nodes from replica")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user