mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-11-30 02:48:45 +08:00
parent
f8ff97fe29
commit
1deac692a0
@ -115,6 +115,7 @@ type Server struct {
|
||||
activateFunc func() error
|
||||
|
||||
nodeUpEventChan chan int64
|
||||
notifyNodeUp chan struct{}
|
||||
}
|
||||
|
||||
func NewQueryCoord(ctx context.Context) (*Server, error) {
|
||||
@ -123,6 +124,7 @@ func NewQueryCoord(ctx context.Context) (*Server, error) {
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
nodeUpEventChan: make(chan int64, 10240),
|
||||
notifyNodeUp: make(chan struct{}),
|
||||
}
|
||||
server.UpdateStateCode(commonpb.StateCode_Abnormal)
|
||||
server.queryNodeCreator = session.DefaultQueryNodeCreator
|
||||
@ -628,6 +630,7 @@ func (s *Server) watchNodes(revision int64) {
|
||||
)
|
||||
s.nodeMgr.Add(session.NewNodeInfo(nodeID, addr))
|
||||
s.nodeUpEventChan <- nodeID
|
||||
s.notifyNodeUp <- struct{}{}
|
||||
|
||||
case sessionutil.SessionUpdateEvent:
|
||||
nodeID := event.Session.ServerID
|
||||
@ -651,39 +654,44 @@ func (s *Server) watchNodes(revision int64) {
|
||||
}
|
||||
|
||||
func (s *Server) handleNodeUpLoop() {
|
||||
log := log.Ctx(s.ctx).WithRateGroup("qcv2.Server", 1, 60)
|
||||
defer s.wg.Done()
|
||||
// small check interval value can reduce the latency of node up
|
||||
ticker := time.NewTicker(Params.QueryCoordCfg.CheckHealthInterval.GetAsDuration(time.Millisecond))
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-s.ctx.Done():
|
||||
log.Info("handle node up loop exit due to context done")
|
||||
return
|
||||
|
||||
case <-s.notifyNodeUp:
|
||||
s.tryHandleNodeUp()
|
||||
case <-ticker.C:
|
||||
ctx, cancel := context.WithTimeout(s.ctx, Params.QueryCoordCfg.CheckHealthRPCTimeout.GetAsDuration(time.Millisecond))
|
||||
defer cancel()
|
||||
reasons, err := s.checkNodeHealth(ctx)
|
||||
if err != nil {
|
||||
log.RatedWarn(10, "unhealthy node exist, node up will be delayed",
|
||||
zap.Int("delayedNodeUpEvents", len(s.nodeUpEventChan)),
|
||||
zap.Int("unhealthyNodeNum", len(reasons)),
|
||||
zap.Strings("unhealthyReason", reasons))
|
||||
return
|
||||
}
|
||||
for len(s.nodeUpEventChan) > 0 {
|
||||
nodeID := <-s.nodeUpEventChan
|
||||
if s.nodeMgr.Get(nodeID) != nil {
|
||||
// only if all nodes are healthy, node up event will be handled
|
||||
s.handleNodeUp(nodeID)
|
||||
s.metricsCacheManager.InvalidateSystemInfoMetrics()
|
||||
s.checkerController.Check()
|
||||
} else {
|
||||
log.Warn("node already down",
|
||||
zap.Int64("nodeID", nodeID))
|
||||
}
|
||||
}
|
||||
s.tryHandleNodeUp()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Server) tryHandleNodeUp() {
|
||||
log := log.Ctx(s.ctx).WithRateGroup("qcv2.Server", 1, 60)
|
||||
ctx, cancel := context.WithTimeout(s.ctx, Params.QueryCoordCfg.CheckHealthRPCTimeout.GetAsDuration(time.Millisecond))
|
||||
defer cancel()
|
||||
reasons, err := s.checkNodeHealth(ctx)
|
||||
if err != nil {
|
||||
log.RatedWarn(10, "unhealthy node exist, node up will be delayed",
|
||||
zap.Int("delayedNodeUpEvents", len(s.nodeUpEventChan)),
|
||||
zap.Int("unhealthyNodeNum", len(reasons)),
|
||||
zap.Strings("unhealthyReason", reasons))
|
||||
return
|
||||
}
|
||||
for len(s.nodeUpEventChan) > 0 {
|
||||
nodeID := <-s.nodeUpEventChan
|
||||
if s.nodeMgr.Get(nodeID) != nil {
|
||||
// only if all nodes are healthy, node up event will be handled
|
||||
s.handleNodeUp(nodeID)
|
||||
s.metricsCacheManager.InvalidateSystemInfoMetrics()
|
||||
s.checkerController.Check()
|
||||
} else {
|
||||
log.Warn("node already down",
|
||||
zap.Int64("nodeID", nodeID))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -177,6 +177,7 @@ func (suite *ServerSuite) TestNodeUp() {
|
||||
suite.NoError(err)
|
||||
defer node1.Stop()
|
||||
|
||||
suite.server.notifyNodeUp <- struct{}{}
|
||||
suite.Eventually(func() bool {
|
||||
node := suite.server.nodeMgr.Get(node1.ID)
|
||||
if node == nil {
|
||||
@ -191,9 +192,8 @@ func (suite *ServerSuite) TestNodeUp() {
|
||||
return true
|
||||
}, 5*time.Second, time.Second)
|
||||
|
||||
// mock node1 lost connection
|
||||
fakeLostConnectionErr := errors.New("fake lost connection error")
|
||||
node1.EXPECT().GetComponentStates(mock.Anything, mock.Anything).Return(nil, fakeLostConnectionErr)
|
||||
// mock unhealthy node
|
||||
suite.server.nodeMgr.Add(session.NewNodeInfo(1001, "localhost"))
|
||||
|
||||
node2 := mocks.NewMockQueryNode(suite.T(), suite.server.etcdCli, 101)
|
||||
node2.EXPECT().GetDataDistribution(mock.Anything, mock.Anything).Return(&querypb.GetDataDistributionResponse{Status: merr.Status(nil)}, nil).Maybe()
|
||||
@ -202,6 +202,7 @@ func (suite *ServerSuite) TestNodeUp() {
|
||||
defer node2.Stop()
|
||||
|
||||
// expect node2 won't be add to qc, due to unhealthy nodes exist
|
||||
suite.server.notifyNodeUp <- struct{}{}
|
||||
suite.Eventually(func() bool {
|
||||
node := suite.server.nodeMgr.Get(node2.ID)
|
||||
if node == nil {
|
||||
@ -216,8 +217,9 @@ func (suite *ServerSuite) TestNodeUp() {
|
||||
return false
|
||||
}, 5*time.Second, time.Second)
|
||||
|
||||
// mock node1 down, so no unhealthy nodes exist
|
||||
suite.server.nodeMgr.Remove(node1.ID)
|
||||
// mock unhealthy node down, so no unhealthy nodes exist
|
||||
suite.server.nodeMgr.Remove(1001)
|
||||
suite.server.notifyNodeUp <- struct{}{}
|
||||
|
||||
// expect node2 will be add to qc
|
||||
suite.Eventually(func() bool {
|
||||
|
Loading…
Reference in New Issue
Block a user