fix nodeup block (#23634)

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
wei liu 2023-04-25 19:20:37 +08:00 committed by GitHub
parent f8ff97fe29
commit 1deac692a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 40 additions and 30 deletions

View File

@ -115,6 +115,7 @@ type Server struct {
activateFunc func() error
nodeUpEventChan chan int64
notifyNodeUp chan struct{}
}
func NewQueryCoord(ctx context.Context) (*Server, error) {
@ -123,6 +124,7 @@ func NewQueryCoord(ctx context.Context) (*Server, error) {
ctx: ctx,
cancel: cancel,
nodeUpEventChan: make(chan int64, 10240),
notifyNodeUp: make(chan struct{}),
}
server.UpdateStateCode(commonpb.StateCode_Abnormal)
server.queryNodeCreator = session.DefaultQueryNodeCreator
@ -628,6 +630,7 @@ func (s *Server) watchNodes(revision int64) {
)
s.nodeMgr.Add(session.NewNodeInfo(nodeID, addr))
s.nodeUpEventChan <- nodeID
s.notifyNodeUp <- struct{}{}
case sessionutil.SessionUpdateEvent:
nodeID := event.Session.ServerID
@ -651,39 +654,44 @@ func (s *Server) watchNodes(revision int64) {
}
func (s *Server) handleNodeUpLoop() {
log := log.Ctx(s.ctx).WithRateGroup("qcv2.Server", 1, 60)
defer s.wg.Done()
// small check interval value can reduce the latency of node up
ticker := time.NewTicker(Params.QueryCoordCfg.CheckHealthInterval.GetAsDuration(time.Millisecond))
defer ticker.Stop()
for {
select {
case <-s.ctx.Done():
log.Info("handle node up loop exit due to context done")
return
case <-s.notifyNodeUp:
s.tryHandleNodeUp()
case <-ticker.C:
ctx, cancel := context.WithTimeout(s.ctx, Params.QueryCoordCfg.CheckHealthRPCTimeout.GetAsDuration(time.Millisecond))
defer cancel()
reasons, err := s.checkNodeHealth(ctx)
if err != nil {
log.RatedWarn(10, "unhealthy node exist, node up will be delayed",
zap.Int("delayedNodeUpEvents", len(s.nodeUpEventChan)),
zap.Int("unhealthyNodeNum", len(reasons)),
zap.Strings("unhealthyReason", reasons))
return
}
for len(s.nodeUpEventChan) > 0 {
nodeID := <-s.nodeUpEventChan
if s.nodeMgr.Get(nodeID) != nil {
// only if all nodes are healthy, node up event will be handled
s.handleNodeUp(nodeID)
s.metricsCacheManager.InvalidateSystemInfoMetrics()
s.checkerController.Check()
} else {
log.Warn("node already down",
zap.Int64("nodeID", nodeID))
}
}
s.tryHandleNodeUp()
}
}
}
func (s *Server) tryHandleNodeUp() {
log := log.Ctx(s.ctx).WithRateGroup("qcv2.Server", 1, 60)
ctx, cancel := context.WithTimeout(s.ctx, Params.QueryCoordCfg.CheckHealthRPCTimeout.GetAsDuration(time.Millisecond))
defer cancel()
reasons, err := s.checkNodeHealth(ctx)
if err != nil {
log.RatedWarn(10, "unhealthy node exist, node up will be delayed",
zap.Int("delayedNodeUpEvents", len(s.nodeUpEventChan)),
zap.Int("unhealthyNodeNum", len(reasons)),
zap.Strings("unhealthyReason", reasons))
return
}
for len(s.nodeUpEventChan) > 0 {
nodeID := <-s.nodeUpEventChan
if s.nodeMgr.Get(nodeID) != nil {
// only if all nodes are healthy, node up event will be handled
s.handleNodeUp(nodeID)
s.metricsCacheManager.InvalidateSystemInfoMetrics()
s.checkerController.Check()
} else {
log.Warn("node already down",
zap.Int64("nodeID", nodeID))
}
}
}

View File

@ -177,6 +177,7 @@ func (suite *ServerSuite) TestNodeUp() {
suite.NoError(err)
defer node1.Stop()
suite.server.notifyNodeUp <- struct{}{}
suite.Eventually(func() bool {
node := suite.server.nodeMgr.Get(node1.ID)
if node == nil {
@ -191,9 +192,8 @@ func (suite *ServerSuite) TestNodeUp() {
return true
}, 5*time.Second, time.Second)
// mock node1 lost connection
fakeLostConnectionErr := errors.New("fake lost connection error")
node1.EXPECT().GetComponentStates(mock.Anything, mock.Anything).Return(nil, fakeLostConnectionErr)
// mock unhealthy node
suite.server.nodeMgr.Add(session.NewNodeInfo(1001, "localhost"))
node2 := mocks.NewMockQueryNode(suite.T(), suite.server.etcdCli, 101)
node2.EXPECT().GetDataDistribution(mock.Anything, mock.Anything).Return(&querypb.GetDataDistributionResponse{Status: merr.Status(nil)}, nil).Maybe()
@ -202,6 +202,7 @@ func (suite *ServerSuite) TestNodeUp() {
defer node2.Stop()
// expect node2 won't be add to qc, due to unhealthy nodes exist
suite.server.notifyNodeUp <- struct{}{}
suite.Eventually(func() bool {
node := suite.server.nodeMgr.Get(node2.ID)
if node == nil {
@ -216,8 +217,9 @@ func (suite *ServerSuite) TestNodeUp() {
return false
}, 5*time.Second, time.Second)
// mock node1 down, so no unhealthy nodes exist
suite.server.nodeMgr.Remove(node1.ID)
// mock unhealthy node down, so no unhealthy nodes exist
suite.server.nodeMgr.Remove(1001)
suite.server.notifyNodeUp <- struct{}{}
// expect node2 will be add to qc
suite.Eventually(func() bool {