stop heartbeat if reach heartbeat limit (#26728)

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
wei liu 2023-09-04 17:51:48 +08:00 committed by GitHub
parent 622077f9ad
commit 1097776477
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 52 additions and 12 deletions

View File

@ -167,7 +167,7 @@ rootCoord:
# Related configuration of proxy, used to validate client requests and reduce the returned results.
proxy:
timeTickInterval: 200 # ms, the interval that proxy synchronize the time tick
healthCheckTimetout: 3000 # ms, the interval that to do component healthy check
healthCheckTimeout: 3000 # ms, the interval that to do component healthy check
msgStream:
timeTick:
bufSize: 512

View File

@ -213,7 +213,7 @@ func (b *LookAsideBalancer) checkQueryNodeHealthLoop(ctx context.Context) {
b.metricsUpdateTs.Range(func(node int64, lastUpdateTs int64) bool {
if now-lastUpdateTs > checkQueryNodeHealthInterval.Milliseconds() {
futures = append(futures, pool.Submit(func() (any, error) {
checkInterval := Params.ProxyCfg.HealthCheckTimetout.GetAsDuration(time.Millisecond)
checkInterval := Params.ProxyCfg.HealthCheckTimeout.GetAsDuration(time.Millisecond)
ctx, cancel := context.WithTimeout(context.Background(), checkInterval)
defer cancel()
@ -263,11 +263,25 @@ func (b *LookAsideBalancer) trySetQueryNodeUnReachable(node int64, err error) bo
failures.Inc()
b.failedHeartBeatCounter.Insert(node, failures)
log.Info("get component status failed",
zap.Int64("node", node),
zap.Int64("times", failures.Load()),
zap.Error(err))
if failures.Load() < Params.ProxyCfg.RetryTimesOnHealthCheck.GetAsInt64() {
log.Warn("get component status failed",
zap.Int64("node", node),
zap.Int64("times", failures.Load()),
zap.Error(err))
return false
}
// if the total time of consecutive heartbeat failures reach the session.ttl, remove the offline query node
limit := Params.CommonCfg.SessionTTL.GetAsDuration(time.Second).Seconds() /
Params.ProxyCfg.HealthCheckTimeout.GetAsDuration(time.Millisecond).Seconds()
if failures.Load() > Params.ProxyCfg.RetryTimesOnHealthCheck.GetAsInt64() && float64(failures.Load()) >= limit {
log.Info("the heartbeat failures has reach it's upper limit, remove the query node",
zap.Int64("nodeID", node))
// stop the heartbeat
b.metricsUpdateTs.GetAndRemove(node)
b.metricsMap.GetAndRemove(node)
b.executingTaskTotalNQ.GetAndRemove(node)
b.unreachableQueryNodes.Remove(node)
return false
}

View File

@ -359,6 +359,31 @@ func (suite *LookAsideBalancerSuite) TestNodeRecover() {
}, 5*time.Second, 100*time.Millisecond)
}
func (suite *LookAsideBalancerSuite) TestNodeOffline() {
Params.Save(Params.CommonCfg.SessionTTL.Key, "10")
Params.Save(Params.ProxyCfg.HealthCheckTimeout.Key, "1000")
// mock qn down for a while and then recover
qn3 := mocks.NewMockQueryNode(suite.T())
suite.clientMgr.EXPECT().GetClient(mock.Anything, int64(3)).Return(qn3, nil)
qn3.EXPECT().GetComponentStates(mock.Anything).Return(&milvuspb.ComponentStates{
State: &milvuspb.ComponentInfo{
StateCode: commonpb.StateCode_Abnormal,
},
}, nil)
suite.balancer.metricsUpdateTs.Insert(3, time.Now().UnixMilli())
suite.Eventually(func() bool {
return suite.balancer.unreachableQueryNodes.Contain(3)
}, 5*time.Second, 100*time.Millisecond)
suite.Eventually(func() bool {
return !suite.balancer.metricsUpdateTs.Contain(3)
}, 10*time.Second, 100*time.Millisecond)
suite.Eventually(func() bool {
return !suite.balancer.unreachableQueryNodes.Contain(3)
}, time.Second, 100*time.Millisecond)
}
func TestLookAsideBalancerSuite(t *testing.T) {
suite.Run(t, new(LookAsideBalancerSuite))
}

View File

@ -851,7 +851,7 @@ type proxyConfig struct {
SoPath ParamItem `refreshable:"false"`
TimeTickInterval ParamItem `refreshable:"false"`
HealthCheckTimetout ParamItem `refreshable:"true"`
HealthCheckTimeout ParamItem `refreshable:"true"`
MsgStreamTimeTickBufSize ParamItem `refreshable:"true"`
MaxNameLength ParamItem `refreshable:"true"`
MaxUsernameLength ParamItem `refreshable:"true"`
@ -884,15 +884,16 @@ func (p *proxyConfig) init(base *BaseTable) {
}
p.TimeTickInterval.Init(base.mgr)
p.HealthCheckTimetout = ParamItem{
Key: "proxy.healthCheckTimetout",
p.HealthCheckTimeout = ParamItem{
Key: "proxy.healthCheckTimeout",
FallbackKeys: []string{"proxy.healthCheckTimetout"},
Version: "2.3.0",
DefaultValue: "3000",
PanicIfEmpty: true,
Doc: "ms, the interval that to do component healthy check",
Export: true,
}
p.HealthCheckTimetout.Init(base.mgr)
p.HealthCheckTimeout.Init(base.mgr)
p.MsgStreamTimeTickBufSize = ParamItem{
Key: "proxy.msgStream.timeTick.bufSize",

View File

@ -131,7 +131,7 @@ func TestComponentParam(t *testing.T) {
t.Logf("TimeTickInterval: %v", &Params.TimeTickInterval)
t.Logf("healthCheckTimetout: %v", &Params.HealthCheckTimetout)
t.Logf("healthCheckTimeout: %v", &Params.HealthCheckTimeout)
t.Logf("MsgStreamTimeTickBufSize: %d", Params.MsgStreamTimeTickBufSize.GetAsInt64())
@ -163,7 +163,7 @@ func TestComponentParam(t *testing.T) {
assert.Equal(t, Params.CheckQueryNodeHealthInterval.GetAsInt(), 1000)
assert.Equal(t, Params.CostMetricsExpireTime.GetAsInt(), 1000)
assert.Equal(t, Params.RetryTimesOnReplica.GetAsInt(), 2)
assert.EqualValues(t, Params.HealthCheckTimetout.GetAsInt64(), 3000)
assert.EqualValues(t, Params.HealthCheckTimeout.GetAsInt64(), 3000)
})
// t.Run("test proxyConfig panic", func(t *testing.T) {