mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-11-30 02:48:45 +08:00
stop heartbeat if reach heartbeat limit (#26728)
Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
parent
622077f9ad
commit
1097776477
@ -167,7 +167,7 @@ rootCoord:
|
||||
# Related configuration of proxy, used to validate client requests and reduce the returned results.
|
||||
proxy:
|
||||
timeTickInterval: 200 # ms, the interval that proxy synchronize the time tick
|
||||
healthCheckTimetout: 3000 # ms, the interval that to do component healthy check
|
||||
healthCheckTimeout: 3000 # ms, the interval that to do component healthy check
|
||||
msgStream:
|
||||
timeTick:
|
||||
bufSize: 512
|
||||
|
@ -213,7 +213,7 @@ func (b *LookAsideBalancer) checkQueryNodeHealthLoop(ctx context.Context) {
|
||||
b.metricsUpdateTs.Range(func(node int64, lastUpdateTs int64) bool {
|
||||
if now-lastUpdateTs > checkQueryNodeHealthInterval.Milliseconds() {
|
||||
futures = append(futures, pool.Submit(func() (any, error) {
|
||||
checkInterval := Params.ProxyCfg.HealthCheckTimetout.GetAsDuration(time.Millisecond)
|
||||
checkInterval := Params.ProxyCfg.HealthCheckTimeout.GetAsDuration(time.Millisecond)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), checkInterval)
|
||||
defer cancel()
|
||||
|
||||
@ -263,11 +263,25 @@ func (b *LookAsideBalancer) trySetQueryNodeUnReachable(node int64, err error) bo
|
||||
failures.Inc()
|
||||
b.failedHeartBeatCounter.Insert(node, failures)
|
||||
|
||||
log.Info("get component status failed",
|
||||
zap.Int64("node", node),
|
||||
zap.Int64("times", failures.Load()),
|
||||
zap.Error(err))
|
||||
|
||||
if failures.Load() < Params.ProxyCfg.RetryTimesOnHealthCheck.GetAsInt64() {
|
||||
log.Warn("get component status failed",
|
||||
zap.Int64("node", node),
|
||||
zap.Int64("times", failures.Load()),
|
||||
zap.Error(err))
|
||||
return false
|
||||
}
|
||||
// if the total time of consecutive heartbeat failures reach the session.ttl, remove the offline query node
|
||||
limit := Params.CommonCfg.SessionTTL.GetAsDuration(time.Second).Seconds() /
|
||||
Params.ProxyCfg.HealthCheckTimeout.GetAsDuration(time.Millisecond).Seconds()
|
||||
if failures.Load() > Params.ProxyCfg.RetryTimesOnHealthCheck.GetAsInt64() && float64(failures.Load()) >= limit {
|
||||
log.Info("the heartbeat failures has reach it's upper limit, remove the query node",
|
||||
zap.Int64("nodeID", node))
|
||||
// stop the heartbeat
|
||||
b.metricsUpdateTs.GetAndRemove(node)
|
||||
b.metricsMap.GetAndRemove(node)
|
||||
b.executingTaskTotalNQ.GetAndRemove(node)
|
||||
b.unreachableQueryNodes.Remove(node)
|
||||
return false
|
||||
}
|
||||
|
||||
|
@ -359,6 +359,31 @@ func (suite *LookAsideBalancerSuite) TestNodeRecover() {
|
||||
}, 5*time.Second, 100*time.Millisecond)
|
||||
}
|
||||
|
||||
func (suite *LookAsideBalancerSuite) TestNodeOffline() {
|
||||
Params.Save(Params.CommonCfg.SessionTTL.Key, "10")
|
||||
Params.Save(Params.ProxyCfg.HealthCheckTimeout.Key, "1000")
|
||||
// mock qn down for a while and then recover
|
||||
qn3 := mocks.NewMockQueryNode(suite.T())
|
||||
suite.clientMgr.EXPECT().GetClient(mock.Anything, int64(3)).Return(qn3, nil)
|
||||
qn3.EXPECT().GetComponentStates(mock.Anything).Return(&milvuspb.ComponentStates{
|
||||
State: &milvuspb.ComponentInfo{
|
||||
StateCode: commonpb.StateCode_Abnormal,
|
||||
},
|
||||
}, nil)
|
||||
|
||||
suite.balancer.metricsUpdateTs.Insert(3, time.Now().UnixMilli())
|
||||
suite.Eventually(func() bool {
|
||||
return suite.balancer.unreachableQueryNodes.Contain(3)
|
||||
}, 5*time.Second, 100*time.Millisecond)
|
||||
|
||||
suite.Eventually(func() bool {
|
||||
return !suite.balancer.metricsUpdateTs.Contain(3)
|
||||
}, 10*time.Second, 100*time.Millisecond)
|
||||
suite.Eventually(func() bool {
|
||||
return !suite.balancer.unreachableQueryNodes.Contain(3)
|
||||
}, time.Second, 100*time.Millisecond)
|
||||
}
|
||||
|
||||
func TestLookAsideBalancerSuite(t *testing.T) {
|
||||
suite.Run(t, new(LookAsideBalancerSuite))
|
||||
}
|
||||
|
@ -851,7 +851,7 @@ type proxyConfig struct {
|
||||
SoPath ParamItem `refreshable:"false"`
|
||||
|
||||
TimeTickInterval ParamItem `refreshable:"false"`
|
||||
HealthCheckTimetout ParamItem `refreshable:"true"`
|
||||
HealthCheckTimeout ParamItem `refreshable:"true"`
|
||||
MsgStreamTimeTickBufSize ParamItem `refreshable:"true"`
|
||||
MaxNameLength ParamItem `refreshable:"true"`
|
||||
MaxUsernameLength ParamItem `refreshable:"true"`
|
||||
@ -884,15 +884,16 @@ func (p *proxyConfig) init(base *BaseTable) {
|
||||
}
|
||||
p.TimeTickInterval.Init(base.mgr)
|
||||
|
||||
p.HealthCheckTimetout = ParamItem{
|
||||
Key: "proxy.healthCheckTimetout",
|
||||
p.HealthCheckTimeout = ParamItem{
|
||||
Key: "proxy.healthCheckTimeout",
|
||||
FallbackKeys: []string{"proxy.healthCheckTimetout"},
|
||||
Version: "2.3.0",
|
||||
DefaultValue: "3000",
|
||||
PanicIfEmpty: true,
|
||||
Doc: "ms, the interval that to do component healthy check",
|
||||
Export: true,
|
||||
}
|
||||
p.HealthCheckTimetout.Init(base.mgr)
|
||||
p.HealthCheckTimeout.Init(base.mgr)
|
||||
|
||||
p.MsgStreamTimeTickBufSize = ParamItem{
|
||||
Key: "proxy.msgStream.timeTick.bufSize",
|
||||
|
@ -131,7 +131,7 @@ func TestComponentParam(t *testing.T) {
|
||||
|
||||
t.Logf("TimeTickInterval: %v", &Params.TimeTickInterval)
|
||||
|
||||
t.Logf("healthCheckTimetout: %v", &Params.HealthCheckTimetout)
|
||||
t.Logf("healthCheckTimeout: %v", &Params.HealthCheckTimeout)
|
||||
|
||||
t.Logf("MsgStreamTimeTickBufSize: %d", Params.MsgStreamTimeTickBufSize.GetAsInt64())
|
||||
|
||||
@ -163,7 +163,7 @@ func TestComponentParam(t *testing.T) {
|
||||
assert.Equal(t, Params.CheckQueryNodeHealthInterval.GetAsInt(), 1000)
|
||||
assert.Equal(t, Params.CostMetricsExpireTime.GetAsInt(), 1000)
|
||||
assert.Equal(t, Params.RetryTimesOnReplica.GetAsInt(), 2)
|
||||
assert.EqualValues(t, Params.HealthCheckTimetout.GetAsInt64(), 3000)
|
||||
assert.EqualValues(t, Params.HealthCheckTimeout.GetAsInt64(), 3000)
|
||||
})
|
||||
|
||||
// t.Run("test proxyConfig panic", func(t *testing.T) {
|
||||
|
Loading…
Reference in New Issue
Block a user