Fix read queue metrics and memory protection (#19787)

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
bigsheeper 2022-10-14 18:05:24 +08:00 committed by GitHub
parent a1db9038fb
commit 3a5aaeb7ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 56 additions and 22 deletions

View File

@ -846,10 +846,6 @@ func (s *Server) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest
zap.String("metric_type", metricType))
if metricType == metricsinfo.SystemInfoMetrics {
ret, err := s.metricsCacheManager.GetSystemInfoMetrics()
if err == nil && ret != nil {
return ret, nil
}
log.Debug("failed to get system info metrics from cache, recompute instead",
zap.Error(err))
@ -862,8 +858,6 @@ func (s *Server) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest
zap.Any("metrics", metrics), // TODO(dragondriver): necessary? may be very large
zap.Error(err))
s.metricsCacheManager.UpdateSystemInfoMetrics(metrics)
return metrics, nil
}

View File

@ -573,21 +573,15 @@ func (s *Server) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest
return resp, nil
}
metrics, err := s.metricsCacheManager.GetSystemInfoMetrics()
resp.Response, err = s.getSystemInfoMetrics(ctx, req)
if err != nil {
log.Warn("failed to read metrics from cache, re-calculate it", zap.Error(err))
metrics = resp
metrics.Response, err = s.getSystemInfoMetrics(ctx, req)
if err != nil {
msg := "failed to get system info metrics"
log.Warn(msg, zap.Error(err))
resp.Status = utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, msg, err)
return resp, nil
}
msg := "failed to get system info metrics"
log.Warn(msg, zap.Error(err))
resp.Status = utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, msg, err)
return resp, nil
}
s.metricsCacheManager.UpdateSystemInfoMetrics(metrics)
return metrics, nil
return resp, nil
}
func (s *Server) GetReplicas(ctx context.Context, req *milvuspb.GetReplicasRequest) (*milvuspb.GetReplicasResponse, error) {

View File

@ -64,6 +64,7 @@ func (stNode *serviceTimeNode) Operate(in []flowgraph.Msg) []flowgraph.Msg {
// should not happen, QueryNode should addTSafe before start flow graph
panic(fmt.Errorf("serviceTimeNode setTSafe timeout, collectionID = %d, err = %s", stNode.collectionID, err))
}
rateCol.updateTSafe(stNode.vChannel, serviceTimeMsg.timeRange.timestampMax)
p, _ := tsoutil.ParseTS(serviceTimeMsg.timeRange.timestampMax)
log.RatedDebug(10.0, "update tSafe:",
zap.Any("collectionID", stNode.collectionID),

View File

@ -47,6 +47,8 @@ func (q *queryTask) PreExecute(ctx context.Context) error {
if !funcutil.CheckCtxValid(q.Ctx()) {
return errors.New("search context timeout1$")
}
q.SetStep(TaskStepPreExecute)
rateCol.rtCounter.increaseQueueTime(q)
return nil
}

View File

@ -78,7 +78,6 @@ func (b *baseReadTask) SetStep(step TaskStep) {
b.tr.Record("enqueueStart")
case TaskStepPreExecute:
b.queueDur = b.tr.Record("enqueueEnd")
rateCol.rtCounter.increaseQueueTime(b)
}
}

View File

@ -60,8 +60,10 @@ type searchTask struct {
func (s *searchTask) PreExecute(ctx context.Context) error {
s.SetStep(TaskStepPreExecute)
rateCol.rtCounter.increaseQueueTime(s)
for _, t := range s.otherTasks {
t.SetStep(TaskStepPreExecute)
rateCol.rtCounter.increaseQueueTime(t)
}
s.combinePlaceHolderGroups()
return nil

View File

@ -541,7 +541,7 @@ func (q *QuotaCenter) memoryToWaterLevel() float64 {
zap.Float64("QueryNodeMemoryHighWaterLevel", queryNodeMemoryHighWaterLevel))
return 0
}
p := (memoryWaterLevel - queryNodeMemoryLowWaterLevel) / (queryNodeMemoryHighWaterLevel - queryNodeMemoryLowWaterLevel)
p := (queryNodeMemoryHighWaterLevel - memoryWaterLevel) / (queryNodeMemoryHighWaterLevel - queryNodeMemoryLowWaterLevel)
if p < factor {
factor = p
}
@ -558,7 +558,7 @@ func (q *QuotaCenter) memoryToWaterLevel() float64 {
zap.Float64("DataNodeMemoryHighWaterLevel", dataNodeMemoryHighWaterLevel))
return 0
}
p := (memoryWaterLevel - dataNodeMemoryLowWaterLevel) / (dataNodeMemoryHighWaterLevel - dataNodeMemoryLowWaterLevel)
p := (dataNodeMemoryHighWaterLevel - memoryWaterLevel) / (dataNodeMemoryHighWaterLevel - dataNodeMemoryLowWaterLevel)
if p < factor {
factor = p
}

View File

@ -344,7 +344,7 @@ func TestQuotaCenter(t *testing.T) {
Params.QuotaConfig.ForceDenyWriting = forceBak
})
t.Run("test memoryToWaterLevel", func(t *testing.T) {
t.Run("test memoryToWaterLevel basic", func(t *testing.T) {
quotaCenter := NewQuotaCenter(pcm, &queryCoordMockForQuota{}, &dataCoordMockForQuota{}, core.tsoAllocator)
factor := quotaCenter.memoryToWaterLevel()
assert.Equal(t, float64(1), factor)
@ -356,6 +356,48 @@ func TestQuotaCenter(t *testing.T) {
assert.Equal(t, float64(0), factor)
})
t.Run("test memoryToWaterLevel factors", func(t *testing.T) {
quotaCenter := NewQuotaCenter(pcm, &queryCoordMockForQuota{}, &dataCoordMockForQuota{}, core.tsoAllocator)
type memCase struct {
lowWater float64
highWater float64
memUsage uint64
memTotal uint64
expectedFactor float64
}
memCases := []memCase{
{0.8, 0.9, 80, 100, 1},
{0.8, 0.9, 82, 100, 0.8},
{0.8, 0.9, 85, 100, 0.5},
{0.8, 0.9, 88, 100, 0.2},
{0.8, 0.9, 90, 100, 0},
{0.85, 0.95, 85, 100, 1},
{0.85, 0.95, 87, 100, 0.8},
{0.85, 0.95, 90, 100, 0.5},
{0.85, 0.95, 93, 100, 0.2},
{0.85, 0.95, 95, 100, 0},
}
lowBackup := Params.QuotaConfig.DataNodeMemoryLowWaterLevel
highBackup := Params.QuotaConfig.DataNodeMemoryHighWaterLevel
for i, c := range memCases {
Params.QuotaConfig.QueryNodeMemoryLowWaterLevel = c.lowWater
Params.QuotaConfig.QueryNodeMemoryHighWaterLevel = c.highWater
quotaCenter.queryNodeMetrics = []*metricsinfo.QueryNodeQuotaMetrics{{
Hms: metricsinfo.HardwareMetrics{MemoryUsage: c.memUsage, Memory: c.memTotal}}}
factor := quotaCenter.memoryToWaterLevel()
if math.Abs(factor-c.expectedFactor) > 0.000001 {
t.Errorf("case %d failed: waterLever[low:%v, high:%v], memMetric[used:%d, total:%d], expectedFactor: %f, actualFactor: %f",
i, c.lowWater, c.highWater, c.memUsage, c.memTotal, c.expectedFactor, factor)
}
}
Params.QuotaConfig.QueryNodeMemoryLowWaterLevel = lowBackup
Params.QuotaConfig.QueryNodeMemoryHighWaterLevel = highBackup
})
t.Run("test diskQuotaExceeded", func(t *testing.T) {
quotaCenter := NewQuotaCenter(pcm, &queryCoordMockForQuota{}, &dataCoordMockForQuota{}, core.tsoAllocator)