Add metrics for garbage collection (#27303)

Also fix second metrics usage in compaction

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
This commit is contained in:
congqixia 2023-09-22 18:47:25 +08:00 committed by GitHub
parent 8f4aaa2da8
commit 1d76565894
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 36 additions and 4 deletions

View File

@ -18,6 +18,7 @@ package datacoord
import (
"context"
"fmt"
"path"
"sort"
"strings"
@ -33,7 +34,9 @@ import (
"github.com/milvus-io/milvus/internal/storage"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/metrics"
"github.com/milvus-io/milvus/pkg/util/metautil"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
@ -141,9 +144,10 @@ func (gc *garbageCollector) scan() {
prefixes = append(prefixes, path.Join(gc.option.cli.RootPath(), common.SegmentInsertLogPath))
prefixes = append(prefixes, path.Join(gc.option.cli.RootPath(), common.SegmentStatslogPath))
prefixes = append(prefixes, path.Join(gc.option.cli.RootPath(), common.SegmentDeltaLogPath))
labels := []string{metrics.InsertFileLabel, metrics.StatFileLabel, metrics.DeleteFileLabel}
var removedKeys []string
for _, prefix := range prefixes {
for idx, prefix := range prefixes {
startTs := time.Now()
infoKeys, modTimes, err := gc.option.cli.ListWithPrefix(ctx, prefix, true)
if err != nil {
@ -152,8 +156,12 @@ func (gc *garbageCollector) scan() {
zap.Error(err),
)
}
cost := time.Since(startTs)
segmentMap, filesMap := getMetaMap()
log.Info("gc scan finish list object", zap.String("prefix", prefix), zap.Duration("time spent", time.Since(startTs)), zap.Int("keys", len(infoKeys)))
metrics.GarbageCollectorListLatency.
WithLabelValues(fmt.Sprint(paramtable.GetNodeID()), labels[idx]).
Observe(float64(cost.Milliseconds()))
log.Info("gc scan finish list object", zap.String("prefix", prefix), zap.Duration("time spent", cost), zap.Int("keys", len(infoKeys)))
for i, infoKey := range infoKeys {
total++
_, has := filesMap[infoKey]
@ -191,6 +199,7 @@ func (gc *garbageCollector) scan() {
}
}
}
metrics.GarbageCollectorRunCount.WithLabelValues(fmt.Sprint(paramtable.GetNodeID())).Add(1)
log.Info("scan file to do garbage collection",
zap.Int("total", total),
zap.Int("valid", valid),

View File

@ -712,7 +712,7 @@ func (t *compactionTask) compact() (*datapb.CompactionResult, error) {
)
log.Info("compact overall elapse", zap.Duration("elapse", time.Since(compactStart)))
metrics.DataNodeCompactionLatency.WithLabelValues(fmt.Sprint(paramtable.GetNodeID())).Observe(t.tr.ElapseSpan().Seconds())
metrics.DataNodeCompactionLatency.WithLabelValues(fmt.Sprint(paramtable.GetNodeID())).Observe(float64(t.tr.ElapseSpan().Milliseconds()))
metrics.DataNodeCompactionLatencyInQueue.WithLabelValues(fmt.Sprint(paramtable.GetNodeID())).Observe(float64(durInQueue.Milliseconds()))
return pack, nil

View File

@ -149,6 +149,26 @@ var (
Buckets: buckets,
}, []string{segmentFileTypeLabelName})
/* garbage collector related metrics */
// GarbageCollectorListLatency metrics for gc scan storage files.
GarbageCollectorListLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: milvusNamespace,
Subsystem: typeutil.DataCoordRole,
Name: "gc_list_latency",
Help: "latency of list objects in storage while garbage collecting (in milliseconds)",
Buckets: longTaskBuckets,
}, []string{nodeIDLabelName, segmentFileTypeLabelName})
GarbageCollectorRunCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: milvusNamespace,
Subsystem: typeutil.DataCoordRole,
Name: "gc_run_count",
Help: "garbage collection running count",
}, []string{nodeIDLabelName})
/* hard to implement, commented now
DataCoordSegmentSizeRatio = prometheus.NewHistogramVec(
prometheus.HistogramOpts{

View File

@ -154,7 +154,7 @@ var (
Subsystem: typeutil.DataNodeRole,
Name: "compaction_latency",
Help: "latency of compaction operation",
Buckets: []float64{0.001, 0.1, 0.5, 1, 5, 10, 20, 50, 100, 250, 500, 1000, 3600, 5000, 10000}, // unit seconds
Buckets: longTaskBuckets,
}, []string{
nodeIDLabelName,
})

View File

@ -95,6 +95,9 @@ var (
// [1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 1.31072e+05]
buckets = prometheus.ExponentialBuckets(1, 2, 18)
// longTaskBuckets provides long task duration in milliseconds
longTaskBuckets = []float64{1, 100, 500, 1000, 5000, 10000, 20000, 50000, 100000, 250000, 500000, 1000000, 3600000, 5000000, 10000000} // unit milliseconds
NumNodes = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: milvusNamespace,