Fix bug for statistical metrics is incorrect (#19357)

Signed-off-by: cai.zhang <cai.zhang@zilliz.com>

Signed-off-by: cai.zhang <cai.zhang@zilliz.com>
This commit is contained in:
cai.zhang 2022-09-28 11:02:54 +08:00 committed by GitHub
parent adeaac4f9a
commit b648034cee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 138 additions and 19 deletions

View File

@ -20,8 +20,11 @@ import (
"context"
"errors"
"fmt"
"strconv"
"sync"
"github.com/prometheus/client_golang/prometheus"
"github.com/golang/protobuf/proto"
"go.uber.org/zap"
@ -173,6 +176,37 @@ func (mt *metaTable) updateSegIndexMeta(segIdx *model.SegmentIndex, updateFunc f
return updateFunc(model.CloneSegmentIndex(segIdx))
}
func (mt *metaTable) updateIndexTasksMetrics() {
taskMetrics := make(map[UniqueID]map[commonpb.IndexState]int)
for _, segIdx := range mt.buildID2SegmentIndex {
if segIdx.IsDeleted {
continue
}
if _, ok := taskMetrics[segIdx.CollectionID]; !ok {
taskMetrics[segIdx.CollectionID] = make(map[commonpb.IndexState]int)
taskMetrics[segIdx.CollectionID][commonpb.IndexState_Unissued] = 0
taskMetrics[segIdx.CollectionID][commonpb.IndexState_InProgress] = 0
taskMetrics[segIdx.CollectionID][commonpb.IndexState_Finished] = 0
taskMetrics[segIdx.CollectionID][commonpb.IndexState_Failed] = 0
}
taskMetrics[segIdx.CollectionID][segIdx.IndexState]++
}
for collID, m := range taskMetrics {
for k, v := range m {
switch k {
case commonpb.IndexState_Unissued:
metrics.IndexCoordIndexTaskNum.WithLabelValues(strconv.FormatInt(collID, 10), metrics.UnissuedIndexTaskLabel).Set(float64(v))
case commonpb.IndexState_InProgress:
metrics.IndexCoordIndexTaskNum.WithLabelValues(strconv.FormatInt(collID, 10), metrics.InProgressIndexTaskLabel).Set(float64(v))
case commonpb.IndexState_Finished:
metrics.IndexCoordIndexTaskNum.WithLabelValues(strconv.FormatInt(collID, 10), metrics.FinishedIndexTaskLabel).Set(float64(v))
case commonpb.IndexState_Failed:
metrics.IndexCoordIndexTaskNum.WithLabelValues(strconv.FormatInt(collID, 10), metrics.FailedIndexTaskLabel).Set(float64(v))
}
}
}
}
func (mt *metaTable) GetAllIndexMeta() map[int64]*model.SegmentIndex {
mt.segmentIndexLock.RLock()
defer mt.segmentIndexLock.RUnlock()
@ -273,7 +307,7 @@ func (mt *metaTable) AddIndex(segIndex *model.SegmentIndex) error {
}
segIndex.IndexState = commonpb.IndexState_Unissued
metrics.IndexCoordIndexTaskCounter.WithLabelValues(metrics.UnissuedIndexTaskLabel).Inc()
metrics.IndexCoordIndexTaskNum.WithLabelValues(strconv.FormatInt(segIndex.CollectionID, 10), metrics.UnissuedIndexTaskLabel).Inc()
if err := mt.saveSegmentIndexMeta(segIndex); err != nil {
// no need to reload, no reason to compare version fail
log.Error("IndexCoord metaTable save index meta failed", zap.Int64("buildID", buildID),
@ -372,11 +406,14 @@ func (mt *metaTable) BuildIndex(buildID UniqueID) error {
log.Error("IndexCoord metaTable BuildIndex fail", zap.Int64("buildID", segIdx.BuildID), zap.Error(err))
return err
}
metrics.IndexCoordIndexTaskCounter.WithLabelValues(metrics.UnissuedIndexTaskLabel).Dec()
metrics.IndexCoordIndexTaskCounter.WithLabelValues(metrics.InProgressIndexTaskLabel).Inc()
return nil
}
return mt.updateSegIndexMeta(segIdx, updateFunc)
if err := mt.updateSegIndexMeta(segIdx, updateFunc); err != nil {
return err
}
mt.updateIndexTasksMetrics()
return nil
}
// GetIndexesForCollection gets all indexes info with the specified collection.
@ -905,6 +942,13 @@ func (mt *metaTable) RemoveIndex(collID, indexID UniqueID) error {
}
delete(mt.collectionIndexes[collID], indexID)
if len(mt.collectionIndexes[collID]) == 0 {
delete(mt.collectionIndexes, collID)
metrics.IndexCoordIndexTaskNum.Delete(prometheus.Labels{"collection_id": strconv.FormatInt(collID, 10), "index_task_status": metrics.UnissuedIndexTaskLabel})
metrics.IndexCoordIndexTaskNum.Delete(prometheus.Labels{"collection_id": strconv.FormatInt(collID, 10), "index_task_status": metrics.InProgressIndexTaskLabel})
metrics.IndexCoordIndexTaskNum.Delete(prometheus.Labels{"collection_id": strconv.FormatInt(collID, 10), "index_task_status": metrics.FinishedIndexTaskLabel})
metrics.IndexCoordIndexTaskNum.Delete(prometheus.Labels{"collection_id": strconv.FormatInt(collID, 10), "index_task_status": metrics.FailedIndexTaskLabel})
}
log.Info("IndexCoord meta table remove index success", zap.Int64("collID", collID), zap.Int64("indexID", indexID))
return nil
}
@ -923,8 +967,12 @@ func (mt *metaTable) RemoveSegmentIndex(collID, partID, segID, buildID UniqueID)
if !ok {
return nil
}
delete(mt.segmentIndexes[segIdx.SegmentID], segIdx.IndexID)
delete(mt.segmentIndexes[segID], segIdx.IndexID)
delete(mt.buildID2SegmentIndex, buildID)
if len(mt.segmentIndexes[segID]) == 0 {
delete(mt.segmentIndexes, segID)
}
return nil
}
@ -968,7 +1016,11 @@ func (mt *metaTable) ResetMeta(buildID UniqueID) error {
return mt.alterSegmentIndexes([]*model.SegmentIndex{segIdx})
}
return mt.updateSegIndexMeta(segIdx, updateFunc)
if err := mt.updateSegIndexMeta(segIdx, updateFunc); err != nil {
return err
}
mt.updateIndexTasksMetrics()
return nil
}
func (mt *metaTable) FinishTask(taskInfo *indexpb.IndexTaskInfo) error {
@ -987,7 +1039,12 @@ func (mt *metaTable) FinishTask(taskInfo *indexpb.IndexTaskInfo) error {
return mt.alterSegmentIndexes([]*model.SegmentIndex{segIdx})
}
return mt.updateSegIndexMeta(segIdx, updateFunc)
if err := mt.updateSegIndexMeta(segIdx, updateFunc); err != nil {
return err
}
mt.updateIndexTasksMetrics()
return nil
}
func (mt *metaTable) MarkSegmentsIndexAsDeletedByBuildID(buildIDs []UniqueID) error {

View File

@ -1064,13 +1064,49 @@ func TestMetaTable_ResetNodeID(t *testing.T) {
func TestMetaTable_ResetMeta(t *testing.T) {
t.Run("success", func(t *testing.T) {
mt := constructMetaTable(&indexcoord.Catalog{
Txn: &mockETCDKV{
multiSave: func(m map[string]string) error {
return nil
mt := &metaTable{
catalog: &indexcoord.Catalog{Txn: NewMockEtcdKV()},
buildID2SegmentIndex: map[UniqueID]*model.SegmentIndex{
buildID: {
SegmentID: segID,
CollectionID: collID,
PartitionID: partID,
NumRows: 1024,
IndexID: indexID,
BuildID: buildID,
NodeID: 1,
IndexVersion: 1,
IndexState: commonpb.IndexState_InProgress,
FailReason: "",
IsDeleted: false,
CreateTime: 1,
IndexFilePaths: nil,
IndexSize: 0,
WriteHandoff: false,
},
},
})
segmentIndexes: map[UniqueID]map[UniqueID]*model.SegmentIndex{
segID: {
indexID: {
SegmentID: segID,
CollectionID: collID,
PartitionID: partID,
NumRows: 1024,
IndexID: indexID,
BuildID: buildID,
NodeID: 1,
IndexVersion: 1,
IndexState: commonpb.IndexState_InProgress,
FailReason: "",
IsDeleted: false,
CreateTime: 1,
IndexFilePaths: nil,
IndexSize: 0,
WriteHandoff: false,
},
},
},
}
err := mt.ResetMeta(buildID)
assert.NoError(t, err)
assert.Equal(t, int64(0), mt.buildID2SegmentIndex[buildID].NodeID)
@ -1138,6 +1174,32 @@ func TestMetaTable_FinishTask(t *testing.T) {
assert.ElementsMatch(t, []string{"file3", "file4"}, mt.buildID2SegmentIndex[buildID+1].IndexFilePaths)
})
t.Run("state failed", func(t *testing.T) {
mt := constructMetaTable(&indexcoord.Catalog{
Txn: &mockETCDKV{
save: func(s string, s2 string) error {
return nil
},
multiSave: func(m map[string]string) error {
return nil
},
},
})
err := mt.AddIndex(segIdx)
assert.NoError(t, err)
err = mt.FinishTask(&indexpb.IndexTaskInfo{
BuildID: buildID + 1,
State: commonpb.IndexState_Failed,
IndexFiles: []string{},
SerializedSize: 0,
FailReason: "failed",
})
assert.NoError(t, err)
assert.Equal(t, commonpb.IndexState_Failed, mt.buildID2SegmentIndex[buildID+1].IndexState)
})
t.Run("fail", func(t *testing.T) {
mt := constructMetaTable(&indexcoord.Catalog{
Txn: &mockETCDKV{

View File

@ -27,25 +27,25 @@ var (
prometheus.CounterOpts{
Namespace: milvusNamespace,
Subsystem: typeutil.IndexCoordRole,
Name: "indexreq_count",
Name: "index_req_count",
Help: "number of building index requests ",
}, []string{statusLabelName})
// IndexCoordIndexTaskCounter records the number of index tasks of each type.
IndexCoordIndexTaskCounter = prometheus.NewGaugeVec(
// IndexCoordIndexTaskNum records the number of index tasks of each type.
IndexCoordIndexTaskNum = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: milvusNamespace,
Subsystem: typeutil.IndexCoordRole,
Name: "indextask_count",
Name: "index_task_count",
Help: "number of index tasks of each type",
}, []string{indexTaskStatusLabelName})
}, []string{collectionIDLabelName, indexTaskStatusLabelName})
// IndexCoordIndexNodeNum records the number of IndexNodes managed by IndexCoord.
IndexCoordIndexNodeNum = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: milvusNamespace,
Subsystem: typeutil.IndexCoordRole,
Name: "indexnode_num",
Name: "index_node_num",
Help: "number of IndexNodes managed by IndexCoord",
}, []string{})
)
@ -53,6 +53,6 @@ var (
//RegisterIndexCoord registers IndexCoord metrics
func RegisterIndexCoord(registry *prometheus.Registry) {
registry.MustRegister(IndexCoordIndexRequestCounter)
registry.MustRegister(IndexCoordIndexTaskCounter)
registry.MustRegister(IndexCoordIndexTaskNum)
registry.MustRegister(IndexCoordIndexNodeNum)
}