fix: Compaction task l0 state transfer wrong (#34599)

This bug caused failed L0 compaction tasks never end.

See also: #34460
pr: #34597

---------

Signed-off-by: yangxuan <xuan.yang@zilliz.com>
This commit is contained in:
XuanYang-cn 2024-07-11 20:21:37 +08:00 committed by GitHub
parent 9dc98ac76d
commit be41892c48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 195 additions and 33 deletions

View File

@ -28,6 +28,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/merr"
)
@ -65,56 +66,69 @@ func (t *l0CompactionTask) processPipelining() bool {
if t.NeedReAssignNodeID() {
return false
}
log := log.With(zap.Int64("triggerID", t.GetTriggerID()), zap.Int64("nodeID", t.GetNodeID()))
var err error
t.plan, err = t.BuildCompactionRequest()
if err != nil {
err2 := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed), setFailReason(err.Error()))
return err2 == nil
log.Warn("l0CompactionTask failed to build compaction request", zap.Error(err))
err = t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed), setFailReason(err.Error()))
if err != nil {
log.Warn("l0CompactionTask failed to updateAndSaveTaskMeta", zap.Error(err))
return false
}
return t.processFailed()
}
err = t.sessions.Compaction(context.Background(), t.GetNodeID(), t.GetPlan())
err = t.sessions.Compaction(context.TODO(), t.GetNodeID(), t.GetPlan())
if err != nil {
log.Warn("Failed to notify compaction tasks to DataNode", zap.Error(err))
log.Warn("l0CompactionTask failed to notify compaction tasks to DataNode", zap.Int64("planID", t.GetPlanID()), zap.Error(err))
t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_pipelining), setNodeID(NullNodeID))
return false
}
t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_executing))
return false
}
func (t *l0CompactionTask) processExecuting() bool {
log := log.With(zap.Int64("planID", t.GetPlanID()), zap.Int64("nodeID", t.GetNodeID()))
result, err := t.sessions.GetCompactionPlanResult(t.GetNodeID(), t.GetPlanID())
if err != nil || result == nil {
if errors.Is(err, merr.ErrNodeNotFound) {
t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_pipelining), setNodeID(NullNodeID))
}
log.Warn("l0CompactionTask failed to get compaction result", zap.Error(err))
return false
}
switch result.GetState() {
case datapb.CompactionTaskState_executing:
if t.checkTimeout() {
err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_timeout))
if err == nil {
return t.processTimeout()
if err != nil {
log.Warn("l0CompactionTask failed to updateAndSaveTaskMeta", zap.Error(err))
return false
}
return t.processTimeout()
}
return false
case datapb.CompactionTaskState_completed:
t.result = result
saveSuccess := t.saveSegmentMeta()
if !saveSuccess {
if err := t.saveSegmentMeta(); err != nil {
log.Warn("l0CompactionTask failed to save segment meta", zap.Error(err))
return false
}
err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_meta_saved))
if err == nil {
return t.processMetaSaved()
if err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_meta_saved)); err != nil {
return false
}
return false
return t.processMetaSaved()
case datapb.CompactionTaskState_failed:
err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed))
if err != nil {
log.Warn("fail to updateAndSaveTaskMeta")
if err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed)); err != nil {
log.Warn("l0CompactionTask failed to updateAndSaveTaskMeta", zap.Error(err))
return false
}
return false
return t.processFailed()
}
return false
}
@ -244,10 +258,9 @@ func (t *l0CompactionTask) BuildCompactionRequest() (*datapb.CompactionPlan, err
// Select sealed L1 segments for LevelZero compaction that meets the condition:
// dmlPos < triggerInfo.pos
sealedSegments := t.meta.SelectSegments(WithCollection(t.GetCollectionID()), SegmentFilterFunc(func(info *SegmentInfo) bool {
return (t.GetPartitionID() == -1 || info.GetPartitionID() == t.GetPartitionID()) &&
return (t.GetPartitionID() == common.AllPartitionsID || info.GetPartitionID() == t.GetPartitionID()) &&
info.GetInsertChannel() == plan.GetChannel() &&
isFlushState(info.GetState()) &&
//!info.isCompacting &&
!info.GetIsImporting() &&
info.GetLevel() != datapb.SegmentLevel_L0 &&
info.GetStartPosition().GetTimestamp() < t.GetPos().GetTimestamp()
@ -262,8 +275,8 @@ func (t *l0CompactionTask) BuildCompactionRequest() (*datapb.CompactionPlan, err
for _, segInfo := range sealedSegments {
// TODO should allow parallel executing of l0 compaction
if segInfo.isCompacting {
log.Info("l0 compaction candidate segment is compacting")
return nil, merr.WrapErrCompactionPlanConflict("segment is compacting")
log.Info("l0 compaction candidate segment is compacting", zap.Int64("segmentID", segInfo.GetID()))
return nil, merr.WrapErrCompactionPlanConflict(fmt.Sprintf("segment %d is compacting", segInfo.GetID()))
}
}
@ -317,14 +330,17 @@ func (t *l0CompactionTask) processTimeout() bool {
}
func (t *l0CompactionTask) processFailed() bool {
if err := t.sessions.DropCompactionPlan(t.GetNodeID(), &datapb.DropCompactionPlanRequest{
PlanID: t.GetPlanID(),
}); err != nil {
log.Warn("l0CompactionTask processFailed unable to drop compaction plan", zap.Int64("planID", t.GetPlanID()), zap.Error(err))
if t.GetNodeID() != 0 && t.GetNodeID() != NullNodeID {
err := t.sessions.DropCompactionPlan(t.GetNodeID(), &datapb.DropCompactionPlanRequest{
PlanID: t.GetPlanID(),
})
if err != nil {
log.Warn("l0CompactionTask processFailed unable to drop compaction plan", zap.Int64("planID", t.GetPlanID()), zap.Error(err))
}
}
t.resetSegmentCompacting()
log.Info("l0CompactionTask processFailed done", zap.Int64("planID", t.GetPlanID()))
log.Info("l0CompactionTask processFailed done", zap.Int64("taskID", t.GetTriggerID()), zap.Int64("planID", t.GetPlanID()))
return true
}
@ -364,7 +380,7 @@ func (t *l0CompactionTask) SaveTaskMeta() error {
return t.saveTaskMeta(t.CompactionTask)
}
func (t *l0CompactionTask) saveSegmentMeta() bool {
func (t *l0CompactionTask) saveSegmentMeta() error {
result := t.result
plan := t.GetPlan()
var operators []UpdateOperator
@ -383,10 +399,6 @@ func (t *l0CompactionTask) saveSegmentMeta() bool {
log.Info("meta update: update segments info for level zero compaction",
zap.Int64("planID", plan.GetPlanID()),
)
err := t.meta.UpdateSegmentsInfo(operators...)
if err != nil {
log.Info("Failed to saveSegmentMeta for compaction tasks to DataNode", zap.Error(err))
return false
}
return true
return t.meta.UpdateSegmentsInfo(operators...)
}

View File

@ -1,6 +1,9 @@
package datacoord
import (
"context"
"github.com/cockroachdb/errors"
"github.com/samber/lo"
"github.com/stretchr/testify/mock"
@ -121,3 +124,149 @@ func (s *CompactionTaskSuite) TestProcessRefreshPlan_SelectZeroSegmentsL0() {
_, err := task.BuildCompactionRequest()
s.Error(err)
}
func generateTestL0Task(state datapb.CompactionTaskState) *l0CompactionTask {
return &l0CompactionTask{
CompactionTask: &datapb.CompactionTask{
PlanID: 1,
TriggerID: 19530,
CollectionID: 1,
PartitionID: 10,
Type: datapb.CompactionType_Level0DeleteCompaction,
NodeID: NullNodeID,
State: state,
InputSegments: []int64{100, 101},
},
}
}
func (s *CompactionTaskSuite) SetupSubTest() {
s.SetupTest()
}
func (s *CompactionTaskSuite) TestProcessStateTrans() {
s.Run("test pipelining needReassignNodeID", func() {
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
t.NodeID = NullNodeID
got := t.Process()
s.False(got)
s.Equal(datapb.CompactionTaskState_pipelining, t.State)
s.EqualValues(NullNodeID, t.NodeID)
})
s.Run("test pipelining BuildCompactionRequest failed", func() {
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
t.NodeID = 100
channel := "ch-1"
deltaLogs := []*datapb.FieldBinlog{getFieldBinlogIDs(101, 3)}
t.meta = s.mockMeta
s.mockMeta.EXPECT().SelectSegments(mock.Anything, mock.Anything).Return(
[]*SegmentInfo{
{SegmentInfo: &datapb.SegmentInfo{
ID: 200,
Level: datapb.SegmentLevel_L1,
InsertChannel: channel,
}, isCompacting: true},
},
)
s.mockMeta.EXPECT().GetHealthySegment(mock.Anything).RunAndReturn(func(segID int64) *SegmentInfo {
return &SegmentInfo{SegmentInfo: &datapb.SegmentInfo{
ID: segID,
Level: datapb.SegmentLevel_L0,
InsertChannel: channel,
State: commonpb.SegmentState_Flushed,
Deltalogs: deltaLogs,
}}
}).Twice()
s.mockMeta.EXPECT().SaveCompactionTask(mock.Anything).Return(nil).Once()
s.mockMeta.EXPECT().SetSegmentsCompacting(mock.Anything, false).Return()
t.sessions = s.mockSessMgr
s.mockSessMgr.EXPECT().DropCompactionPlan(mock.Anything, mock.Anything).Return(nil).Once()
got := t.Process()
s.True(got)
s.Equal(datapb.CompactionTaskState_failed, t.State)
})
s.Run("test pipelining Compaction failed", func() {
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
t.NodeID = 100
channel := "ch-1"
deltaLogs := []*datapb.FieldBinlog{getFieldBinlogIDs(101, 3)}
t.meta = s.mockMeta
s.mockMeta.EXPECT().SelectSegments(mock.Anything, mock.Anything).Return(
[]*SegmentInfo{
{SegmentInfo: &datapb.SegmentInfo{
ID: 200,
Level: datapb.SegmentLevel_L1,
InsertChannel: channel,
}},
},
)
s.mockMeta.EXPECT().GetHealthySegment(mock.Anything).RunAndReturn(func(segID int64) *SegmentInfo {
return &SegmentInfo{SegmentInfo: &datapb.SegmentInfo{
ID: segID,
Level: datapb.SegmentLevel_L0,
InsertChannel: channel,
State: commonpb.SegmentState_Flushed,
Deltalogs: deltaLogs,
}}
}).Twice()
s.mockMeta.EXPECT().SaveCompactionTask(mock.Anything).Return(nil)
t.sessions = s.mockSessMgr
s.mockSessMgr.EXPECT().Compaction(mock.Anything, mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, nodeID int64, plan *datapb.CompactionPlan) error {
s.Require().EqualValues(t.NodeID, nodeID)
return errors.New("mock error")
})
got := t.Process()
s.False(got)
s.Equal(datapb.CompactionTaskState_pipelining, t.State)
s.EqualValues(NullNodeID, t.NodeID)
})
s.Run("test pipelining success", func() {
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
t.NodeID = 100
channel := "ch-1"
deltaLogs := []*datapb.FieldBinlog{getFieldBinlogIDs(101, 3)}
t.meta = s.mockMeta
s.mockMeta.EXPECT().SelectSegments(mock.Anything, mock.Anything).Return(
[]*SegmentInfo{
{SegmentInfo: &datapb.SegmentInfo{
ID: 200,
Level: datapb.SegmentLevel_L1,
InsertChannel: channel,
}},
},
)
s.mockMeta.EXPECT().GetHealthySegment(mock.Anything).RunAndReturn(func(segID int64) *SegmentInfo {
return &SegmentInfo{SegmentInfo: &datapb.SegmentInfo{
ID: segID,
Level: datapb.SegmentLevel_L0,
InsertChannel: channel,
State: commonpb.SegmentState_Flushed,
Deltalogs: deltaLogs,
}}
}).Twice()
s.mockMeta.EXPECT().SaveCompactionTask(mock.Anything).Return(nil).Once()
t.sessions = s.mockSessMgr
s.mockSessMgr.EXPECT().Compaction(mock.Anything, mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, nodeID int64, plan *datapb.CompactionPlan) error {
s.Require().EqualValues(t.NodeID, nodeID)
return nil
})
got := t.Process()
s.False(got)
s.Equal(datapb.CompactionTaskState_executing, t.State)
})
}

View File

@ -253,6 +253,7 @@ func (m *CompactionTriggerManager) SubmitL0ViewToScheduler(ctx context.Context,
zap.Int64("taskID", taskID),
zap.Int64("planID", task.GetPlanID()),
zap.String("type", task.GetType().String()),
zap.Int64s("L0 segments", levelZeroSegs),
)
}