fix: Compaction task l0 state transfer wrong (#34597)

This bug caused failed L0 compaction tasks never end.

See also: #34460

---------

Signed-off-by: yangxuan <xuan.yang@zilliz.com>
This commit is contained in:
XuanYang-cn 2024-07-11 21:39:36 +08:00 committed by GitHub
parent 104d0966b7
commit d7966f46ad
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 195 additions and 33 deletions

View File

@ -28,6 +28,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb" "github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus/internal/proto/datapb" "github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/pkg/common"
"github.com/milvus-io/milvus/pkg/log" "github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/merr" "github.com/milvus-io/milvus/pkg/util/merr"
) )
@ -65,56 +66,69 @@ func (t *l0CompactionTask) processPipelining() bool {
if t.NeedReAssignNodeID() { if t.NeedReAssignNodeID() {
return false return false
} }
log := log.With(zap.Int64("triggerID", t.GetTriggerID()), zap.Int64("nodeID", t.GetNodeID()))
var err error var err error
t.plan, err = t.BuildCompactionRequest() t.plan, err = t.BuildCompactionRequest()
if err != nil { if err != nil {
err2 := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed), setFailReason(err.Error())) log.Warn("l0CompactionTask failed to build compaction request", zap.Error(err))
return err2 == nil err = t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed), setFailReason(err.Error()))
if err != nil {
log.Warn("l0CompactionTask failed to updateAndSaveTaskMeta", zap.Error(err))
return false
}
return t.processFailed()
} }
err = t.sessions.Compaction(context.Background(), t.GetNodeID(), t.GetPlan())
err = t.sessions.Compaction(context.TODO(), t.GetNodeID(), t.GetPlan())
if err != nil { if err != nil {
log.Warn("Failed to notify compaction tasks to DataNode", zap.Error(err)) log.Warn("l0CompactionTask failed to notify compaction tasks to DataNode", zap.Int64("planID", t.GetPlanID()), zap.Error(err))
t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_pipelining), setNodeID(NullNodeID)) t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_pipelining), setNodeID(NullNodeID))
return false return false
} }
t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_executing)) t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_executing))
return false return false
} }
func (t *l0CompactionTask) processExecuting() bool { func (t *l0CompactionTask) processExecuting() bool {
log := log.With(zap.Int64("planID", t.GetPlanID()), zap.Int64("nodeID", t.GetNodeID()))
result, err := t.sessions.GetCompactionPlanResult(t.GetNodeID(), t.GetPlanID()) result, err := t.sessions.GetCompactionPlanResult(t.GetNodeID(), t.GetPlanID())
if err != nil || result == nil { if err != nil || result == nil {
if errors.Is(err, merr.ErrNodeNotFound) { if errors.Is(err, merr.ErrNodeNotFound) {
t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_pipelining), setNodeID(NullNodeID)) t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_pipelining), setNodeID(NullNodeID))
} }
log.Warn("l0CompactionTask failed to get compaction result", zap.Error(err))
return false return false
} }
switch result.GetState() { switch result.GetState() {
case datapb.CompactionTaskState_executing: case datapb.CompactionTaskState_executing:
if t.checkTimeout() { if t.checkTimeout() {
err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_timeout)) err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_timeout))
if err == nil { if err != nil {
return t.processTimeout() log.Warn("l0CompactionTask failed to updateAndSaveTaskMeta", zap.Error(err))
return false
} }
return t.processTimeout()
} }
return false
case datapb.CompactionTaskState_completed: case datapb.CompactionTaskState_completed:
t.result = result t.result = result
saveSuccess := t.saveSegmentMeta() if err := t.saveSegmentMeta(); err != nil {
if !saveSuccess { log.Warn("l0CompactionTask failed to save segment meta", zap.Error(err))
return false return false
} }
err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_meta_saved))
if err == nil { if err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_meta_saved)); err != nil {
return t.processMetaSaved() return false
} }
return false return t.processMetaSaved()
case datapb.CompactionTaskState_failed: case datapb.CompactionTaskState_failed:
err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed)) if err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed)); err != nil {
if err != nil { log.Warn("l0CompactionTask failed to updateAndSaveTaskMeta", zap.Error(err))
log.Warn("fail to updateAndSaveTaskMeta") return false
} }
return false return t.processFailed()
} }
return false return false
} }
@ -244,10 +258,9 @@ func (t *l0CompactionTask) BuildCompactionRequest() (*datapb.CompactionPlan, err
// Select sealed L1 segments for LevelZero compaction that meets the condition: // Select sealed L1 segments for LevelZero compaction that meets the condition:
// dmlPos < triggerInfo.pos // dmlPos < triggerInfo.pos
sealedSegments := t.meta.SelectSegments(WithCollection(t.GetCollectionID()), SegmentFilterFunc(func(info *SegmentInfo) bool { sealedSegments := t.meta.SelectSegments(WithCollection(t.GetCollectionID()), SegmentFilterFunc(func(info *SegmentInfo) bool {
return (t.GetPartitionID() == -1 || info.GetPartitionID() == t.GetPartitionID()) && return (t.GetPartitionID() == common.AllPartitionsID || info.GetPartitionID() == t.GetPartitionID()) &&
info.GetInsertChannel() == plan.GetChannel() && info.GetInsertChannel() == plan.GetChannel() &&
isFlushState(info.GetState()) && isFlushState(info.GetState()) &&
//!info.isCompacting &&
!info.GetIsImporting() && !info.GetIsImporting() &&
info.GetLevel() != datapb.SegmentLevel_L0 && info.GetLevel() != datapb.SegmentLevel_L0 &&
info.GetStartPosition().GetTimestamp() < t.GetPos().GetTimestamp() info.GetStartPosition().GetTimestamp() < t.GetPos().GetTimestamp()
@ -262,8 +275,8 @@ func (t *l0CompactionTask) BuildCompactionRequest() (*datapb.CompactionPlan, err
for _, segInfo := range sealedSegments { for _, segInfo := range sealedSegments {
// TODO should allow parallel executing of l0 compaction // TODO should allow parallel executing of l0 compaction
if segInfo.isCompacting { if segInfo.isCompacting {
log.Info("l0 compaction candidate segment is compacting") log.Info("l0 compaction candidate segment is compacting", zap.Int64("segmentID", segInfo.GetID()))
return nil, merr.WrapErrCompactionPlanConflict("segment is compacting") return nil, merr.WrapErrCompactionPlanConflict(fmt.Sprintf("segment %d is compacting", segInfo.GetID()))
} }
} }
@ -317,14 +330,17 @@ func (t *l0CompactionTask) processTimeout() bool {
} }
func (t *l0CompactionTask) processFailed() bool { func (t *l0CompactionTask) processFailed() bool {
if err := t.sessions.DropCompactionPlan(t.GetNodeID(), &datapb.DropCompactionPlanRequest{ if t.GetNodeID() != 0 && t.GetNodeID() != NullNodeID {
PlanID: t.GetPlanID(), err := t.sessions.DropCompactionPlan(t.GetNodeID(), &datapb.DropCompactionPlanRequest{
}); err != nil { PlanID: t.GetPlanID(),
log.Warn("l0CompactionTask processFailed unable to drop compaction plan", zap.Int64("planID", t.GetPlanID()), zap.Error(err)) })
if err != nil {
log.Warn("l0CompactionTask processFailed unable to drop compaction plan", zap.Int64("planID", t.GetPlanID()), zap.Error(err))
}
} }
t.resetSegmentCompacting() t.resetSegmentCompacting()
log.Info("l0CompactionTask processFailed done", zap.Int64("planID", t.GetPlanID())) log.Info("l0CompactionTask processFailed done", zap.Int64("taskID", t.GetTriggerID()), zap.Int64("planID", t.GetPlanID()))
return true return true
} }
@ -364,7 +380,7 @@ func (t *l0CompactionTask) SaveTaskMeta() error {
return t.saveTaskMeta(t.CompactionTask) return t.saveTaskMeta(t.CompactionTask)
} }
func (t *l0CompactionTask) saveSegmentMeta() bool { func (t *l0CompactionTask) saveSegmentMeta() error {
result := t.result result := t.result
plan := t.GetPlan() plan := t.GetPlan()
var operators []UpdateOperator var operators []UpdateOperator
@ -383,10 +399,6 @@ func (t *l0CompactionTask) saveSegmentMeta() bool {
log.Info("meta update: update segments info for level zero compaction", log.Info("meta update: update segments info for level zero compaction",
zap.Int64("planID", plan.GetPlanID()), zap.Int64("planID", plan.GetPlanID()),
) )
err := t.meta.UpdateSegmentsInfo(operators...)
if err != nil { return t.meta.UpdateSegmentsInfo(operators...)
log.Info("Failed to saveSegmentMeta for compaction tasks to DataNode", zap.Error(err))
return false
}
return true
} }

View File

@ -1,6 +1,9 @@
package datacoord package datacoord
import ( import (
"context"
"github.com/cockroachdb/errors"
"github.com/samber/lo" "github.com/samber/lo"
"github.com/stretchr/testify/mock" "github.com/stretchr/testify/mock"
@ -121,3 +124,149 @@ func (s *CompactionTaskSuite) TestProcessRefreshPlan_SelectZeroSegmentsL0() {
_, err := task.BuildCompactionRequest() _, err := task.BuildCompactionRequest()
s.Error(err) s.Error(err)
} }
func generateTestL0Task(state datapb.CompactionTaskState) *l0CompactionTask {
return &l0CompactionTask{
CompactionTask: &datapb.CompactionTask{
PlanID: 1,
TriggerID: 19530,
CollectionID: 1,
PartitionID: 10,
Type: datapb.CompactionType_Level0DeleteCompaction,
NodeID: NullNodeID,
State: state,
InputSegments: []int64{100, 101},
},
}
}
func (s *CompactionTaskSuite) SetupSubTest() {
s.SetupTest()
}
func (s *CompactionTaskSuite) TestProcessStateTrans() {
s.Run("test pipelining needReassignNodeID", func() {
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
t.NodeID = NullNodeID
got := t.Process()
s.False(got)
s.Equal(datapb.CompactionTaskState_pipelining, t.State)
s.EqualValues(NullNodeID, t.NodeID)
})
s.Run("test pipelining BuildCompactionRequest failed", func() {
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
t.NodeID = 100
channel := "ch-1"
deltaLogs := []*datapb.FieldBinlog{getFieldBinlogIDs(101, 3)}
t.meta = s.mockMeta
s.mockMeta.EXPECT().SelectSegments(mock.Anything, mock.Anything).Return(
[]*SegmentInfo{
{SegmentInfo: &datapb.SegmentInfo{
ID: 200,
Level: datapb.SegmentLevel_L1,
InsertChannel: channel,
}, isCompacting: true},
},
)
s.mockMeta.EXPECT().GetHealthySegment(mock.Anything).RunAndReturn(func(segID int64) *SegmentInfo {
return &SegmentInfo{SegmentInfo: &datapb.SegmentInfo{
ID: segID,
Level: datapb.SegmentLevel_L0,
InsertChannel: channel,
State: commonpb.SegmentState_Flushed,
Deltalogs: deltaLogs,
}}
}).Twice()
s.mockMeta.EXPECT().SaveCompactionTask(mock.Anything).Return(nil).Once()
s.mockMeta.EXPECT().SetSegmentsCompacting(mock.Anything, false).Return()
t.sessions = s.mockSessMgr
s.mockSessMgr.EXPECT().DropCompactionPlan(mock.Anything, mock.Anything).Return(nil).Once()
got := t.Process()
s.True(got)
s.Equal(datapb.CompactionTaskState_failed, t.State)
})
s.Run("test pipelining Compaction failed", func() {
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
t.NodeID = 100
channel := "ch-1"
deltaLogs := []*datapb.FieldBinlog{getFieldBinlogIDs(101, 3)}
t.meta = s.mockMeta
s.mockMeta.EXPECT().SelectSegments(mock.Anything, mock.Anything).Return(
[]*SegmentInfo{
{SegmentInfo: &datapb.SegmentInfo{
ID: 200,
Level: datapb.SegmentLevel_L1,
InsertChannel: channel,
}},
},
)
s.mockMeta.EXPECT().GetHealthySegment(mock.Anything).RunAndReturn(func(segID int64) *SegmentInfo {
return &SegmentInfo{SegmentInfo: &datapb.SegmentInfo{
ID: segID,
Level: datapb.SegmentLevel_L0,
InsertChannel: channel,
State: commonpb.SegmentState_Flushed,
Deltalogs: deltaLogs,
}}
}).Twice()
s.mockMeta.EXPECT().SaveCompactionTask(mock.Anything).Return(nil)
t.sessions = s.mockSessMgr
s.mockSessMgr.EXPECT().Compaction(mock.Anything, mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, nodeID int64, plan *datapb.CompactionPlan) error {
s.Require().EqualValues(t.NodeID, nodeID)
return errors.New("mock error")
})
got := t.Process()
s.False(got)
s.Equal(datapb.CompactionTaskState_pipelining, t.State)
s.EqualValues(NullNodeID, t.NodeID)
})
s.Run("test pipelining success", func() {
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
t.NodeID = 100
channel := "ch-1"
deltaLogs := []*datapb.FieldBinlog{getFieldBinlogIDs(101, 3)}
t.meta = s.mockMeta
s.mockMeta.EXPECT().SelectSegments(mock.Anything, mock.Anything).Return(
[]*SegmentInfo{
{SegmentInfo: &datapb.SegmentInfo{
ID: 200,
Level: datapb.SegmentLevel_L1,
InsertChannel: channel,
}},
},
)
s.mockMeta.EXPECT().GetHealthySegment(mock.Anything).RunAndReturn(func(segID int64) *SegmentInfo {
return &SegmentInfo{SegmentInfo: &datapb.SegmentInfo{
ID: segID,
Level: datapb.SegmentLevel_L0,
InsertChannel: channel,
State: commonpb.SegmentState_Flushed,
Deltalogs: deltaLogs,
}}
}).Twice()
s.mockMeta.EXPECT().SaveCompactionTask(mock.Anything).Return(nil).Once()
t.sessions = s.mockSessMgr
s.mockSessMgr.EXPECT().Compaction(mock.Anything, mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, nodeID int64, plan *datapb.CompactionPlan) error {
s.Require().EqualValues(t.NodeID, nodeID)
return nil
})
got := t.Process()
s.False(got)
s.Equal(datapb.CompactionTaskState_executing, t.State)
})
}

View File

@ -255,6 +255,7 @@ func (m *CompactionTriggerManager) SubmitL0ViewToScheduler(ctx context.Context,
zap.Int64("taskID", taskID), zap.Int64("taskID", taskID),
zap.Int64("planID", task.GetPlanID()), zap.Int64("planID", task.GetPlanID()),
zap.String("type", task.GetType().String()), zap.String("type", task.GetType().String()),
zap.Int64s("L0 segments", levelZeroSegs),
) )
} }