mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-01 03:18:29 +08:00
fix: Compaction task l0 state transfer wrong (#34599)
This bug caused failed L0 compaction tasks never end. See also: #34460 pr: #34597 --------- Signed-off-by: yangxuan <xuan.yang@zilliz.com>
This commit is contained in:
parent
9dc98ac76d
commit
be41892c48
@ -28,6 +28,7 @@ import (
|
||||
|
||||
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
||||
"github.com/milvus-io/milvus/internal/proto/datapb"
|
||||
"github.com/milvus-io/milvus/pkg/common"
|
||||
"github.com/milvus-io/milvus/pkg/log"
|
||||
"github.com/milvus-io/milvus/pkg/util/merr"
|
||||
)
|
||||
@ -65,56 +66,69 @@ func (t *l0CompactionTask) processPipelining() bool {
|
||||
if t.NeedReAssignNodeID() {
|
||||
return false
|
||||
}
|
||||
|
||||
log := log.With(zap.Int64("triggerID", t.GetTriggerID()), zap.Int64("nodeID", t.GetNodeID()))
|
||||
var err error
|
||||
t.plan, err = t.BuildCompactionRequest()
|
||||
if err != nil {
|
||||
err2 := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed), setFailReason(err.Error()))
|
||||
return err2 == nil
|
||||
log.Warn("l0CompactionTask failed to build compaction request", zap.Error(err))
|
||||
err = t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed), setFailReason(err.Error()))
|
||||
if err != nil {
|
||||
log.Warn("l0CompactionTask failed to updateAndSaveTaskMeta", zap.Error(err))
|
||||
return false
|
||||
}
|
||||
|
||||
return t.processFailed()
|
||||
}
|
||||
err = t.sessions.Compaction(context.Background(), t.GetNodeID(), t.GetPlan())
|
||||
|
||||
err = t.sessions.Compaction(context.TODO(), t.GetNodeID(), t.GetPlan())
|
||||
if err != nil {
|
||||
log.Warn("Failed to notify compaction tasks to DataNode", zap.Error(err))
|
||||
log.Warn("l0CompactionTask failed to notify compaction tasks to DataNode", zap.Int64("planID", t.GetPlanID()), zap.Error(err))
|
||||
t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_pipelining), setNodeID(NullNodeID))
|
||||
return false
|
||||
}
|
||||
|
||||
t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_executing))
|
||||
return false
|
||||
}
|
||||
|
||||
func (t *l0CompactionTask) processExecuting() bool {
|
||||
log := log.With(zap.Int64("planID", t.GetPlanID()), zap.Int64("nodeID", t.GetNodeID()))
|
||||
result, err := t.sessions.GetCompactionPlanResult(t.GetNodeID(), t.GetPlanID())
|
||||
if err != nil || result == nil {
|
||||
if errors.Is(err, merr.ErrNodeNotFound) {
|
||||
t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_pipelining), setNodeID(NullNodeID))
|
||||
}
|
||||
log.Warn("l0CompactionTask failed to get compaction result", zap.Error(err))
|
||||
return false
|
||||
}
|
||||
switch result.GetState() {
|
||||
case datapb.CompactionTaskState_executing:
|
||||
if t.checkTimeout() {
|
||||
err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_timeout))
|
||||
if err == nil {
|
||||
return t.processTimeout()
|
||||
if err != nil {
|
||||
log.Warn("l0CompactionTask failed to updateAndSaveTaskMeta", zap.Error(err))
|
||||
return false
|
||||
}
|
||||
return t.processTimeout()
|
||||
}
|
||||
return false
|
||||
case datapb.CompactionTaskState_completed:
|
||||
t.result = result
|
||||
saveSuccess := t.saveSegmentMeta()
|
||||
if !saveSuccess {
|
||||
if err := t.saveSegmentMeta(); err != nil {
|
||||
log.Warn("l0CompactionTask failed to save segment meta", zap.Error(err))
|
||||
return false
|
||||
}
|
||||
err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_meta_saved))
|
||||
if err == nil {
|
||||
return t.processMetaSaved()
|
||||
|
||||
if err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_meta_saved)); err != nil {
|
||||
return false
|
||||
}
|
||||
return false
|
||||
return t.processMetaSaved()
|
||||
case datapb.CompactionTaskState_failed:
|
||||
err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed))
|
||||
if err != nil {
|
||||
log.Warn("fail to updateAndSaveTaskMeta")
|
||||
if err := t.updateAndSaveTaskMeta(setState(datapb.CompactionTaskState_failed)); err != nil {
|
||||
log.Warn("l0CompactionTask failed to updateAndSaveTaskMeta", zap.Error(err))
|
||||
return false
|
||||
}
|
||||
return false
|
||||
return t.processFailed()
|
||||
}
|
||||
return false
|
||||
}
|
||||
@ -244,10 +258,9 @@ func (t *l0CompactionTask) BuildCompactionRequest() (*datapb.CompactionPlan, err
|
||||
// Select sealed L1 segments for LevelZero compaction that meets the condition:
|
||||
// dmlPos < triggerInfo.pos
|
||||
sealedSegments := t.meta.SelectSegments(WithCollection(t.GetCollectionID()), SegmentFilterFunc(func(info *SegmentInfo) bool {
|
||||
return (t.GetPartitionID() == -1 || info.GetPartitionID() == t.GetPartitionID()) &&
|
||||
return (t.GetPartitionID() == common.AllPartitionsID || info.GetPartitionID() == t.GetPartitionID()) &&
|
||||
info.GetInsertChannel() == plan.GetChannel() &&
|
||||
isFlushState(info.GetState()) &&
|
||||
//!info.isCompacting &&
|
||||
!info.GetIsImporting() &&
|
||||
info.GetLevel() != datapb.SegmentLevel_L0 &&
|
||||
info.GetStartPosition().GetTimestamp() < t.GetPos().GetTimestamp()
|
||||
@ -262,8 +275,8 @@ func (t *l0CompactionTask) BuildCompactionRequest() (*datapb.CompactionPlan, err
|
||||
for _, segInfo := range sealedSegments {
|
||||
// TODO should allow parallel executing of l0 compaction
|
||||
if segInfo.isCompacting {
|
||||
log.Info("l0 compaction candidate segment is compacting")
|
||||
return nil, merr.WrapErrCompactionPlanConflict("segment is compacting")
|
||||
log.Info("l0 compaction candidate segment is compacting", zap.Int64("segmentID", segInfo.GetID()))
|
||||
return nil, merr.WrapErrCompactionPlanConflict(fmt.Sprintf("segment %d is compacting", segInfo.GetID()))
|
||||
}
|
||||
}
|
||||
|
||||
@ -317,14 +330,17 @@ func (t *l0CompactionTask) processTimeout() bool {
|
||||
}
|
||||
|
||||
func (t *l0CompactionTask) processFailed() bool {
|
||||
if err := t.sessions.DropCompactionPlan(t.GetNodeID(), &datapb.DropCompactionPlanRequest{
|
||||
PlanID: t.GetPlanID(),
|
||||
}); err != nil {
|
||||
log.Warn("l0CompactionTask processFailed unable to drop compaction plan", zap.Int64("planID", t.GetPlanID()), zap.Error(err))
|
||||
if t.GetNodeID() != 0 && t.GetNodeID() != NullNodeID {
|
||||
err := t.sessions.DropCompactionPlan(t.GetNodeID(), &datapb.DropCompactionPlanRequest{
|
||||
PlanID: t.GetPlanID(),
|
||||
})
|
||||
if err != nil {
|
||||
log.Warn("l0CompactionTask processFailed unable to drop compaction plan", zap.Int64("planID", t.GetPlanID()), zap.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
t.resetSegmentCompacting()
|
||||
log.Info("l0CompactionTask processFailed done", zap.Int64("planID", t.GetPlanID()))
|
||||
log.Info("l0CompactionTask processFailed done", zap.Int64("taskID", t.GetTriggerID()), zap.Int64("planID", t.GetPlanID()))
|
||||
return true
|
||||
}
|
||||
|
||||
@ -364,7 +380,7 @@ func (t *l0CompactionTask) SaveTaskMeta() error {
|
||||
return t.saveTaskMeta(t.CompactionTask)
|
||||
}
|
||||
|
||||
func (t *l0CompactionTask) saveSegmentMeta() bool {
|
||||
func (t *l0CompactionTask) saveSegmentMeta() error {
|
||||
result := t.result
|
||||
plan := t.GetPlan()
|
||||
var operators []UpdateOperator
|
||||
@ -383,10 +399,6 @@ func (t *l0CompactionTask) saveSegmentMeta() bool {
|
||||
log.Info("meta update: update segments info for level zero compaction",
|
||||
zap.Int64("planID", plan.GetPlanID()),
|
||||
)
|
||||
err := t.meta.UpdateSegmentsInfo(operators...)
|
||||
if err != nil {
|
||||
log.Info("Failed to saveSegmentMeta for compaction tasks to DataNode", zap.Error(err))
|
||||
return false
|
||||
}
|
||||
return true
|
||||
|
||||
return t.meta.UpdateSegmentsInfo(operators...)
|
||||
}
|
||||
|
@ -1,6 +1,9 @@
|
||||
package datacoord
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/cockroachdb/errors"
|
||||
"github.com/samber/lo"
|
||||
"github.com/stretchr/testify/mock"
|
||||
|
||||
@ -121,3 +124,149 @@ func (s *CompactionTaskSuite) TestProcessRefreshPlan_SelectZeroSegmentsL0() {
|
||||
_, err := task.BuildCompactionRequest()
|
||||
s.Error(err)
|
||||
}
|
||||
|
||||
func generateTestL0Task(state datapb.CompactionTaskState) *l0CompactionTask {
|
||||
return &l0CompactionTask{
|
||||
CompactionTask: &datapb.CompactionTask{
|
||||
PlanID: 1,
|
||||
TriggerID: 19530,
|
||||
CollectionID: 1,
|
||||
PartitionID: 10,
|
||||
Type: datapb.CompactionType_Level0DeleteCompaction,
|
||||
NodeID: NullNodeID,
|
||||
State: state,
|
||||
InputSegments: []int64{100, 101},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (s *CompactionTaskSuite) SetupSubTest() {
|
||||
s.SetupTest()
|
||||
}
|
||||
|
||||
func (s *CompactionTaskSuite) TestProcessStateTrans() {
|
||||
s.Run("test pipelining needReassignNodeID", func() {
|
||||
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
|
||||
t.NodeID = NullNodeID
|
||||
got := t.Process()
|
||||
s.False(got)
|
||||
s.Equal(datapb.CompactionTaskState_pipelining, t.State)
|
||||
s.EqualValues(NullNodeID, t.NodeID)
|
||||
})
|
||||
|
||||
s.Run("test pipelining BuildCompactionRequest failed", func() {
|
||||
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
|
||||
t.NodeID = 100
|
||||
channel := "ch-1"
|
||||
deltaLogs := []*datapb.FieldBinlog{getFieldBinlogIDs(101, 3)}
|
||||
|
||||
t.meta = s.mockMeta
|
||||
s.mockMeta.EXPECT().SelectSegments(mock.Anything, mock.Anything).Return(
|
||||
[]*SegmentInfo{
|
||||
{SegmentInfo: &datapb.SegmentInfo{
|
||||
ID: 200,
|
||||
Level: datapb.SegmentLevel_L1,
|
||||
InsertChannel: channel,
|
||||
}, isCompacting: true},
|
||||
},
|
||||
)
|
||||
|
||||
s.mockMeta.EXPECT().GetHealthySegment(mock.Anything).RunAndReturn(func(segID int64) *SegmentInfo {
|
||||
return &SegmentInfo{SegmentInfo: &datapb.SegmentInfo{
|
||||
ID: segID,
|
||||
Level: datapb.SegmentLevel_L0,
|
||||
InsertChannel: channel,
|
||||
State: commonpb.SegmentState_Flushed,
|
||||
Deltalogs: deltaLogs,
|
||||
}}
|
||||
}).Twice()
|
||||
s.mockMeta.EXPECT().SaveCompactionTask(mock.Anything).Return(nil).Once()
|
||||
s.mockMeta.EXPECT().SetSegmentsCompacting(mock.Anything, false).Return()
|
||||
|
||||
t.sessions = s.mockSessMgr
|
||||
s.mockSessMgr.EXPECT().DropCompactionPlan(mock.Anything, mock.Anything).Return(nil).Once()
|
||||
|
||||
got := t.Process()
|
||||
s.True(got)
|
||||
s.Equal(datapb.CompactionTaskState_failed, t.State)
|
||||
})
|
||||
|
||||
s.Run("test pipelining Compaction failed", func() {
|
||||
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
|
||||
t.NodeID = 100
|
||||
channel := "ch-1"
|
||||
deltaLogs := []*datapb.FieldBinlog{getFieldBinlogIDs(101, 3)}
|
||||
|
||||
t.meta = s.mockMeta
|
||||
s.mockMeta.EXPECT().SelectSegments(mock.Anything, mock.Anything).Return(
|
||||
[]*SegmentInfo{
|
||||
{SegmentInfo: &datapb.SegmentInfo{
|
||||
ID: 200,
|
||||
Level: datapb.SegmentLevel_L1,
|
||||
InsertChannel: channel,
|
||||
}},
|
||||
},
|
||||
)
|
||||
|
||||
s.mockMeta.EXPECT().GetHealthySegment(mock.Anything).RunAndReturn(func(segID int64) *SegmentInfo {
|
||||
return &SegmentInfo{SegmentInfo: &datapb.SegmentInfo{
|
||||
ID: segID,
|
||||
Level: datapb.SegmentLevel_L0,
|
||||
InsertChannel: channel,
|
||||
State: commonpb.SegmentState_Flushed,
|
||||
Deltalogs: deltaLogs,
|
||||
}}
|
||||
}).Twice()
|
||||
s.mockMeta.EXPECT().SaveCompactionTask(mock.Anything).Return(nil)
|
||||
|
||||
t.sessions = s.mockSessMgr
|
||||
s.mockSessMgr.EXPECT().Compaction(mock.Anything, mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, nodeID int64, plan *datapb.CompactionPlan) error {
|
||||
s.Require().EqualValues(t.NodeID, nodeID)
|
||||
return errors.New("mock error")
|
||||
})
|
||||
|
||||
got := t.Process()
|
||||
s.False(got)
|
||||
s.Equal(datapb.CompactionTaskState_pipelining, t.State)
|
||||
s.EqualValues(NullNodeID, t.NodeID)
|
||||
})
|
||||
|
||||
s.Run("test pipelining success", func() {
|
||||
t := generateTestL0Task(datapb.CompactionTaskState_pipelining)
|
||||
t.NodeID = 100
|
||||
channel := "ch-1"
|
||||
deltaLogs := []*datapb.FieldBinlog{getFieldBinlogIDs(101, 3)}
|
||||
|
||||
t.meta = s.mockMeta
|
||||
s.mockMeta.EXPECT().SelectSegments(mock.Anything, mock.Anything).Return(
|
||||
[]*SegmentInfo{
|
||||
{SegmentInfo: &datapb.SegmentInfo{
|
||||
ID: 200,
|
||||
Level: datapb.SegmentLevel_L1,
|
||||
InsertChannel: channel,
|
||||
}},
|
||||
},
|
||||
)
|
||||
|
||||
s.mockMeta.EXPECT().GetHealthySegment(mock.Anything).RunAndReturn(func(segID int64) *SegmentInfo {
|
||||
return &SegmentInfo{SegmentInfo: &datapb.SegmentInfo{
|
||||
ID: segID,
|
||||
Level: datapb.SegmentLevel_L0,
|
||||
InsertChannel: channel,
|
||||
State: commonpb.SegmentState_Flushed,
|
||||
Deltalogs: deltaLogs,
|
||||
}}
|
||||
}).Twice()
|
||||
s.mockMeta.EXPECT().SaveCompactionTask(mock.Anything).Return(nil).Once()
|
||||
|
||||
t.sessions = s.mockSessMgr
|
||||
s.mockSessMgr.EXPECT().Compaction(mock.Anything, mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, nodeID int64, plan *datapb.CompactionPlan) error {
|
||||
s.Require().EqualValues(t.NodeID, nodeID)
|
||||
return nil
|
||||
})
|
||||
|
||||
got := t.Process()
|
||||
s.False(got)
|
||||
s.Equal(datapb.CompactionTaskState_executing, t.State)
|
||||
})
|
||||
}
|
||||
|
@ -253,6 +253,7 @@ func (m *CompactionTriggerManager) SubmitL0ViewToScheduler(ctx context.Context,
|
||||
zap.Int64("taskID", taskID),
|
||||
zap.Int64("planID", task.GetPlanID()),
|
||||
zap.String("type", task.GetType().String()),
|
||||
zap.Int64s("L0 segments", levelZeroSegs),
|
||||
)
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user