mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-02 03:48:37 +08:00
Remove recollect segment stats during starting datacoord (#27410)
Signed-off-by: jaime <yun.zhang@zilliz.com>
This commit is contained in:
parent
ec1fe3549e
commit
e386a62fae
@ -136,17 +136,6 @@ func (c *Cluster) Import(ctx context.Context, nodeID int64, it *datapb.ImportTas
|
|||||||
c.sessionManager.Import(ctx, nodeID, it)
|
c.sessionManager.Import(ctx, nodeID, it)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ReCollectSegmentStats triggers a ReCollectSegmentStats call from session manager.
|
|
||||||
func (c *Cluster) ReCollectSegmentStats(ctx context.Context) error {
|
|
||||||
for _, node := range c.sessionManager.getLiveNodeIDs() {
|
|
||||||
err := c.sessionManager.ReCollectSegmentStats(ctx, node)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// GetSessions returns all sessions
|
// GetSessions returns all sessions
|
||||||
func (c *Cluster) GetSessions() []*Session {
|
func (c *Cluster) GetSessions() []*Session {
|
||||||
return c.sessionManager.GetSessions()
|
return c.sessionManager.GetSessions()
|
||||||
|
@ -618,66 +618,3 @@ func TestCluster_Import(t *testing.T) {
|
|||||||
})
|
})
|
||||||
time.Sleep(500 * time.Millisecond)
|
time.Sleep(500 * time.Millisecond)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestCluster_ReCollectSegmentStats(t *testing.T) {
|
|
||||||
kv := getWatchKV(t)
|
|
||||||
defer func() {
|
|
||||||
kv.RemoveWithPrefix("")
|
|
||||||
kv.Close()
|
|
||||||
}()
|
|
||||||
|
|
||||||
t.Run("recollect succeed", func(t *testing.T) {
|
|
||||||
ctx, cancel := context.WithCancel(context.TODO())
|
|
||||||
defer cancel()
|
|
||||||
mockSessionCreator := func(ctx context.Context, addr string, nodeID int64) (types.DataNodeClient, error) {
|
|
||||||
return newMockDataNodeClient(1, nil)
|
|
||||||
}
|
|
||||||
sessionManager := NewSessionManager(withSessionCreator(mockSessionCreator))
|
|
||||||
channelManager, err := NewChannelManager(kv, newMockHandler())
|
|
||||||
assert.NoError(t, err)
|
|
||||||
cluster := NewCluster(sessionManager, channelManager)
|
|
||||||
defer cluster.Close()
|
|
||||||
addr := "localhost:8080"
|
|
||||||
info := &NodeInfo{
|
|
||||||
Address: addr,
|
|
||||||
NodeID: 1,
|
|
||||||
}
|
|
||||||
nodes := []*NodeInfo{info}
|
|
||||||
err = cluster.Startup(ctx, nodes)
|
|
||||||
assert.NoError(t, err)
|
|
||||||
|
|
||||||
err = cluster.Watch("chan-1", 1)
|
|
||||||
assert.NoError(t, err)
|
|
||||||
|
|
||||||
assert.NotPanics(t, func() {
|
|
||||||
cluster.ReCollectSegmentStats(ctx)
|
|
||||||
})
|
|
||||||
time.Sleep(500 * time.Millisecond)
|
|
||||||
})
|
|
||||||
|
|
||||||
t.Run("recollect failed", func(t *testing.T) {
|
|
||||||
ctx, cancel := context.WithCancel(context.TODO())
|
|
||||||
defer cancel()
|
|
||||||
sessionManager := NewSessionManager()
|
|
||||||
channelManager, err := NewChannelManager(kv, newMockHandler())
|
|
||||||
assert.NoError(t, err)
|
|
||||||
cluster := NewCluster(sessionManager, channelManager)
|
|
||||||
defer cluster.Close()
|
|
||||||
addr := "localhost:8080"
|
|
||||||
info := &NodeInfo{
|
|
||||||
Address: addr,
|
|
||||||
NodeID: 1,
|
|
||||||
}
|
|
||||||
nodes := []*NodeInfo{info}
|
|
||||||
err = cluster.Startup(ctx, nodes)
|
|
||||||
assert.NoError(t, err)
|
|
||||||
|
|
||||||
err = cluster.Watch("chan-1", 1)
|
|
||||||
assert.NoError(t, err)
|
|
||||||
|
|
||||||
assert.NotPanics(t, func() {
|
|
||||||
cluster.ReCollectSegmentStats(ctx)
|
|
||||||
})
|
|
||||||
time.Sleep(500 * time.Millisecond)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
@ -394,12 +394,6 @@ func (s *Server) startDataCoord() {
|
|||||||
s.compactionTrigger.start()
|
s.compactionTrigger.start()
|
||||||
}
|
}
|
||||||
s.startServerLoop()
|
s.startServerLoop()
|
||||||
// DataCoord (re)starts successfully and starts to collection segment stats
|
|
||||||
// data from all DataNode.
|
|
||||||
// This will prevent DataCoord from missing out any important segment stats
|
|
||||||
// data while offline.
|
|
||||||
log.Info("DataCoord (re)starts successfully and re-collecting segment stats from DataNodes")
|
|
||||||
s.reCollectSegmentStats(s.ctx)
|
|
||||||
s.stateCode.Store(commonpb.StateCode_Healthy)
|
s.stateCode.Store(commonpb.StateCode_Healthy)
|
||||||
sessionutil.SaveServerInfo(typeutil.DataCoordRole, s.session.ServerID)
|
sessionutil.SaveServerInfo(typeutil.DataCoordRole, s.session.ServerID)
|
||||||
}
|
}
|
||||||
@ -1112,25 +1106,3 @@ func (s *Server) loadCollectionFromRootCoord(ctx context.Context, collectionID i
|
|||||||
s.meta.AddCollection(collInfo)
|
s.meta.AddCollection(collInfo)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) reCollectSegmentStats(ctx context.Context) {
|
|
||||||
if s.channelManager == nil {
|
|
||||||
log.Error("null channel manager found, which should NOT happen in non-testing environment")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
nodes := s.sessionManager.getLiveNodeIDs()
|
|
||||||
log.Info("re-collecting segment stats from DataNodes",
|
|
||||||
zap.Int64s("DataNode IDs", nodes))
|
|
||||||
|
|
||||||
reCollectFunc := func() error {
|
|
||||||
err := s.cluster.ReCollectSegmentStats(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
if err := retry.Do(ctx, reCollectFunc, retry.Attempts(20), retry.Sleep(time.Millisecond*100), retry.MaxSleepTime(5*time.Second)); err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -41,7 +41,6 @@ const (
|
|||||||
flushTimeout = 15 * time.Second
|
flushTimeout = 15 * time.Second
|
||||||
// TODO: evaluate and update import timeout.
|
// TODO: evaluate and update import timeout.
|
||||||
importTimeout = 3 * time.Hour
|
importTimeout = 3 * time.Hour
|
||||||
reCollectTimeout = 5 * time.Second
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// SessionManager provides the grpc interfaces of cluster
|
// SessionManager provides the grpc interfaces of cluster
|
||||||
@ -227,32 +226,6 @@ func (c *SessionManager) execImport(ctx context.Context, nodeID int64, itr *data
|
|||||||
log.Info("success to import", zap.Int64("node", nodeID), zap.Any("import task", itr))
|
log.Info("success to import", zap.Int64("node", nodeID), zap.Any("import task", itr))
|
||||||
}
|
}
|
||||||
|
|
||||||
// ReCollectSegmentStats collects segment stats info from DataNodes, after DataCoord reboots.
|
|
||||||
func (c *SessionManager) ReCollectSegmentStats(ctx context.Context, nodeID int64) error {
|
|
||||||
cli, err := c.getClient(ctx, nodeID)
|
|
||||||
if err != nil {
|
|
||||||
log.Warn("failed to get dataNode client", zap.Int64("DataNode ID", nodeID), zap.Error(err))
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
ctx, cancel := context.WithTimeout(ctx, reCollectTimeout)
|
|
||||||
defer cancel()
|
|
||||||
resp, err := cli.ResendSegmentStats(ctx, &datapb.ResendSegmentStatsRequest{
|
|
||||||
Base: commonpbutil.NewMsgBase(
|
|
||||||
commonpbutil.WithMsgType(commonpb.MsgType_ResendSegmentStats),
|
|
||||||
commonpbutil.WithSourceID(paramtable.GetNodeID()),
|
|
||||||
),
|
|
||||||
})
|
|
||||||
if err := VerifyResponse(resp, err); err != nil {
|
|
||||||
log.Warn("re-collect segment stats call failed",
|
|
||||||
zap.Int64("DataNode ID", nodeID), zap.Error(err))
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
log.Info("re-collect segment stats call succeeded",
|
|
||||||
zap.Int64("DataNode ID", nodeID),
|
|
||||||
zap.Int64s("segment stat collected", resp.GetSegResent()))
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *SessionManager) GetCompactionState() map[int64]*datapb.CompactionStateResult {
|
func (c *SessionManager) GetCompactionState() map[int64]*datapb.CompactionStateResult {
|
||||||
wg := sync.WaitGroup{}
|
wg := sync.WaitGroup{}
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
|
@ -192,26 +192,6 @@ func (fm *flowgraphManager) getChannel(segID UniqueID) (Channel, error) {
|
|||||||
return nil, fmt.Errorf("cannot find segment %d in all flowgraphs", segID)
|
return nil, fmt.Errorf("cannot find segment %d in all flowgraphs", segID)
|
||||||
}
|
}
|
||||||
|
|
||||||
// resendTT loops through flow graphs, looks for segments that are not flushed,
|
|
||||||
// and sends them to that flow graph's `resendTTCh` channel so stats of
|
|
||||||
// these segments will be resent.
|
|
||||||
func (fm *flowgraphManager) resendTT() []UniqueID {
|
|
||||||
var unFlushedSegments []UniqueID
|
|
||||||
fm.flowgraphs.Range(func(key string, fg *dataSyncService) bool {
|
|
||||||
segIDs := fg.channel.listNotFlushedSegmentIDs()
|
|
||||||
if len(segIDs) > 0 {
|
|
||||||
log.Info("un-flushed segments found, stats will be resend",
|
|
||||||
zap.Int64s("segment IDs", segIDs))
|
|
||||||
unFlushedSegments = append(unFlushedSegments, segIDs...)
|
|
||||||
fg.resendTTCh <- resendTTMsg{
|
|
||||||
segmentIDs: segIDs,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
})
|
|
||||||
return unFlushedSegments
|
|
||||||
}
|
|
||||||
|
|
||||||
func (fm *flowgraphManager) getFlowgraphService(vchan string) (*dataSyncService, bool) {
|
func (fm *flowgraphManager) getFlowgraphService(vchan string) (*dataSyncService, bool) {
|
||||||
return fm.flowgraphs.Get(vchan)
|
return fm.flowgraphs.Get(vchan)
|
||||||
}
|
}
|
||||||
|
@ -162,17 +162,13 @@ func (node *DataNode) FlushSegments(ctx context.Context, req *datapb.FlushSegmen
|
|||||||
return merr.Success(), nil
|
return merr.Success(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ResendSegmentStats resend un-flushed segment stats back upstream to DataCoord by resending DataNode time tick message.
|
// ResendSegmentStats . ResendSegmentStats resend un-flushed segment stats back upstream to DataCoord by resending DataNode time tick message.
|
||||||
// It returns a list of segments to be sent.
|
// It returns a list of segments to be sent.
|
||||||
|
// Deprecated in 2.3.2, reversed it just for compatibility during rolling back
|
||||||
func (node *DataNode) ResendSegmentStats(ctx context.Context, req *datapb.ResendSegmentStatsRequest) (*datapb.ResendSegmentStatsResponse, error) {
|
func (node *DataNode) ResendSegmentStats(ctx context.Context, req *datapb.ResendSegmentStatsRequest) (*datapb.ResendSegmentStatsResponse, error) {
|
||||||
log.Info("start resending segment stats, if any",
|
|
||||||
zap.Int64("DataNode ID", paramtable.GetNodeID()))
|
|
||||||
segResent := node.flowgraphManager.resendTT()
|
|
||||||
log.Info("found segment(s) with stats to resend",
|
|
||||||
zap.Int64s("segment IDs", segResent))
|
|
||||||
return &datapb.ResendSegmentStatsResponse{
|
return &datapb.ResendSegmentStatsResponse{
|
||||||
Status: merr.Success(),
|
Status: merr.Success(),
|
||||||
SegResent: segResent,
|
SegResent: make([]int64, 0),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -804,13 +804,13 @@ func (s *DataNodeServicesSuite) TestResendSegmentStats() {
|
|||||||
resp, err := s.node.ResendSegmentStats(s.ctx, req)
|
resp, err := s.node.ResendSegmentStats(s.ctx, req)
|
||||||
s.Assert().NoError(err)
|
s.Assert().NoError(err)
|
||||||
s.Assert().True(merr.Ok(resp.GetStatus()))
|
s.Assert().True(merr.Ok(resp.GetStatus()))
|
||||||
s.Assert().ElementsMatch([]UniqueID{0, 1, 2}, resp.GetSegResent())
|
s.Assert().Empty(resp.GetSegResent())
|
||||||
|
|
||||||
// Duplicate call.
|
// Duplicate call.
|
||||||
resp, err = s.node.ResendSegmentStats(s.ctx, req)
|
resp, err = s.node.ResendSegmentStats(s.ctx, req)
|
||||||
s.Assert().NoError(err)
|
s.Assert().NoError(err)
|
||||||
s.Assert().True(merr.Ok(resp.GetStatus()))
|
s.Assert().True(merr.Ok(resp.GetStatus()))
|
||||||
s.Assert().ElementsMatch([]UniqueID{0, 1, 2}, resp.GetSegResent())
|
s.Assert().Empty(resp.GetSegResent())
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *DataNodeServicesSuite) TestFlushChannels() {
|
func (s *DataNodeServicesSuite) TestFlushChannels() {
|
||||||
|
@ -111,6 +111,7 @@ service DataNode {
|
|||||||
// https://wiki.lfaidata.foundation/display/MIL/MEP+24+--+Support+bulk+load
|
// https://wiki.lfaidata.foundation/display/MIL/MEP+24+--+Support+bulk+load
|
||||||
rpc Import(ImportTaskRequest) returns(common.Status) {}
|
rpc Import(ImportTaskRequest) returns(common.Status) {}
|
||||||
|
|
||||||
|
// Deprecated
|
||||||
rpc ResendSegmentStats(ResendSegmentStatsRequest) returns(ResendSegmentStatsResponse) {}
|
rpc ResendSegmentStats(ResendSegmentStatsRequest) returns(ResendSegmentStatsResponse) {}
|
||||||
|
|
||||||
rpc AddImportSegment(AddImportSegmentRequest) returns(AddImportSegmentResponse) {}
|
rpc AddImportSegment(AddImportSegmentRequest) returns(AddImportSegmentResponse) {}
|
||||||
|
@ -7346,6 +7346,7 @@ type DataNodeClient interface {
|
|||||||
SyncSegments(ctx context.Context, in *SyncSegmentsRequest, opts ...grpc.CallOption) (*commonpb.Status, error)
|
SyncSegments(ctx context.Context, in *SyncSegmentsRequest, opts ...grpc.CallOption) (*commonpb.Status, error)
|
||||||
// https://wiki.lfaidata.foundation/display/MIL/MEP+24+--+Support+bulk+load
|
// https://wiki.lfaidata.foundation/display/MIL/MEP+24+--+Support+bulk+load
|
||||||
Import(ctx context.Context, in *ImportTaskRequest, opts ...grpc.CallOption) (*commonpb.Status, error)
|
Import(ctx context.Context, in *ImportTaskRequest, opts ...grpc.CallOption) (*commonpb.Status, error)
|
||||||
|
// Deprecated
|
||||||
ResendSegmentStats(ctx context.Context, in *ResendSegmentStatsRequest, opts ...grpc.CallOption) (*ResendSegmentStatsResponse, error)
|
ResendSegmentStats(ctx context.Context, in *ResendSegmentStatsRequest, opts ...grpc.CallOption) (*ResendSegmentStatsResponse, error)
|
||||||
AddImportSegment(ctx context.Context, in *AddImportSegmentRequest, opts ...grpc.CallOption) (*AddImportSegmentResponse, error)
|
AddImportSegment(ctx context.Context, in *AddImportSegmentRequest, opts ...grpc.CallOption) (*AddImportSegmentResponse, error)
|
||||||
FlushChannels(ctx context.Context, in *FlushChannelsRequest, opts ...grpc.CallOption) (*commonpb.Status, error)
|
FlushChannels(ctx context.Context, in *FlushChannelsRequest, opts ...grpc.CallOption) (*commonpb.Status, error)
|
||||||
@ -7510,6 +7511,7 @@ type DataNodeServer interface {
|
|||||||
SyncSegments(context.Context, *SyncSegmentsRequest) (*commonpb.Status, error)
|
SyncSegments(context.Context, *SyncSegmentsRequest) (*commonpb.Status, error)
|
||||||
// https://wiki.lfaidata.foundation/display/MIL/MEP+24+--+Support+bulk+load
|
// https://wiki.lfaidata.foundation/display/MIL/MEP+24+--+Support+bulk+load
|
||||||
Import(context.Context, *ImportTaskRequest) (*commonpb.Status, error)
|
Import(context.Context, *ImportTaskRequest) (*commonpb.Status, error)
|
||||||
|
// Deprecated
|
||||||
ResendSegmentStats(context.Context, *ResendSegmentStatsRequest) (*ResendSegmentStatsResponse, error)
|
ResendSegmentStats(context.Context, *ResendSegmentStatsRequest) (*ResendSegmentStatsResponse, error)
|
||||||
AddImportSegment(context.Context, *AddImportSegmentRequest) (*AddImportSegmentResponse, error)
|
AddImportSegment(context.Context, *AddImportSegmentRequest) (*AddImportSegmentResponse, error)
|
||||||
FlushChannels(context.Context, *FlushChannelsRequest) (*commonpb.Status, error)
|
FlushChannels(context.Context, *FlushChannelsRequest) (*commonpb.Status, error)
|
||||||
|
Loading…
Reference in New Issue
Block a user