2021-10-25 19:46:28 +08:00
|
|
|
// Licensed to the LF AI & Data foundation under one
|
|
|
|
// or more contributor license agreements. See the NOTICE file
|
|
|
|
// distributed with this work for additional information
|
|
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
|
|
// to you under the Apache License, Version 2.0 (the
|
|
|
|
// "License"); you may not use this file except in compliance
|
2021-10-14 15:44:34 +08:00
|
|
|
// with the License. You may obtain a copy of the License at
|
|
|
|
//
|
2021-10-25 19:46:28 +08:00
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
2021-10-14 15:44:34 +08:00
|
|
|
//
|
2021-10-25 19:46:28 +08:00
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2021-10-14 15:44:34 +08:00
|
|
|
|
|
|
|
package datacoord
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2021-11-05 22:25:00 +08:00
|
|
|
"fmt"
|
2021-10-14 15:44:34 +08:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
2023-09-21 09:45:27 +08:00
|
|
|
"go.uber.org/zap"
|
|
|
|
|
2023-06-09 01:28:37 +08:00
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
2021-10-14 15:44:34 +08:00
|
|
|
grpcdatanodeclient "github.com/milvus-io/milvus/internal/distributed/datanode/client"
|
|
|
|
"github.com/milvus-io/milvus/internal/proto/datapb"
|
|
|
|
"github.com/milvus-io/milvus/internal/types"
|
2023-04-06 19:14:32 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/log"
|
2023-07-28 10:23:02 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/metrics"
|
2023-04-06 19:14:32 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/util/commonpbutil"
|
|
|
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
2023-07-13 14:12:29 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/util/retry"
|
2023-07-24 10:23:01 +08:00
|
|
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
2021-10-14 15:44:34 +08:00
|
|
|
)
|
|
|
|
|
2021-11-05 22:25:00 +08:00
|
|
|
const (
|
2022-09-26 18:06:54 +08:00
|
|
|
flushTimeout = 15 * time.Second
|
2022-04-01 11:33:28 +08:00
|
|
|
// TODO: evaluate and update import timeout.
|
2023-02-27 10:41:46 +08:00
|
|
|
importTimeout = 3 * time.Hour
|
|
|
|
reCollectTimeout = 5 * time.Second
|
2021-11-05 22:25:00 +08:00
|
|
|
)
|
2021-10-14 15:44:34 +08:00
|
|
|
|
|
|
|
// SessionManager provides the grpc interfaces of cluster
|
|
|
|
type SessionManager struct {
|
|
|
|
sessions struct {
|
|
|
|
sync.RWMutex
|
|
|
|
data map[int64]*Session
|
|
|
|
}
|
|
|
|
sessionCreator dataNodeCreatorFunc
|
|
|
|
}
|
|
|
|
|
|
|
|
// SessionOpt provides a way to set params in SessionManager
|
|
|
|
type SessionOpt func(c *SessionManager)
|
|
|
|
|
|
|
|
func withSessionCreator(creator dataNodeCreatorFunc) SessionOpt {
|
|
|
|
return func(c *SessionManager) { c.sessionCreator = creator }
|
|
|
|
}
|
|
|
|
|
|
|
|
func defaultSessionCreator() dataNodeCreatorFunc {
|
2023-08-17 20:20:20 +08:00
|
|
|
return func(ctx context.Context, addr string, nodeID int64) (types.DataNode, error) {
|
|
|
|
return grpcdatanodeclient.NewClient(ctx, addr, nodeID)
|
2021-10-14 15:44:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// NewSessionManager creates a new SessionManager
|
|
|
|
func NewSessionManager(options ...SessionOpt) *SessionManager {
|
|
|
|
m := &SessionManager{
|
|
|
|
sessions: struct {
|
|
|
|
sync.RWMutex
|
|
|
|
data map[int64]*Session
|
|
|
|
}{data: make(map[int64]*Session)},
|
|
|
|
sessionCreator: defaultSessionCreator(),
|
|
|
|
}
|
|
|
|
for _, opt := range options {
|
|
|
|
opt(m)
|
|
|
|
}
|
|
|
|
return m
|
|
|
|
}
|
|
|
|
|
|
|
|
// AddSession creates a new session
|
|
|
|
func (c *SessionManager) AddSession(node *NodeInfo) {
|
|
|
|
c.sessions.Lock()
|
|
|
|
defer c.sessions.Unlock()
|
|
|
|
|
|
|
|
session := NewSession(node, c.sessionCreator)
|
|
|
|
c.sessions.data[node.NodeID] = session
|
2023-07-28 10:23:02 +08:00
|
|
|
metrics.DataCoordNumDataNodes.WithLabelValues().Set(float64(len(c.sessions.data)))
|
2021-10-14 15:44:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// DeleteSession removes the node session
|
|
|
|
func (c *SessionManager) DeleteSession(node *NodeInfo) {
|
|
|
|
c.sessions.Lock()
|
|
|
|
defer c.sessions.Unlock()
|
|
|
|
|
|
|
|
if session, ok := c.sessions.data[node.NodeID]; ok {
|
|
|
|
session.Dispose()
|
|
|
|
delete(c.sessions.data, node.NodeID)
|
|
|
|
}
|
2023-07-28 10:23:02 +08:00
|
|
|
metrics.DataCoordNumDataNodes.WithLabelValues().Set(float64(len(c.sessions.data)))
|
2021-10-14 15:44:34 +08:00
|
|
|
}
|
|
|
|
|
2022-08-19 19:50:50 +08:00
|
|
|
// getLiveNodeIDs returns IDs of all live DataNodes.
|
|
|
|
func (c *SessionManager) getLiveNodeIDs() []int64 {
|
|
|
|
c.sessions.RLock()
|
|
|
|
defer c.sessions.RUnlock()
|
|
|
|
|
|
|
|
ret := make([]int64, 0, len(c.sessions.data))
|
|
|
|
for id := range c.sessions.data {
|
|
|
|
ret = append(ret, id)
|
|
|
|
}
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
2021-10-14 15:44:34 +08:00
|
|
|
// GetSessions gets all node sessions
|
|
|
|
func (c *SessionManager) GetSessions() []*Session {
|
|
|
|
c.sessions.RLock()
|
|
|
|
defer c.sessions.RUnlock()
|
|
|
|
|
|
|
|
ret := make([]*Session, 0, len(c.sessions.data))
|
|
|
|
for _, s := range c.sessions.data {
|
|
|
|
ret = append(ret, s)
|
|
|
|
}
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
|
|
|
// Flush is a grpc interface. It will send req to nodeID asynchronously
|
|
|
|
func (c *SessionManager) Flush(ctx context.Context, nodeID int64, req *datapb.FlushSegmentsRequest) {
|
|
|
|
go c.execFlush(ctx, nodeID, req)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *SessionManager) execFlush(ctx context.Context, nodeID int64, req *datapb.FlushSegmentsRequest) {
|
2021-11-05 22:25:00 +08:00
|
|
|
cli, err := c.getClient(ctx, nodeID)
|
2021-10-14 15:44:34 +08:00
|
|
|
if err != nil {
|
2022-05-06 17:49:51 +08:00
|
|
|
log.Warn("failed to get dataNode client", zap.Int64("dataNode ID", nodeID), zap.Error(err))
|
2021-10-14 15:44:34 +08:00
|
|
|
return
|
|
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, flushTimeout)
|
|
|
|
defer cancel()
|
|
|
|
|
|
|
|
resp, err := cli.FlushSegments(ctx, req)
|
|
|
|
if err := VerifyResponse(resp, err); err != nil {
|
2022-05-06 17:49:51 +08:00
|
|
|
log.Error("flush call (perhaps partially) failed", zap.Int64("dataNode ID", nodeID), zap.Error(err))
|
|
|
|
} else {
|
|
|
|
log.Info("flush call succeeded", zap.Int64("dataNode ID", nodeID))
|
2021-10-14 15:44:34 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-08-23 15:50:52 +08:00
|
|
|
// Compaction is a grpc interface. It will send request to DataNode with provided `nodeID` synchronously.
|
|
|
|
func (c *SessionManager) Compaction(nodeID int64, plan *datapb.CompactionPlan) error {
|
2023-07-18 14:25:20 +08:00
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), Params.DataCoordCfg.CompactionRPCTimeout.GetAsDuration(time.Second))
|
2021-11-05 22:25:00 +08:00
|
|
|
defer cancel()
|
|
|
|
cli, err := c.getClient(ctx, nodeID)
|
|
|
|
if err != nil {
|
|
|
|
log.Warn("failed to get client", zap.Int64("nodeID", nodeID), zap.Error(err))
|
2022-08-23 15:50:52 +08:00
|
|
|
return err
|
2021-11-05 22:25:00 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
resp, err := cli.Compaction(ctx, plan)
|
|
|
|
if err := VerifyResponse(resp, err); err != nil {
|
|
|
|
log.Warn("failed to execute compaction", zap.Int64("node", nodeID), zap.Error(err), zap.Int64("planID", plan.GetPlanID()))
|
2022-08-23 15:50:52 +08:00
|
|
|
return err
|
2021-11-05 22:25:00 +08:00
|
|
|
}
|
|
|
|
|
2022-03-02 15:35:55 +08:00
|
|
|
log.Info("success to execute compaction", zap.Int64("node", nodeID), zap.Any("planID", plan.GetPlanID()))
|
2022-08-23 15:50:52 +08:00
|
|
|
return nil
|
2021-11-05 22:25:00 +08:00
|
|
|
}
|
|
|
|
|
2022-09-27 16:02:53 +08:00
|
|
|
// SyncSegments is a grpc interface. It will send request to DataNode with provided `nodeID` synchronously.
|
|
|
|
func (c *SessionManager) SyncSegments(nodeID int64, req *datapb.SyncSegmentsRequest) error {
|
2023-07-13 14:12:29 +08:00
|
|
|
log := log.With(
|
|
|
|
zap.Int64("nodeID", nodeID),
|
|
|
|
zap.Int64("planID", req.GetPlanID()),
|
|
|
|
)
|
2023-07-18 14:25:20 +08:00
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), Params.DataCoordCfg.CompactionRPCTimeout.GetAsDuration(time.Second))
|
2022-09-27 16:02:53 +08:00
|
|
|
defer cancel()
|
|
|
|
cli, err := c.getClient(ctx, nodeID)
|
|
|
|
if err != nil {
|
2023-07-13 14:12:29 +08:00
|
|
|
log.Warn("failed to get client", zap.Error(err))
|
2022-09-27 16:02:53 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2023-07-13 14:12:29 +08:00
|
|
|
err = retry.Do(context.Background(), func() error {
|
2023-07-18 14:25:20 +08:00
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), Params.DataCoordCfg.CompactionRPCTimeout.GetAsDuration(time.Second))
|
|
|
|
defer cancel()
|
|
|
|
|
2023-07-13 14:12:29 +08:00
|
|
|
resp, err := cli.SyncSegments(ctx, req)
|
|
|
|
if err := VerifyResponse(resp, err); err != nil {
|
|
|
|
log.Warn("failed to sync segments", zap.Error(err))
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
})
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
log.Warn("failed to sync segments after retry", zap.Error(err))
|
2022-09-27 16:02:53 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2023-07-13 14:12:29 +08:00
|
|
|
log.Info("success to sync segments")
|
2022-09-27 16:02:53 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-04-01 11:33:28 +08:00
|
|
|
// Import is a grpc interface. It will send request to DataNode with provided `nodeID` asynchronously.
|
|
|
|
func (c *SessionManager) Import(ctx context.Context, nodeID int64, itr *datapb.ImportTaskRequest) {
|
|
|
|
go c.execImport(ctx, nodeID, itr)
|
|
|
|
}
|
|
|
|
|
|
|
|
// execImport gets the corresponding DataNode with its ID and calls its Import method.
|
|
|
|
func (c *SessionManager) execImport(ctx context.Context, nodeID int64, itr *datapb.ImportTaskRequest) {
|
|
|
|
cli, err := c.getClient(ctx, nodeID)
|
|
|
|
if err != nil {
|
|
|
|
log.Warn("failed to get client for import", zap.Int64("nodeID", nodeID), zap.Error(err))
|
|
|
|
return
|
|
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, importTimeout)
|
|
|
|
defer cancel()
|
|
|
|
resp, err := cli.Import(ctx, itr)
|
|
|
|
if err := VerifyResponse(resp, err); err != nil {
|
|
|
|
log.Warn("failed to import", zap.Int64("node", nodeID), zap.Error(err))
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
log.Info("success to import", zap.Int64("node", nodeID), zap.Any("import task", itr))
|
|
|
|
}
|
|
|
|
|
2022-05-25 14:34:00 +08:00
|
|
|
// ReCollectSegmentStats collects segment stats info from DataNodes, after DataCoord reboots.
|
2023-02-27 10:41:46 +08:00
|
|
|
func (c *SessionManager) ReCollectSegmentStats(ctx context.Context, nodeID int64) error {
|
2022-05-25 14:34:00 +08:00
|
|
|
cli, err := c.getClient(ctx, nodeID)
|
|
|
|
if err != nil {
|
|
|
|
log.Warn("failed to get dataNode client", zap.Int64("DataNode ID", nodeID), zap.Error(err))
|
2023-02-27 10:41:46 +08:00
|
|
|
return err
|
2022-05-25 14:34:00 +08:00
|
|
|
}
|
|
|
|
ctx, cancel := context.WithTimeout(ctx, reCollectTimeout)
|
|
|
|
defer cancel()
|
|
|
|
resp, err := cli.ResendSegmentStats(ctx, &datapb.ResendSegmentStatsRequest{
|
2022-10-21 15:57:28 +08:00
|
|
|
Base: commonpbutil.NewMsgBase(
|
|
|
|
commonpbutil.WithMsgType(commonpb.MsgType_ResendSegmentStats),
|
2022-11-04 14:25:38 +08:00
|
|
|
commonpbutil.WithSourceID(paramtable.GetNodeID()),
|
2022-10-21 15:57:28 +08:00
|
|
|
),
|
2022-05-25 14:34:00 +08:00
|
|
|
})
|
|
|
|
if err := VerifyResponse(resp, err); err != nil {
|
2023-02-27 10:41:46 +08:00
|
|
|
log.Warn("re-collect segment stats call failed",
|
2022-05-25 14:34:00 +08:00
|
|
|
zap.Int64("DataNode ID", nodeID), zap.Error(err))
|
2023-02-27 10:41:46 +08:00
|
|
|
return err
|
2022-05-25 14:34:00 +08:00
|
|
|
}
|
2023-02-27 10:41:46 +08:00
|
|
|
log.Info("re-collect segment stats call succeeded",
|
|
|
|
zap.Int64("DataNode ID", nodeID),
|
|
|
|
zap.Int64s("segment stat collected", resp.GetSegResent()))
|
|
|
|
return nil
|
2022-05-25 14:34:00 +08:00
|
|
|
}
|
|
|
|
|
2022-08-23 15:50:52 +08:00
|
|
|
func (c *SessionManager) GetCompactionState() map[int64]*datapb.CompactionStateResult {
|
|
|
|
wg := sync.WaitGroup{}
|
|
|
|
ctx := context.Background()
|
|
|
|
|
2023-07-24 10:23:01 +08:00
|
|
|
plans := typeutil.NewConcurrentMap[int64, *datapb.CompactionStateResult]()
|
2022-08-23 15:50:52 +08:00
|
|
|
c.sessions.RLock()
|
|
|
|
for nodeID, s := range c.sessions.data {
|
2022-10-27 19:39:31 +08:00
|
|
|
wg.Add(1)
|
2022-08-23 15:50:52 +08:00
|
|
|
go func(nodeID int64, s *Session) {
|
|
|
|
defer wg.Done()
|
|
|
|
cli, err := s.GetOrCreateClient(ctx)
|
|
|
|
if err != nil {
|
|
|
|
log.Info("Cannot Create Client", zap.Int64("NodeID", nodeID))
|
|
|
|
return
|
|
|
|
}
|
2023-07-18 14:25:20 +08:00
|
|
|
ctx, cancel := context.WithTimeout(ctx, Params.DataCoordCfg.CompactionRPCTimeout.GetAsDuration(time.Second))
|
2022-08-23 15:50:52 +08:00
|
|
|
defer cancel()
|
|
|
|
resp, err := cli.GetCompactionState(ctx, &datapb.CompactionStateRequest{
|
2022-10-21 15:57:28 +08:00
|
|
|
Base: commonpbutil.NewMsgBase(
|
|
|
|
commonpbutil.WithMsgType(commonpb.MsgType_GetSystemConfigs),
|
2022-11-04 14:25:38 +08:00
|
|
|
commonpbutil.WithSourceID(paramtable.GetNodeID()),
|
2022-10-21 15:57:28 +08:00
|
|
|
),
|
2022-08-23 15:50:52 +08:00
|
|
|
})
|
|
|
|
if err != nil {
|
|
|
|
log.Info("Get State failed", zap.Error(err))
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
if resp.GetStatus().GetErrorCode() != commonpb.ErrorCode_Success {
|
|
|
|
log.Info("Get State failed", zap.String("Reason", resp.GetStatus().GetReason()))
|
|
|
|
return
|
|
|
|
}
|
|
|
|
for _, rst := range resp.GetResults() {
|
2023-07-24 10:23:01 +08:00
|
|
|
plans.Insert(rst.PlanID, rst)
|
2022-08-23 15:50:52 +08:00
|
|
|
}
|
|
|
|
}(nodeID, s)
|
|
|
|
}
|
|
|
|
c.sessions.RUnlock()
|
|
|
|
wg.Wait()
|
|
|
|
|
|
|
|
rst := make(map[int64]*datapb.CompactionStateResult)
|
2023-07-24 10:23:01 +08:00
|
|
|
plans.Range(func(planID int64, result *datapb.CompactionStateResult) bool {
|
|
|
|
rst[planID] = result
|
2022-08-23 15:50:52 +08:00
|
|
|
return true
|
|
|
|
})
|
|
|
|
|
|
|
|
return rst
|
|
|
|
}
|
|
|
|
|
2023-09-12 21:07:19 +08:00
|
|
|
func (c *SessionManager) FlushChannels(ctx context.Context, nodeID int64, req *datapb.FlushChannelsRequest) error {
|
|
|
|
log := log.Ctx(ctx).With(zap.Int64("nodeID", nodeID))
|
|
|
|
cli, err := c.getClient(ctx, nodeID)
|
|
|
|
if err != nil {
|
|
|
|
log.Warn("failed to get client", zap.Error(err))
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
resp, err := cli.FlushChannels(ctx, req)
|
|
|
|
err = VerifyResponse(resp, err)
|
|
|
|
if err != nil {
|
|
|
|
log.Warn("SessionManager.FlushChannels failed", zap.Error(err))
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
log.Info("SessionManager.FlushChannels successfully")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2021-11-05 22:25:00 +08:00
|
|
|
func (c *SessionManager) getClient(ctx context.Context, nodeID int64) (types.DataNode, error) {
|
|
|
|
c.sessions.RLock()
|
|
|
|
session, ok := c.sessions.data[nodeID]
|
|
|
|
c.sessions.RUnlock()
|
|
|
|
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("can not find session of node %d", nodeID)
|
|
|
|
}
|
|
|
|
|
|
|
|
return session.GetOrCreateClient(ctx)
|
|
|
|
}
|
|
|
|
|
2021-10-14 15:44:34 +08:00
|
|
|
// Close release sessions
|
|
|
|
func (c *SessionManager) Close() {
|
|
|
|
c.sessions.Lock()
|
|
|
|
defer c.sessions.Unlock()
|
|
|
|
|
|
|
|
for _, s := range c.sessions.data {
|
|
|
|
s.Dispose()
|
|
|
|
}
|
|
|
|
c.sessions.data = nil
|
|
|
|
}
|