2021-10-27 19:00:48 +08:00
|
|
|
// Licensed to the LF AI & Data foundation under one
|
|
|
|
// or more contributor license agreements. See the NOTICE file
|
|
|
|
// distributed with this work for additional information
|
|
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
|
|
// to you under the Apache License, Version 2.0 (the
|
|
|
|
// "License"); you may not use this file except in compliance
|
2021-07-14 14:15:55 +08:00
|
|
|
// with the License. You may obtain a copy of the License at
|
|
|
|
//
|
2021-10-27 19:00:48 +08:00
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
2021-07-14 14:15:55 +08:00
|
|
|
//
|
2021-10-27 19:00:48 +08:00
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2021-07-14 14:15:55 +08:00
|
|
|
|
|
|
|
package indexcoord
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"sync"
|
2022-07-07 14:44:21 +08:00
|
|
|
|
2022-08-25 15:48:54 +08:00
|
|
|
"go.uber.org/zap"
|
|
|
|
|
2022-09-16 16:56:49 +08:00
|
|
|
"github.com/milvus-io/milvus/api/commonpb"
|
|
|
|
"github.com/milvus-io/milvus/api/milvuspb"
|
2021-07-14 14:15:55 +08:00
|
|
|
grpcindexnodeclient "github.com/milvus-io/milvus/internal/distributed/indexnode/client"
|
|
|
|
"github.com/milvus-io/milvus/internal/log"
|
2022-08-25 15:48:54 +08:00
|
|
|
"github.com/milvus-io/milvus/internal/metastore/model"
|
2022-07-13 16:16:25 +08:00
|
|
|
"github.com/milvus-io/milvus/internal/metrics"
|
|
|
|
"github.com/milvus-io/milvus/internal/proto/indexpb"
|
2021-07-14 14:15:55 +08:00
|
|
|
"github.com/milvus-io/milvus/internal/types"
|
|
|
|
)
|
|
|
|
|
2021-09-26 18:55:58 +08:00
|
|
|
// NodeManager is used by IndexCoord to manage the client of IndexNode.
|
2021-07-14 14:15:55 +08:00
|
|
|
type NodeManager struct {
|
|
|
|
nodeClients map[UniqueID]types.IndexNode
|
|
|
|
pq *PriorityQueue
|
2021-12-23 21:35:52 +08:00
|
|
|
lock sync.RWMutex
|
|
|
|
ctx context.Context
|
2021-07-14 14:15:55 +08:00
|
|
|
}
|
|
|
|
|
2021-09-26 18:55:58 +08:00
|
|
|
// NewNodeManager is used to create a new NodeManager.
|
2021-12-23 21:35:52 +08:00
|
|
|
func NewNodeManager(ctx context.Context) *NodeManager {
|
2021-07-14 14:15:55 +08:00
|
|
|
return &NodeManager{
|
|
|
|
nodeClients: make(map[UniqueID]types.IndexNode),
|
2021-12-09 14:19:40 +08:00
|
|
|
pq: &PriorityQueue{
|
|
|
|
policy: PeekClientV1,
|
|
|
|
},
|
|
|
|
lock: sync.RWMutex{},
|
2021-12-23 21:35:52 +08:00
|
|
|
ctx: ctx,
|
2021-07-14 14:15:55 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-12-15 18:27:32 +08:00
|
|
|
// setClient sets IndexNode client to node manager.
|
2022-08-25 15:48:54 +08:00
|
|
|
func (nm *NodeManager) setClient(nodeID UniqueID, client types.IndexNode) {
|
2021-08-17 17:54:10 +08:00
|
|
|
log.Debug("IndexCoord NodeManager setClient", zap.Int64("nodeID", nodeID))
|
2021-07-23 10:44:12 +08:00
|
|
|
item := &PQItem{
|
|
|
|
key: nodeID,
|
|
|
|
priority: 0,
|
2021-11-01 17:16:05 +08:00
|
|
|
weight: 0,
|
2021-12-09 20:45:04 +08:00
|
|
|
totalMem: 0,
|
2021-07-23 10:44:12 +08:00
|
|
|
}
|
2021-12-23 21:35:52 +08:00
|
|
|
nm.lock.Lock()
|
2021-07-23 10:44:12 +08:00
|
|
|
nm.nodeClients[nodeID] = client
|
2022-08-31 10:23:04 +08:00
|
|
|
log.Debug("IndexNode NodeManager setClient success", zap.Int64("nodeID", nodeID), zap.Int("IndexNode num", len(nm.nodeClients)))
|
2021-12-23 21:35:52 +08:00
|
|
|
nm.lock.Unlock()
|
2021-07-23 10:44:12 +08:00
|
|
|
nm.pq.Push(item)
|
|
|
|
}
|
|
|
|
|
2021-09-26 18:55:58 +08:00
|
|
|
// RemoveNode removes the unused client of IndexNode.
|
2021-07-14 14:15:55 +08:00
|
|
|
func (nm *NodeManager) RemoveNode(nodeID UniqueID) {
|
|
|
|
log.Debug("IndexCoord", zap.Any("Remove node with ID", nodeID))
|
2021-12-23 21:35:52 +08:00
|
|
|
nm.lock.Lock()
|
2021-07-14 14:15:55 +08:00
|
|
|
delete(nm.nodeClients, nodeID)
|
2021-12-23 21:35:52 +08:00
|
|
|
nm.lock.Unlock()
|
2021-07-14 14:15:55 +08:00
|
|
|
nm.pq.Remove(nodeID)
|
2022-03-15 21:51:21 +08:00
|
|
|
metrics.IndexCoordIndexNodeNum.WithLabelValues().Dec()
|
2021-07-14 14:15:55 +08:00
|
|
|
}
|
|
|
|
|
2021-09-26 18:55:58 +08:00
|
|
|
// AddNode adds the client of IndexNode.
|
2021-07-14 14:15:55 +08:00
|
|
|
func (nm *NodeManager) AddNode(nodeID UniqueID, address string) error {
|
2022-07-07 14:44:21 +08:00
|
|
|
|
2021-07-14 14:15:55 +08:00
|
|
|
log.Debug("IndexCoord addNode", zap.Any("nodeID", nodeID), zap.Any("node address", address))
|
|
|
|
if nm.pq.CheckExist(nodeID) {
|
2021-09-26 21:23:57 +08:00
|
|
|
log.Warn("IndexCoord", zap.Any("Node client already exist with ID:", nodeID))
|
2021-07-14 14:15:55 +08:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
nodeClient, err := grpcindexnodeclient.NewClient(context.TODO(), address)
|
|
|
|
if err != nil {
|
2021-08-17 17:54:10 +08:00
|
|
|
log.Error("IndexCoord NodeManager", zap.Any("Add node err", err))
|
2021-07-14 14:15:55 +08:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
err = nodeClient.Init()
|
|
|
|
if err != nil {
|
2021-08-17 17:54:10 +08:00
|
|
|
log.Error("IndexCoord NodeManager", zap.Any("Add node err", err))
|
2021-07-14 14:15:55 +08:00
|
|
|
return err
|
|
|
|
}
|
2022-03-15 21:51:21 +08:00
|
|
|
metrics.IndexCoordIndexNodeNum.WithLabelValues().Inc()
|
2022-08-25 15:48:54 +08:00
|
|
|
nm.setClient(nodeID, nodeClient)
|
|
|
|
return nil
|
2021-07-14 14:15:55 +08:00
|
|
|
}
|
|
|
|
|
2021-09-26 18:55:58 +08:00
|
|
|
// PeekClient peeks the client with the least load.
|
2022-08-25 15:48:54 +08:00
|
|
|
func (nm *NodeManager) PeekClient(meta *model.SegmentIndex) (UniqueID, types.IndexNode) {
|
2022-07-19 14:24:29 +08:00
|
|
|
log.Info("IndexCoord peek client")
|
2022-07-13 16:16:25 +08:00
|
|
|
allClients := nm.GetAllClients()
|
|
|
|
if len(allClients) == 0 {
|
2022-07-07 14:44:21 +08:00
|
|
|
log.Error("there is no IndexNode online")
|
|
|
|
return -1, nil
|
|
|
|
}
|
2022-07-13 16:16:25 +08:00
|
|
|
|
|
|
|
// Note: In order to quickly end other goroutines, an error is returned when the client is successfully selected
|
|
|
|
ctx, cancel := context.WithCancel(nm.ctx)
|
|
|
|
var (
|
|
|
|
peekNodeID = UniqueID(0)
|
|
|
|
nodeMutex = sync.Mutex{}
|
|
|
|
wg = sync.WaitGroup{}
|
|
|
|
)
|
|
|
|
|
|
|
|
for nodeID, client := range allClients {
|
|
|
|
nodeID := nodeID
|
|
|
|
client := client
|
|
|
|
wg.Add(1)
|
|
|
|
go func() {
|
|
|
|
defer wg.Done()
|
2022-08-25 15:48:54 +08:00
|
|
|
resp, err := client.GetJobStats(ctx, &indexpb.GetJobStatsRequest{})
|
2022-07-13 16:16:25 +08:00
|
|
|
if err != nil {
|
|
|
|
log.Warn("get IndexNode slots failed", zap.Int64("nodeID", nodeID), zap.Error(err))
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if resp.Status.ErrorCode != commonpb.ErrorCode_Success {
|
|
|
|
log.Warn("get IndexNode slots failed", zap.Int64("nodeID", nodeID),
|
|
|
|
zap.String("reason", resp.Status.Reason))
|
|
|
|
return
|
|
|
|
}
|
2022-08-25 15:48:54 +08:00
|
|
|
if resp.TaskSlots > 0 {
|
2022-07-13 16:16:25 +08:00
|
|
|
nodeMutex.Lock()
|
|
|
|
defer nodeMutex.Unlock()
|
|
|
|
log.Info("peek client success", zap.Int64("nodeID", nodeID))
|
|
|
|
if peekNodeID == 0 {
|
|
|
|
peekNodeID = nodeID
|
|
|
|
}
|
|
|
|
cancel()
|
|
|
|
// Note: In order to quickly end other goroutines, an error is returned when the client is successfully selected
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
cancel()
|
|
|
|
if peekNodeID != 0 {
|
2022-07-19 14:24:29 +08:00
|
|
|
log.Info("IndexCoord peek client success", zap.Int64("nodeID", peekNodeID))
|
2022-07-13 16:16:25 +08:00
|
|
|
return peekNodeID, allClients[peekNodeID]
|
2021-12-04 11:39:34 +08:00
|
|
|
}
|
2022-07-07 14:44:21 +08:00
|
|
|
|
2022-07-19 14:24:29 +08:00
|
|
|
log.Warn("IndexCoord peek client fail")
|
2022-07-07 14:44:21 +08:00
|
|
|
return 0, nil
|
2021-12-04 11:39:34 +08:00
|
|
|
}
|
|
|
|
|
2022-07-13 16:16:25 +08:00
|
|
|
func (nm *NodeManager) GetAllClients() map[UniqueID]types.IndexNode {
|
|
|
|
nm.lock.RLock()
|
|
|
|
defer nm.lock.RUnlock()
|
|
|
|
|
|
|
|
allClients := make(map[UniqueID]types.IndexNode, len(nm.nodeClients))
|
|
|
|
for nodeID, client := range nm.nodeClients {
|
|
|
|
allClients[nodeID] = client
|
|
|
|
}
|
|
|
|
|
|
|
|
return allClients
|
|
|
|
}
|
|
|
|
|
2022-08-25 15:48:54 +08:00
|
|
|
func (nm *NodeManager) GetClientByID(nodeID UniqueID) (types.IndexNode, bool) {
|
|
|
|
nm.lock.RLock()
|
|
|
|
defer nm.lock.RUnlock()
|
|
|
|
|
|
|
|
client, ok := nm.nodeClients[nodeID]
|
|
|
|
return client, ok
|
|
|
|
}
|
|
|
|
|
2021-12-15 16:55:26 +08:00
|
|
|
// indexNodeGetMetricsResponse record the metrics information of IndexNode.
|
2021-08-19 10:28:10 +08:00
|
|
|
type indexNodeGetMetricsResponse struct {
|
|
|
|
resp *milvuspb.GetMetricsResponse
|
|
|
|
err error
|
|
|
|
}
|
|
|
|
|
2021-12-15 17:07:10 +08:00
|
|
|
// getMetrics get metrics information of all IndexNode.
|
2021-08-19 10:28:10 +08:00
|
|
|
func (nm *NodeManager) getMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) []indexNodeGetMetricsResponse {
|
2021-12-23 21:35:52 +08:00
|
|
|
var clients []types.IndexNode
|
2021-08-19 10:28:10 +08:00
|
|
|
nm.lock.RLock()
|
2021-12-23 21:35:52 +08:00
|
|
|
for _, node := range nm.nodeClients {
|
|
|
|
clients = append(clients, node)
|
|
|
|
}
|
|
|
|
nm.lock.RUnlock()
|
2021-08-19 10:28:10 +08:00
|
|
|
|
|
|
|
ret := make([]indexNodeGetMetricsResponse, 0, len(nm.nodeClients))
|
2021-12-23 21:35:52 +08:00
|
|
|
for _, node := range clients {
|
2021-08-19 10:28:10 +08:00
|
|
|
resp, err := node.GetMetrics(ctx, req)
|
|
|
|
ret = append(ret, indexNodeGetMetricsResponse{
|
|
|
|
resp: resp,
|
|
|
|
err: err,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
return ret
|
|
|
|
}
|