mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-05 05:18:52 +08:00
9e96ed4873
Related to #35927 There are serveral issue this PR addresses: - Use `ResetTraceConfig` method instead init one in update event handler - Implement dynamic stats.Handler to receive tracing config update event - Update `enable_trace` flag when `ResetTraceConfig` is invoked - Change `enable_trace` to `std::atomic<bool>` in case of data race Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
412 lines
13 KiB
Go
412 lines
13 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package streamingnode
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net"
|
|
"os"
|
|
"strconv"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/cockroachdb/errors"
|
|
grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware"
|
|
"github.com/tikv/client-go/v2/txnkv"
|
|
clientv3 "go.etcd.io/etcd/client/v3"
|
|
"go.uber.org/zap"
|
|
"google.golang.org/grpc"
|
|
"google.golang.org/grpc/keepalive"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
|
dcc "github.com/milvus-io/milvus/internal/distributed/datacoord/client"
|
|
rcc "github.com/milvus-io/milvus/internal/distributed/rootcoord/client"
|
|
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
|
|
tikvkv "github.com/milvus-io/milvus/internal/kv/tikv"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
streamingnodeserver "github.com/milvus-io/milvus/internal/streamingnode/server"
|
|
"github.com/milvus-io/milvus/internal/types"
|
|
"github.com/milvus-io/milvus/internal/util/componentutil"
|
|
"github.com/milvus-io/milvus/internal/util/dependency"
|
|
kvfactory "github.com/milvus-io/milvus/internal/util/dependency/kv"
|
|
"github.com/milvus-io/milvus/internal/util/sessionutil"
|
|
streamingserviceinterceptor "github.com/milvus-io/milvus/internal/util/streamingutil/service/interceptor"
|
|
"github.com/milvus-io/milvus/pkg/kv"
|
|
"github.com/milvus-io/milvus/pkg/log"
|
|
"github.com/milvus-io/milvus/pkg/tracer"
|
|
"github.com/milvus-io/milvus/pkg/util"
|
|
"github.com/milvus-io/milvus/pkg/util/funcutil"
|
|
"github.com/milvus-io/milvus/pkg/util/interceptor"
|
|
"github.com/milvus-io/milvus/pkg/util/logutil"
|
|
"github.com/milvus-io/milvus/pkg/util/paramtable"
|
|
"github.com/milvus-io/milvus/pkg/util/retry"
|
|
"github.com/milvus-io/milvus/pkg/util/tikv"
|
|
"github.com/milvus-io/milvus/pkg/util/typeutil"
|
|
)
|
|
|
|
// Server is the grpc server of streamingnode.
|
|
type Server struct {
|
|
stopOnce sync.Once
|
|
grpcServerChan chan struct{}
|
|
|
|
// session of current server.
|
|
session *sessionutil.Session
|
|
metaKV kv.MetaKv
|
|
|
|
// server
|
|
streamingnode *streamingnodeserver.Server
|
|
|
|
// rpc
|
|
grpcServer *grpc.Server
|
|
lis net.Listener
|
|
|
|
factory dependency.Factory
|
|
|
|
// component client
|
|
etcdCli *clientv3.Client
|
|
tikvCli *txnkv.Client
|
|
rootCoord types.RootCoordClient
|
|
dataCoord types.DataCoordClient
|
|
chunkManager storage.ChunkManager
|
|
}
|
|
|
|
// NewServer create a new StreamingNode server.
|
|
func NewServer(f dependency.Factory) (*Server, error) {
|
|
return &Server{
|
|
stopOnce: sync.Once{},
|
|
factory: f,
|
|
grpcServerChan: make(chan struct{}),
|
|
}, nil
|
|
}
|
|
|
|
// Run runs the server.
|
|
func (s *Server) Run() error {
|
|
// TODO: We should set a timeout for the process startup.
|
|
// But currently, we don't implement.
|
|
ctx := context.Background()
|
|
|
|
if err := s.init(ctx); err != nil {
|
|
return err
|
|
}
|
|
log.Info("streamingnode init done ...")
|
|
|
|
if err := s.start(ctx); err != nil {
|
|
return err
|
|
}
|
|
log.Info("streamingnode start done ...")
|
|
return nil
|
|
}
|
|
|
|
// Stop stops the server, should be call after Run returned.
|
|
func (s *Server) Stop() (err error) {
|
|
s.stopOnce.Do(s.stop)
|
|
return nil
|
|
}
|
|
|
|
// stop stops the server.
|
|
func (s *Server) stop() {
|
|
addr, _ := s.getAddress()
|
|
log.Info("streamingnode stop", zap.String("Address", addr))
|
|
|
|
// Unregister current server from etcd.
|
|
log.Info("streamingnode unregister session from etcd...")
|
|
if err := s.session.GoingStop(); err != nil {
|
|
log.Warn("streamingnode unregister session failed", zap.Error(err))
|
|
}
|
|
|
|
// Stop StreamingNode service.
|
|
log.Info("streamingnode stop service...")
|
|
s.streamingnode.Stop()
|
|
|
|
// Stop grpc server.
|
|
log.Info("streamingnode stop grpc server...")
|
|
s.grpcServer.GracefulStop()
|
|
|
|
// Stop all session
|
|
log.Info("streamingnode stop session...")
|
|
s.session.Stop()
|
|
|
|
// Stop rootCoord client.
|
|
log.Info("streamingnode stop rootCoord client...")
|
|
if err := s.rootCoord.Close(); err != nil {
|
|
log.Warn("streamingnode stop rootCoord client failed", zap.Error(err))
|
|
}
|
|
|
|
// Stop tikv
|
|
if s.tikvCli != nil {
|
|
if err := s.tikvCli.Close(); err != nil {
|
|
log.Warn("streamingnode stop tikv client failed", zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// Wait for grpc server to stop.
|
|
log.Info("wait for grpc server stop...")
|
|
<-s.grpcServerChan
|
|
log.Info("streamingnode stop done")
|
|
}
|
|
|
|
// Health check the health status of streamingnode.
|
|
func (s *Server) Health(ctx context.Context) commonpb.StateCode {
|
|
return s.streamingnode.Health(ctx)
|
|
}
|
|
|
|
func (s *Server) init(ctx context.Context) (err error) {
|
|
defer func() {
|
|
if err != nil {
|
|
log.Error("StreamingNode init failed", zap.Error(err))
|
|
return
|
|
}
|
|
log.Info("init StreamingNode server finished")
|
|
}()
|
|
|
|
// Create etcd client.
|
|
s.etcdCli, _ = kvfactory.GetEtcdAndPath()
|
|
|
|
if err := s.initMeta(); err != nil {
|
|
return err
|
|
}
|
|
if err := s.initChunkManager(ctx); err != nil {
|
|
return err
|
|
}
|
|
if err := s.allocateAddress(); err != nil {
|
|
return err
|
|
}
|
|
if err := s.initSession(ctx); err != nil {
|
|
return err
|
|
}
|
|
if err := s.initRootCoord(ctx); err != nil {
|
|
return err
|
|
}
|
|
if err := s.initDataCoord(ctx); err != nil {
|
|
return err
|
|
}
|
|
s.initGRPCServer()
|
|
|
|
// Create StreamingNode service.
|
|
s.streamingnode = streamingnodeserver.NewServerBuilder().
|
|
WithETCD(s.etcdCli).
|
|
WithChunkManager(s.chunkManager).
|
|
WithGRPCServer(s.grpcServer).
|
|
WithRootCoordClient(s.rootCoord).
|
|
WithDataCoordClient(s.dataCoord).
|
|
WithSession(s.session).
|
|
WithMetaKV(s.metaKV).
|
|
WithChunkManager(s.chunkManager).
|
|
Build()
|
|
if err := s.streamingnode.Init(ctx); err != nil {
|
|
return errors.Wrap(err, "StreamingNode service init failed")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Server) start(ctx context.Context) (err error) {
|
|
defer func() {
|
|
if err != nil {
|
|
log.Error("StreamingNode start failed", zap.Error(err))
|
|
return
|
|
}
|
|
log.Info("start StreamingNode server finished")
|
|
}()
|
|
|
|
// Start StreamingNode service.
|
|
s.streamingnode.Start()
|
|
|
|
// Start grpc server.
|
|
if err := s.startGPRCServer(ctx); err != nil {
|
|
return errors.Wrap(err, "StreamingNode start gRPC server fail")
|
|
}
|
|
|
|
// Register current server to etcd.
|
|
s.registerSessionToETCD()
|
|
return nil
|
|
}
|
|
|
|
func (s *Server) initSession(ctx context.Context) error {
|
|
s.session = sessionutil.NewSession(ctx)
|
|
if s.session == nil {
|
|
return errors.New("session is nil, the etcd client connection may have failed")
|
|
}
|
|
addr, err := s.getAddress()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
s.session.Init(typeutil.StreamingNodeRole, addr, false, true)
|
|
paramtable.SetNodeID(s.session.ServerID)
|
|
log.Info("StreamingNode init session", zap.Int64("nodeID", paramtable.GetNodeID()), zap.String("node address", addr))
|
|
return nil
|
|
}
|
|
|
|
func (s *Server) initMeta() error {
|
|
params := paramtable.Get()
|
|
metaType := params.MetaStoreCfg.MetaStoreType.GetValue()
|
|
log.Info("data coordinator connecting to metadata store", zap.String("metaType", metaType))
|
|
metaRootPath := ""
|
|
if metaType == util.MetaStoreTypeTiKV {
|
|
var err error
|
|
s.tikvCli, err = tikv.GetTiKVClient(¶mtable.Get().TiKVCfg)
|
|
if err != nil {
|
|
log.Warn("Streamingnode init tikv client failed", zap.Error(err))
|
|
return err
|
|
}
|
|
metaRootPath = params.TiKVCfg.MetaRootPath.GetValue()
|
|
s.metaKV = tikvkv.NewTiKV(s.tikvCli, metaRootPath,
|
|
tikvkv.WithRequestTimeout(paramtable.Get().ServiceParam.TiKVCfg.RequestTimeout.GetAsDuration(time.Millisecond)))
|
|
} else if metaType == util.MetaStoreTypeEtcd {
|
|
metaRootPath = params.EtcdCfg.MetaRootPath.GetValue()
|
|
s.metaKV = etcdkv.NewEtcdKV(s.etcdCli, metaRootPath,
|
|
etcdkv.WithRequestTimeout(paramtable.Get().ServiceParam.EtcdCfg.RequestTimeout.GetAsDuration(time.Millisecond)))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Server) initRootCoord(ctx context.Context) (err error) {
|
|
log.Info("StreamingNode connect to rootCoord...")
|
|
s.rootCoord, err = rcc.NewClient(ctx)
|
|
if err != nil {
|
|
return errors.Wrap(err, "StreamingNode try to new RootCoord client failed")
|
|
}
|
|
|
|
log.Info("StreamingNode try to wait for RootCoord ready")
|
|
err = componentutil.WaitForComponentHealthy(ctx, s.rootCoord, "RootCoord", 1000000, time.Millisecond*200)
|
|
if err != nil {
|
|
return errors.Wrap(err, "StreamingNode wait for RootCoord ready failed")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Server) initDataCoord(ctx context.Context) (err error) {
|
|
log.Info("StreamingNode connect to dataCoord...")
|
|
s.dataCoord, err = dcc.NewClient(ctx)
|
|
if err != nil {
|
|
return errors.Wrap(err, "StreamingNode try to new DataCoord client failed")
|
|
}
|
|
|
|
log.Info("StreamingNode try to wait for DataCoord ready")
|
|
err = componentutil.WaitForComponentHealthy(ctx, s.dataCoord, "DataCoord", 1000000, time.Millisecond*200)
|
|
if err != nil {
|
|
return errors.Wrap(err, "StreamingNode wait for DataCoord ready failed")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Server) initChunkManager(ctx context.Context) (err error) {
|
|
log.Info("StreamingNode init chunk manager...")
|
|
s.factory.Init(paramtable.Get())
|
|
manager, err := s.factory.NewPersistentStorageChunkManager(ctx)
|
|
if err != nil {
|
|
return errors.Wrap(err, "StreamingNode try to new chunk manager failed")
|
|
}
|
|
s.chunkManager = manager
|
|
return nil
|
|
}
|
|
|
|
func (s *Server) initGRPCServer() {
|
|
log.Info("create StreamingNode server...")
|
|
cfg := ¶mtable.Get().StreamingNodeGrpcServerCfg
|
|
kaep := keepalive.EnforcementPolicy{
|
|
MinTime: 5 * time.Second, // If a client pings more than once every 5 seconds, terminate the connection
|
|
PermitWithoutStream: true, // Allow pings even when there are no active streams
|
|
}
|
|
kasp := keepalive.ServerParameters{
|
|
Time: 60 * time.Second, // Ping the client if it is idle for 60 seconds to ensure the connection is still active
|
|
Timeout: 10 * time.Second, // Wait 10 second for the ping ack before assuming the connection is dead
|
|
}
|
|
|
|
serverIDGetter := func() int64 {
|
|
return s.session.ServerID
|
|
}
|
|
s.grpcServer = grpc.NewServer(
|
|
grpc.KeepaliveEnforcementPolicy(kaep),
|
|
grpc.KeepaliveParams(kasp),
|
|
grpc.MaxRecvMsgSize(cfg.ServerMaxRecvSize.GetAsInt()),
|
|
grpc.MaxSendMsgSize(cfg.ServerMaxSendSize.GetAsInt()),
|
|
grpc.UnaryInterceptor(grpc_middleware.ChainUnaryServer(
|
|
logutil.UnaryTraceLoggerInterceptor,
|
|
interceptor.ClusterValidationUnaryServerInterceptor(),
|
|
interceptor.ServerIDValidationUnaryServerInterceptor(serverIDGetter),
|
|
streamingserviceinterceptor.NewStreamingServiceUnaryServerInterceptor(),
|
|
)),
|
|
grpc.StreamInterceptor(grpc_middleware.ChainStreamServer(
|
|
logutil.StreamTraceLoggerInterceptor,
|
|
interceptor.ClusterValidationStreamServerInterceptor(),
|
|
interceptor.ServerIDValidationStreamServerInterceptor(serverIDGetter),
|
|
streamingserviceinterceptor.NewStreamingServiceStreamServerInterceptor(),
|
|
)),
|
|
grpc.StatsHandler(tracer.GetDynamicOtelGrpcServerStatsHandler()),
|
|
)
|
|
}
|
|
|
|
// allocateAddress allocates a available address for streamingnode grpc server.
|
|
func (s *Server) allocateAddress() (err error) {
|
|
port := paramtable.Get().StreamingNodeGrpcServerCfg.Port.GetAsInt()
|
|
|
|
retry.Do(context.Background(), func() error {
|
|
addr := ":" + strconv.Itoa(port)
|
|
s.lis, err = net.Listen("tcp", addr)
|
|
if err != nil {
|
|
if port != 0 {
|
|
// set port=0 to get next available port by os
|
|
log.Warn("StreamingNode suggested port is in used, try to get by os", zap.Error(err))
|
|
port = 0
|
|
}
|
|
}
|
|
return err
|
|
}, retry.Attempts(10))
|
|
return err
|
|
}
|
|
|
|
// getAddress returns the address of streamingnode grpc server.
|
|
// must be called after allocateAddress.
|
|
func (s *Server) getAddress() (string, error) {
|
|
if s.lis == nil {
|
|
return "", errors.New("StreamingNode grpc server is not initialized")
|
|
}
|
|
ip := paramtable.Get().StreamingNodeGrpcServerCfg.IP
|
|
return fmt.Sprintf("%s:%d", ip, s.lis.Addr().(*net.TCPAddr).Port), nil
|
|
}
|
|
|
|
// startGRPCServer starts the grpc server.
|
|
func (s *Server) startGPRCServer(ctx context.Context) error {
|
|
errCh := make(chan error, 1)
|
|
go func() {
|
|
defer close(s.grpcServerChan)
|
|
|
|
if err := s.grpcServer.Serve(s.lis); err != nil {
|
|
select {
|
|
case errCh <- err:
|
|
// failure at initial startup.
|
|
default:
|
|
// failure at runtime.
|
|
panic(errors.Wrapf(err, "grpc server stop with unexpected error"))
|
|
}
|
|
}
|
|
}()
|
|
funcutil.CheckGrpcReady(ctx, errCh)
|
|
return <-errCh
|
|
}
|
|
|
|
// registerSessionToETCD registers current server to etcd.
|
|
func (s *Server) registerSessionToETCD() {
|
|
s.session.Register()
|
|
// start liveness check
|
|
s.session.LivenessCheck(context.Background(), func() {
|
|
log.Error("StreamingNode disconnected from etcd, process will exit", zap.Int64("Server Id", paramtable.GetNodeID()))
|
|
os.Exit(1)
|
|
})
|
|
}
|