milvus/internal/distributed/streamingnode/service.go
congqixia 9e96ed4873
fix: Fix tracing config update logic (#35928)
Related to #35927

There are serveral issue this PR addresses:
- Use `ResetTraceConfig` method instead init one in update event handler
- Implement dynamic stats.Handler to receive tracing config update event
- Update `enable_trace` flag when `ResetTraceConfig` is invoked
- Change `enable_trace` to `std::atomic<bool>` in case of data race

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
2024-09-05 14:27:04 +08:00

412 lines
13 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package streamingnode
import (
"context"
"fmt"
"net"
"os"
"strconv"
"sync"
"time"
"github.com/cockroachdb/errors"
grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware"
"github.com/tikv/client-go/v2/txnkv"
clientv3 "go.etcd.io/etcd/client/v3"
"go.uber.org/zap"
"google.golang.org/grpc"
"google.golang.org/grpc/keepalive"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
dcc "github.com/milvus-io/milvus/internal/distributed/datacoord/client"
rcc "github.com/milvus-io/milvus/internal/distributed/rootcoord/client"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
tikvkv "github.com/milvus-io/milvus/internal/kv/tikv"
"github.com/milvus-io/milvus/internal/storage"
streamingnodeserver "github.com/milvus-io/milvus/internal/streamingnode/server"
"github.com/milvus-io/milvus/internal/types"
"github.com/milvus-io/milvus/internal/util/componentutil"
"github.com/milvus-io/milvus/internal/util/dependency"
kvfactory "github.com/milvus-io/milvus/internal/util/dependency/kv"
"github.com/milvus-io/milvus/internal/util/sessionutil"
streamingserviceinterceptor "github.com/milvus-io/milvus/internal/util/streamingutil/service/interceptor"
"github.com/milvus-io/milvus/pkg/kv"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/tracer"
"github.com/milvus-io/milvus/pkg/util"
"github.com/milvus-io/milvus/pkg/util/funcutil"
"github.com/milvus-io/milvus/pkg/util/interceptor"
"github.com/milvus-io/milvus/pkg/util/logutil"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/retry"
"github.com/milvus-io/milvus/pkg/util/tikv"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
// Server is the grpc server of streamingnode.
type Server struct {
stopOnce sync.Once
grpcServerChan chan struct{}
// session of current server.
session *sessionutil.Session
metaKV kv.MetaKv
// server
streamingnode *streamingnodeserver.Server
// rpc
grpcServer *grpc.Server
lis net.Listener
factory dependency.Factory
// component client
etcdCli *clientv3.Client
tikvCli *txnkv.Client
rootCoord types.RootCoordClient
dataCoord types.DataCoordClient
chunkManager storage.ChunkManager
}
// NewServer create a new StreamingNode server.
func NewServer(f dependency.Factory) (*Server, error) {
return &Server{
stopOnce: sync.Once{},
factory: f,
grpcServerChan: make(chan struct{}),
}, nil
}
// Run runs the server.
func (s *Server) Run() error {
// TODO: We should set a timeout for the process startup.
// But currently, we don't implement.
ctx := context.Background()
if err := s.init(ctx); err != nil {
return err
}
log.Info("streamingnode init done ...")
if err := s.start(ctx); err != nil {
return err
}
log.Info("streamingnode start done ...")
return nil
}
// Stop stops the server, should be call after Run returned.
func (s *Server) Stop() (err error) {
s.stopOnce.Do(s.stop)
return nil
}
// stop stops the server.
func (s *Server) stop() {
addr, _ := s.getAddress()
log.Info("streamingnode stop", zap.String("Address", addr))
// Unregister current server from etcd.
log.Info("streamingnode unregister session from etcd...")
if err := s.session.GoingStop(); err != nil {
log.Warn("streamingnode unregister session failed", zap.Error(err))
}
// Stop StreamingNode service.
log.Info("streamingnode stop service...")
s.streamingnode.Stop()
// Stop grpc server.
log.Info("streamingnode stop grpc server...")
s.grpcServer.GracefulStop()
// Stop all session
log.Info("streamingnode stop session...")
s.session.Stop()
// Stop rootCoord client.
log.Info("streamingnode stop rootCoord client...")
if err := s.rootCoord.Close(); err != nil {
log.Warn("streamingnode stop rootCoord client failed", zap.Error(err))
}
// Stop tikv
if s.tikvCli != nil {
if err := s.tikvCli.Close(); err != nil {
log.Warn("streamingnode stop tikv client failed", zap.Error(err))
}
}
// Wait for grpc server to stop.
log.Info("wait for grpc server stop...")
<-s.grpcServerChan
log.Info("streamingnode stop done")
}
// Health check the health status of streamingnode.
func (s *Server) Health(ctx context.Context) commonpb.StateCode {
return s.streamingnode.Health(ctx)
}
func (s *Server) init(ctx context.Context) (err error) {
defer func() {
if err != nil {
log.Error("StreamingNode init failed", zap.Error(err))
return
}
log.Info("init StreamingNode server finished")
}()
// Create etcd client.
s.etcdCli, _ = kvfactory.GetEtcdAndPath()
if err := s.initMeta(); err != nil {
return err
}
if err := s.initChunkManager(ctx); err != nil {
return err
}
if err := s.allocateAddress(); err != nil {
return err
}
if err := s.initSession(ctx); err != nil {
return err
}
if err := s.initRootCoord(ctx); err != nil {
return err
}
if err := s.initDataCoord(ctx); err != nil {
return err
}
s.initGRPCServer()
// Create StreamingNode service.
s.streamingnode = streamingnodeserver.NewServerBuilder().
WithETCD(s.etcdCli).
WithChunkManager(s.chunkManager).
WithGRPCServer(s.grpcServer).
WithRootCoordClient(s.rootCoord).
WithDataCoordClient(s.dataCoord).
WithSession(s.session).
WithMetaKV(s.metaKV).
WithChunkManager(s.chunkManager).
Build()
if err := s.streamingnode.Init(ctx); err != nil {
return errors.Wrap(err, "StreamingNode service init failed")
}
return nil
}
func (s *Server) start(ctx context.Context) (err error) {
defer func() {
if err != nil {
log.Error("StreamingNode start failed", zap.Error(err))
return
}
log.Info("start StreamingNode server finished")
}()
// Start StreamingNode service.
s.streamingnode.Start()
// Start grpc server.
if err := s.startGPRCServer(ctx); err != nil {
return errors.Wrap(err, "StreamingNode start gRPC server fail")
}
// Register current server to etcd.
s.registerSessionToETCD()
return nil
}
func (s *Server) initSession(ctx context.Context) error {
s.session = sessionutil.NewSession(ctx)
if s.session == nil {
return errors.New("session is nil, the etcd client connection may have failed")
}
addr, err := s.getAddress()
if err != nil {
return err
}
s.session.Init(typeutil.StreamingNodeRole, addr, false, true)
paramtable.SetNodeID(s.session.ServerID)
log.Info("StreamingNode init session", zap.Int64("nodeID", paramtable.GetNodeID()), zap.String("node address", addr))
return nil
}
func (s *Server) initMeta() error {
params := paramtable.Get()
metaType := params.MetaStoreCfg.MetaStoreType.GetValue()
log.Info("data coordinator connecting to metadata store", zap.String("metaType", metaType))
metaRootPath := ""
if metaType == util.MetaStoreTypeTiKV {
var err error
s.tikvCli, err = tikv.GetTiKVClient(&paramtable.Get().TiKVCfg)
if err != nil {
log.Warn("Streamingnode init tikv client failed", zap.Error(err))
return err
}
metaRootPath = params.TiKVCfg.MetaRootPath.GetValue()
s.metaKV = tikvkv.NewTiKV(s.tikvCli, metaRootPath,
tikvkv.WithRequestTimeout(paramtable.Get().ServiceParam.TiKVCfg.RequestTimeout.GetAsDuration(time.Millisecond)))
} else if metaType == util.MetaStoreTypeEtcd {
metaRootPath = params.EtcdCfg.MetaRootPath.GetValue()
s.metaKV = etcdkv.NewEtcdKV(s.etcdCli, metaRootPath,
etcdkv.WithRequestTimeout(paramtable.Get().ServiceParam.EtcdCfg.RequestTimeout.GetAsDuration(time.Millisecond)))
}
return nil
}
func (s *Server) initRootCoord(ctx context.Context) (err error) {
log.Info("StreamingNode connect to rootCoord...")
s.rootCoord, err = rcc.NewClient(ctx)
if err != nil {
return errors.Wrap(err, "StreamingNode try to new RootCoord client failed")
}
log.Info("StreamingNode try to wait for RootCoord ready")
err = componentutil.WaitForComponentHealthy(ctx, s.rootCoord, "RootCoord", 1000000, time.Millisecond*200)
if err != nil {
return errors.Wrap(err, "StreamingNode wait for RootCoord ready failed")
}
return nil
}
func (s *Server) initDataCoord(ctx context.Context) (err error) {
log.Info("StreamingNode connect to dataCoord...")
s.dataCoord, err = dcc.NewClient(ctx)
if err != nil {
return errors.Wrap(err, "StreamingNode try to new DataCoord client failed")
}
log.Info("StreamingNode try to wait for DataCoord ready")
err = componentutil.WaitForComponentHealthy(ctx, s.dataCoord, "DataCoord", 1000000, time.Millisecond*200)
if err != nil {
return errors.Wrap(err, "StreamingNode wait for DataCoord ready failed")
}
return nil
}
func (s *Server) initChunkManager(ctx context.Context) (err error) {
log.Info("StreamingNode init chunk manager...")
s.factory.Init(paramtable.Get())
manager, err := s.factory.NewPersistentStorageChunkManager(ctx)
if err != nil {
return errors.Wrap(err, "StreamingNode try to new chunk manager failed")
}
s.chunkManager = manager
return nil
}
func (s *Server) initGRPCServer() {
log.Info("create StreamingNode server...")
cfg := &paramtable.Get().StreamingNodeGrpcServerCfg
kaep := keepalive.EnforcementPolicy{
MinTime: 5 * time.Second, // If a client pings more than once every 5 seconds, terminate the connection
PermitWithoutStream: true, // Allow pings even when there are no active streams
}
kasp := keepalive.ServerParameters{
Time: 60 * time.Second, // Ping the client if it is idle for 60 seconds to ensure the connection is still active
Timeout: 10 * time.Second, // Wait 10 second for the ping ack before assuming the connection is dead
}
serverIDGetter := func() int64 {
return s.session.ServerID
}
s.grpcServer = grpc.NewServer(
grpc.KeepaliveEnforcementPolicy(kaep),
grpc.KeepaliveParams(kasp),
grpc.MaxRecvMsgSize(cfg.ServerMaxRecvSize.GetAsInt()),
grpc.MaxSendMsgSize(cfg.ServerMaxSendSize.GetAsInt()),
grpc.UnaryInterceptor(grpc_middleware.ChainUnaryServer(
logutil.UnaryTraceLoggerInterceptor,
interceptor.ClusterValidationUnaryServerInterceptor(),
interceptor.ServerIDValidationUnaryServerInterceptor(serverIDGetter),
streamingserviceinterceptor.NewStreamingServiceUnaryServerInterceptor(),
)),
grpc.StreamInterceptor(grpc_middleware.ChainStreamServer(
logutil.StreamTraceLoggerInterceptor,
interceptor.ClusterValidationStreamServerInterceptor(),
interceptor.ServerIDValidationStreamServerInterceptor(serverIDGetter),
streamingserviceinterceptor.NewStreamingServiceStreamServerInterceptor(),
)),
grpc.StatsHandler(tracer.GetDynamicOtelGrpcServerStatsHandler()),
)
}
// allocateAddress allocates a available address for streamingnode grpc server.
func (s *Server) allocateAddress() (err error) {
port := paramtable.Get().StreamingNodeGrpcServerCfg.Port.GetAsInt()
retry.Do(context.Background(), func() error {
addr := ":" + strconv.Itoa(port)
s.lis, err = net.Listen("tcp", addr)
if err != nil {
if port != 0 {
// set port=0 to get next available port by os
log.Warn("StreamingNode suggested port is in used, try to get by os", zap.Error(err))
port = 0
}
}
return err
}, retry.Attempts(10))
return err
}
// getAddress returns the address of streamingnode grpc server.
// must be called after allocateAddress.
func (s *Server) getAddress() (string, error) {
if s.lis == nil {
return "", errors.New("StreamingNode grpc server is not initialized")
}
ip := paramtable.Get().StreamingNodeGrpcServerCfg.IP
return fmt.Sprintf("%s:%d", ip, s.lis.Addr().(*net.TCPAddr).Port), nil
}
// startGRPCServer starts the grpc server.
func (s *Server) startGPRCServer(ctx context.Context) error {
errCh := make(chan error, 1)
go func() {
defer close(s.grpcServerChan)
if err := s.grpcServer.Serve(s.lis); err != nil {
select {
case errCh <- err:
// failure at initial startup.
default:
// failure at runtime.
panic(errors.Wrapf(err, "grpc server stop with unexpected error"))
}
}
}()
funcutil.CheckGrpcReady(ctx, errCh)
return <-errCh
}
// registerSessionToETCD registers current server to etcd.
func (s *Server) registerSessionToETCD() {
s.session.Register()
// start liveness check
s.session.LivenessCheck(context.Background(), func() {
log.Error("StreamingNode disconnected from etcd, process will exit", zap.Int64("Server Id", paramtable.GetNodeID()))
os.Exit(1)
})
}