chyezh 9871966415
enhance: segment alloc interceptor (#34996)
#33285

- add segment alloc interceptor for streamingnode.
- add add manual alloc segment rpc for datacoord.

---------

Signed-off-by: chyezh <chyezh@outlook.com>
2024-08-04 07:40:15 +08:00

388 lines
12 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package streamingnode
import (
"context"
"fmt"
"net"
"os"
"strconv"
"sync"
"time"
"github.com/cockroachdb/errors"
grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware"
"github.com/tikv/client-go/v2/txnkv"
clientv3 "go.etcd.io/etcd/client/v3"
"go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc"
"go.uber.org/zap"
"google.golang.org/grpc"
"google.golang.org/grpc/keepalive"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
dcc "github.com/milvus-io/milvus/internal/distributed/datacoord/client"
rcc "github.com/milvus-io/milvus/internal/distributed/rootcoord/client"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
tikvkv "github.com/milvus-io/milvus/internal/kv/tikv"
streamingnodeserver "github.com/milvus-io/milvus/internal/streamingnode/server"
"github.com/milvus-io/milvus/internal/types"
"github.com/milvus-io/milvus/internal/util/componentutil"
kvfactory "github.com/milvus-io/milvus/internal/util/dependency/kv"
"github.com/milvus-io/milvus/internal/util/sessionutil"
streamingserviceinterceptor "github.com/milvus-io/milvus/internal/util/streamingutil/service/interceptor"
"github.com/milvus-io/milvus/pkg/kv"
"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/tracer"
"github.com/milvus-io/milvus/pkg/util"
"github.com/milvus-io/milvus/pkg/util/funcutil"
"github.com/milvus-io/milvus/pkg/util/interceptor"
"github.com/milvus-io/milvus/pkg/util/logutil"
"github.com/milvus-io/milvus/pkg/util/paramtable"
"github.com/milvus-io/milvus/pkg/util/retry"
"github.com/milvus-io/milvus/pkg/util/tikv"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)
// Server is the grpc server of streamingnode.
type Server struct {
stopOnce sync.Once
grpcServerChan chan struct{}
// session of current server.
session *sessionutil.Session
metaKV kv.MetaKv
// server
streamingnode *streamingnodeserver.Server
// rpc
grpcServer *grpc.Server
lis net.Listener
// component client
etcdCli *clientv3.Client
tikvCli *txnkv.Client
rootCoord types.RootCoordClient
dataCoord types.DataCoordClient
}
// NewServer create a new StreamingNode server.
func NewServer() (*Server, error) {
return &Server{
stopOnce: sync.Once{},
grpcServerChan: make(chan struct{}),
}, nil
}
// Run runs the server.
func (s *Server) Run() error {
if err := s.init(); err != nil {
return err
}
log.Info("streamingnode init done ...")
if err := s.start(); err != nil {
return err
}
log.Info("streamingnode start done ...")
return nil
}
// Stop stops the server, should be call after Run returned.
func (s *Server) Stop() (err error) {
s.stopOnce.Do(s.stop)
return nil
}
// stop stops the server.
func (s *Server) stop() {
addr, _ := s.getAddress()
log.Info("streamingnode stop", zap.String("Address", addr))
// Unregister current server from etcd.
log.Info("streamingnode unregister session from etcd...")
if err := s.session.GoingStop(); err != nil {
log.Warn("streamingnode unregister session failed", zap.Error(err))
}
// Stop StreamingNode service.
log.Info("streamingnode stop service...")
s.streamingnode.Stop()
// Stop grpc server.
log.Info("streamingnode stop grpc server...")
s.grpcServer.GracefulStop()
// Stop all session
log.Info("streamingnode stop session...")
s.session.Stop()
// Stop rootCoord client.
log.Info("streamingnode stop rootCoord client...")
if err := s.rootCoord.Close(); err != nil {
log.Warn("streamingnode stop rootCoord client failed", zap.Error(err))
}
// Stop tikv
if s.tikvCli != nil {
if err := s.tikvCli.Close(); err != nil {
log.Warn("streamingnode stop tikv client failed", zap.Error(err))
}
}
// Wait for grpc server to stop.
log.Info("wait for grpc server stop...")
<-s.grpcServerChan
log.Info("streamingnode stop done")
}
// Health check the health status of streamingnode.
func (s *Server) Health(ctx context.Context) commonpb.StateCode {
return s.streamingnode.Health(ctx)
}
func (s *Server) init() (err error) {
defer func() {
if err != nil {
log.Error("StreamingNode init failed", zap.Error(err))
return
}
log.Info("init StreamingNode server finished")
}()
// Create etcd client.
s.etcdCli, _ = kvfactory.GetEtcdAndPath()
if err := s.initMeta(); err != nil {
return err
}
if err := s.allocateAddress(); err != nil {
return err
}
if err := s.initSession(); err != nil {
return err
}
if err := s.initRootCoord(); err != nil {
return err
}
if err := s.initDataCoord(); err != nil {
return err
}
s.initGRPCServer()
// Create StreamingNode service.
s.streamingnode = streamingnodeserver.NewServerBuilder().
WithETCD(s.etcdCli).
WithGRPCServer(s.grpcServer).
WithRootCoordClient(s.rootCoord).
WithDataCoordClient(s.dataCoord).
WithSession(s.session).
WithMetaKV(s.metaKV).
Build()
if err := s.streamingnode.Init(context.Background()); err != nil {
return errors.Wrap(err, "StreamingNode service init failed")
}
return nil
}
func (s *Server) start() (err error) {
defer func() {
if err != nil {
log.Error("StreamingNode start failed", zap.Error(err))
return
}
log.Info("start StreamingNode server finished")
}()
// Start StreamingNode service.
s.streamingnode.Start()
// Start grpc server.
if err := s.startGPRCServer(); err != nil {
return errors.Wrap(err, "StreamingNode start gRPC server fail")
}
// Register current server to etcd.
s.registerSessionToETCD()
return nil
}
func (s *Server) initSession() error {
s.session = sessionutil.NewSession(context.Background())
if s.session == nil {
return errors.New("session is nil, the etcd client connection may have failed")
}
addr, err := s.getAddress()
if err != nil {
return err
}
s.session.Init(typeutil.StreamingNodeRole, addr, false, true)
paramtable.SetNodeID(s.session.ServerID)
log.Info("StreamingNode init session", zap.Int64("nodeID", paramtable.GetNodeID()), zap.String("node address", addr))
return nil
}
func (s *Server) initMeta() error {
params := paramtable.Get()
metaType := params.MetaStoreCfg.MetaStoreType.GetValue()
log.Info("data coordinator connecting to metadata store", zap.String("metaType", metaType))
metaRootPath := ""
if metaType == util.MetaStoreTypeTiKV {
var err error
s.tikvCli, err = tikv.GetTiKVClient(&paramtable.Get().TiKVCfg)
if err != nil {
log.Warn("Streamingnode init tikv client failed", zap.Error(err))
return err
}
metaRootPath = params.TiKVCfg.MetaRootPath.GetValue()
s.metaKV = tikvkv.NewTiKV(s.tikvCli, metaRootPath,
tikvkv.WithRequestTimeout(paramtable.Get().ServiceParam.TiKVCfg.RequestTimeout.GetAsDuration(time.Millisecond)))
} else if metaType == util.MetaStoreTypeEtcd {
metaRootPath = params.EtcdCfg.MetaRootPath.GetValue()
s.metaKV = etcdkv.NewEtcdKV(s.etcdCli, metaRootPath,
etcdkv.WithRequestTimeout(paramtable.Get().ServiceParam.EtcdCfg.RequestTimeout.GetAsDuration(time.Millisecond)))
}
return nil
}
func (s *Server) initRootCoord() (err error) {
log.Info("StreamingNode connect to rootCoord...")
s.rootCoord, err = rcc.NewClient(context.Background())
if err != nil {
return errors.Wrap(err, "StreamingNode try to new RootCoord client failed")
}
log.Info("StreamingNode try to wait for RootCoord ready")
err = componentutil.WaitForComponentHealthy(context.Background(), s.rootCoord, "RootCoord", 1000000, time.Millisecond*200)
if err != nil {
return errors.Wrap(err, "StreamingNode wait for RootCoord ready failed")
}
return nil
}
func (s *Server) initDataCoord() (err error) {
log.Info("StreamingNode connect to dataCoord...")
s.dataCoord, err = dcc.NewClient(context.Background())
if err != nil {
return errors.Wrap(err, "StreamingNode try to new DataCoord client failed")
}
log.Info("StreamingNode try to wait for DataCoord ready")
err = componentutil.WaitForComponentHealthy(context.Background(), s.dataCoord, "DataCoord", 1000000, time.Millisecond*200)
if err != nil {
return errors.Wrap(err, "StreamingNode wait for DataCoord ready failed")
}
return nil
}
func (s *Server) initGRPCServer() {
log.Info("create StreamingNode server...")
cfg := &paramtable.Get().StreamingNodeGrpcServerCfg
kaep := keepalive.EnforcementPolicy{
MinTime: 5 * time.Second, // If a client pings more than once every 5 seconds, terminate the connection
PermitWithoutStream: true, // Allow pings even when there are no active streams
}
kasp := keepalive.ServerParameters{
Time: 60 * time.Second, // Ping the client if it is idle for 60 seconds to ensure the connection is still active
Timeout: 10 * time.Second, // Wait 10 second for the ping ack before assuming the connection is dead
}
serverIDGetter := func() int64 {
return s.session.ServerID
}
opts := tracer.GetInterceptorOpts()
s.grpcServer = grpc.NewServer(
grpc.KeepaliveEnforcementPolicy(kaep),
grpc.KeepaliveParams(kasp),
grpc.MaxRecvMsgSize(cfg.ServerMaxRecvSize.GetAsInt()),
grpc.MaxSendMsgSize(cfg.ServerMaxSendSize.GetAsInt()),
grpc.UnaryInterceptor(grpc_middleware.ChainUnaryServer(
otelgrpc.UnaryServerInterceptor(opts...),
logutil.UnaryTraceLoggerInterceptor,
interceptor.ClusterValidationUnaryServerInterceptor(),
interceptor.ServerIDValidationUnaryServerInterceptor(serverIDGetter),
streamingserviceinterceptor.NewStreamingServiceUnaryServerInterceptor(),
)),
grpc.StreamInterceptor(grpc_middleware.ChainStreamServer(
otelgrpc.StreamServerInterceptor(opts...),
logutil.StreamTraceLoggerInterceptor,
interceptor.ClusterValidationStreamServerInterceptor(),
interceptor.ServerIDValidationStreamServerInterceptor(serverIDGetter),
streamingserviceinterceptor.NewStreamingServiceStreamServerInterceptor(),
)))
}
// allocateAddress allocates a available address for streamingnode grpc server.
func (s *Server) allocateAddress() (err error) {
port := paramtable.Get().StreamingNodeGrpcServerCfg.Port.GetAsInt()
retry.Do(context.Background(), func() error {
addr := ":" + strconv.Itoa(port)
s.lis, err = net.Listen("tcp", addr)
if err != nil {
if port != 0 {
// set port=0 to get next available port by os
log.Warn("StreamingNode suggested port is in used, try to get by os", zap.Error(err))
port = 0
}
}
return err
}, retry.Attempts(10))
return err
}
// getAddress returns the address of streamingnode grpc server.
// must be called after allocateAddress.
func (s *Server) getAddress() (string, error) {
if s.lis == nil {
return "", errors.New("StreamingNode grpc server is not initialized")
}
ip := paramtable.Get().StreamingNodeGrpcServerCfg.IP
return fmt.Sprintf("%s:%d", ip, s.lis.Addr().(*net.TCPAddr).Port), nil
}
// startGRPCServer starts the grpc server.
func (s *Server) startGPRCServer() error {
errCh := make(chan error, 1)
go func() {
defer close(s.grpcServerChan)
if err := s.grpcServer.Serve(s.lis); err != nil {
select {
case errCh <- err:
// failure at initial startup.
default:
// failure at runtime.
panic(errors.Wrapf(err, "grpc server stop with unexpected error"))
}
}
}()
funcutil.CheckGrpcReady(context.Background(), errCh)
return <-errCh
}
// registerSessionToETCD registers current server to etcd.
func (s *Server) registerSessionToETCD() {
s.session.Register()
// start liveness check
s.session.LivenessCheck(context.Background(), func() {
log.Error("StreamingNode disconnected from etcd, process will exit", zap.Int64("Server Id", paramtable.GetNodeID()))
os.Exit(1)
})
}