Add datacoord Server comment and improve para (#6182)

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
This commit is contained in:
congqixia 2021-06-29 10:46:13 +08:00 committed by GitHub
parent d53b232adf
commit c49ce4ddc2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 45 additions and 11 deletions

View File

@ -133,10 +133,13 @@ func (c *cluster) refresh(dataNodes []*datapb.DataNodeInfo) error {
} }
// paraRun parallel run, with max Parallel limit // paraRun parallel run, with max Parallel limit
func parraRun(works []func(), maxRunner int) { func paraRun(works []func(), maxRunner int) {
wg := sync.WaitGroup{} wg := sync.WaitGroup{}
ch := make(chan func()) ch := make(chan func())
wg.Add(len(works)) wg.Add(len(works))
if maxRunner > len(works) {
maxRunner = len(works)
}
for i := 0; i < maxRunner; i++ { for i := 0; i < maxRunner; i++ {
go func() { go func() {
@ -210,6 +213,7 @@ func (c *cluster) watch(nodes []*datapb.DataNodeInfo) ([]*datapb.DataNodeInfo, e
mut.Lock() mut.Lock()
errs = append(errs, err) errs = append(errs, err)
mut.Unlock() mut.Unlock()
return
} }
if resp.ErrorCode != commonpb.ErrorCode_Success { if resp.ErrorCode != commonpb.ErrorCode_Success {
log.Warn("watch channels failed", zap.String("address", n.Address), zap.Error(err)) log.Warn("watch channels failed", zap.String("address", n.Address), zap.Error(err))
@ -225,8 +229,11 @@ func (c *cluster) watch(nodes []*datapb.DataNodeInfo) ([]*datapb.DataNodeInfo, e
} }
}) })
} }
parraRun(works, 3) paraRun(works, 20)
return nodes, retry.ErrorList(errs) if len(errs) > 0 {
return nodes, retry.ErrorList(errs)
}
return nodes, nil
} }
func (c *cluster) register(n *datapb.DataNodeInfo) { func (c *cluster) register(n *datapb.DataNodeInfo) {

View File

@ -19,7 +19,7 @@ import (
const serverNotServingErrMsg = "server is not serving" const serverNotServingErrMsg = "server is not serving"
func (s *Server) isClosed() bool { func (s *Server) isClosed() bool {
return atomic.LoadInt64(&s.isServing) != 2 return atomic.LoadInt64(&s.isServing) != ServerStateHealthy
} }
func (s *Server) GetTimeTickChannel(ctx context.Context) (*milvuspb.StringResponse, error) { func (s *Server) GetTimeTickChannel(ctx context.Context) (*milvuspb.StringResponse, error) {
@ -354,9 +354,9 @@ func (s *Server) GetComponentStates(ctx context.Context) (*internalpb.ComponentS
} }
state := atomic.LoadInt64(&s.isServing) state := atomic.LoadInt64(&s.isServing)
switch state { switch state {
case 1: case ServerStateInitializing:
resp.State.StateCode = internalpb.StateCode_Initializing resp.State.StateCode = internalpb.StateCode_Initializing
case 2: case ServerStateHealthy:
resp.State.StateCode = internalpb.StateCode_Healthy resp.State.StateCode = internalpb.StateCode_Healthy
default: default:
resp.State.StateCode = internalpb.StateCode_Abnormal resp.State.StateCode = internalpb.StateCode_Abnormal

View File

@ -48,15 +48,29 @@ type (
Timestamp = typeutil.Timestamp Timestamp = typeutil.Timestamp
) )
// ServerState type alias
type ServerState = int64
const (
// ServerStateStopped state stands for just created or stopped `Server` instance
ServerStateStopped ServerState = 0
// ServerStateInitializing state stands initializing `Server` instance
ServerStateInitializing ServerState = 1
// ServerStateHealthy state stands for healthy `Server` instance
ServerStateHealthy ServerState = 2
)
type dataNodeCreatorFunc func(ctx context.Context, addr string) (types.DataNode, error) type dataNodeCreatorFunc func(ctx context.Context, addr string) (types.DataNode, error)
type rootCoordCreatorFunc func(ctx context.Context, metaRootPath string, etcdEndpoints []string) (types.RootCoord, error) type rootCoordCreatorFunc func(ctx context.Context, metaRootPath string, etcdEndpoints []string) (types.RootCoord, error)
// Server implements `types.Datacoord`
// handles Data Cooridinator related jobs
type Server struct { type Server struct {
ctx context.Context ctx context.Context
serverLoopCtx context.Context serverLoopCtx context.Context
serverLoopCancel context.CancelFunc serverLoopCancel context.CancelFunc
serverLoopWg sync.WaitGroup serverLoopWg sync.WaitGroup
isServing int64 isServing ServerState
kvClient *etcdkv.EtcdKV kvClient *etcdkv.EtcdKV
meta *meta meta *meta
@ -79,6 +93,7 @@ type Server struct {
rootCoordClientCreator rootCoordCreatorFunc rootCoordClientCreator rootCoordCreatorFunc
} }
// CreateServer create `Server` instance
func CreateServer(ctx context.Context, factory msgstream.Factory) (*Server, error) { func CreateServer(ctx context.Context, factory msgstream.Factory) (*Server, error) {
rand.Seed(time.Now().UnixNano()) rand.Seed(time.Now().UnixNano())
s := &Server{ s := &Server{
@ -107,11 +122,19 @@ func (s *Server) Register() error {
return nil return nil
} }
// Init change server state to Initializing
func (s *Server) Init() error { func (s *Server) Init() error {
atomic.StoreInt64(&s.isServing, 1) atomic.StoreInt64(&s.isServing, ServerStateInitializing)
return nil return nil
} }
// Start initialize `Server` members and start loops, follow steps are taken:
// 1. initialize message factory parameters
// 2. initialize root coord client, meta, datanode cluster, segment info channel,
// allocator, segment manager
// 3. start service discovery and server loops, which includes message stream handler (segment statistics,datanode tt)
// datanodes etcd watch, etcd alive check and flush completed status check
// 4. set server state to Healthy
func (s *Server) Start() error { func (s *Server) Start() error {
var err error var err error
m := map[string]interface{}{ m := map[string]interface{}{
@ -151,7 +174,7 @@ func (s *Server) Start() error {
s.startServerLoop() s.startServerLoop()
atomic.StoreInt64(&s.isServing, 2) atomic.StoreInt64(&s.isServing, ServerStateHealthy)
log.Debug("DataCoordinator startup success") log.Debug("DataCoordinator startup success")
return nil return nil
} }
@ -482,12 +505,16 @@ func (s *Server) initRootCoordClient() error {
return s.rootCoordClient.Start() return s.rootCoordClient.Start()
} }
// Stop do the Server finalize processes
// it checks the server status is healthy, if not, just quit
// if Server is healthy, set server state to stopped, release etcd session,
// stop message stream client and stop server loops
func (s *Server) Stop() error { func (s *Server) Stop() error {
if !atomic.CompareAndSwapInt64(&s.isServing, 2, 0) { if !atomic.CompareAndSwapInt64(&s.isServing, ServerStateHealthy, ServerStateStopped) {
return nil return nil
} }
log.Debug("DataCoord server shutdown") log.Debug("DataCoord server shutdown")
atomic.StoreInt64(&s.isServing, 0) atomic.StoreInt64(&s.isServing, ServerStateStopped)
s.cluster.releaseSessions() s.cluster.releaseSessions()
s.segmentInfoStream.Close() s.segmentInfoStream.Close()
s.flushMsgStream.Close() s.flushMsgStream.Close()