milvus/internal/util/proxyutil/proxy_watcher.go
yihao.dai 4204c0473e
fix: [2.5] Ignore 2.6 Proxy to avoid timetick lag (#43519)
During the rolling upgrade from 2.5 to 2.6, the 2.5 coordinator detects
newly started 2.6 proxies. However, 2.6 proxies do not sync timetick,
which leads to timetick delay. This PR ignores 2.6 proxies to prevent
ttDelay during the upgrade process.

issue: https://github.com/milvus-io/milvus/issues/43518

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
2025-07-23 22:40:54 +08:00

237 lines
6.8 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package proxyutil
import (
"context"
"fmt"
"path"
"sync"
"time"
"github.com/blang/semver/v4"
"go.etcd.io/etcd/api/v3/mvccpb"
v3rpc "go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
clientv3 "go.etcd.io/etcd/client/v3"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"
"github.com/milvus-io/milvus/internal/json"
"github.com/milvus-io/milvus/internal/util/sessionutil"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/util/lifetime"
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
type ProxyWatcherInterface interface {
AddSessionFunc(fns ...func(*sessionutil.Session))
DelSessionFunc(fns ...func(*sessionutil.Session))
WatchProxy(ctx context.Context) error
Stop()
}
// ProxyWatcher manages proxy clients
type ProxyWatcher struct {
wg errgroup.Group
lock sync.Mutex
etcdCli *clientv3.Client
initSessionsFunc []func([]*sessionutil.Session)
addSessionsFunc []func(*sessionutil.Session)
delSessionsFunc []func(*sessionutil.Session)
closeOnce sync.Once
closeCh lifetime.SafeChan
}
// NewProxyWatcher helper function to create a proxyWatcher
// fns are the custom getSessions function list
func NewProxyWatcher(client *clientv3.Client, fns ...func([]*sessionutil.Session)) *ProxyWatcher {
p := &ProxyWatcher{
lock: sync.Mutex{},
etcdCli: client,
closeCh: lifetime.NewSafeChan(),
}
p.initSessionsFunc = append(p.initSessionsFunc, fns...)
return p
}
// AddSessionFunc adds functions to addSessions function list
func (p *ProxyWatcher) AddSessionFunc(fns ...func(*sessionutil.Session)) {
p.lock.Lock()
defer p.lock.Unlock()
p.addSessionsFunc = append(p.addSessionsFunc, fns...)
}
// DelSessionFunc add functions to delSessions function list
func (p *ProxyWatcher) DelSessionFunc(fns ...func(*sessionutil.Session)) {
p.lock.Lock()
defer p.lock.Unlock()
p.delSessionsFunc = append(p.delSessionsFunc, fns...)
}
// WatchProxy starts a goroutine to watch proxy session changes on etcd
func (p *ProxyWatcher) WatchProxy(ctx context.Context) error {
childCtx, cancel := context.WithTimeout(ctx, paramtable.Get().ServiceParam.EtcdCfg.RequestTimeout.GetAsDuration(time.Millisecond))
defer cancel()
sessions, rev, err := p.getSessionsOnEtcd(childCtx)
if err != nil {
return err
}
log.Info("succeed to init sessions on etcd", zap.Any("sessions", sessions), zap.Int64("revision", rev))
// all init function should be clear meta firstly.
for _, f := range p.initSessionsFunc {
f(sessions)
}
eventCh := p.etcdCli.Watch(
ctx,
path.Join(paramtable.Get().EtcdCfg.MetaRootPath.GetValue(), sessionutil.DefaultServiceRoot, typeutil.ProxyRole),
clientv3.WithPrefix(),
clientv3.WithCreatedNotify(),
clientv3.WithPrevKV(),
clientv3.WithRev(rev+1),
)
p.wg.Go(func() error {
p.startWatchEtcd(ctx, eventCh)
return nil
})
return nil
}
func (p *ProxyWatcher) startWatchEtcd(ctx context.Context, eventCh clientv3.WatchChan) {
log.Info("start to watch etcd")
for {
select {
case <-ctx.Done():
log.Warn("stop watching etcd loop")
return
case <-p.closeCh.CloseCh():
log.Warn("stop watching etcd loop")
return
case event, ok := <-eventCh:
if !ok {
log.Warn("stop watching etcd loop due to closed etcd event channel")
panic("stop watching etcd loop due to closed etcd event channel")
}
if err := event.Err(); err != nil {
if err == v3rpc.ErrCompacted {
err2 := p.WatchProxy(ctx)
if err2 != nil {
log.Error("re watch proxy fails when etcd has a compaction error",
zap.Error(err), zap.Error(err2))
panic("failed to handle etcd request, exit..")
}
return
}
log.Error("Watch proxy service failed", zap.Error(err))
panic(err)
}
for _, e := range event.Events {
var err error
switch e.Type {
case mvccpb.PUT:
err = p.handlePutEvent(e)
case mvccpb.DELETE:
err = p.handleDeleteEvent(e)
}
if err != nil {
log.Warn("failed to handle proxy event", zap.Any("event", e), zap.Error(err))
}
}
}
}
}
func (p *ProxyWatcher) handlePutEvent(e *clientv3.Event) error {
session, err := p.parseSession(e.Kv.Value)
if err != nil {
return err
}
log.Ctx(context.TODO()).Debug("received proxy put event with session", zap.Any("session", session))
rangeChecker := semver.MustParseRange(">=2.6.0-dev")
if rangeChecker(session.Version) {
log.Info("new proxy with no timetick join, ignored",
zap.String("version", session.Version.String()),
zap.Int64("serverID", session.ServerID),
zap.String("address", session.Address))
return nil
}
for _, f := range p.addSessionsFunc {
f(session)
}
return nil
}
func (p *ProxyWatcher) handleDeleteEvent(e *clientv3.Event) error {
session, err := p.parseSession(e.PrevKv.Value)
if err != nil {
return err
}
log.Ctx(context.TODO()).Debug("received proxy delete event with session", zap.Any("session", session))
for _, f := range p.delSessionsFunc {
f(session)
}
return nil
}
func (p *ProxyWatcher) parseSession(value []byte) (*sessionutil.Session, error) {
session := new(sessionutil.Session)
err := json.Unmarshal(value, session)
if err != nil {
return nil, err
}
return session, nil
}
func (p *ProxyWatcher) getSessionsOnEtcd(ctx context.Context) ([]*sessionutil.Session, int64, error) {
resp, err := p.etcdCli.Get(
ctx,
path.Join(paramtable.Get().EtcdCfg.MetaRootPath.GetValue(), sessionutil.DefaultServiceRoot, typeutil.ProxyRole),
clientv3.WithPrefix(),
clientv3.WithSort(clientv3.SortByKey, clientv3.SortAscend),
)
if err != nil {
return nil, 0, fmt.Errorf("proxy manager failed to watch proxy with error %w", err)
}
var sessions []*sessionutil.Session
for _, v := range resp.Kvs {
session, err := p.parseSession(v.Value)
if err != nil {
log.Warn("failed to unmarshal session", zap.Error(err))
return nil, 0, err
}
sessions = append(sessions, session)
}
return sessions, resp.Header.Revision, nil
}
// Stop stops the ProxyManager
func (p *ProxyWatcher) Stop() {
p.closeOnce.Do(func() {
p.closeCh.Close()
p.wg.Wait()
})
}