mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
### **User description** AddProxyClients now removes clients not in the new snapshot before adding new ones. This ensures proper cleanup when ProxyWatcher re-watche etcd. issue: https://github.com/milvus-io/milvus/issues/46397 pr: https://github.com/milvus-io/milvus/pull/46398 ___ ### **PR Type** Bug fix ___ ### **Description** - Rename `AddProxyClients` to `SetProxyClients` for clearer semantics - Implement stale client cleanup before adding new proxy clients - Remove proxy clients not present in new etcd snapshot - Update all callers in querycoord and rootcoord servers - Regenerate mock files with mockery v2.53.3 ___ ### Diagram Walkthrough ```mermaid flowchart LR A["ProxyWatcher detects<br/>etcd change"] -->|calls| B["SetProxyClients<br/>with new snapshot"] B -->|removes| C["Stale clients<br/>not in snapshot"] C -->|closes| D["Cleanup resources"] B -->|adds| E["New proxy clients<br/>from snapshot"] ``` <details><summary><h3>File Walkthrough</h3></summary> <table><thead><tr><th></th><th align="left">Relevant files</th></tr></thead><tbody><tr><td><strong>Bug fix</strong></td><td><details><summary>3 files</summary><table> <tr> <td><strong>proxy_client_manager.go</strong><dd><code>Rename AddProxyClients to SetProxyClients with cleanup</code> </dd></td> <td><a href="https://github.com/milvus-io/milvus/pull/46491/files#diff-1a13e14654661bffe70ce626777d527871fcae62361a5fc18b7dca93e66afe1e">+22/-2</a> </td> </tr> <tr> <td><strong>server.go</strong><dd><code>Update ProxyWatcher to use SetProxyClients</code> </dd></td> <td><a href="https://github.com/milvus-io/milvus/pull/46491/files#diff-87d7712e6df027656ba9d433fb77b702c185486130879be54204da2b8f092230">+1/-1</a> </td> </tr> <tr> <td><strong>root_coord.go</strong><dd><code>Update ProxyWatcher initialization to SetProxyClients</code> </dd></td> <td><a href="https://github.com/milvus-io/milvus/pull/46491/files#diff-8fab0705c4ddb5f98e5955d3c3013fa795c87237a8525e189c3296d98dcce47f">+2/-2</a> </td> </tr> </table></details></td></tr><tr><td><strong>Tests</strong></td><td><details><summary>1 files</summary><table> <tr> <td><strong>proxy_client_manager_test.go</strong><dd><code>Update test for SetProxyClients stale removal</code> </dd></td> <td><a href="https://github.com/milvus-io/milvus/pull/46491/files#diff-fb82a84e67ec187c1bfbdc4335c18b504b2f4392758d859e54115684ea8a526d">+26/-10</a> </td> </tr> </table></details></td></tr><tr><td><strong>Miscellaneous</strong></td><td><details><summary>7 files</summary><table> <tr> <td><strong>mock_proxy_client_manager.go</strong><dd><code>Regenerate mock with SetProxyClients method</code> </dd></td> <td><a href="https://github.com/milvus-io/milvus/pull/46491/files#diff-8cc3cfe21d2694f58ebe7f2d44e12c467d9e83ac9edb37bcb6c7262e7b2ca09d">+78/-38</a> </td> </tr> <tr> <td><strong>mock_proxy_watcher.go</strong><dd><code>Regenerate mock with mockery v2.53.3</code> </dd></td> <td><a href="https://github.com/milvus-io/milvus/pull/46491/files#diff-0c22e89cbee68c397abee52e797166fbac754b439eb1304baf53dd207e8f11d2">+9/-5</a> </td> </tr> <tr> <td><strong>mock_global_id_allocator.go</strong><dd><code>Regenerate mock with mockery v2.53.3</code> </dd></td> <td><a href="https://github.com/milvus-io/milvus/pull/46491/files#diff-1822c38f0a6c4225134bee554bf518d0b9184040ea5bb3d9e926515ae80655c8">+15/-3</a> </td> </tr> <tr> <td><strong>mock_grpc_client.go</strong><dd><code>Regenerate mock with mockery v2.53.3</code> </dd></td> <td><a href="https://github.com/milvus-io/milvus/pull/46491/files#diff-6137f17eb7eef693d72c6181f519f7b1a87669a00f5268cbd7c3399567451e64">+33/-13</a> </td> </tr> <tr> <td><strong>allocator.go</strong><dd><code>Regenerate mock with mockery v2.53.3</code> </dd></td> <td><a href="https://github.com/milvus-io/milvus/pull/46491/files#diff-e2482bb9748ad163ba57c65251355035f45c140f12214cd96cc5da88376fcc39">+26/-6</a> </td> </tr> <tr> <td><strong>mock_factory.go</strong><dd><code>Regenerate mock with mockery v2.53.3</code> </dd></td> <td><a href="https://github.com/milvus-io/milvus/pull/46491/files#diff-bc91a5fcdc0bd52eaaaf805d15985fe578debbb2576eb0c97d08e0573d0132e6">+18/-2</a> </td> </tr> <tr> <td><strong>mock_session.go</strong><dd><code>Regenerate mock with mockery v2.53.3</code> </dd></td> <td><a href="https://github.com/milvus-io/milvus/pull/46491/files#diff-850f08ed9571fd6f3f84bd846fa29dab2d52be7c8f9d9b358fc667c6970a90e6">+79/-19</a> </td> </tr> </table></details></td></tr></tbody></table> </details> ___ Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
420 lines
13 KiB
Go
420 lines
13 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package proxyutil
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
|
|
"github.com/cockroachdb/errors"
|
|
"github.com/samber/lo"
|
|
"go.uber.org/zap"
|
|
"golang.org/x/sync/errgroup"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
|
|
grpcproxyclient "github.com/milvus-io/milvus/internal/distributed/proxy/client"
|
|
"github.com/milvus-io/milvus/internal/types"
|
|
"github.com/milvus-io/milvus/internal/util/sessionutil"
|
|
"github.com/milvus-io/milvus/pkg/v2/log"
|
|
"github.com/milvus-io/milvus/pkg/v2/metrics"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/proxypb"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/commonpbutil"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/merr"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/metricsinfo"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
|
|
)
|
|
|
|
type ExpireCacheConfig struct {
|
|
msgType commonpb.MsgType
|
|
}
|
|
|
|
func (c ExpireCacheConfig) Apply(req *proxypb.InvalidateCollMetaCacheRequest) {
|
|
if req.GetBase() == nil {
|
|
req.Base = commonpbutil.NewMsgBase()
|
|
}
|
|
req.Base.MsgType = c.msgType
|
|
}
|
|
|
|
func DefaultExpireCacheConfig() ExpireCacheConfig {
|
|
return ExpireCacheConfig{}
|
|
}
|
|
|
|
type ExpireCacheOpt func(c *ExpireCacheConfig)
|
|
|
|
func SetMsgType(msgType commonpb.MsgType) ExpireCacheOpt {
|
|
return func(c *ExpireCacheConfig) {
|
|
c.msgType = msgType
|
|
}
|
|
}
|
|
|
|
type ProxyCreator func(ctx context.Context, addr string, nodeID int64) (types.ProxyClient, error)
|
|
|
|
func DefaultProxyCreator(ctx context.Context, addr string, nodeID int64) (types.ProxyClient, error) {
|
|
cli, err := grpcproxyclient.NewClient(ctx, addr, nodeID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return cli, nil
|
|
}
|
|
|
|
type ProxyClientManagerHelper struct {
|
|
afterConnect func()
|
|
}
|
|
|
|
var defaultClientManagerHelper = ProxyClientManagerHelper{
|
|
afterConnect: func() {},
|
|
}
|
|
|
|
type ProxyClientManagerInterface interface {
|
|
AddProxyClient(session *sessionutil.Session)
|
|
SetProxyClients(session []*sessionutil.Session)
|
|
GetProxyClients() *typeutil.ConcurrentMap[int64, types.ProxyClient]
|
|
DelProxyClient(s *sessionutil.Session)
|
|
GetProxyCount() int
|
|
|
|
InvalidateCollectionMetaCache(ctx context.Context, request *proxypb.InvalidateCollMetaCacheRequest, opts ...ExpireCacheOpt) error
|
|
InvalidateShardLeaderCache(ctx context.Context, request *proxypb.InvalidateShardLeaderCacheRequest) error
|
|
InvalidateCredentialCache(ctx context.Context, request *proxypb.InvalidateCredCacheRequest) error
|
|
UpdateCredentialCache(ctx context.Context, request *proxypb.UpdateCredCacheRequest) error
|
|
RefreshPolicyInfoCache(ctx context.Context, req *proxypb.RefreshPolicyInfoCacheRequest) error
|
|
GetProxyMetrics(ctx context.Context) ([]*milvuspb.GetMetricsResponse, error)
|
|
SetRates(ctx context.Context, request *proxypb.SetRatesRequest) error
|
|
GetComponentStates(ctx context.Context) (map[int64]*milvuspb.ComponentStates, error)
|
|
}
|
|
|
|
type ProxyClientManager struct {
|
|
creator ProxyCreator
|
|
proxyClient *typeutil.ConcurrentMap[int64, types.ProxyClient]
|
|
helper ProxyClientManagerHelper
|
|
}
|
|
|
|
func NewProxyClientManager(creator ProxyCreator) *ProxyClientManager {
|
|
return &ProxyClientManager{
|
|
creator: creator,
|
|
proxyClient: typeutil.NewConcurrentMap[int64, types.ProxyClient](),
|
|
helper: defaultClientManagerHelper,
|
|
}
|
|
}
|
|
|
|
// SetProxyClients sets proxy clients from a full snapshot of sessions.
|
|
// It removes stale clients not in the new snapshot and adds new ones.
|
|
// This is called during initial setup or when re-watching after etcd error.
|
|
func (p *ProxyClientManager) SetProxyClients(sessions []*sessionutil.Session) {
|
|
aliveSessions := lo.KeyBy(sessions, func(session *sessionutil.Session) int64 {
|
|
return session.ServerID
|
|
})
|
|
|
|
// Remove stale clients not in the alive sessions
|
|
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
|
|
if _, ok := aliveSessions[key]; !ok {
|
|
if cli, loaded := p.proxyClient.GetAndRemove(key); loaded {
|
|
cli.Close()
|
|
log.Info("remove stale proxy client", zap.Int64("serverID", key))
|
|
}
|
|
}
|
|
return true
|
|
})
|
|
|
|
// Add new clients
|
|
for _, session := range sessions {
|
|
p.AddProxyClient(session)
|
|
}
|
|
}
|
|
|
|
func (p *ProxyClientManager) GetProxyClients() *typeutil.ConcurrentMap[int64, types.ProxyClient] {
|
|
return p.proxyClient
|
|
}
|
|
|
|
func (p *ProxyClientManager) AddProxyClient(session *sessionutil.Session) {
|
|
_, ok := p.proxyClient.Get(session.ServerID)
|
|
if ok {
|
|
return
|
|
}
|
|
|
|
p.connect(session)
|
|
p.updateProxyNumMetric()
|
|
}
|
|
|
|
// GetProxyCount returns number of proxy clients.
|
|
func (p *ProxyClientManager) GetProxyCount() int {
|
|
return p.proxyClient.Len()
|
|
}
|
|
|
|
// mutex.Lock is required before calling this method.
|
|
func (p *ProxyClientManager) updateProxyNumMetric() {
|
|
metrics.RootCoordProxyCounter.WithLabelValues().Set(float64(p.proxyClient.Len()))
|
|
}
|
|
|
|
func (p *ProxyClientManager) connect(session *sessionutil.Session) {
|
|
pc, err := p.creator(context.Background(), session.Address, session.ServerID)
|
|
if err != nil {
|
|
log.Warn("failed to create proxy client", zap.String("address", session.Address), zap.Int64("serverID", session.ServerID), zap.Error(err))
|
|
return
|
|
}
|
|
|
|
_, ok := p.proxyClient.GetOrInsert(session.GetServerID(), pc)
|
|
if ok {
|
|
pc.Close()
|
|
return
|
|
}
|
|
log.Info("succeed to create proxy client", zap.String("address", session.Address), zap.Int64("serverID", session.ServerID))
|
|
p.helper.afterConnect()
|
|
}
|
|
|
|
func (p *ProxyClientManager) DelProxyClient(s *sessionutil.Session) {
|
|
cli, ok := p.proxyClient.GetAndRemove(s.GetServerID())
|
|
if ok {
|
|
cli.Close()
|
|
}
|
|
|
|
p.updateProxyNumMetric()
|
|
log.Info("remove proxy client", zap.String("proxy address", s.Address), zap.Int64("proxy id", s.ServerID))
|
|
}
|
|
|
|
func (p *ProxyClientManager) InvalidateCollectionMetaCache(ctx context.Context, request *proxypb.InvalidateCollMetaCacheRequest, opts ...ExpireCacheOpt) error {
|
|
c := DefaultExpireCacheConfig()
|
|
for _, opt := range opts {
|
|
opt(&c)
|
|
}
|
|
c.Apply(request)
|
|
|
|
if p.proxyClient.Len() == 0 {
|
|
log.Warn("proxy client is empty, InvalidateCollectionMetaCache will not send to any client")
|
|
return nil
|
|
}
|
|
|
|
group := &errgroup.Group{}
|
|
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
|
|
k, v := key, value
|
|
group.Go(func() error {
|
|
sta, err := v.InvalidateCollectionMetaCache(ctx, request)
|
|
if err != nil {
|
|
if errors.Is(err, merr.ErrNodeNotFound) {
|
|
log.Warn("InvalidateCollectionMetaCache failed due to proxy service not found", zap.Error(err))
|
|
return nil
|
|
}
|
|
|
|
if errors.Is(err, merr.ErrServiceUnimplemented) {
|
|
return nil
|
|
}
|
|
|
|
return fmt.Errorf("InvalidateCollectionMetaCache failed, proxyID = %d, err = %s", k, err)
|
|
}
|
|
if sta.ErrorCode != commonpb.ErrorCode_Success {
|
|
return fmt.Errorf("InvalidateCollectionMetaCache failed, proxyID = %d, err = %s", k, sta.Reason)
|
|
}
|
|
return nil
|
|
})
|
|
return true
|
|
})
|
|
return group.Wait()
|
|
}
|
|
|
|
// InvalidateCredentialCache TODO: too many codes similar to InvalidateCollectionMetaCache.
|
|
func (p *ProxyClientManager) InvalidateCredentialCache(ctx context.Context, request *proxypb.InvalidateCredCacheRequest) error {
|
|
if p.proxyClient.Len() == 0 {
|
|
log.Warn("proxy client is empty, InvalidateCredentialCache will not send to any client")
|
|
return nil
|
|
}
|
|
|
|
group := &errgroup.Group{}
|
|
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
|
|
k, v := key, value
|
|
group.Go(func() error {
|
|
sta, err := v.InvalidateCredentialCache(ctx, request)
|
|
if err != nil {
|
|
return fmt.Errorf("InvalidateCredentialCache failed, proxyID = %d, err = %s", k, err)
|
|
}
|
|
if sta.ErrorCode != commonpb.ErrorCode_Success {
|
|
return fmt.Errorf("InvalidateCredentialCache failed, proxyID = %d, err = %s", k, sta.Reason)
|
|
}
|
|
return nil
|
|
})
|
|
return true
|
|
})
|
|
|
|
return group.Wait()
|
|
}
|
|
|
|
// UpdateCredentialCache TODO: too many codes similar to InvalidateCollectionMetaCache.
|
|
func (p *ProxyClientManager) UpdateCredentialCache(ctx context.Context, request *proxypb.UpdateCredCacheRequest) error {
|
|
if p.proxyClient.Len() == 0 {
|
|
log.Warn("proxy client is empty, UpdateCredentialCache will not send to any client")
|
|
return nil
|
|
}
|
|
|
|
group := &errgroup.Group{}
|
|
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
|
|
k, v := key, value
|
|
group.Go(func() error {
|
|
sta, err := v.UpdateCredentialCache(ctx, request)
|
|
if err != nil {
|
|
return fmt.Errorf("UpdateCredentialCache failed, proxyID = %d, err = %s", k, err)
|
|
}
|
|
if sta.ErrorCode != commonpb.ErrorCode_Success {
|
|
return fmt.Errorf("UpdateCredentialCache failed, proxyID = %d, err = %s", k, sta.Reason)
|
|
}
|
|
return nil
|
|
})
|
|
return true
|
|
})
|
|
return group.Wait()
|
|
}
|
|
|
|
// RefreshPolicyInfoCache TODO: too many codes similar to InvalidateCollectionMetaCache.
|
|
func (p *ProxyClientManager) RefreshPolicyInfoCache(ctx context.Context, req *proxypb.RefreshPolicyInfoCacheRequest) error {
|
|
if p.proxyClient.Len() == 0 {
|
|
log.Warn("proxy client is empty, RefreshPrivilegeInfoCache will not send to any client")
|
|
return nil
|
|
}
|
|
|
|
group := &errgroup.Group{}
|
|
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
|
|
k, v := key, value
|
|
group.Go(func() error {
|
|
status, err := v.RefreshPolicyInfoCache(ctx, req)
|
|
if err != nil {
|
|
return fmt.Errorf("RefreshPolicyInfoCache failed, proxyID = %d, err = %s", k, err)
|
|
}
|
|
if status.GetErrorCode() != commonpb.ErrorCode_Success {
|
|
return merr.Error(status)
|
|
}
|
|
return nil
|
|
})
|
|
return true
|
|
})
|
|
return group.Wait()
|
|
}
|
|
|
|
// GetProxyMetrics sends requests to proxies to get metrics.
|
|
func (p *ProxyClientManager) GetProxyMetrics(ctx context.Context) ([]*milvuspb.GetMetricsResponse, error) {
|
|
if p.proxyClient.Len() == 0 {
|
|
log.Warn("proxy client is empty, GetMetrics will not send to any client")
|
|
return nil, nil
|
|
}
|
|
|
|
req, err := metricsinfo.ConstructRequestByMetricType(metricsinfo.SystemInfoMetrics)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
group := &errgroup.Group{}
|
|
var metricRspsMu sync.Mutex
|
|
metricRsps := make([]*milvuspb.GetMetricsResponse, 0)
|
|
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
|
|
k, v := key, value
|
|
group.Go(func() error {
|
|
rsp, err := v.GetProxyMetrics(ctx, req)
|
|
if err != nil {
|
|
return fmt.Errorf("GetMetrics failed, proxyID = %d, err = %s", k, err)
|
|
}
|
|
if rsp.GetStatus().GetErrorCode() != commonpb.ErrorCode_Success {
|
|
return fmt.Errorf("GetMetrics failed, proxyID = %d, err = %s", k, rsp.GetStatus().GetReason())
|
|
}
|
|
metricRspsMu.Lock()
|
|
metricRsps = append(metricRsps, rsp)
|
|
metricRspsMu.Unlock()
|
|
return nil
|
|
})
|
|
return true
|
|
})
|
|
err = group.Wait()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return metricRsps, nil
|
|
}
|
|
|
|
// SetRates notifies Proxy to limit rates of requests.
|
|
func (p *ProxyClientManager) SetRates(ctx context.Context, request *proxypb.SetRatesRequest) error {
|
|
if p.proxyClient.Len() == 0 {
|
|
log.Warn("proxy client is empty, SetRates will not send to any client")
|
|
return nil
|
|
}
|
|
|
|
group := &errgroup.Group{}
|
|
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
|
|
k, v := key, value
|
|
group.Go(func() error {
|
|
sta, err := v.SetRates(ctx, request)
|
|
if err != nil {
|
|
return fmt.Errorf("SetRates failed, proxyID = %d, err = %s", k, err)
|
|
}
|
|
if sta.GetErrorCode() != commonpb.ErrorCode_Success {
|
|
return fmt.Errorf("SetRates failed, proxyID = %d, err = %s", k, sta.Reason)
|
|
}
|
|
return nil
|
|
})
|
|
return true
|
|
})
|
|
return group.Wait()
|
|
}
|
|
|
|
func (p *ProxyClientManager) GetComponentStates(ctx context.Context) (map[int64]*milvuspb.ComponentStates, error) {
|
|
group, ctx := errgroup.WithContext(ctx)
|
|
states := make(map[int64]*milvuspb.ComponentStates)
|
|
|
|
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
|
|
k, v := key, value
|
|
group.Go(func() error {
|
|
sta, err := v.GetComponentStates(ctx, &milvuspb.GetComponentStatesRequest{})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
states[k] = sta
|
|
return nil
|
|
})
|
|
return true
|
|
})
|
|
err := group.Wait()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return states, nil
|
|
}
|
|
|
|
func (p *ProxyClientManager) InvalidateShardLeaderCache(ctx context.Context, request *proxypb.InvalidateShardLeaderCacheRequest) error {
|
|
if p.proxyClient.Len() == 0 {
|
|
log.Warn("proxy client is empty, InvalidateShardLeaderCache will not send to any client")
|
|
return nil
|
|
}
|
|
|
|
group := &errgroup.Group{}
|
|
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
|
|
k, v := key, value
|
|
group.Go(func() error {
|
|
sta, err := v.InvalidateShardLeaderCache(ctx, request)
|
|
if err != nil {
|
|
if errors.Is(err, merr.ErrNodeNotFound) {
|
|
log.Warn("InvalidateShardLeaderCache failed due to proxy service not found", zap.Error(err))
|
|
return nil
|
|
}
|
|
return fmt.Errorf("InvalidateShardLeaderCache failed, proxyID = %d, err = %s", k, err)
|
|
}
|
|
if sta.ErrorCode != commonpb.ErrorCode_Success {
|
|
return fmt.Errorf("InvalidateShardLeaderCache failed, proxyID = %d, err = %s", k, sta.Reason)
|
|
}
|
|
return nil
|
|
})
|
|
return true
|
|
})
|
|
return group.Wait()
|
|
}
|