milvus/internal/util/proxyutil/proxy_client_manager.go
yihao.dai 44d915a43b
fix: [2.5] Remove stale proxy clients on rewatch etcd (#46491)
### **User description**
AddProxyClients now removes clients not in the new snapshot before
adding new ones. This ensures proper cleanup when ProxyWatcher re-watche
etcd.

issue: https://github.com/milvus-io/milvus/issues/46397

pr: https://github.com/milvus-io/milvus/pull/46398


___

### **PR Type**
Bug fix


___

### **Description**
- Rename `AddProxyClients` to `SetProxyClients` for clearer semantics

- Implement stale client cleanup before adding new proxy clients

- Remove proxy clients not present in new etcd snapshot

- Update all callers in querycoord and rootcoord servers

- Regenerate mock files with mockery v2.53.3


___

### Diagram Walkthrough


```mermaid
flowchart LR
  A["ProxyWatcher detects<br/>etcd change"] -->|calls| B["SetProxyClients<br/>with new snapshot"]
  B -->|removes| C["Stale clients<br/>not in snapshot"]
  C -->|closes| D["Cleanup resources"]
  B -->|adds| E["New proxy clients<br/>from snapshot"]
```



<details><summary><h3>File Walkthrough</h3></summary>

<table><thead><tr><th></th><th align="left">Relevant
files</th></tr></thead><tbody><tr><td><strong>Bug
fix</strong></td><td><details><summary>3 files</summary><table>
<tr>
<td><strong>proxy_client_manager.go</strong><dd><code>Rename
AddProxyClients to SetProxyClients with cleanup</code>&nbsp; &nbsp;
&nbsp; </dd></td>
<td><a
href="https://github.com/milvus-io/milvus/pull/46491/files#diff-1a13e14654661bffe70ce626777d527871fcae62361a5fc18b7dca93e66afe1e">+22/-2</a>&nbsp;
&nbsp; </td>

</tr>

<tr>
<td><strong>server.go</strong><dd><code>Update ProxyWatcher to use
SetProxyClients</code>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; </dd></td>
<td><a
href="https://github.com/milvus-io/milvus/pull/46491/files#diff-87d7712e6df027656ba9d433fb77b702c185486130879be54204da2b8f092230">+1/-1</a>&nbsp;
&nbsp; &nbsp; </td>

</tr>

<tr>
<td><strong>root_coord.go</strong><dd><code>Update ProxyWatcher
initialization to SetProxyClients</code>&nbsp; &nbsp; &nbsp; &nbsp;
</dd></td>
<td><a
href="https://github.com/milvus-io/milvus/pull/46491/files#diff-8fab0705c4ddb5f98e5955d3c3013fa795c87237a8525e189c3296d98dcce47f">+2/-2</a>&nbsp;
&nbsp; &nbsp; </td>

</tr>

</table></details></td></tr><tr><td><strong>Tests</strong></td><td><details><summary>1
files</summary><table>
<tr>
<td><strong>proxy_client_manager_test.go</strong><dd><code>Update test
for SetProxyClients stale removal</code>&nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; </dd></td>
<td><a
href="https://github.com/milvus-io/milvus/pull/46491/files#diff-fb82a84e67ec187c1bfbdc4335c18b504b2f4392758d859e54115684ea8a526d">+26/-10</a>&nbsp;
</td>

</tr>

</table></details></td></tr><tr><td><strong>Miscellaneous</strong></td><td><details><summary>7
files</summary><table>
<tr>
<td><strong>mock_proxy_client_manager.go</strong><dd><code>Regenerate
mock with SetProxyClients method</code>&nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
</dd></td>
<td><a
href="https://github.com/milvus-io/milvus/pull/46491/files#diff-8cc3cfe21d2694f58ebe7f2d44e12c467d9e83ac9edb37bcb6c7262e7b2ca09d">+78/-38</a>&nbsp;
</td>

</tr>

<tr>
<td><strong>mock_proxy_watcher.go</strong><dd><code>Regenerate mock with
mockery v2.53.3</code>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; </dd></td>
<td><a
href="https://github.com/milvus-io/milvus/pull/46491/files#diff-0c22e89cbee68c397abee52e797166fbac754b439eb1304baf53dd207e8f11d2">+9/-5</a>&nbsp;
&nbsp; &nbsp; </td>

</tr>

<tr>
<td><strong>mock_global_id_allocator.go</strong><dd><code>Regenerate
mock with mockery v2.53.3</code>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; </dd></td>
<td><a
href="https://github.com/milvus-io/milvus/pull/46491/files#diff-1822c38f0a6c4225134bee554bf518d0b9184040ea5bb3d9e926515ae80655c8">+15/-3</a>&nbsp;
&nbsp; </td>

</tr>

<tr>
<td><strong>mock_grpc_client.go</strong><dd><code>Regenerate mock with
mockery v2.53.3</code>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; </dd></td>
<td><a
href="https://github.com/milvus-io/milvus/pull/46491/files#diff-6137f17eb7eef693d72c6181f519f7b1a87669a00f5268cbd7c3399567451e64">+33/-13</a>&nbsp;
</td>

</tr>

<tr>
<td><strong>allocator.go</strong><dd><code>Regenerate mock with mockery
v2.53.3</code>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; </dd></td>
<td><a
href="https://github.com/milvus-io/milvus/pull/46491/files#diff-e2482bb9748ad163ba57c65251355035f45c140f12214cd96cc5da88376fcc39">+26/-6</a>&nbsp;
&nbsp; </td>

</tr>

<tr>
<td><strong>mock_factory.go</strong><dd><code>Regenerate mock with
mockery v2.53.3</code>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; </dd></td>
<td><a
href="https://github.com/milvus-io/milvus/pull/46491/files#diff-bc91a5fcdc0bd52eaaaf805d15985fe578debbb2576eb0c97d08e0573d0132e6">+18/-2</a>&nbsp;
&nbsp; </td>

</tr>

<tr>
<td><strong>mock_session.go</strong><dd><code>Regenerate mock with
mockery v2.53.3</code>&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;
&nbsp; &nbsp; &nbsp; &nbsp; </dd></td>
<td><a
href="https://github.com/milvus-io/milvus/pull/46491/files#diff-850f08ed9571fd6f3f84bd846fa29dab2d52be7c8f9d9b358fc667c6970a90e6">+79/-19</a>&nbsp;
</td>

</tr>
</table></details></td></tr></tbody></table>

</details>

___

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
2025-12-23 21:11:19 +08:00

420 lines
13 KiB
Go

// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package proxyutil
import (
"context"
"fmt"
"sync"
"github.com/cockroachdb/errors"
"github.com/samber/lo"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
grpcproxyclient "github.com/milvus-io/milvus/internal/distributed/proxy/client"
"github.com/milvus-io/milvus/internal/types"
"github.com/milvus-io/milvus/internal/util/sessionutil"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/metrics"
"github.com/milvus-io/milvus/pkg/v2/proto/proxypb"
"github.com/milvus-io/milvus/pkg/v2/util/commonpbutil"
"github.com/milvus-io/milvus/pkg/v2/util/merr"
"github.com/milvus-io/milvus/pkg/v2/util/metricsinfo"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
type ExpireCacheConfig struct {
msgType commonpb.MsgType
}
func (c ExpireCacheConfig) Apply(req *proxypb.InvalidateCollMetaCacheRequest) {
if req.GetBase() == nil {
req.Base = commonpbutil.NewMsgBase()
}
req.Base.MsgType = c.msgType
}
func DefaultExpireCacheConfig() ExpireCacheConfig {
return ExpireCacheConfig{}
}
type ExpireCacheOpt func(c *ExpireCacheConfig)
func SetMsgType(msgType commonpb.MsgType) ExpireCacheOpt {
return func(c *ExpireCacheConfig) {
c.msgType = msgType
}
}
type ProxyCreator func(ctx context.Context, addr string, nodeID int64) (types.ProxyClient, error)
func DefaultProxyCreator(ctx context.Context, addr string, nodeID int64) (types.ProxyClient, error) {
cli, err := grpcproxyclient.NewClient(ctx, addr, nodeID)
if err != nil {
return nil, err
}
return cli, nil
}
type ProxyClientManagerHelper struct {
afterConnect func()
}
var defaultClientManagerHelper = ProxyClientManagerHelper{
afterConnect: func() {},
}
type ProxyClientManagerInterface interface {
AddProxyClient(session *sessionutil.Session)
SetProxyClients(session []*sessionutil.Session)
GetProxyClients() *typeutil.ConcurrentMap[int64, types.ProxyClient]
DelProxyClient(s *sessionutil.Session)
GetProxyCount() int
InvalidateCollectionMetaCache(ctx context.Context, request *proxypb.InvalidateCollMetaCacheRequest, opts ...ExpireCacheOpt) error
InvalidateShardLeaderCache(ctx context.Context, request *proxypb.InvalidateShardLeaderCacheRequest) error
InvalidateCredentialCache(ctx context.Context, request *proxypb.InvalidateCredCacheRequest) error
UpdateCredentialCache(ctx context.Context, request *proxypb.UpdateCredCacheRequest) error
RefreshPolicyInfoCache(ctx context.Context, req *proxypb.RefreshPolicyInfoCacheRequest) error
GetProxyMetrics(ctx context.Context) ([]*milvuspb.GetMetricsResponse, error)
SetRates(ctx context.Context, request *proxypb.SetRatesRequest) error
GetComponentStates(ctx context.Context) (map[int64]*milvuspb.ComponentStates, error)
}
type ProxyClientManager struct {
creator ProxyCreator
proxyClient *typeutil.ConcurrentMap[int64, types.ProxyClient]
helper ProxyClientManagerHelper
}
func NewProxyClientManager(creator ProxyCreator) *ProxyClientManager {
return &ProxyClientManager{
creator: creator,
proxyClient: typeutil.NewConcurrentMap[int64, types.ProxyClient](),
helper: defaultClientManagerHelper,
}
}
// SetProxyClients sets proxy clients from a full snapshot of sessions.
// It removes stale clients not in the new snapshot and adds new ones.
// This is called during initial setup or when re-watching after etcd error.
func (p *ProxyClientManager) SetProxyClients(sessions []*sessionutil.Session) {
aliveSessions := lo.KeyBy(sessions, func(session *sessionutil.Session) int64 {
return session.ServerID
})
// Remove stale clients not in the alive sessions
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
if _, ok := aliveSessions[key]; !ok {
if cli, loaded := p.proxyClient.GetAndRemove(key); loaded {
cli.Close()
log.Info("remove stale proxy client", zap.Int64("serverID", key))
}
}
return true
})
// Add new clients
for _, session := range sessions {
p.AddProxyClient(session)
}
}
func (p *ProxyClientManager) GetProxyClients() *typeutil.ConcurrentMap[int64, types.ProxyClient] {
return p.proxyClient
}
func (p *ProxyClientManager) AddProxyClient(session *sessionutil.Session) {
_, ok := p.proxyClient.Get(session.ServerID)
if ok {
return
}
p.connect(session)
p.updateProxyNumMetric()
}
// GetProxyCount returns number of proxy clients.
func (p *ProxyClientManager) GetProxyCount() int {
return p.proxyClient.Len()
}
// mutex.Lock is required before calling this method.
func (p *ProxyClientManager) updateProxyNumMetric() {
metrics.RootCoordProxyCounter.WithLabelValues().Set(float64(p.proxyClient.Len()))
}
func (p *ProxyClientManager) connect(session *sessionutil.Session) {
pc, err := p.creator(context.Background(), session.Address, session.ServerID)
if err != nil {
log.Warn("failed to create proxy client", zap.String("address", session.Address), zap.Int64("serverID", session.ServerID), zap.Error(err))
return
}
_, ok := p.proxyClient.GetOrInsert(session.GetServerID(), pc)
if ok {
pc.Close()
return
}
log.Info("succeed to create proxy client", zap.String("address", session.Address), zap.Int64("serverID", session.ServerID))
p.helper.afterConnect()
}
func (p *ProxyClientManager) DelProxyClient(s *sessionutil.Session) {
cli, ok := p.proxyClient.GetAndRemove(s.GetServerID())
if ok {
cli.Close()
}
p.updateProxyNumMetric()
log.Info("remove proxy client", zap.String("proxy address", s.Address), zap.Int64("proxy id", s.ServerID))
}
func (p *ProxyClientManager) InvalidateCollectionMetaCache(ctx context.Context, request *proxypb.InvalidateCollMetaCacheRequest, opts ...ExpireCacheOpt) error {
c := DefaultExpireCacheConfig()
for _, opt := range opts {
opt(&c)
}
c.Apply(request)
if p.proxyClient.Len() == 0 {
log.Warn("proxy client is empty, InvalidateCollectionMetaCache will not send to any client")
return nil
}
group := &errgroup.Group{}
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
k, v := key, value
group.Go(func() error {
sta, err := v.InvalidateCollectionMetaCache(ctx, request)
if err != nil {
if errors.Is(err, merr.ErrNodeNotFound) {
log.Warn("InvalidateCollectionMetaCache failed due to proxy service not found", zap.Error(err))
return nil
}
if errors.Is(err, merr.ErrServiceUnimplemented) {
return nil
}
return fmt.Errorf("InvalidateCollectionMetaCache failed, proxyID = %d, err = %s", k, err)
}
if sta.ErrorCode != commonpb.ErrorCode_Success {
return fmt.Errorf("InvalidateCollectionMetaCache failed, proxyID = %d, err = %s", k, sta.Reason)
}
return nil
})
return true
})
return group.Wait()
}
// InvalidateCredentialCache TODO: too many codes similar to InvalidateCollectionMetaCache.
func (p *ProxyClientManager) InvalidateCredentialCache(ctx context.Context, request *proxypb.InvalidateCredCacheRequest) error {
if p.proxyClient.Len() == 0 {
log.Warn("proxy client is empty, InvalidateCredentialCache will not send to any client")
return nil
}
group := &errgroup.Group{}
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
k, v := key, value
group.Go(func() error {
sta, err := v.InvalidateCredentialCache(ctx, request)
if err != nil {
return fmt.Errorf("InvalidateCredentialCache failed, proxyID = %d, err = %s", k, err)
}
if sta.ErrorCode != commonpb.ErrorCode_Success {
return fmt.Errorf("InvalidateCredentialCache failed, proxyID = %d, err = %s", k, sta.Reason)
}
return nil
})
return true
})
return group.Wait()
}
// UpdateCredentialCache TODO: too many codes similar to InvalidateCollectionMetaCache.
func (p *ProxyClientManager) UpdateCredentialCache(ctx context.Context, request *proxypb.UpdateCredCacheRequest) error {
if p.proxyClient.Len() == 0 {
log.Warn("proxy client is empty, UpdateCredentialCache will not send to any client")
return nil
}
group := &errgroup.Group{}
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
k, v := key, value
group.Go(func() error {
sta, err := v.UpdateCredentialCache(ctx, request)
if err != nil {
return fmt.Errorf("UpdateCredentialCache failed, proxyID = %d, err = %s", k, err)
}
if sta.ErrorCode != commonpb.ErrorCode_Success {
return fmt.Errorf("UpdateCredentialCache failed, proxyID = %d, err = %s", k, sta.Reason)
}
return nil
})
return true
})
return group.Wait()
}
// RefreshPolicyInfoCache TODO: too many codes similar to InvalidateCollectionMetaCache.
func (p *ProxyClientManager) RefreshPolicyInfoCache(ctx context.Context, req *proxypb.RefreshPolicyInfoCacheRequest) error {
if p.proxyClient.Len() == 0 {
log.Warn("proxy client is empty, RefreshPrivilegeInfoCache will not send to any client")
return nil
}
group := &errgroup.Group{}
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
k, v := key, value
group.Go(func() error {
status, err := v.RefreshPolicyInfoCache(ctx, req)
if err != nil {
return fmt.Errorf("RefreshPolicyInfoCache failed, proxyID = %d, err = %s", k, err)
}
if status.GetErrorCode() != commonpb.ErrorCode_Success {
return merr.Error(status)
}
return nil
})
return true
})
return group.Wait()
}
// GetProxyMetrics sends requests to proxies to get metrics.
func (p *ProxyClientManager) GetProxyMetrics(ctx context.Context) ([]*milvuspb.GetMetricsResponse, error) {
if p.proxyClient.Len() == 0 {
log.Warn("proxy client is empty, GetMetrics will not send to any client")
return nil, nil
}
req, err := metricsinfo.ConstructRequestByMetricType(metricsinfo.SystemInfoMetrics)
if err != nil {
return nil, err
}
group := &errgroup.Group{}
var metricRspsMu sync.Mutex
metricRsps := make([]*milvuspb.GetMetricsResponse, 0)
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
k, v := key, value
group.Go(func() error {
rsp, err := v.GetProxyMetrics(ctx, req)
if err != nil {
return fmt.Errorf("GetMetrics failed, proxyID = %d, err = %s", k, err)
}
if rsp.GetStatus().GetErrorCode() != commonpb.ErrorCode_Success {
return fmt.Errorf("GetMetrics failed, proxyID = %d, err = %s", k, rsp.GetStatus().GetReason())
}
metricRspsMu.Lock()
metricRsps = append(metricRsps, rsp)
metricRspsMu.Unlock()
return nil
})
return true
})
err = group.Wait()
if err != nil {
return nil, err
}
return metricRsps, nil
}
// SetRates notifies Proxy to limit rates of requests.
func (p *ProxyClientManager) SetRates(ctx context.Context, request *proxypb.SetRatesRequest) error {
if p.proxyClient.Len() == 0 {
log.Warn("proxy client is empty, SetRates will not send to any client")
return nil
}
group := &errgroup.Group{}
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
k, v := key, value
group.Go(func() error {
sta, err := v.SetRates(ctx, request)
if err != nil {
return fmt.Errorf("SetRates failed, proxyID = %d, err = %s", k, err)
}
if sta.GetErrorCode() != commonpb.ErrorCode_Success {
return fmt.Errorf("SetRates failed, proxyID = %d, err = %s", k, sta.Reason)
}
return nil
})
return true
})
return group.Wait()
}
func (p *ProxyClientManager) GetComponentStates(ctx context.Context) (map[int64]*milvuspb.ComponentStates, error) {
group, ctx := errgroup.WithContext(ctx)
states := make(map[int64]*milvuspb.ComponentStates)
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
k, v := key, value
group.Go(func() error {
sta, err := v.GetComponentStates(ctx, &milvuspb.GetComponentStatesRequest{})
if err != nil {
return err
}
states[k] = sta
return nil
})
return true
})
err := group.Wait()
if err != nil {
return nil, err
}
return states, nil
}
func (p *ProxyClientManager) InvalidateShardLeaderCache(ctx context.Context, request *proxypb.InvalidateShardLeaderCacheRequest) error {
if p.proxyClient.Len() == 0 {
log.Warn("proxy client is empty, InvalidateShardLeaderCache will not send to any client")
return nil
}
group := &errgroup.Group{}
p.proxyClient.Range(func(key int64, value types.ProxyClient) bool {
k, v := key, value
group.Go(func() error {
sta, err := v.InvalidateShardLeaderCache(ctx, request)
if err != nil {
if errors.Is(err, merr.ErrNodeNotFound) {
log.Warn("InvalidateShardLeaderCache failed due to proxy service not found", zap.Error(err))
return nil
}
return fmt.Errorf("InvalidateShardLeaderCache failed, proxyID = %d, err = %s", k, err)
}
if sta.ErrorCode != commonpb.ErrorCode_Success {
return fmt.Errorf("InvalidateShardLeaderCache failed, proxyID = %d, err = %s", k, sta.Reason)
}
return nil
})
return true
})
return group.Wait()
}