fix: [2.6] Ensure the proxy's shard-leader cache remains stable for coord down test (#45909)

issue: #45847 
master pr: #45908 

After a collection is successfully loaded, the shard-leader state on the
QC may still not be marked as serviceable. It becomes serviceable only
after the scheduled distribution update runs, which will also invalidate
the shard-leader cache on the proxy. Therefore, even if queries are
already executable, the shard-leader mapping on the proxy may still
change afterward.

Try to ensure—as much as possible—that the proxy’s shard-leader cache
remains stable before killing the mixcoord.

Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
This commit is contained in:
cai.zhang 2025-11-28 17:29:08 +08:00 committed by GitHub
parent da601d2a3c
commit 07727e7f25
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -32,6 +32,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/v2/common"
"github.com/milvus-io/milvus/pkg/v2/log"
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
"github.com/milvus-io/milvus/pkg/v2/util/funcutil"
"github.com/milvus-io/milvus/pkg/v2/util/merr"
"github.com/milvus-io/milvus/pkg/v2/util/metric"
@ -174,6 +175,24 @@ func (s *CoordDownSearch) checkCollections() bool {
return notLoaded == 0
}
func (s *CoordDownSearch) waitLeaderServiceable() {
checkQNLeaderServiceable := func() bool {
for _, c := range s.Cluster.GetAllStreamingAndQueryNodesClient() {
resp, err := c.GetDataDistribution(context.TODO(), &querypb.GetDataDistributionRequest{})
s.NoError(err)
for _, v := range resp.GetLeaderViews() {
if !v.GetStatus().GetServiceable() {
return false
}
}
}
return true
}
for !checkQNLeaderServiceable() {
time.Sleep(time.Millisecond * 100)
}
}
func (s *CoordDownSearch) search(collectionName string, dim int, consistencyLevel commonpb.ConsistencyLevel) {
c := s.Cluster
var err error
@ -275,6 +294,9 @@ func (s *CoordDownSearch) setupData() {
wg.Wait()
log.Info("=========================Data injection finished=========================")
s.checkCollections()
s.waitLeaderServiceable()
// wait for qc get new distribution
time.Sleep(paramtable.Get().QueryCoordCfg.DistPullInterval.GetAsDuration(time.Millisecond) + time.Second)
log.Info(fmt.Sprintf("=========================start to search %s=========================", searchName))
s.search(searchName, Dim, commonpb.ConsistencyLevel_Eventually)
log.Info("=========================Search finished=========================")