mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-08 01:58:34 +08:00
Fix bug for IndexNode pod kill test (#18177)
Signed-off-by: Cai.Zhang <cai.zhang@zilliz.com>
This commit is contained in:
parent
b9d7027522
commit
a47a414dce
@ -166,6 +166,7 @@ func TestIndexBuilder(t *testing.T) {
|
|||||||
Err: false,
|
Err: false,
|
||||||
},
|
},
|
||||||
nodeManager: &NodeManager{
|
nodeManager: &NodeManager{
|
||||||
|
ctx: ctx,
|
||||||
nodeClients: map[UniqueID]types.IndexNode{
|
nodeClients: map[UniqueID]types.IndexNode{
|
||||||
4: &indexnode.Mock{
|
4: &indexnode.Mock{
|
||||||
Err: false,
|
Err: false,
|
||||||
@ -295,7 +296,9 @@ func TestIndexBuilder_Error(t *testing.T) {
|
|||||||
Fail: false,
|
Fail: false,
|
||||||
Err: false,
|
Err: false,
|
||||||
},
|
},
|
||||||
nodeManager: &NodeManager{},
|
nodeManager: &NodeManager{
|
||||||
|
ctx: ctx,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
mt := &metaTable{
|
mt := &metaTable{
|
||||||
indexBuildID2Meta: map[UniqueID]*Meta{
|
indexBuildID2Meta: map[UniqueID]*Meta{
|
||||||
@ -340,6 +343,7 @@ func TestIndexBuilder_Error(t *testing.T) {
|
|||||||
Err: false,
|
Err: false,
|
||||||
},
|
},
|
||||||
nodeManager: &NodeManager{
|
nodeManager: &NodeManager{
|
||||||
|
ctx: ctx,
|
||||||
nodeClients: map[UniqueID]types.IndexNode{
|
nodeClients: map[UniqueID]types.IndexNode{
|
||||||
1: &indexnode.Mock{
|
1: &indexnode.Mock{
|
||||||
Err: false,
|
Err: false,
|
||||||
@ -391,6 +395,7 @@ func TestIndexBuilder_Error(t *testing.T) {
|
|||||||
Err: true,
|
Err: true,
|
||||||
},
|
},
|
||||||
nodeManager: &NodeManager{
|
nodeManager: &NodeManager{
|
||||||
|
ctx: ctx,
|
||||||
nodeClients: map[UniqueID]types.IndexNode{
|
nodeClients: map[UniqueID]types.IndexNode{
|
||||||
1: &indexnode.Mock{
|
1: &indexnode.Mock{
|
||||||
Err: false,
|
Err: false,
|
||||||
|
|||||||
@ -20,17 +20,14 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"github.com/milvus-io/milvus/internal/proto/commonpb"
|
|
||||||
"github.com/milvus-io/milvus/internal/proto/indexpb"
|
|
||||||
|
|
||||||
"github.com/milvus-io/milvus/internal/metrics"
|
|
||||||
|
|
||||||
"go.uber.org/zap"
|
|
||||||
|
|
||||||
grpcindexnodeclient "github.com/milvus-io/milvus/internal/distributed/indexnode/client"
|
grpcindexnodeclient "github.com/milvus-io/milvus/internal/distributed/indexnode/client"
|
||||||
"github.com/milvus-io/milvus/internal/log"
|
"github.com/milvus-io/milvus/internal/log"
|
||||||
|
"github.com/milvus-io/milvus/internal/metrics"
|
||||||
|
"github.com/milvus-io/milvus/internal/proto/commonpb"
|
||||||
|
"github.com/milvus-io/milvus/internal/proto/indexpb"
|
||||||
"github.com/milvus-io/milvus/internal/proto/milvuspb"
|
"github.com/milvus-io/milvus/internal/proto/milvuspb"
|
||||||
"github.com/milvus-io/milvus/internal/types"
|
"github.com/milvus-io/milvus/internal/types"
|
||||||
|
"go.uber.org/zap"
|
||||||
)
|
)
|
||||||
|
|
||||||
// NodeManager is used by IndexCoord to manage the client of IndexNode.
|
// NodeManager is used by IndexCoord to manage the client of IndexNode.
|
||||||
@ -56,7 +53,7 @@ func NewNodeManager(ctx context.Context) *NodeManager {
|
|||||||
// setClient sets IndexNode client to node manager.
|
// setClient sets IndexNode client to node manager.
|
||||||
func (nm *NodeManager) setClient(nodeID UniqueID, client types.IndexNode) error {
|
func (nm *NodeManager) setClient(nodeID UniqueID, client types.IndexNode) error {
|
||||||
log.Debug("IndexCoord NodeManager setClient", zap.Int64("nodeID", nodeID))
|
log.Debug("IndexCoord NodeManager setClient", zap.Int64("nodeID", nodeID))
|
||||||
defer log.Debug("IndexNode NodeManager setclient success", zap.Any("nodeID", nodeID))
|
defer log.Debug("IndexNode NodeManager setClient success", zap.Any("nodeID", nodeID))
|
||||||
item := &PQItem{
|
item := &PQItem{
|
||||||
key: nodeID,
|
key: nodeID,
|
||||||
priority: 0,
|
priority: 0,
|
||||||
@ -105,32 +102,71 @@ func (nm *NodeManager) AddNode(nodeID UniqueID, address string) error {
|
|||||||
|
|
||||||
// PeekClient peeks the client with the least load.
|
// PeekClient peeks the client with the least load.
|
||||||
func (nm *NodeManager) PeekClient(meta *Meta) (UniqueID, types.IndexNode) {
|
func (nm *NodeManager) PeekClient(meta *Meta) (UniqueID, types.IndexNode) {
|
||||||
nm.lock.RLock()
|
allClients := nm.GetAllClients()
|
||||||
defer nm.lock.RUnlock()
|
|
||||||
|
|
||||||
if len(nm.nodeClients) == 0 {
|
if len(allClients) == 0 {
|
||||||
log.Error("there is no IndexNode online")
|
log.Error("there is no IndexNode online")
|
||||||
return -1, nil
|
return -1, nil
|
||||||
}
|
}
|
||||||
for nodeID, client := range nm.nodeClients {
|
|
||||||
resp, err := client.GetTaskSlots(nm.ctx, &indexpb.GetTaskSlotsRequest{})
|
// Note: In order to quickly end other goroutines, an error is returned when the client is successfully selected
|
||||||
if err != nil {
|
ctx, cancel := context.WithCancel(nm.ctx)
|
||||||
log.Warn("get IndexNode slots failed", zap.Int64("nodeID", nodeID), zap.Error(err))
|
var (
|
||||||
continue
|
peekNodeID = UniqueID(0)
|
||||||
}
|
nodeMutex = sync.Mutex{}
|
||||||
if resp.Status.ErrorCode != commonpb.ErrorCode_Success {
|
wg = sync.WaitGroup{}
|
||||||
log.Warn("get IndexNode slots failed", zap.Int64("nodeID", nodeID),
|
)
|
||||||
zap.String("reason", resp.Status.Reason))
|
|
||||||
continue
|
for nodeID, client := range allClients {
|
||||||
}
|
nodeID := nodeID
|
||||||
if resp.Slots > 0 {
|
client := client
|
||||||
return nodeID, client
|
wg.Add(1)
|
||||||
}
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
resp, err := client.GetTaskSlots(ctx, &indexpb.GetTaskSlotsRequest{})
|
||||||
|
if err != nil {
|
||||||
|
log.Warn("get IndexNode slots failed", zap.Int64("nodeID", nodeID), zap.Error(err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if resp.Status.ErrorCode != commonpb.ErrorCode_Success {
|
||||||
|
log.Warn("get IndexNode slots failed", zap.Int64("nodeID", nodeID),
|
||||||
|
zap.String("reason", resp.Status.Reason))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if resp.Slots > 0 {
|
||||||
|
nodeMutex.Lock()
|
||||||
|
defer nodeMutex.Unlock()
|
||||||
|
log.Info("peek client success", zap.Int64("nodeID", nodeID))
|
||||||
|
if peekNodeID == 0 {
|
||||||
|
peekNodeID = nodeID
|
||||||
|
}
|
||||||
|
cancel()
|
||||||
|
// Note: In order to quickly end other goroutines, an error is returned when the client is successfully selected
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
cancel()
|
||||||
|
if peekNodeID != 0 {
|
||||||
|
return peekNodeID, allClients[peekNodeID]
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0, nil
|
return 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (nm *NodeManager) GetAllClients() map[UniqueID]types.IndexNode {
|
||||||
|
nm.lock.RLock()
|
||||||
|
defer nm.lock.RUnlock()
|
||||||
|
|
||||||
|
allClients := make(map[UniqueID]types.IndexNode, len(nm.nodeClients))
|
||||||
|
for nodeID, client := range nm.nodeClients {
|
||||||
|
allClients[nodeID] = client
|
||||||
|
}
|
||||||
|
|
||||||
|
return allClients
|
||||||
|
}
|
||||||
|
|
||||||
// indexNodeGetMetricsResponse record the metrics information of IndexNode.
|
// indexNodeGetMetricsResponse record the metrics information of IndexNode.
|
||||||
type indexNodeGetMetricsResponse struct {
|
type indexNodeGetMetricsResponse struct {
|
||||||
resp *milvuspb.GetMetricsResponse
|
resp *milvuspb.GetMetricsResponse
|
||||||
|
|||||||
@ -18,40 +18,145 @@ package indexcoord
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/milvus-io/milvus/internal/indexnode"
|
||||||
"github.com/milvus-io/milvus/internal/proto/commonpb"
|
"github.com/milvus-io/milvus/internal/proto/commonpb"
|
||||||
"github.com/milvus-io/milvus/internal/proto/indexpb"
|
"github.com/milvus-io/milvus/internal/proto/indexpb"
|
||||||
"github.com/milvus-io/milvus/internal/proto/schemapb"
|
"github.com/milvus-io/milvus/internal/proto/schemapb"
|
||||||
|
"github.com/milvus-io/milvus/internal/types"
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestNodeManager_PeekClient(t *testing.T) {
|
func TestNodeManager_PeekClient(t *testing.T) {
|
||||||
nm := NewNodeManager(context.Background())
|
t.Run("success", func(t *testing.T) {
|
||||||
meta := &Meta{
|
nm := NewNodeManager(context.Background())
|
||||||
indexMeta: &indexpb.IndexMeta{
|
meta := &Meta{
|
||||||
Req: &indexpb.BuildIndexRequest{
|
indexMeta: &indexpb.IndexMeta{
|
||||||
DataPaths: []string{"PeekClient-1", "PeekClient-2"},
|
Req: &indexpb.BuildIndexRequest{
|
||||||
NumRows: 1000,
|
DataPaths: []string{"PeekClient-1", "PeekClient-2"},
|
||||||
TypeParams: []*commonpb.KeyValuePair{
|
NumRows: 1000,
|
||||||
{
|
TypeParams: []*commonpb.KeyValuePair{
|
||||||
Key: "dim",
|
{
|
||||||
Value: "128",
|
Key: "dim",
|
||||||
|
Value: "128",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
FieldSchema: &schemapb.FieldSchema{
|
||||||
|
DataType: schemapb.DataType_FloatVector,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
FieldSchema: &schemapb.FieldSchema{
|
},
|
||||||
DataType: schemapb.DataType_FloatVector,
|
}
|
||||||
|
nodeID, client := nm.PeekClient(meta)
|
||||||
|
assert.Equal(t, int64(-1), nodeID)
|
||||||
|
assert.Nil(t, client)
|
||||||
|
err := nm.AddNode(1, "indexnode-1")
|
||||||
|
assert.Nil(t, err)
|
||||||
|
nm.pq.SetMemory(1, 100)
|
||||||
|
nodeID2, client2 := nm.PeekClient(meta)
|
||||||
|
assert.Equal(t, int64(0), nodeID2)
|
||||||
|
assert.Nil(t, client2)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("multiple unavailable IndexNode", func(t *testing.T) {
|
||||||
|
nm := &NodeManager{
|
||||||
|
ctx: context.TODO(),
|
||||||
|
nodeClients: map[UniqueID]types.IndexNode{
|
||||||
|
1: &indexnode.MockIndexNode{
|
||||||
|
GetTaskSlotsMock: func(ctx context.Context, req *indexpb.GetTaskSlotsRequest) (*indexpb.GetTaskSlotsResponse, error) {
|
||||||
|
return &indexpb.GetTaskSlotsResponse{
|
||||||
|
Status: &commonpb.Status{
|
||||||
|
ErrorCode: commonpb.ErrorCode_UnexpectedError,
|
||||||
|
},
|
||||||
|
}, errors.New("error")
|
||||||
|
},
|
||||||
|
},
|
||||||
|
2: &indexnode.MockIndexNode{
|
||||||
|
GetTaskSlotsMock: func(ctx context.Context, req *indexpb.GetTaskSlotsRequest) (*indexpb.GetTaskSlotsResponse, error) {
|
||||||
|
return &indexpb.GetTaskSlotsResponse{
|
||||||
|
Status: &commonpb.Status{
|
||||||
|
ErrorCode: commonpb.ErrorCode_UnexpectedError,
|
||||||
|
},
|
||||||
|
}, errors.New("error")
|
||||||
|
},
|
||||||
|
},
|
||||||
|
3: &indexnode.MockIndexNode{
|
||||||
|
GetTaskSlotsMock: func(ctx context.Context, req *indexpb.GetTaskSlotsRequest) (*indexpb.GetTaskSlotsResponse, error) {
|
||||||
|
return &indexpb.GetTaskSlotsResponse{
|
||||||
|
Status: &commonpb.Status{
|
||||||
|
ErrorCode: commonpb.ErrorCode_UnexpectedError,
|
||||||
|
},
|
||||||
|
}, errors.New("error")
|
||||||
|
},
|
||||||
|
},
|
||||||
|
4: &indexnode.MockIndexNode{
|
||||||
|
GetTaskSlotsMock: func(ctx context.Context, req *indexpb.GetTaskSlotsRequest) (*indexpb.GetTaskSlotsResponse, error) {
|
||||||
|
return &indexpb.GetTaskSlotsResponse{
|
||||||
|
Status: &commonpb.Status{
|
||||||
|
ErrorCode: commonpb.ErrorCode_UnexpectedError,
|
||||||
|
},
|
||||||
|
}, errors.New("error")
|
||||||
|
},
|
||||||
|
},
|
||||||
|
5: &indexnode.MockIndexNode{
|
||||||
|
GetTaskSlotsMock: func(ctx context.Context, req *indexpb.GetTaskSlotsRequest) (*indexpb.GetTaskSlotsResponse, error) {
|
||||||
|
return &indexpb.GetTaskSlotsResponse{
|
||||||
|
Status: &commonpb.Status{
|
||||||
|
ErrorCode: commonpb.ErrorCode_UnexpectedError,
|
||||||
|
Reason: "fail reason",
|
||||||
|
},
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
6: &indexnode.MockIndexNode{
|
||||||
|
GetTaskSlotsMock: func(ctx context.Context, req *indexpb.GetTaskSlotsRequest) (*indexpb.GetTaskSlotsResponse, error) {
|
||||||
|
return &indexpb.GetTaskSlotsResponse{
|
||||||
|
Status: &commonpb.Status{
|
||||||
|
ErrorCode: commonpb.ErrorCode_UnexpectedError,
|
||||||
|
Reason: "fail reason",
|
||||||
|
},
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
7: &indexnode.MockIndexNode{
|
||||||
|
GetTaskSlotsMock: func(ctx context.Context, req *indexpb.GetTaskSlotsRequest) (*indexpb.GetTaskSlotsResponse, error) {
|
||||||
|
return &indexpb.GetTaskSlotsResponse{
|
||||||
|
Status: &commonpb.Status{
|
||||||
|
ErrorCode: commonpb.ErrorCode_UnexpectedError,
|
||||||
|
Reason: "fail reason",
|
||||||
|
},
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
8: &indexnode.MockIndexNode{
|
||||||
|
GetTaskSlotsMock: func(ctx context.Context, req *indexpb.GetTaskSlotsRequest) (*indexpb.GetTaskSlotsResponse, error) {
|
||||||
|
return &indexpb.GetTaskSlotsResponse{
|
||||||
|
Slots: 1,
|
||||||
|
Status: &commonpb.Status{
|
||||||
|
ErrorCode: commonpb.ErrorCode_Success,
|
||||||
|
Reason: "",
|
||||||
|
},
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
|
},
|
||||||
|
9: &indexnode.MockIndexNode{
|
||||||
|
GetTaskSlotsMock: func(ctx context.Context, req *indexpb.GetTaskSlotsRequest) (*indexpb.GetTaskSlotsResponse, error) {
|
||||||
|
return &indexpb.GetTaskSlotsResponse{
|
||||||
|
Slots: 10,
|
||||||
|
Status: &commonpb.Status{
|
||||||
|
ErrorCode: commonpb.ErrorCode_Success,
|
||||||
|
Reason: "",
|
||||||
|
},
|
||||||
|
}, nil
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
}
|
||||||
}
|
|
||||||
nodeID, client := nm.PeekClient(meta)
|
nodeID, client := nm.PeekClient(&Meta{})
|
||||||
assert.Equal(t, int64(-1), nodeID)
|
assert.NotNil(t, client)
|
||||||
assert.Nil(t, client)
|
assert.Contains(t, []UniqueID{8, 9}, nodeID)
|
||||||
err := nm.AddNode(1, "indexnode-1")
|
})
|
||||||
assert.Nil(t, err)
|
|
||||||
nm.pq.SetMemory(1, 100)
|
|
||||||
nodeID2, client2 := nm.PeekClient(meta)
|
|
||||||
assert.Equal(t, int64(0), nodeID2)
|
|
||||||
assert.Nil(t, client2)
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -22,6 +22,8 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
|
"github.com/milvus-io/milvus/internal/types"
|
||||||
|
|
||||||
"go.uber.org/zap"
|
"go.uber.org/zap"
|
||||||
|
|
||||||
"github.com/golang/protobuf/proto"
|
"github.com/golang/protobuf/proto"
|
||||||
@ -39,6 +41,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// Mock is an alternative to IndexNode, it will return specific results based on specific parameters.
|
// Mock is an alternative to IndexNode, it will return specific results based on specific parameters.
|
||||||
|
// Deprecated, use MockIndexNode
|
||||||
type Mock struct {
|
type Mock struct {
|
||||||
Build bool
|
Build bool
|
||||||
Failure bool
|
Failure bool
|
||||||
@ -386,3 +389,23 @@ func getMockSystemInfoMetrics(
|
|||||||
ComponentName: metricsinfo.ConstructComponentName(typeutil.IndexNodeRole, Params.IndexNodeCfg.GetNodeID()),
|
ComponentName: metricsinfo.ConstructComponentName(typeutil.IndexNodeRole, Params.IndexNodeCfg.GetNodeID()),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type MockIndexNode struct {
|
||||||
|
types.IndexNode
|
||||||
|
|
||||||
|
CreateIndexMock func(ctx context.Context, req *indexpb.CreateIndexRequest) (*commonpb.Status, error)
|
||||||
|
GetTaskSlotsMock func(ctx context.Context, req *indexpb.GetTaskSlotsRequest) (*indexpb.GetTaskSlotsResponse, error)
|
||||||
|
GetMetricsMock func(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (min *MockIndexNode) CreateIndex(ctx context.Context, req *indexpb.CreateIndexRequest) (*commonpb.Status, error) {
|
||||||
|
return min.CreateIndexMock(ctx, req)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (min *MockIndexNode) GetTaskSlots(ctx context.Context, req *indexpb.GetTaskSlotsRequest) (*indexpb.GetTaskSlotsResponse, error) {
|
||||||
|
return min.GetTaskSlotsMock(ctx, req)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (min *MockIndexNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
|
||||||
|
return min.GetMetricsMock(ctx, req)
|
||||||
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user