resource group impl (#21609)

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
This commit is contained in:
wei liu 2023-01-30 10:19:48 +08:00 committed by GitHub
parent 66027790a2
commit 73c44d4b29
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
63 changed files with 5547 additions and 514 deletions

2
go.mod
View File

@ -27,7 +27,7 @@ require (
github.com/klauspost/compress v1.14.4 github.com/klauspost/compress v1.14.4
github.com/lingdor/stackerror v0.0.0-20191119040541-976d8885ed76 github.com/lingdor/stackerror v0.0.0-20191119040541-976d8885ed76
github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d
github.com/milvus-io/milvus-proto/go-api v0.0.0-20230112125535-5f87a812202c github.com/milvus-io/milvus-proto/go-api v0.0.0-20230129073344-87a125853a0b
github.com/minio/minio-go/v7 v7.0.17 github.com/minio/minio-go/v7 v7.0.17
github.com/panjf2000/ants/v2 v2.4.8 github.com/panjf2000/ants/v2 v2.4.8
github.com/pkg/errors v0.9.1 github.com/pkg/errors v0.9.1

4
go.sum
View File

@ -491,8 +491,8 @@ github.com/mgutz/ansi v0.0.0-20200706080929-d51e80ef957d/go.mod h1:01TrycV0kFyex
github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8= github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b h1:TfeY0NxYxZzUfIfYe5qYDBzt4ZYRqzUjTR6CvUzjat8=
github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4= github.com/milvus-io/gorocksdb v0.0.0-20220624081344-8c5f4212846b/go.mod h1:iwW+9cWfIzzDseEBCCeDSN5SD16Tidvy8cwQ7ZY8Qj4=
github.com/milvus-io/milvus-proto/go-api v0.0.0-20230112125535-5f87a812202c h1:74uRPm5WWagMe8bItOQ8QFuXcrUIWuWGAQ1GrwVM4J4= github.com/milvus-io/milvus-proto/go-api v0.0.0-20230129073344-87a125853a0b h1:HoJ3J70COnaR3WQTA4gN70DkiaMRPkyLI6yXrPqpFiU=
github.com/milvus-io/milvus-proto/go-api v0.0.0-20230112125535-5f87a812202c/go.mod h1:148qnlmZ0Fdm1Fq+Mj/OW2uDoEP25g3mjh0vMGtkgmk= github.com/milvus-io/milvus-proto/go-api v0.0.0-20230129073344-87a125853a0b/go.mod h1:148qnlmZ0Fdm1Fq+Mj/OW2uDoEP25g3mjh0vMGtkgmk=
github.com/milvus-io/pulsar-client-go v0.6.10 h1:eqpJjU+/QX0iIhEo3nhOqMNXL+TyInAs1IAHZCrCM/A= github.com/milvus-io/pulsar-client-go v0.6.10 h1:eqpJjU+/QX0iIhEo3nhOqMNXL+TyInAs1IAHZCrCM/A=
github.com/milvus-io/pulsar-client-go v0.6.10/go.mod h1:lQqCkgwDF8YFYjKA+zOheTk1tev2B+bKj5j7+nm8M1w= github.com/milvus-io/pulsar-client-go v0.6.10/go.mod h1:lQqCkgwDF8YFYjKA+zOheTk1tev2B+bKj5j7+nm8M1w=
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs= github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=

View File

@ -372,7 +372,7 @@ const char descriptor_table_protodef_common_2eproto[] PROTOBUF_SECTION_VARIABLE(
"\n\n\006Sealed\020\003\022\013\n\007Flushed\020\004\022\014\n\010Flushing\020\005\022\013" "\n\n\006Sealed\020\003\022\013\n\007Flushed\020\004\022\014\n\010Flushing\020\005\022\013"
"\n\007Dropped\020\006\022\r\n\tImporting\020\007*>\n\017Placeholde" "\n\007Dropped\020\006\022\r\n\tImporting\020\007*>\n\017Placeholde"
"rType\022\010\n\004None\020\000\022\020\n\014BinaryVector\020d\022\017\n\013Flo" "rType\022\010\n\004None\020\000\022\020\n\014BinaryVector\020d\022\017\n\013Flo"
"atVector\020e*\277\016\n\007MsgType\022\r\n\tUndefined\020\000\022\024\n" "atVector\020e*\300\016\n\007MsgType\022\r\n\tUndefined\020\000\022\024\n"
"\020CreateCollection\020d\022\022\n\016DropCollection\020e\022" "\020CreateCollection\020d\022\022\n\016DropCollection\020e\022"
"\021\n\rHasCollection\020f\022\026\n\022DescribeCollection" "\021\n\rHasCollection\020f\022\026\n\022DescribeCollection"
"\020g\022\023\n\017ShowCollections\020h\022\024\n\020GetSystemConf" "\020g\022\023\n\017ShowCollections\020h\022\024\n\020GetSystemConf"
@ -416,53 +416,53 @@ const char descriptor_table_protodef_common_2eproto[] PROTOBUF_SECTION_VARIABLE(
"\n\020OperatePrivilege\020\306\014\022\020\n\013SelectGrant\020\307\014\022" "\n\020OperatePrivilege\020\306\014\022\020\n\013SelectGrant\020\307\014\022"
"\033\n\026RefreshPolicyInfoCache\020\310\014\022\017\n\nListPoli" "\033\n\026RefreshPolicyInfoCache\020\310\014\022\017\n\nListPoli"
"cy\020\311\014\022\030\n\023CreateResourceGroup\020\244\r\022\026\n\021DropR" "cy\020\311\014\022\030\n\023CreateResourceGroup\020\244\r\022\026\n\021DropR"
"esourceGroup\020\245\r\022\026\n\021ListResourceGroup\020\246\r\022" "esourceGroup\020\245\r\022\027\n\022ListResourceGroups\020\246\r"
"\032\n\025DescribeResourceGroup\020\247\r\022\021\n\014TransferN" "\022\032\n\025DescribeResourceGroup\020\247\r\022\021\n\014Transfer"
"ode\020\250\r\022\024\n\017TransferReplica\020\251\r*\"\n\007DslType\022" "Node\020\250\r\022\024\n\017TransferReplica\020\251\r*\"\n\007DslType"
"\007\n\003Dsl\020\000\022\016\n\nBoolExprV1\020\001*B\n\017CompactionSt" "\022\007\n\003Dsl\020\000\022\016\n\nBoolExprV1\020\001*B\n\017CompactionS"
"ate\022\021\n\rUndefiedState\020\000\022\r\n\tExecuting\020\001\022\r\n" "tate\022\021\n\rUndefiedState\020\000\022\r\n\tExecuting\020\001\022\r"
"\tCompleted\020\002*X\n\020ConsistencyLevel\022\n\n\006Stro" "\n\tCompleted\020\002*X\n\020ConsistencyLevel\022\n\n\006Str"
"ng\020\000\022\013\n\007Session\020\001\022\013\n\007Bounded\020\002\022\016\n\nEventu" "ong\020\000\022\013\n\007Session\020\001\022\013\n\007Bounded\020\002\022\016\n\nEvent"
"ally\020\003\022\016\n\nCustomized\020\004*\236\001\n\013ImportState\022\021" "ually\020\003\022\016\n\nCustomized\020\004*\236\001\n\013ImportState\022"
"\n\rImportPending\020\000\022\020\n\014ImportFailed\020\001\022\021\n\rI" "\021\n\rImportPending\020\000\022\020\n\014ImportFailed\020\001\022\021\n\r"
"mportStarted\020\002\022\023\n\017ImportPersisted\020\005\022\021\n\rI" "ImportStarted\020\002\022\023\n\017ImportPersisted\020\005\022\021\n\r"
"mportFlushed\020\010\022\023\n\017ImportCompleted\020\006\022\032\n\026I" "ImportFlushed\020\010\022\023\n\017ImportCompleted\020\006\022\032\n\026"
"mportFailedAndCleaned\020\007*2\n\nObjectType\022\016\n" "ImportFailedAndCleaned\020\007*2\n\nObjectType\022\016"
"\nCollection\020\000\022\n\n\006Global\020\001\022\010\n\004User\020\002*\233\005\n\017" "\n\nCollection\020\000\022\n\n\006Global\020\001\022\010\n\004User\020\002*\233\005\n"
"ObjectPrivilege\022\020\n\014PrivilegeAll\020\000\022\035\n\031Pri" "\017ObjectPrivilege\022\020\n\014PrivilegeAll\020\000\022\035\n\031Pr"
"vilegeCreateCollection\020\001\022\033\n\027PrivilegeDro" "ivilegeCreateCollection\020\001\022\033\n\027PrivilegeDr"
"pCollection\020\002\022\037\n\033PrivilegeDescribeCollec" "opCollection\020\002\022\037\n\033PrivilegeDescribeColle"
"tion\020\003\022\034\n\030PrivilegeShowCollections\020\004\022\021\n\r" "ction\020\003\022\034\n\030PrivilegeShowCollections\020\004\022\021\n"
"PrivilegeLoad\020\005\022\024\n\020PrivilegeRelease\020\006\022\027\n" "\rPrivilegeLoad\020\005\022\024\n\020PrivilegeRelease\020\006\022\027"
"\023PrivilegeCompaction\020\007\022\023\n\017PrivilegeInser" "\n\023PrivilegeCompaction\020\007\022\023\n\017PrivilegeInse"
"t\020\010\022\023\n\017PrivilegeDelete\020\t\022\032\n\026PrivilegeGet" "rt\020\010\022\023\n\017PrivilegeDelete\020\t\022\032\n\026PrivilegeGe"
"Statistics\020\n\022\030\n\024PrivilegeCreateIndex\020\013\022\030" "tStatistics\020\n\022\030\n\024PrivilegeCreateIndex\020\013\022"
"\n\024PrivilegeIndexDetail\020\014\022\026\n\022PrivilegeDro" "\030\n\024PrivilegeIndexDetail\020\014\022\026\n\022PrivilegeDr"
"pIndex\020\r\022\023\n\017PrivilegeSearch\020\016\022\022\n\016Privile" "opIndex\020\r\022\023\n\017PrivilegeSearch\020\016\022\022\n\016Privil"
"geFlush\020\017\022\022\n\016PrivilegeQuery\020\020\022\030\n\024Privile" "egeFlush\020\017\022\022\n\016PrivilegeQuery\020\020\022\030\n\024Privil"
"geLoadBalance\020\021\022\023\n\017PrivilegeImport\020\022\022\034\n\030" "egeLoadBalance\020\021\022\023\n\017PrivilegeImport\020\022\022\034\n"
"PrivilegeCreateOwnership\020\023\022\027\n\023PrivilegeU" "\030PrivilegeCreateOwnership\020\023\022\027\n\023Privilege"
"pdateUser\020\024\022\032\n\026PrivilegeDropOwnership\020\025\022" "UpdateUser\020\024\022\032\n\026PrivilegeDropOwnership\020\025"
"\034\n\030PrivilegeSelectOwnership\020\026\022\034\n\030Privile" "\022\034\n\030PrivilegeSelectOwnership\020\026\022\034\n\030Privil"
"geManageOwnership\020\027\022\027\n\023PrivilegeSelectUs" "egeManageOwnership\020\027\022\027\n\023PrivilegeSelectU"
"er\020\030\022\023\n\017PrivilegeUpsert\020\031*S\n\tStateCode\022\020" "ser\020\030\022\023\n\017PrivilegeUpsert\020\031*S\n\tStateCode\022"
"\n\014Initializing\020\000\022\013\n\007Healthy\020\001\022\014\n\010Abnorma" "\020\n\014Initializing\020\000\022\013\n\007Healthy\020\001\022\014\n\010Abnorm"
"l\020\002\022\013\n\007StandBy\020\003\022\014\n\010Stopping\020\004*c\n\tLoadSt" "al\020\002\022\013\n\007StandBy\020\003\022\014\n\010Stopping\020\004*c\n\tLoadS"
"ate\022\025\n\021LoadStateNotExist\020\000\022\024\n\020LoadStateN" "tate\022\025\n\021LoadStateNotExist\020\000\022\024\n\020LoadState"
"otLoad\020\001\022\024\n\020LoadStateLoading\020\002\022\023\n\017LoadSt" "NotLoad\020\001\022\024\n\020LoadStateLoading\020\002\022\023\n\017LoadS"
"ateLoaded\020\003:^\n\021privilege_ext_obj\022\037.googl" "tateLoaded\020\003:^\n\021privilege_ext_obj\022\037.goog"
"e.protobuf.MessageOptions\030\351\007 \001(\0132!.milvu" "le.protobuf.MessageOptions\030\351\007 \001(\0132!.milv"
"s.proto.common.PrivilegeExtBf\n\016io.milvus" "us.proto.common.PrivilegeExtBf\n\016io.milvu"
".grpcB\013CommonProtoP\001Z1github.com/milvus-" "s.grpcB\013CommonProtoP\001Z1github.com/milvus"
"io/milvus-proto/go-api/commonpb\240\001\001\252\002\016IO." "-io/milvus-proto/go-api/commonpb\240\001\001\252\002\016IO"
"Milvus.Grpcb\006proto3" ".Milvus.Grpcb\006proto3"
; ;
static const ::_pbi::DescriptorTable* const descriptor_table_common_2eproto_deps[1] = { static const ::_pbi::DescriptorTable* const descriptor_table_common_2eproto_deps[1] = {
&::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto, &::descriptor_table_google_2fprotobuf_2fdescriptor_2eproto,
}; };
static ::_pbi::once_flag descriptor_table_common_2eproto_once; static ::_pbi::once_flag descriptor_table_common_2eproto_once;
const ::_pbi::DescriptorTable descriptor_table_common_2eproto = { const ::_pbi::DescriptorTable descriptor_table_common_2eproto = {
false, false, 5859, descriptor_table_protodef_common_2eproto, false, false, 5860, descriptor_table_protodef_common_2eproto,
"common.proto", "common.proto",
&descriptor_table_common_2eproto_once, descriptor_table_common_2eproto_deps, 1, 11, &descriptor_table_common_2eproto_once, descriptor_table_common_2eproto_deps, 1, 11,
schemas, file_default_instances, TableStruct_common_2eproto::offsets, schemas, file_default_instances, TableStruct_common_2eproto::offsets,

View File

@ -354,7 +354,7 @@ enum MsgType : int {
ListPolicy = 1609, ListPolicy = 1609,
CreateResourceGroup = 1700, CreateResourceGroup = 1700,
DropResourceGroup = 1701, DropResourceGroup = 1701,
ListResourceGroup = 1702, ListResourceGroups = 1702,
DescribeResourceGroup = 1703, DescribeResourceGroup = 1703,
TransferNode = 1704, TransferNode = 1704,
TransferReplica = 1705, TransferReplica = 1705,

View File

@ -869,25 +869,25 @@ func (s *Server) RenameCollection(ctx context.Context, req *milvuspb.RenameColle
} }
func (s *Server) CreateResourceGroup(ctx context.Context, req *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error) { func (s *Server) CreateResourceGroup(ctx context.Context, req *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error) {
return nil, nil return s.proxy.CreateResourceGroup(ctx, req)
} }
func (s *Server) DropResourceGroup(ctx context.Context, req *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error) { func (s *Server) DropResourceGroup(ctx context.Context, req *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error) {
return nil, nil return s.proxy.DropResourceGroup(ctx, req)
} }
func (s *Server) DescribeResourceGroup(ctx context.Context, req *milvuspb.DescribeResourceGroupRequest) (*milvuspb.DescribeResourceGroupResponse, error) { func (s *Server) DescribeResourceGroup(ctx context.Context, req *milvuspb.DescribeResourceGroupRequest) (*milvuspb.DescribeResourceGroupResponse, error) {
return nil, nil return s.proxy.DescribeResourceGroup(ctx, req)
} }
func (s *Server) TransferNode(ctx context.Context, req *milvuspb.TransferNodeRequest) (*commonpb.Status, error) { func (s *Server) TransferNode(ctx context.Context, req *milvuspb.TransferNodeRequest) (*commonpb.Status, error) {
return nil, nil return s.proxy.TransferNode(ctx, req)
} }
func (s *Server) TransferReplica(ctx context.Context, req *milvuspb.TransferReplicaRequest) (*commonpb.Status, error) { func (s *Server) TransferReplica(ctx context.Context, req *milvuspb.TransferReplicaRequest) (*commonpb.Status, error) {
return nil, nil return s.proxy.TransferReplica(ctx, req)
} }
func (s *Server) ListResourceGroup(ctx context.Context, req *milvuspb.ListResourceGroupRequest) (*milvuspb.ListResourceGroupResponse, error) { func (s *Server) ListResourceGroups(ctx context.Context, req *milvuspb.ListResourceGroupsRequest) (*milvuspb.ListResourceGroupsResponse, error) {
return nil, nil return s.proxy.ListResourceGroups(ctx, req)
} }

View File

@ -293,7 +293,7 @@ func (m *MockRootCoord) RenameCollection(ctx context.Context, req *milvuspb.Rena
return nil, nil return nil, nil
} }
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
type MockQueryCoord struct { type MockQueryCoord struct {
MockBase MockBase
initErr error initErr error
@ -408,6 +408,30 @@ func (m *MockQueryCoord) CheckHealth(ctx context.Context, req *milvuspb.CheckHea
}, nil }, nil
} }
func (m *MockQueryCoord) CreateResourceGroup(ctx context.Context, req *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error) {
return nil, nil
}
func (m *MockQueryCoord) DropResourceGroup(ctx context.Context, req *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error) {
return nil, nil
}
func (m *MockQueryCoord) DescribeResourceGroup(ctx context.Context, req *querypb.DescribeResourceGroupRequest) (*querypb.DescribeResourceGroupResponse, error) {
return nil, nil
}
func (m *MockQueryCoord) TransferNode(ctx context.Context, req *milvuspb.TransferNodeRequest) (*commonpb.Status, error) {
return nil, nil
}
func (m *MockQueryCoord) TransferReplica(ctx context.Context, req *querypb.TransferReplicaRequest) (*commonpb.Status, error) {
return nil, nil
}
func (m *MockQueryCoord) ListResourceGroups(ctx context.Context, req *milvuspb.ListResourceGroupsRequest) (*milvuspb.ListResourceGroupsResponse, error) {
return nil, nil
}
// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
type MockDataCoord struct { type MockDataCoord struct {
MockBase MockBase
@ -935,7 +959,7 @@ func (m *MockProxy) TransferReplica(ctx context.Context, req *milvuspb.TransferR
return nil, nil return nil, nil
} }
func (m *MockProxy) ListResourceGroup(ctx context.Context, req *milvuspb.ListResourceGroupRequest) (*milvuspb.ListResourceGroupResponse, error) { func (m *MockProxy) ListResourceGroups(ctx context.Context, req *milvuspb.ListResourceGroupsRequest) (*milvuspb.ListResourceGroupsResponse, error) {
return nil, nil return nil, nil
} }
@ -1380,6 +1404,36 @@ func Test_NewServer(t *testing.T) {
assert.Nil(t, err) assert.Nil(t, err)
}) })
t.Run("CreateResourceGroup", func(t *testing.T) {
_, err := server.CreateResourceGroup(ctx, nil)
assert.Nil(t, err)
})
t.Run("DropResourceGroup", func(t *testing.T) {
_, err := server.DropResourceGroup(ctx, nil)
assert.Nil(t, err)
})
t.Run("TransferNode", func(t *testing.T) {
_, err := server.TransferNode(ctx, nil)
assert.Nil(t, err)
})
t.Run("TransferReplica", func(t *testing.T) {
_, err := server.TransferReplica(ctx, nil)
assert.Nil(t, err)
})
t.Run("ListResourceGroups", func(t *testing.T) {
_, err := server.ListResourceGroups(ctx, nil)
assert.Nil(t, err)
})
t.Run("DescribeResourceGroup", func(t *testing.T) {
_, err := server.DescribeResourceGroup(ctx, nil)
assert.Nil(t, err)
})
err = server.Stop() err = server.Stop()
assert.Nil(t, err) assert.Nil(t, err)

View File

@ -418,3 +418,111 @@ func (c *Client) CheckHealth(ctx context.Context, req *milvuspb.CheckHealthReque
} }
return ret.(*milvuspb.CheckHealthResponse), err return ret.(*milvuspb.CheckHealthResponse), err
} }
func (c *Client) CreateResourceGroup(ctx context.Context, req *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error) {
req = typeutil.Clone(req)
commonpbutil.UpdateMsgBase(
req.GetBase(),
commonpbutil.FillMsgBaseFromClient(paramtable.GetNodeID(), commonpbutil.WithTargetID(c.sess.ServerID)),
)
ret, err := c.grpcClient.ReCall(ctx, func(client querypb.QueryCoordClient) (any, error) {
if !funcutil.CheckCtxValid(ctx) {
return nil, ctx.Err()
}
return client.CreateResourceGroup(ctx, req)
})
if err != nil || ret == nil {
return nil, err
}
return ret.(*commonpb.Status), err
}
func (c *Client) DropResourceGroup(ctx context.Context, req *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error) {
req = typeutil.Clone(req)
commonpbutil.UpdateMsgBase(
req.GetBase(),
commonpbutil.FillMsgBaseFromClient(paramtable.GetNodeID(), commonpbutil.WithTargetID(c.sess.ServerID)),
)
ret, err := c.grpcClient.ReCall(ctx, func(client querypb.QueryCoordClient) (any, error) {
if !funcutil.CheckCtxValid(ctx) {
return nil, ctx.Err()
}
return client.DropResourceGroup(ctx, req)
})
if err != nil || ret == nil {
return nil, err
}
return ret.(*commonpb.Status), err
}
func (c *Client) DescribeResourceGroup(ctx context.Context, req *querypb.DescribeResourceGroupRequest) (*querypb.DescribeResourceGroupResponse, error) {
req = typeutil.Clone(req)
commonpbutil.UpdateMsgBase(
req.GetBase(),
commonpbutil.FillMsgBaseFromClient(paramtable.GetNodeID(), commonpbutil.WithTargetID(c.sess.ServerID)),
)
ret, err := c.grpcClient.ReCall(ctx, func(client querypb.QueryCoordClient) (any, error) {
if !funcutil.CheckCtxValid(ctx) {
return nil, ctx.Err()
}
return client.DescribeResourceGroup(ctx, req)
})
if err != nil || ret == nil {
return nil, err
}
return ret.(*querypb.DescribeResourceGroupResponse), err
}
func (c *Client) TransferNode(ctx context.Context, req *milvuspb.TransferNodeRequest) (*commonpb.Status, error) {
req = typeutil.Clone(req)
commonpbutil.UpdateMsgBase(
req.GetBase(),
commonpbutil.FillMsgBaseFromClient(paramtable.GetNodeID(), commonpbutil.WithTargetID(c.sess.ServerID)),
)
ret, err := c.grpcClient.ReCall(ctx, func(client querypb.QueryCoordClient) (any, error) {
if !funcutil.CheckCtxValid(ctx) {
return nil, ctx.Err()
}
return client.TransferNode(ctx, req)
})
if err != nil || ret == nil {
return nil, err
}
return ret.(*commonpb.Status), err
}
func (c *Client) TransferReplica(ctx context.Context, req *querypb.TransferReplicaRequest) (*commonpb.Status, error) {
req = typeutil.Clone(req)
commonpbutil.UpdateMsgBase(
req.GetBase(),
commonpbutil.FillMsgBaseFromClient(paramtable.GetNodeID(), commonpbutil.WithTargetID(c.sess.ServerID)),
)
ret, err := c.grpcClient.ReCall(ctx, func(client querypb.QueryCoordClient) (any, error) {
if !funcutil.CheckCtxValid(ctx) {
return nil, ctx.Err()
}
return client.TransferReplica(ctx, req)
})
if err != nil || ret == nil {
return nil, err
}
return ret.(*commonpb.Status), err
}
func (c *Client) ListResourceGroups(ctx context.Context, req *milvuspb.ListResourceGroupsRequest) (*milvuspb.ListResourceGroupsResponse, error) {
req = typeutil.Clone(req)
commonpbutil.UpdateMsgBase(
req.GetBase(),
commonpbutil.FillMsgBaseFromClient(paramtable.GetNodeID(), commonpbutil.WithTargetID(c.sess.ServerID)),
)
ret, err := c.grpcClient.ReCall(ctx, func(client querypb.QueryCoordClient) (any, error) {
if !funcutil.CheckCtxValid(ctx) {
return nil, ctx.Err()
}
return client.ListResourceGroups(ctx, req)
})
if err != nil || ret == nil {
return nil, err
}
return ret.(*milvuspb.ListResourceGroupsResponse), err
}

View File

@ -124,6 +124,24 @@ func Test_NewClient(t *testing.T) {
r20, err := client.CheckHealth(ctx, nil) r20, err := client.CheckHealth(ctx, nil)
retCheck(retNotNil, r20, err) retCheck(retNotNil, r20, err)
r21, err := client.CreateResourceGroup(ctx, nil)
retCheck(retNotNil, r21, err)
r22, err := client.DropResourceGroup(ctx, nil)
retCheck(retNotNil, r22, err)
r23, err := client.TransferNode(ctx, nil)
retCheck(retNotNil, r23, err)
r24, err := client.TransferReplica(ctx, nil)
retCheck(retNotNil, r24, err)
r26, err := client.ListResourceGroups(ctx, nil)
retCheck(retNotNil, r26, err)
r27, err := client.DescribeResourceGroup(ctx, nil)
retCheck(retNotNil, r27, err)
} }
client.grpcClient = &mock.GRPCClientBase[querypb.QueryCoordClient]{ client.grpcClient = &mock.GRPCClientBase[querypb.QueryCoordClient]{

View File

@ -364,3 +364,27 @@ func (s *Server) GetShardLeaders(ctx context.Context, req *querypb.GetShardLeade
func (s *Server) CheckHealth(ctx context.Context, req *milvuspb.CheckHealthRequest) (*milvuspb.CheckHealthResponse, error) { func (s *Server) CheckHealth(ctx context.Context, req *milvuspb.CheckHealthRequest) (*milvuspb.CheckHealthResponse, error) {
return s.queryCoord.CheckHealth(ctx, req) return s.queryCoord.CheckHealth(ctx, req)
} }
func (s *Server) CreateResourceGroup(ctx context.Context, req *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error) {
return s.queryCoord.CreateResourceGroup(ctx, req)
}
func (s *Server) DropResourceGroup(ctx context.Context, req *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error) {
return s.queryCoord.DropResourceGroup(ctx, req)
}
func (s *Server) TransferNode(ctx context.Context, req *milvuspb.TransferNodeRequest) (*commonpb.Status, error) {
return s.queryCoord.TransferNode(ctx, req)
}
func (s *Server) TransferReplica(ctx context.Context, req *querypb.TransferReplicaRequest) (*commonpb.Status, error) {
return s.queryCoord.TransferReplica(ctx, req)
}
func (s *Server) ListResourceGroups(ctx context.Context, req *milvuspb.ListResourceGroupsRequest) (*milvuspb.ListResourceGroupsResponse, error) {
return s.queryCoord.ListResourceGroups(ctx, req)
}
func (s *Server) DescribeResourceGroup(ctx context.Context, req *querypb.DescribeResourceGroupRequest) (*querypb.DescribeResourceGroupResponse, error) {
return s.queryCoord.DescribeResourceGroup(ctx, req)
}

View File

@ -162,6 +162,34 @@ func (m *MockQueryCoord) CheckHealth(ctx context.Context, req *milvuspb.CheckHea
}, m.err }, m.err
} }
func (m *MockQueryCoord) CreateResourceGroup(ctx context.Context, req *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error) {
return m.status, nil
}
func (m *MockQueryCoord) DropResourceGroup(ctx context.Context, req *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error) {
return m.status, nil
}
func (m *MockQueryCoord) TransferNode(ctx context.Context, req *milvuspb.TransferNodeRequest) (*commonpb.Status, error) {
return m.status, nil
}
func (m *MockQueryCoord) TransferReplica(ctx context.Context, req *querypb.TransferReplicaRequest) (*commonpb.Status, error) {
return m.status, nil
}
func (m *MockQueryCoord) ListResourceGroups(ctx context.Context, req *milvuspb.ListResourceGroupsRequest) (*milvuspb.ListResourceGroupsResponse, error) {
return &milvuspb.ListResourceGroupsResponse{
Status: m.status,
}, nil
}
func (m *MockQueryCoord) DescribeResourceGroup(ctx context.Context, req *querypb.DescribeResourceGroupRequest) (*querypb.DescribeResourceGroupResponse, error) {
return &querypb.DescribeResourceGroupResponse{
Status: m.status,
}, nil
}
// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
type MockRootCoord struct { type MockRootCoord struct {
types.RootCoord types.RootCoord
@ -371,6 +399,43 @@ func Test_NewServer(t *testing.T) {
assert.Equal(t, true, ret.IsHealthy) assert.Equal(t, true, ret.IsHealthy)
}) })
t.Run("CreateResourceGroup", func(t *testing.T) {
resp, err := server.CreateResourceGroup(ctx, nil)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, resp.ErrorCode)
})
t.Run("DropResourceGroup", func(t *testing.T) {
resp, err := server.DropResourceGroup(ctx, nil)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, resp.ErrorCode)
})
t.Run("TransferNode", func(t *testing.T) {
resp, err := server.TransferNode(ctx, nil)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, resp.ErrorCode)
})
t.Run("TransferReplica", func(t *testing.T) {
resp, err := server.TransferReplica(ctx, nil)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, resp.ErrorCode)
})
t.Run("ListResourceGroups", func(t *testing.T) {
req := &milvuspb.ListResourceGroupsRequest{}
resp, err := server.ListResourceGroups(ctx, req)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
})
t.Run("DescribeResourceGroup", func(t *testing.T) {
resp, err := server.DescribeResourceGroup(ctx, nil)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, resp.Status.ErrorCode)
})
err = server.Stop() err = server.Stop()
assert.Nil(t, err) assert.Nil(t, err)
} }

View File

@ -159,4 +159,7 @@ type QueryCoordCatalog interface {
ReleasePartition(collection int64, partitions ...int64) error ReleasePartition(collection int64, partitions ...int64) error
ReleaseReplicas(collectionID int64) error ReleaseReplicas(collectionID int64) error
ReleaseReplica(collection, replica int64) error ReleaseReplica(collection, replica int64) error
SaveResourceGroup(rgs ...*querypb.ResourceGroup) error
RemoveResourceGroup(rgName string) error
GetResourceGroups() ([]*querypb.ResourceGroup, error)
} }

View File

@ -36,6 +36,13 @@ service QueryCoord {
rpc GetShardLeaders(GetShardLeadersRequest) returns (GetShardLeadersResponse) {} rpc GetShardLeaders(GetShardLeadersRequest) returns (GetShardLeadersResponse) {}
rpc CheckHealth(milvus.CheckHealthRequest) returns (milvus.CheckHealthResponse) {} rpc CheckHealth(milvus.CheckHealthRequest) returns (milvus.CheckHealthResponse) {}
rpc CreateResourceGroup(milvus.CreateResourceGroupRequest) returns (common.Status) {}
rpc DropResourceGroup(milvus.DropResourceGroupRequest) returns (common.Status) {}
rpc TransferNode(milvus.TransferNodeRequest) returns (common.Status) {}
rpc TransferReplica(TransferReplicaRequest) returns (common.Status) {}
rpc ListResourceGroups(milvus.ListResourceGroupsRequest) returns (milvus.ListResourceGroupsResponse) {}
rpc DescribeResourceGroup(DescribeResourceGroupRequest) returns (DescribeResourceGroupResponse) {}
} }
service QueryNode { service QueryNode {
@ -101,6 +108,8 @@ message LoadCollectionRequest {
// fieldID -> indexID // fieldID -> indexID
map<int64, int64> field_indexID = 6; map<int64, int64> field_indexID = 6;
bool refresh = 7; bool refresh = 7;
// resource group names
repeated string resource_groups = 8;
} }
message ReleaseCollectionRequest { message ReleaseCollectionRequest {
@ -128,6 +137,8 @@ message LoadPartitionsRequest {
// fieldID -> indexID // fieldID -> indexID
map<int64, int64> field_indexID = 7; map<int64, int64> field_indexID = 7;
bool refresh = 8; bool refresh = 8;
// resource group names
repeated string resource_groups = 9;
} }
message ReleasePartitionsRequest { message ReleasePartitionsRequest {
@ -488,6 +499,7 @@ message Replica {
int64 ID = 1; int64 ID = 1;
int64 collectionID = 2; int64 collectionID = 2;
repeated int64 nodes = 3; repeated int64 nodes = 3;
string resource_group = 4;
} }
enum SyncType { enum SyncType {
@ -510,3 +522,39 @@ message SyncDistributionRequest {
repeated SyncAction actions = 4; repeated SyncAction actions = 4;
} }
message ResourceGroup {
string name = 1;
int32 capacity = 2;
repeated int64 nodes = 3;
}
// transfer `replicaNum` replicas in `collectionID` from `source_resource_group` to `target_resource_groups`
message TransferReplicaRequest {
common.MsgBase base = 1;
string source_resource_group = 2;
string target_resource_group = 3;
int64 collectionID = 4;
int64 num_replica = 5;
}
message DescribeResourceGroupRequest {
common.MsgBase base = 1;
string resource_group = 2;
}
message DescribeResourceGroupResponse {
common.Status status = 1;
ResourceGroupInfo resource_group = 2;
}
message ResourceGroupInfo {
string name = 1;
int32 capacity = 2;
int32 num_available_node = 3;
// collection id -> loaded replica num
map<int64, int32> num_loaded_replica = 4;
// collection id -> accessed other rg's node num
map<int64, int32> num_outgoing_node = 5;
// collection id -> be accessed node num by other rg
map<int64, int32> num_incoming_node = 6;
}

File diff suppressed because it is too large Load Diff

View File

@ -3452,6 +3452,10 @@ func (node *Proxy) GetReplicas(ctx context.Context, req *milvuspb.GetReplicasReq
commonpbutil.WithSourceID(paramtable.GetNodeID()), commonpbutil.WithSourceID(paramtable.GetNodeID()),
) )
if req.GetCollectionName() != "" {
req.CollectionID, _ = globalMetaCache.GetCollectionID(ctx, req.GetCollectionName())
}
resp, err := node.queryCoord.GetReplicas(ctx, req) resp, err := node.queryCoord.GetReplicas(ctx, req)
if err != nil { if err != nil {
log.Error("Failed to get replicas from Query Coordinator", log.Error("Failed to get replicas from Query Coordinator",
@ -3758,7 +3762,6 @@ func (node *Proxy) UpdateCredentialCache(ctx context.Context, request *proxypb.U
}, nil }, nil
} }
//
func (node *Proxy) CreateCredential(ctx context.Context, req *milvuspb.CreateCredentialRequest) (*commonpb.Status, error) { func (node *Proxy) CreateCredential(ctx context.Context, req *milvuspb.CreateCredentialRequest) (*commonpb.Status, error) {
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-CreateCredential") ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-CreateCredential")
defer sp.End() defer sp.End()
@ -3823,7 +3826,6 @@ func (node *Proxy) CreateCredential(ctx context.Context, req *milvuspb.CreateCre
return result, err return result, err
} }
//
func (node *Proxy) UpdateCredential(ctx context.Context, req *milvuspb.UpdateCredentialRequest) (*commonpb.Status, error) { func (node *Proxy) UpdateCredential(ctx context.Context, req *milvuspb.UpdateCredentialRequest) (*commonpb.Status, error) {
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-UpdateCredential") ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-UpdateCredential")
defer sp.End() defer sp.End()
@ -3897,7 +3899,6 @@ func (node *Proxy) UpdateCredential(ctx context.Context, req *milvuspb.UpdateCre
return result, err return result, err
} }
//
func (node *Proxy) DeleteCredential(ctx context.Context, req *milvuspb.DeleteCredentialRequest) (*commonpb.Status, error) { func (node *Proxy) DeleteCredential(ctx context.Context, req *milvuspb.DeleteCredentialRequest) (*commonpb.Status, error) {
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-DeleteCredential") ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-DeleteCredential")
defer sp.End() defer sp.End()
@ -4449,42 +4450,391 @@ func (node *Proxy) RenameCollection(ctx context.Context, req *milvuspb.RenameCol
} }
func (node *Proxy) CreateResourceGroup(ctx context.Context, request *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error) { func (node *Proxy) CreateResourceGroup(ctx context.Context, request *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error) {
if !node.checkHealthy() {
return unhealthyStatus(), nil
}
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-CreateResourceGroup")
defer sp.End()
method := "CreateResourceGroup"
tr := timerecord.NewTimeRecorder(method)
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.TotalLabel).Inc()
t := &CreateResourceGroupTask{
ctx: ctx,
Condition: NewTaskCondition(ctx),
CreateResourceGroupRequest: request,
queryCoord: node.queryCoord,
}
log := log.Ctx(ctx).With(
zap.String("role", typeutil.ProxyRole),
)
log.Debug("CreateResourceGroup received")
if err := node.sched.ddQueue.Enqueue(t); err != nil {
log.Warn("CreateResourceGroup failed to enqueue",
zap.Error(err))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.AbandonLabel).Inc()
return &commonpb.Status{ return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success, ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
}, nil }, nil
}
log.Debug("CreateResourceGroup enqueued",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
if err := t.WaitToFinish(); err != nil {
log.Warn("CreateResourceGroup failed to WaitToFinish",
zap.Error(err),
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.FailLabel).Inc()
return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
}, nil
}
log.Debug("CreateResourceGroup done",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.SuccessLabel).Inc()
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
return t.result, nil
} }
func (node *Proxy) DropResourceGroup(ctx context.Context, request *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error) { func (node *Proxy) DropResourceGroup(ctx context.Context, request *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error) {
if !node.checkHealthy() {
return unhealthyStatus(), nil
}
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-DropResourceGroup")
defer sp.End()
method := "DropResourceGroup"
tr := timerecord.NewTimeRecorder(method)
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.TotalLabel).Inc()
t := &DropResourceGroupTask{
ctx: ctx,
Condition: NewTaskCondition(ctx),
DropResourceGroupRequest: request,
queryCoord: node.queryCoord,
}
log := log.Ctx(ctx).With(
zap.String("role", typeutil.ProxyRole),
)
log.Debug("DropResourceGroup received")
if err := node.sched.ddQueue.Enqueue(t); err != nil {
log.Warn("DropResourceGroup failed to enqueue",
zap.Error(err))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.AbandonLabel).Inc()
return &commonpb.Status{ return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success, ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
}, nil }, nil
}
log.Debug("DropResourceGroup enqueued",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
if err := t.WaitToFinish(); err != nil {
log.Warn("DropResourceGroup failed to WaitToFinish",
zap.Error(err),
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.FailLabel).Inc()
return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
}, nil
}
log.Debug("DropResourceGroup done",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.SuccessLabel).Inc()
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
return t.result, nil
} }
func (node *Proxy) TransferNode(ctx context.Context, request *milvuspb.TransferNodeRequest) (*commonpb.Status, error) { func (node *Proxy) TransferNode(ctx context.Context, request *milvuspb.TransferNodeRequest) (*commonpb.Status, error) {
if !node.checkHealthy() {
return unhealthyStatus(), nil
}
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-TransferNode")
defer sp.End()
method := "TransferNode"
tr := timerecord.NewTimeRecorder(method)
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.TotalLabel).Inc()
t := &TransferNodeTask{
ctx: ctx,
Condition: NewTaskCondition(ctx),
TransferNodeRequest: request,
queryCoord: node.queryCoord,
}
log := log.Ctx(ctx).With(
zap.String("role", typeutil.ProxyRole),
)
log.Debug("TransferNode received")
if err := node.sched.ddQueue.Enqueue(t); err != nil {
log.Warn("TransferNode failed to enqueue",
zap.Error(err))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.AbandonLabel).Inc()
return &commonpb.Status{ return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success, ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
}, nil }, nil
}
log.Debug("TransferNode enqueued",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
if err := t.WaitToFinish(); err != nil {
log.Warn("TransferNode failed to WaitToFinish",
zap.Error(err),
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.FailLabel).Inc()
return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
}, nil
}
log.Debug("TransferNode done",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.SuccessLabel).Inc()
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
return t.result, nil
} }
func (node *Proxy) TransferReplica(ctx context.Context, request *milvuspb.TransferReplicaRequest) (*commonpb.Status, error) { func (node *Proxy) TransferReplica(ctx context.Context, request *milvuspb.TransferReplicaRequest) (*commonpb.Status, error) {
if !node.checkHealthy() {
return unhealthyStatus(), nil
}
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-TransferReplica")
defer sp.End()
method := "TransferReplica"
tr := timerecord.NewTimeRecorder(method)
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.TotalLabel).Inc()
t := &TransferReplicaTask{
ctx: ctx,
Condition: NewTaskCondition(ctx),
TransferReplicaRequest: request,
queryCoord: node.queryCoord,
}
log := log.Ctx(ctx).With(
zap.String("role", typeutil.ProxyRole),
)
log.Debug("TransferReplica received")
if err := node.sched.ddQueue.Enqueue(t); err != nil {
log.Warn("TransferReplica failed to enqueue",
zap.Error(err))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.AbandonLabel).Inc()
return &commonpb.Status{ return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success, ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
}, nil }, nil
}
log.Debug("TransferReplica enqueued",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
if err := t.WaitToFinish(); err != nil {
log.Warn("TransferReplica failed to WaitToFinish",
zap.Error(err),
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.FailLabel).Inc()
return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
}, nil
}
log.Debug("TransferReplica done",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.SuccessLabel).Inc()
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
return t.result, nil
} }
func (node *Proxy) ListResourceGroup(ctx context.Context, request *milvuspb.ListResourceGroupRequest) (*milvuspb.ListResourceGroupResponse, error) { func (node *Proxy) ListResourceGroups(ctx context.Context, request *milvuspb.ListResourceGroupsRequest) (*milvuspb.ListResourceGroupsResponse, error) {
return &milvuspb.ListResourceGroupResponse{ if !node.checkHealthy() {
return &milvuspb.ListResourceGroupsResponse{
Status: unhealthyStatus(),
}, nil
}
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-ListResourceGroups")
defer sp.End()
method := "ListResourceGroups"
tr := timerecord.NewTimeRecorder(method)
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.TotalLabel).Inc()
t := &ListResourceGroupsTask{
ctx: ctx,
Condition: NewTaskCondition(ctx),
ListResourceGroupsRequest: request,
queryCoord: node.queryCoord,
}
log := log.Ctx(ctx).With(
zap.String("role", typeutil.ProxyRole),
)
log.Debug("ListResourceGroups received")
if err := node.sched.ddQueue.Enqueue(t); err != nil {
log.Warn("ListResourceGroups failed to enqueue",
zap.Error(err))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.AbandonLabel).Inc()
return &milvuspb.ListResourceGroupsResponse{
Status: &commonpb.Status{ Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success, ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
}, },
}, nil }, nil
}
log.Debug("ListResourceGroups enqueued",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
if err := t.WaitToFinish(); err != nil {
log.Warn("ListResourceGroups failed to WaitToFinish",
zap.Error(err),
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.FailLabel).Inc()
return &milvuspb.ListResourceGroupsResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
},
}, nil
}
log.Debug("ListResourceGroups done",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.SuccessLabel).Inc()
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
return t.result, nil
} }
func (node *Proxy) DescribeResourceGroup(ctx context.Context, request *milvuspb.DescribeResourceGroupRequest) (*milvuspb.DescribeResourceGroupResponse, error) { func (node *Proxy) DescribeResourceGroup(ctx context.Context, request *milvuspb.DescribeResourceGroupRequest) (*milvuspb.DescribeResourceGroupResponse, error) {
if !node.checkHealthy() {
return &milvuspb.DescribeResourceGroupResponse{
Status: unhealthyStatus(),
}, nil
}
ctx, sp := otel.Tracer(typeutil.ProxyRole).Start(ctx, "Proxy-DescribeResourceGroup")
defer sp.End()
method := "DescribeResourceGroup"
tr := timerecord.NewTimeRecorder(method)
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.TotalLabel).Inc()
t := &DescribeResourceGroupTask{
ctx: ctx,
Condition: NewTaskCondition(ctx),
DescribeResourceGroupRequest: request,
queryCoord: node.queryCoord,
}
log := log.Ctx(ctx).With(
zap.String("role", typeutil.ProxyRole),
)
log.Debug("DescribeResourceGroup received")
if err := node.sched.ddQueue.Enqueue(t); err != nil {
log.Warn("DescribeResourceGroup failed to enqueue",
zap.Error(err))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.AbandonLabel).Inc()
return &milvuspb.DescribeResourceGroupResponse{ return &milvuspb.DescribeResourceGroupResponse{
Status: &commonpb.Status{ Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success, ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
}, },
}, nil }, nil
}
log.Debug("DescribeResourceGroup enqueued",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
if err := t.WaitToFinish(); err != nil {
log.Warn("DescribeResourceGroup failed to WaitToFinish",
zap.Error(err),
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.FailLabel).Inc()
return &milvuspb.DescribeResourceGroupResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_UnexpectedError,
Reason: err.Error(),
},
}, nil
}
log.Debug("DescribeResourceGroup done",
zap.Uint64("BeginTS", t.BeginTs()),
zap.Uint64("EndTS", t.EndTs()))
metrics.ProxyFunctionCall.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method,
metrics.SuccessLabel).Inc()
metrics.ProxyReqLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), method).Observe(float64(tr.ElapseSpan().Milliseconds()))
return t.result, nil
} }

View File

@ -29,6 +29,7 @@ import (
"github.com/milvus-io/milvus/internal/log" "github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/mocks" "github.com/milvus-io/milvus/internal/mocks"
"github.com/milvus-io/milvus/internal/proto/proxypb" "github.com/milvus-io/milvus/internal/proto/proxypb"
"github.com/milvus-io/milvus/internal/util/dependency"
"github.com/milvus-io/milvus/internal/util/paramtable" "github.com/milvus-io/milvus/internal/util/paramtable"
"github.com/milvus-io/milvus/internal/util/sessionutil" "github.com/milvus-io/milvus/internal/util/sessionutil"
) )
@ -198,3 +199,75 @@ func TestProxyRenameCollection(t *testing.T) {
assert.Equal(t, commonpb.ErrorCode_Success, resp.GetErrorCode()) assert.Equal(t, commonpb.ErrorCode_Success, resp.GetErrorCode())
}) })
} }
func TestProxy_ResourceGroup(t *testing.T) {
factory := dependency.NewDefaultFactory(true)
ctx := context.Background()
node, err := NewProxy(ctx, factory)
assert.NoError(t, err)
node.multiRateLimiter = NewMultiRateLimiter()
node.stateCode.Store(commonpb.StateCode_Healthy)
qc := NewQueryCoordMock()
node.SetQueryCoordClient(qc)
tsoAllocatorIns := newMockTsoAllocator()
node.sched, err = newTaskScheduler(node.ctx, tsoAllocatorIns, node.factory)
assert.NoError(t, err)
node.sched.Start()
defer node.sched.Close()
rc := &MockRootCoordClientInterface{}
mgr := newShardClientMgr()
InitMetaCache(ctx, rc, qc, mgr)
t.Run("create resource group", func(t *testing.T) {
resp, err := node.CreateResourceGroup(ctx, &milvuspb.CreateResourceGroupRequest{
ResourceGroup: "rg",
})
assert.NoError(t, err)
assert.Equal(t, resp.ErrorCode, commonpb.ErrorCode_Success)
})
t.Run("drop resource group", func(t *testing.T) {
resp, err := node.DropResourceGroup(ctx, &milvuspb.DropResourceGroupRequest{
ResourceGroup: "rg",
})
assert.NoError(t, err)
assert.Equal(t, resp.ErrorCode, commonpb.ErrorCode_Success)
})
t.Run("transfer node", func(t *testing.T) {
resp, err := node.TransferNode(ctx, &milvuspb.TransferNodeRequest{
SourceResourceGroup: "rg1",
TargetResourceGroup: "rg2",
NumNode: 1,
})
assert.NoError(t, err)
assert.Equal(t, resp.ErrorCode, commonpb.ErrorCode_Success)
})
t.Run("transfer replica", func(t *testing.T) {
resp, err := node.TransferReplica(ctx, &milvuspb.TransferReplicaRequest{
SourceResourceGroup: "rg1",
TargetResourceGroup: "rg2",
NumReplica: 1,
CollectionName: "collection1",
})
assert.NoError(t, err)
assert.Equal(t, resp.ErrorCode, commonpb.ErrorCode_Success)
})
t.Run("list resource group", func(t *testing.T) {
resp, err := node.ListResourceGroups(ctx, &milvuspb.ListResourceGroupsRequest{})
assert.NoError(t, err)
assert.Equal(t, resp.Status.ErrorCode, commonpb.ErrorCode_Success)
})
t.Run("describe resource group", func(t *testing.T) {
resp, err := node.DescribeResourceGroup(ctx, &milvuspb.DescribeResourceGroupRequest{})
assert.NoError(t, err)
assert.Equal(t, resp.Status.ErrorCode, commonpb.ErrorCode_Success)
})
}

View File

@ -51,6 +51,8 @@ import (
type Cache interface { type Cache interface {
// GetCollectionID get collection's id by name. // GetCollectionID get collection's id by name.
GetCollectionID(ctx context.Context, collectionName string) (typeutil.UniqueID, error) GetCollectionID(ctx context.Context, collectionName string) (typeutil.UniqueID, error)
// GetCollectionName get collection's name by id
GetCollectionName(ctx context.Context, collectionID int64) (string, error)
// GetCollectionInfo get collection's information by name, such as collection id, schema, and etc. // GetCollectionInfo get collection's information by name, such as collection id, schema, and etc.
GetCollectionInfo(ctx context.Context, collectionName string) (*collectionInfo, error) GetCollectionInfo(ctx context.Context, collectionName string) (*collectionInfo, error)
// GetPartitionID get partition's identifier of specific collection. // GetPartitionID get partition's identifier of specific collection.
@ -196,7 +198,7 @@ func (m *MetaCache) GetCollectionID(ctx context.Context, collectionName string)
metrics.ProxyCacheStatsCounter.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), "GeCollectionID", metrics.CacheMissLabel).Inc() metrics.ProxyCacheStatsCounter.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), "GeCollectionID", metrics.CacheMissLabel).Inc()
tr := timerecord.NewTimeRecorder("UpdateCache") tr := timerecord.NewTimeRecorder("UpdateCache")
m.mu.RUnlock() m.mu.RUnlock()
coll, err := m.describeCollection(ctx, collectionName) coll, err := m.describeCollection(ctx, collectionName, 0)
if err != nil { if err != nil {
return 0, err return 0, err
} }
@ -213,6 +215,37 @@ func (m *MetaCache) GetCollectionID(ctx context.Context, collectionName string)
return collInfo.collID, nil return collInfo.collID, nil
} }
// GetCollectionName returns the corresponding collection name for provided collection id
func (m *MetaCache) GetCollectionName(ctx context.Context, collectionID int64) (string, error) {
m.mu.RLock()
var collInfo *collectionInfo
for _, coll := range m.collInfo {
if coll.collID == collectionID {
collInfo = coll
break
}
}
if collInfo == nil || !collInfo.isCollectionCached() {
metrics.ProxyCacheStatsCounter.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), "GeCollectionName", metrics.CacheMissLabel).Inc()
tr := timerecord.NewTimeRecorder("UpdateCache")
m.mu.RUnlock()
coll, err := m.describeCollection(ctx, "", collectionID)
if err != nil {
return "", err
}
m.mu.Lock()
defer m.mu.Unlock()
m.updateCollection(coll, coll.Schema.Name)
metrics.ProxyUpdateCacheLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(float64(tr.ElapseSpan().Milliseconds()))
return coll.Schema.Name, nil
}
defer m.mu.RUnlock()
metrics.ProxyCacheStatsCounter.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), "GeCollectionName", metrics.CacheHitLabel).Inc()
return collInfo.schema.Name, nil
}
// GetCollectionInfo returns the collection information related to provided collection name // GetCollectionInfo returns the collection information related to provided collection name
// If the information is not found, proxy will try to fetch information for other source (RootCoord for now) // If the information is not found, proxy will try to fetch information for other source (RootCoord for now)
func (m *MetaCache) GetCollectionInfo(ctx context.Context, collectionName string) (*collectionInfo, error) { func (m *MetaCache) GetCollectionInfo(ctx context.Context, collectionName string) (*collectionInfo, error) {
@ -224,7 +257,7 @@ func (m *MetaCache) GetCollectionInfo(ctx context.Context, collectionName string
if !ok || !collInfo.isCollectionCached() { if !ok || !collInfo.isCollectionCached() {
tr := timerecord.NewTimeRecorder("UpdateCache") tr := timerecord.NewTimeRecorder("UpdateCache")
metrics.ProxyCacheStatsCounter.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), "GetCollectionInfo", metrics.CacheMissLabel).Inc() metrics.ProxyCacheStatsCounter.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), "GetCollectionInfo", metrics.CacheMissLabel).Inc()
coll, err := m.describeCollection(ctx, collectionName) coll, err := m.describeCollection(ctx, collectionName, 0)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -281,7 +314,7 @@ func (m *MetaCache) GetCollectionSchema(ctx context.Context, collectionName stri
metrics.ProxyCacheStatsCounter.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), "GetCollectionSchema", metrics.CacheMissLabel).Inc() metrics.ProxyCacheStatsCounter.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10), "GetCollectionSchema", metrics.CacheMissLabel).Inc()
tr := timerecord.NewTimeRecorder("UpdateCache") tr := timerecord.NewTimeRecorder("UpdateCache")
m.mu.RUnlock() m.mu.RUnlock()
coll, err := m.describeCollection(ctx, collectionName) coll, err := m.describeCollection(ctx, collectionName, 0)
if err != nil { if err != nil {
log.Warn("Failed to load collection from rootcoord ", log.Warn("Failed to load collection from rootcoord ",
zap.String("collection name ", collectionName), zap.String("collection name ", collectionName),
@ -294,7 +327,7 @@ func (m *MetaCache) GetCollectionSchema(ctx context.Context, collectionName stri
collInfo = m.collInfo[collectionName] collInfo = m.collInfo[collectionName]
metrics.ProxyUpdateCacheLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(float64(tr.ElapseSpan().Milliseconds())) metrics.ProxyUpdateCacheLatency.WithLabelValues(strconv.FormatInt(paramtable.GetNodeID(), 10)).Observe(float64(tr.ElapseSpan().Milliseconds()))
log.Debug("Reload collection from root coordinator ", log.Debug("Reload collection from root coordinator ",
zap.String("collection name ", collectionName), zap.String("collection name", collectionName),
zap.Any("time (milliseconds) take ", tr.ElapseSpan().Milliseconds())) zap.Any("time (milliseconds) take ", tr.ElapseSpan().Milliseconds()))
return collInfo.schema, nil return collInfo.schema, nil
} }
@ -424,12 +457,13 @@ func (m *MetaCache) GetPartitionInfo(ctx context.Context, collectionName string,
} }
// Get the collection information from rootcoord. // Get the collection information from rootcoord.
func (m *MetaCache) describeCollection(ctx context.Context, collectionName string) (*milvuspb.DescribeCollectionResponse, error) { func (m *MetaCache) describeCollection(ctx context.Context, collectionName string, collectionID int64) (*milvuspb.DescribeCollectionResponse, error) {
req := &milvuspb.DescribeCollectionRequest{ req := &milvuspb.DescribeCollectionRequest{
Base: commonpbutil.NewMsgBase( Base: commonpbutil.NewMsgBase(
commonpbutil.WithMsgType(commonpb.MsgType_DescribeCollection), commonpbutil.WithMsgType(commonpb.MsgType_DescribeCollection),
), ),
CollectionName: collectionName, CollectionName: collectionName,
CollectionID: collectionID,
} }
coll, err := m.rootCoord.DescribeCollection(ctx, req) coll, err := m.rootCoord.DescribeCollection(ctx, req)
if err != nil { if err != nil {

View File

@ -127,7 +127,7 @@ func (m *MockRootCoordClientInterface) DescribeCollection(ctx context.Context, i
return nil, errors.New("mocked error") return nil, errors.New("mocked error")
} }
m.IncAccessCount() m.IncAccessCount()
if in.CollectionName == "collection1" { if in.CollectionName == "collection1" || in.CollectionID == 1 {
return &milvuspb.DescribeCollectionResponse{ return &milvuspb.DescribeCollectionResponse{
Status: &commonpb.Status{ Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success, ErrorCode: commonpb.ErrorCode_Success,
@ -135,10 +135,11 @@ func (m *MockRootCoordClientInterface) DescribeCollection(ctx context.Context, i
CollectionID: typeutil.UniqueID(1), CollectionID: typeutil.UniqueID(1),
Schema: &schemapb.CollectionSchema{ Schema: &schemapb.CollectionSchema{
AutoID: true, AutoID: true,
Name: "collection1",
}, },
}, nil }, nil
} }
if in.CollectionName == "collection2" { if in.CollectionName == "collection2" || in.CollectionID == 2 {
return &milvuspb.DescribeCollectionResponse{ return &milvuspb.DescribeCollectionResponse{
Status: &commonpb.Status{ Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success, ErrorCode: commonpb.ErrorCode_Success,
@ -146,6 +147,7 @@ func (m *MockRootCoordClientInterface) DescribeCollection(ctx context.Context, i
CollectionID: typeutil.UniqueID(2), CollectionID: typeutil.UniqueID(2),
Schema: &schemapb.CollectionSchema{ Schema: &schemapb.CollectionSchema{
AutoID: true, AutoID: true,
Name: "collection2",
}, },
}, nil }, nil
} }
@ -230,7 +232,7 @@ func (m *MockQueryCoordClientInterface) ShowCollections(ctx context.Context, req
return rsp, nil return rsp, nil
} }
//Simulate the cache path and the // Simulate the cache path and the
func TestMetaCache_GetCollection(t *testing.T) { func TestMetaCache_GetCollection(t *testing.T) {
ctx := context.Background() ctx := context.Background()
rootCoord := &MockRootCoordClientInterface{} rootCoord := &MockRootCoordClientInterface{}
@ -251,6 +253,7 @@ func TestMetaCache_GetCollection(t *testing.T) {
assert.Equal(t, schema, &schemapb.CollectionSchema{ assert.Equal(t, schema, &schemapb.CollectionSchema{
AutoID: true, AutoID: true,
Fields: []*schemapb.FieldSchema{}, Fields: []*schemapb.FieldSchema{},
Name: "collection1",
}) })
id, err = globalMetaCache.GetCollectionID(ctx, "collection2") id, err = globalMetaCache.GetCollectionID(ctx, "collection2")
assert.Equal(t, rootCoord.GetAccessCount(), 2) assert.Equal(t, rootCoord.GetAccessCount(), 2)
@ -262,6 +265,7 @@ func TestMetaCache_GetCollection(t *testing.T) {
assert.Equal(t, schema, &schemapb.CollectionSchema{ assert.Equal(t, schema, &schemapb.CollectionSchema{
AutoID: true, AutoID: true,
Fields: []*schemapb.FieldSchema{}, Fields: []*schemapb.FieldSchema{},
Name: "collection2",
}) })
// test to get from cache, this should trigger root request // test to get from cache, this should trigger root request
@ -275,10 +279,61 @@ func TestMetaCache_GetCollection(t *testing.T) {
assert.Equal(t, schema, &schemapb.CollectionSchema{ assert.Equal(t, schema, &schemapb.CollectionSchema{
AutoID: true, AutoID: true,
Fields: []*schemapb.FieldSchema{}, Fields: []*schemapb.FieldSchema{},
Name: "collection1",
}) })
} }
func TestMetaCache_GetCollectionName(t *testing.T) {
ctx := context.Background()
rootCoord := &MockRootCoordClientInterface{}
queryCoord := &MockQueryCoordClientInterface{}
mgr := newShardClientMgr()
err := InitMetaCache(ctx, rootCoord, queryCoord, mgr)
assert.Nil(t, err)
collection, err := globalMetaCache.GetCollectionName(ctx, 1)
assert.Nil(t, err)
assert.Equal(t, collection, "collection1")
assert.Equal(t, rootCoord.GetAccessCount(), 1)
// should'nt be accessed to remote root coord.
schema, err := globalMetaCache.GetCollectionSchema(ctx, "collection1")
assert.Equal(t, rootCoord.GetAccessCount(), 1)
assert.Nil(t, err)
assert.Equal(t, schema, &schemapb.CollectionSchema{
AutoID: true,
Fields: []*schemapb.FieldSchema{},
Name: "collection1",
})
collection, err = globalMetaCache.GetCollectionName(ctx, 1)
assert.Equal(t, rootCoord.GetAccessCount(), 1)
assert.Nil(t, err)
assert.Equal(t, collection, "collection1")
schema, err = globalMetaCache.GetCollectionSchema(ctx, "collection2")
assert.Equal(t, rootCoord.GetAccessCount(), 2)
assert.Nil(t, err)
assert.Equal(t, schema, &schemapb.CollectionSchema{
AutoID: true,
Fields: []*schemapb.FieldSchema{},
Name: "collection2",
})
// test to get from cache, this should trigger root request
collection, err = globalMetaCache.GetCollectionName(ctx, 1)
assert.Equal(t, rootCoord.GetAccessCount(), 2)
assert.Nil(t, err)
assert.Equal(t, collection, "collection1")
schema, err = globalMetaCache.GetCollectionSchema(ctx, "collection1")
assert.Equal(t, rootCoord.GetAccessCount(), 2)
assert.Nil(t, err)
assert.Equal(t, schema, &schemapb.CollectionSchema{
AutoID: true,
Fields: []*schemapb.FieldSchema{},
Name: "collection1",
})
}
func TestMetaCache_GetCollectionFailure(t *testing.T) { func TestMetaCache_GetCollectionFailure(t *testing.T) {
ctx := context.Background() ctx := context.Background()
rootCoord := &MockRootCoordClientInterface{} rootCoord := &MockRootCoordClientInterface{}
@ -299,6 +354,7 @@ func TestMetaCache_GetCollectionFailure(t *testing.T) {
assert.Equal(t, schema, &schemapb.CollectionSchema{ assert.Equal(t, schema, &schemapb.CollectionSchema{
AutoID: true, AutoID: true,
Fields: []*schemapb.FieldSchema{}, Fields: []*schemapb.FieldSchema{},
Name: "collection1",
}) })
rootCoord.Error = true rootCoord.Error = true
@ -307,6 +363,7 @@ func TestMetaCache_GetCollectionFailure(t *testing.T) {
assert.Equal(t, schema, &schemapb.CollectionSchema{ assert.Equal(t, schema, &schemapb.CollectionSchema{
AutoID: true, AutoID: true,
Fields: []*schemapb.FieldSchema{}, Fields: []*schemapb.FieldSchema{},
Name: "collection1",
}) })
} }
@ -367,6 +424,7 @@ func TestMetaCache_ConcurrentTest1(t *testing.T) {
assert.Equal(t, schema, &schemapb.CollectionSchema{ assert.Equal(t, schema, &schemapb.CollectionSchema{
AutoID: true, AutoID: true,
Fields: []*schemapb.FieldSchema{}, Fields: []*schemapb.FieldSchema{},
Name: "collection1",
}) })
time.Sleep(10 * time.Millisecond) time.Sleep(10 * time.Millisecond)
} }

View File

@ -8,6 +8,7 @@ import (
) )
type getCollectionIDFunc func(ctx context.Context, collectionName string) (typeutil.UniqueID, error) type getCollectionIDFunc func(ctx context.Context, collectionName string) (typeutil.UniqueID, error)
type getCollectionNameFunc func(ctx context.Context, collectionID int64) (string, error)
type getCollectionSchemaFunc func(ctx context.Context, collectionName string) (*schemapb.CollectionSchema, error) type getCollectionSchemaFunc func(ctx context.Context, collectionName string) (*schemapb.CollectionSchema, error)
type getCollectionInfoFunc func(ctx context.Context, collectionName string) (*collectionInfo, error) type getCollectionInfoFunc func(ctx context.Context, collectionName string) (*collectionInfo, error)
type getUserRoleFunc func(username string) []string type getUserRoleFunc func(username string) []string
@ -16,6 +17,7 @@ type getPartitionIDFunc func(ctx context.Context, collectionName string, partiti
type mockCache struct { type mockCache struct {
Cache Cache
getIDFunc getCollectionIDFunc getIDFunc getCollectionIDFunc
getNameFunc getCollectionNameFunc
getSchemaFunc getCollectionSchemaFunc getSchemaFunc getCollectionSchemaFunc
getInfoFunc getCollectionInfoFunc getInfoFunc getCollectionInfoFunc
getUserRoleFunc getUserRoleFunc getUserRoleFunc getUserRoleFunc
@ -29,6 +31,13 @@ func (m *mockCache) GetCollectionID(ctx context.Context, collectionName string)
return 0, nil return 0, nil
} }
func (m *mockCache) GetCollectionName(ctx context.Context, collectionID int64) (string, error) {
if m.getIDFunc != nil {
return m.getNameFunc(ctx, collectionID)
}
return "", nil
}
func (m *mockCache) GetCollectionSchema(ctx context.Context, collectionName string) (*schemapb.CollectionSchema, error) { func (m *mockCache) GetCollectionSchema(ctx context.Context, collectionName string) (*schemapb.CollectionSchema, error) {
if m.getSchemaFunc != nil { if m.getSchemaFunc != nil {
return m.getSchemaFunc(ctx, collectionName) return m.getSchemaFunc(ctx, collectionName)

View File

@ -22,6 +22,7 @@ import (
"sync" "sync"
"sync/atomic" "sync/atomic"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/milvus-io/milvus/internal/util/funcutil" "github.com/milvus-io/milvus/internal/util/funcutil"
"github.com/milvus-io/milvus/internal/util/uniquegenerator" "github.com/milvus-io/milvus/internal/util/uniquegenerator"
@ -423,6 +424,60 @@ func (coord *QueryCoordMock) GetShardLeaders(ctx context.Context, req *querypb.G
}, nil }, nil
} }
func (coord *QueryCoordMock) CreateResourceGroup(ctx context.Context, req *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error) {
return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success,
Reason: "",
}, nil
}
func (coord *QueryCoordMock) DropResourceGroup(ctx context.Context, req *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error) {
return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success,
Reason: "",
}, nil
}
func (coord *QueryCoordMock) TransferNode(ctx context.Context, req *milvuspb.TransferNodeRequest) (*commonpb.Status, error) {
return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success,
Reason: "",
}, nil
}
func (coord *QueryCoordMock) TransferReplica(ctx context.Context, req *querypb.TransferReplicaRequest) (*commonpb.Status, error) {
return &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success,
Reason: "",
}, nil
}
func (coord *QueryCoordMock) ListResourceGroups(ctx context.Context, req *milvuspb.ListResourceGroupsRequest) (*milvuspb.ListResourceGroupsResponse, error) {
return &milvuspb.ListResourceGroupsResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success,
Reason: "",
},
ResourceGroups: []string{meta.DefaultResourceGroupName, "rg"},
}, nil
}
func (coord *QueryCoordMock) DescribeResourceGroup(ctx context.Context, req *querypb.DescribeResourceGroupRequest) (*querypb.DescribeResourceGroupResponse, error) {
return &querypb.DescribeResourceGroupResponse{
Status: &commonpb.Status{
ErrorCode: commonpb.ErrorCode_Success,
Reason: "",
},
ResourceGroup: &querypb.ResourceGroupInfo{
Name: "rg",
Capacity: 2,
NumAvailableNode: 1,
NumOutgoingNode: map[int64]int32{1: 1},
NumIncomingNode: map[int64]int32{2: 2},
},
}, nil
}
func NewQueryCoordMock(opts ...QueryCoordMockOption) *QueryCoordMock { func NewQueryCoordMock(opts ...QueryCoordMockOption) *QueryCoordMock {
coord := &QueryCoordMock{ coord := &QueryCoordMock{
nodeID: UniqueID(uniquegenerator.GetUniqueIntGeneratorIns().GetInt()), nodeID: UniqueID(uniquegenerator.GetUniqueIntGeneratorIns().GetInt()),

View File

@ -39,6 +39,7 @@ import (
"github.com/milvus-io/milvus/internal/util/commonpbutil" "github.com/milvus-io/milvus/internal/util/commonpbutil"
"github.com/milvus-io/milvus/internal/util/paramtable" "github.com/milvus-io/milvus/internal/util/paramtable"
"github.com/milvus-io/milvus/internal/util/typeutil" "github.com/milvus-io/milvus/internal/util/typeutil"
"github.com/samber/lo"
) )
const ( const (
@ -72,6 +73,12 @@ const (
AlterAliasTaskName = "AlterAliasTask" AlterAliasTaskName = "AlterAliasTask"
AlterCollectionTaskName = "AlterCollectionTask" AlterCollectionTaskName = "AlterCollectionTask"
UpsertTaskName = "UpsertTask" UpsertTaskName = "UpsertTask"
CreateResourceGroupTaskName = "CreateResourceGroupTask"
DropResourceGroupTaskName = "DropResourceGroupTask"
TransferNodeTaskName = "TransferNodeTask"
TransferReplicaTaskName = "TransferReplicaTask"
ListResourceGroupsTaskName = "ListResourceGroupsTask"
DescribeResourceGroupTaskName = "DescribeResourceGroupTask"
// minFloat32 minimum float. // minFloat32 minimum float.
minFloat32 = -1 * float32(math.MaxFloat32) minFloat32 = -1 * float32(math.MaxFloat32)
@ -1916,3 +1923,412 @@ func (a *AlterAliasTask) Execute(ctx context.Context) error {
func (a *AlterAliasTask) PostExecute(ctx context.Context) error { func (a *AlterAliasTask) PostExecute(ctx context.Context) error {
return nil return nil
} }
type CreateResourceGroupTask struct {
Condition
*milvuspb.CreateResourceGroupRequest
ctx context.Context
queryCoord types.QueryCoord
result *commonpb.Status
}
func (t *CreateResourceGroupTask) TraceCtx() context.Context {
return t.ctx
}
func (t *CreateResourceGroupTask) ID() UniqueID {
return t.Base.MsgID
}
func (t *CreateResourceGroupTask) SetID(uid UniqueID) {
t.Base.MsgID = uid
}
func (t *CreateResourceGroupTask) Name() string {
return CreateResourceGroupTaskName
}
func (t *CreateResourceGroupTask) Type() commonpb.MsgType {
return t.Base.MsgType
}
func (t *CreateResourceGroupTask) BeginTs() Timestamp {
return t.Base.Timestamp
}
func (t *CreateResourceGroupTask) EndTs() Timestamp {
return t.Base.Timestamp
}
func (t *CreateResourceGroupTask) SetTs(ts Timestamp) {
t.Base.Timestamp = ts
}
func (t *CreateResourceGroupTask) OnEnqueue() error {
t.Base = commonpbutil.NewMsgBase()
return nil
}
func (t *CreateResourceGroupTask) PreExecute(ctx context.Context) error {
t.Base.MsgType = commonpb.MsgType_CreateResourceGroup
t.Base.SourceID = paramtable.GetNodeID()
return nil
}
func (t *CreateResourceGroupTask) Execute(ctx context.Context) error {
var err error
t.result, err = t.queryCoord.CreateResourceGroup(ctx, t.CreateResourceGroupRequest)
return err
}
func (t *CreateResourceGroupTask) PostExecute(ctx context.Context) error {
return nil
}
type DropResourceGroupTask struct {
Condition
*milvuspb.DropResourceGroupRequest
ctx context.Context
queryCoord types.QueryCoord
result *commonpb.Status
}
func (t *DropResourceGroupTask) TraceCtx() context.Context {
return t.ctx
}
func (t *DropResourceGroupTask) ID() UniqueID {
return t.Base.MsgID
}
func (t *DropResourceGroupTask) SetID(uid UniqueID) {
t.Base.MsgID = uid
}
func (t *DropResourceGroupTask) Name() string {
return DropResourceGroupTaskName
}
func (t *DropResourceGroupTask) Type() commonpb.MsgType {
return t.Base.MsgType
}
func (t *DropResourceGroupTask) BeginTs() Timestamp {
return t.Base.Timestamp
}
func (t *DropResourceGroupTask) EndTs() Timestamp {
return t.Base.Timestamp
}
func (t *DropResourceGroupTask) SetTs(ts Timestamp) {
t.Base.Timestamp = ts
}
func (t *DropResourceGroupTask) OnEnqueue() error {
t.Base = commonpbutil.NewMsgBase()
return nil
}
func (t *DropResourceGroupTask) PreExecute(ctx context.Context) error {
t.Base.MsgType = commonpb.MsgType_DropResourceGroup
t.Base.SourceID = paramtable.GetNodeID()
return nil
}
func (t *DropResourceGroupTask) Execute(ctx context.Context) error {
var err error
t.result, err = t.queryCoord.DropResourceGroup(ctx, t.DropResourceGroupRequest)
return err
}
func (t *DropResourceGroupTask) PostExecute(ctx context.Context) error {
return nil
}
type DescribeResourceGroupTask struct {
Condition
*milvuspb.DescribeResourceGroupRequest
ctx context.Context
queryCoord types.QueryCoord
result *milvuspb.DescribeResourceGroupResponse
}
func (t *DescribeResourceGroupTask) TraceCtx() context.Context {
return t.ctx
}
func (t *DescribeResourceGroupTask) ID() UniqueID {
return t.Base.MsgID
}
func (t *DescribeResourceGroupTask) SetID(uid UniqueID) {
t.Base.MsgID = uid
}
func (t *DescribeResourceGroupTask) Name() string {
return DescribeResourceGroupTaskName
}
func (t *DescribeResourceGroupTask) Type() commonpb.MsgType {
return t.Base.MsgType
}
func (t *DescribeResourceGroupTask) BeginTs() Timestamp {
return t.Base.Timestamp
}
func (t *DescribeResourceGroupTask) EndTs() Timestamp {
return t.Base.Timestamp
}
func (t *DescribeResourceGroupTask) SetTs(ts Timestamp) {
t.Base.Timestamp = ts
}
func (t *DescribeResourceGroupTask) OnEnqueue() error {
t.Base = commonpbutil.NewMsgBase()
return nil
}
func (t *DescribeResourceGroupTask) PreExecute(ctx context.Context) error {
t.Base.MsgType = commonpb.MsgType_DescribeResourceGroup
t.Base.SourceID = paramtable.GetNodeID()
return nil
}
func (t *DescribeResourceGroupTask) Execute(ctx context.Context) error {
var err error
resp, err := t.queryCoord.DescribeResourceGroup(ctx, &querypb.DescribeResourceGroupRequest{
ResourceGroup: t.ResourceGroup,
})
rgInfo := resp.GetResourceGroup()
getCollectionNameFunc := func(value int32, key int64) string {
name, err := globalMetaCache.GetCollectionName(ctx, key)
if err != nil {
// unreachable logic path
return "unavailable_collection"
}
return name
}
loadReplicas := lo.MapKeys(rgInfo.NumLoadedReplica, getCollectionNameFunc)
outgoingNodes := lo.MapKeys(rgInfo.NumOutgoingNode, getCollectionNameFunc)
incomingNodes := lo.MapKeys(rgInfo.NumIncomingNode, getCollectionNameFunc)
t.result = &milvuspb.DescribeResourceGroupResponse{
Status: resp.Status,
ResourceGroup: &milvuspb.ResourceGroup{
Name: rgInfo.GetName(),
Capacity: rgInfo.GetCapacity(),
NumAvailableNode: rgInfo.NumAvailableNode,
NumLoadedReplica: loadReplicas,
NumOutgoingNode: outgoingNodes,
NumIncomingNode: incomingNodes,
},
}
return err
}
func (t *DescribeResourceGroupTask) PostExecute(ctx context.Context) error {
return nil
}
type TransferNodeTask struct {
Condition
*milvuspb.TransferNodeRequest
ctx context.Context
queryCoord types.QueryCoord
result *commonpb.Status
}
func (t *TransferNodeTask) TraceCtx() context.Context {
return t.ctx
}
func (t *TransferNodeTask) ID() UniqueID {
return t.Base.MsgID
}
func (t *TransferNodeTask) SetID(uid UniqueID) {
t.Base.MsgID = uid
}
func (t *TransferNodeTask) Name() string {
return TransferNodeTaskName
}
func (t *TransferNodeTask) Type() commonpb.MsgType {
return t.Base.MsgType
}
func (t *TransferNodeTask) BeginTs() Timestamp {
return t.Base.Timestamp
}
func (t *TransferNodeTask) EndTs() Timestamp {
return t.Base.Timestamp
}
func (t *TransferNodeTask) SetTs(ts Timestamp) {
t.Base.Timestamp = ts
}
func (t *TransferNodeTask) OnEnqueue() error {
t.Base = commonpbutil.NewMsgBase()
return nil
}
func (t *TransferNodeTask) PreExecute(ctx context.Context) error {
t.Base.MsgType = commonpb.MsgType_TransferNode
t.Base.SourceID = paramtable.GetNodeID()
return nil
}
func (t *TransferNodeTask) Execute(ctx context.Context) error {
var err error
t.result, err = t.queryCoord.TransferNode(ctx, t.TransferNodeRequest)
return err
}
func (t *TransferNodeTask) PostExecute(ctx context.Context) error {
return nil
}
type TransferReplicaTask struct {
Condition
*milvuspb.TransferReplicaRequest
ctx context.Context
queryCoord types.QueryCoord
result *commonpb.Status
}
func (t *TransferReplicaTask) TraceCtx() context.Context {
return t.ctx
}
func (t *TransferReplicaTask) ID() UniqueID {
return t.Base.MsgID
}
func (t *TransferReplicaTask) SetID(uid UniqueID) {
t.Base.MsgID = uid
}
func (t *TransferReplicaTask) Name() string {
return TransferReplicaTaskName
}
func (t *TransferReplicaTask) Type() commonpb.MsgType {
return t.Base.MsgType
}
func (t *TransferReplicaTask) BeginTs() Timestamp {
return t.Base.Timestamp
}
func (t *TransferReplicaTask) EndTs() Timestamp {
return t.Base.Timestamp
}
func (t *TransferReplicaTask) SetTs(ts Timestamp) {
t.Base.Timestamp = ts
}
func (t *TransferReplicaTask) OnEnqueue() error {
t.Base = commonpbutil.NewMsgBase()
return nil
}
func (t *TransferReplicaTask) PreExecute(ctx context.Context) error {
t.Base.MsgType = commonpb.MsgType_TransferReplica
t.Base.SourceID = paramtable.GetNodeID()
return nil
}
func (t *TransferReplicaTask) Execute(ctx context.Context) error {
var err error
collID, err := globalMetaCache.GetCollectionID(ctx, t.CollectionName)
if err != nil {
return err
}
t.result, err = t.queryCoord.TransferReplica(ctx, &querypb.TransferReplicaRequest{
SourceResourceGroup: t.SourceResourceGroup,
TargetResourceGroup: t.TargetResourceGroup,
CollectionID: collID,
NumReplica: t.NumReplica,
})
return err
}
func (t *TransferReplicaTask) PostExecute(ctx context.Context) error {
return nil
}
type ListResourceGroupsTask struct {
Condition
*milvuspb.ListResourceGroupsRequest
ctx context.Context
queryCoord types.QueryCoord
result *milvuspb.ListResourceGroupsResponse
}
func (t *ListResourceGroupsTask) TraceCtx() context.Context {
return t.ctx
}
func (t *ListResourceGroupsTask) ID() UniqueID {
return t.Base.MsgID
}
func (t *ListResourceGroupsTask) SetID(uid UniqueID) {
t.Base.MsgID = uid
}
func (t *ListResourceGroupsTask) Name() string {
return ListResourceGroupsTaskName
}
func (t *ListResourceGroupsTask) Type() commonpb.MsgType {
return t.Base.MsgType
}
func (t *ListResourceGroupsTask) BeginTs() Timestamp {
return t.Base.Timestamp
}
func (t *ListResourceGroupsTask) EndTs() Timestamp {
return t.Base.Timestamp
}
func (t *ListResourceGroupsTask) SetTs(ts Timestamp) {
t.Base.Timestamp = ts
}
func (t *ListResourceGroupsTask) OnEnqueue() error {
t.Base = commonpbutil.NewMsgBase()
return nil
}
func (t *ListResourceGroupsTask) PreExecute(ctx context.Context) error {
t.Base.MsgType = commonpb.MsgType_ListResourceGroups
t.Base.SourceID = paramtable.GetNodeID()
return nil
}
func (t *ListResourceGroupsTask) Execute(ctx context.Context) error {
var err error
t.result, err = t.queryCoord.ListResourceGroups(ctx, t.ListResourceGroupsRequest)
return err
}
func (t *ListResourceGroupsTask) PostExecute(ctx context.Context) error {
return nil
}

View File

@ -28,6 +28,7 @@ import (
"time" "time"
"github.com/milvus-io/milvus/internal/proto/indexpb" "github.com/milvus-io/milvus/internal/proto/indexpb"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/golang/protobuf/proto" "github.com/golang/protobuf/proto"
"github.com/milvus-io/milvus-proto/go-api/commonpb" "github.com/milvus-io/milvus-proto/go-api/commonpb"
@ -2538,3 +2539,248 @@ func Test_loadPartitionTask_Execute(t *testing.T) {
assert.Error(t, err) assert.Error(t, err)
}) })
} }
func TestCreateResourceGroupTask(t *testing.T) {
rc := NewRootCoordMock()
rc.Start()
defer rc.Stop()
qc := NewQueryCoordMock()
qc.Start()
defer qc.Stop()
ctx := context.Background()
mgr := newShardClientMgr()
InitMetaCache(ctx, rc, qc, mgr)
createRGReq := &milvuspb.CreateResourceGroupRequest{
Base: &commonpb.MsgBase{
MsgID: 1,
Timestamp: 2,
TargetID: 3,
},
ResourceGroup: "rg",
}
task := &CreateResourceGroupTask{
CreateResourceGroupRequest: createRGReq,
ctx: ctx,
queryCoord: qc,
}
task.PreExecute(ctx)
assert.Equal(t, commonpb.MsgType_CreateResourceGroup, task.Type())
assert.Equal(t, UniqueID(1), task.ID())
assert.Equal(t, Timestamp(2), task.BeginTs())
assert.Equal(t, Timestamp(2), task.EndTs())
assert.Equal(t, paramtable.GetNodeID(), task.Base.GetSourceID())
assert.Equal(t, UniqueID(3), task.Base.GetTargetID())
err := task.Execute(ctx)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, task.result.ErrorCode)
}
func TestDropResourceGroupTask(t *testing.T) {
rc := NewRootCoordMock()
rc.Start()
defer rc.Stop()
qc := NewQueryCoordMock()
qc.Start()
defer qc.Stop()
ctx := context.Background()
mgr := newShardClientMgr()
InitMetaCache(ctx, rc, qc, mgr)
dropRGReq := &milvuspb.DropResourceGroupRequest{
Base: &commonpb.MsgBase{
MsgID: 1,
Timestamp: 2,
TargetID: 3,
},
ResourceGroup: "rg",
}
task := &DropResourceGroupTask{
DropResourceGroupRequest: dropRGReq,
ctx: ctx,
queryCoord: qc,
}
task.PreExecute(ctx)
assert.Equal(t, commonpb.MsgType_DropResourceGroup, task.Type())
assert.Equal(t, UniqueID(1), task.ID())
assert.Equal(t, Timestamp(2), task.BeginTs())
assert.Equal(t, Timestamp(2), task.EndTs())
assert.Equal(t, paramtable.GetNodeID(), task.Base.GetSourceID())
assert.Equal(t, UniqueID(3), task.Base.GetTargetID())
err := task.Execute(ctx)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, task.result.ErrorCode)
}
func TestTransferNodeTask(t *testing.T) {
rc := NewRootCoordMock()
rc.Start()
defer rc.Stop()
qc := NewQueryCoordMock()
qc.Start()
defer qc.Stop()
ctx := context.Background()
mgr := newShardClientMgr()
InitMetaCache(ctx, rc, qc, mgr)
req := &milvuspb.TransferNodeRequest{
Base: &commonpb.MsgBase{
MsgID: 1,
Timestamp: 2,
TargetID: 3,
},
SourceResourceGroup: "rg1",
TargetResourceGroup: "rg2",
NumNode: 1,
}
task := &TransferNodeTask{
TransferNodeRequest: req,
ctx: ctx,
queryCoord: qc,
}
task.PreExecute(ctx)
assert.Equal(t, commonpb.MsgType_TransferNode, task.Type())
assert.Equal(t, UniqueID(1), task.ID())
assert.Equal(t, Timestamp(2), task.BeginTs())
assert.Equal(t, Timestamp(2), task.EndTs())
assert.Equal(t, paramtable.GetNodeID(), task.Base.GetSourceID())
assert.Equal(t, UniqueID(3), task.Base.GetTargetID())
err := task.Execute(ctx)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, task.result.ErrorCode)
}
func TestTransferReplicaTask(t *testing.T) {
rc := &MockRootCoordClientInterface{}
qc := NewQueryCoordMock()
qc.Start()
defer qc.Stop()
ctx := context.Background()
mgr := newShardClientMgr()
InitMetaCache(ctx, rc, qc, mgr)
// make it avoid remote call on rc
globalMetaCache.GetCollectionSchema(context.Background(), "collection1")
req := &milvuspb.TransferReplicaRequest{
Base: &commonpb.MsgBase{
MsgID: 1,
Timestamp: 2,
TargetID: 3,
},
CollectionName: "collection1",
SourceResourceGroup: "rg1",
TargetResourceGroup: "rg2",
NumReplica: 1,
}
task := &TransferReplicaTask{
TransferReplicaRequest: req,
ctx: ctx,
queryCoord: qc,
}
task.PreExecute(ctx)
assert.Equal(t, commonpb.MsgType_TransferReplica, task.Type())
assert.Equal(t, UniqueID(1), task.ID())
assert.Equal(t, Timestamp(2), task.BeginTs())
assert.Equal(t, Timestamp(2), task.EndTs())
assert.Equal(t, paramtable.GetNodeID(), task.Base.GetSourceID())
assert.Equal(t, UniqueID(3), task.Base.GetTargetID())
err := task.Execute(ctx)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, task.result.ErrorCode)
}
func TestListResourceGroupsTask(t *testing.T) {
rc := &MockRootCoordClientInterface{}
qc := NewQueryCoordMock()
qc.Start()
defer qc.Stop()
ctx := context.Background()
mgr := newShardClientMgr()
InitMetaCache(ctx, rc, qc, mgr)
req := &milvuspb.ListResourceGroupsRequest{
Base: &commonpb.MsgBase{
MsgID: 1,
Timestamp: 2,
TargetID: 3,
},
}
task := &ListResourceGroupsTask{
ListResourceGroupsRequest: req,
ctx: ctx,
queryCoord: qc,
}
task.PreExecute(ctx)
assert.Equal(t, commonpb.MsgType_ListResourceGroups, task.Type())
assert.Equal(t, UniqueID(1), task.ID())
assert.Equal(t, Timestamp(2), task.BeginTs())
assert.Equal(t, Timestamp(2), task.EndTs())
assert.Equal(t, paramtable.GetNodeID(), task.Base.GetSourceID())
assert.Equal(t, UniqueID(3), task.Base.GetTargetID())
err := task.Execute(ctx)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, task.result.Status.ErrorCode)
groups := task.result.GetResourceGroups()
assert.Contains(t, groups, meta.DefaultResourceGroupName)
assert.Contains(t, groups, "rg")
}
func TestDescribeResourceGroupTask(t *testing.T) {
rc := &MockRootCoordClientInterface{}
qc := NewQueryCoordMock()
qc.Start()
defer qc.Stop()
ctx := context.Background()
mgr := newShardClientMgr()
InitMetaCache(ctx, rc, qc, mgr)
// make it avoid remote call on rc
globalMetaCache.GetCollectionSchema(context.Background(), "collection1")
globalMetaCache.GetCollectionSchema(context.Background(), "collection2")
req := &milvuspb.DescribeResourceGroupRequest{
Base: &commonpb.MsgBase{
MsgID: 1,
Timestamp: 2,
TargetID: 3,
},
ResourceGroup: "rg",
}
task := &DescribeResourceGroupTask{
DescribeResourceGroupRequest: req,
ctx: ctx,
queryCoord: qc,
}
task.PreExecute(ctx)
assert.Equal(t, commonpb.MsgType_DescribeResourceGroup, task.Type())
assert.Equal(t, UniqueID(1), task.ID())
assert.Equal(t, Timestamp(2), task.BeginTs())
assert.Equal(t, Timestamp(2), task.EndTs())
assert.Equal(t, paramtable.GetNodeID(), task.Base.GetSourceID())
assert.Equal(t, UniqueID(3), task.Base.GetTargetID())
err := task.Execute(ctx)
assert.Nil(t, err)
assert.Equal(t, commonpb.ErrorCode_Success, task.result.Status.ErrorCode)
groupInfo := task.result.GetResourceGroup()
outgoingNodeNum := groupInfo.GetNumOutgoingNode()
incomingNodeNum := groupInfo.GetNumIncomingNode()
assert.NotNil(t, outgoingNodeNum["collection1"])
assert.NotNil(t, incomingNodeNum["collection2"])
}

View File

@ -104,7 +104,7 @@ func (b *RowCountBasedBalancer) Balance() ([]SegmentAssignPlan, []ChannelAssignP
} }
func (b *RowCountBasedBalancer) balanceReplica(replica *meta.Replica) ([]SegmentAssignPlan, []ChannelAssignPlan) { func (b *RowCountBasedBalancer) balanceReplica(replica *meta.Replica) ([]SegmentAssignPlan, []ChannelAssignPlan) {
nodes := replica.Nodes.Collect() nodes := replica.GetNodes()
if len(nodes) == 0 { if len(nodes) == 0 {
return nil, nil return nil, nil
} }
@ -112,6 +112,8 @@ func (b *RowCountBasedBalancer) balanceReplica(replica *meta.Replica) ([]Segment
nodesSegments := make(map[int64][]*meta.Segment) nodesSegments := make(map[int64][]*meta.Segment)
stoppingNodesSegments := make(map[int64][]*meta.Segment) stoppingNodesSegments := make(map[int64][]*meta.Segment)
outboundNodes := b.meta.ResourceManager.CheckOutboundNodes(replica)
totalCnt := 0 totalCnt := 0
for _, nid := range nodes { for _, nid := range nodes {
segments := b.dist.SegmentDistManager.GetByCollectionAndNode(replica.GetCollectionID(), nid) segments := b.dist.SegmentDistManager.GetByCollectionAndNode(replica.GetCollectionID(), nid)
@ -125,6 +127,14 @@ func (b *RowCountBasedBalancer) balanceReplica(replica *meta.Replica) ([]Segment
continue continue
} else if isStopping { } else if isStopping {
stoppingNodesSegments[nid] = segments stoppingNodesSegments[nid] = segments
} else if outboundNodes.Contain(nid) {
// if node is stop or transfer to other rg
log.RatedInfo(10, "meet outbound node, try to move out all segment/channel",
zap.Int64("collectionID", replica.GetCollectionID()),
zap.Int64("replicaID", replica.GetCollectionID()),
zap.Int64("node", nid),
)
stoppingNodesSegments[nid] = segments
} else { } else {
nodesSegments[nid] = segments nodesSegments[nid] = segments
} }
@ -224,7 +234,7 @@ outer:
node.setPriority(node.getPriority() + int(s.GetNumOfRows())) node.setPriority(node.getPriority() + int(s.GetNumOfRows()))
queue.push(node) queue.push(node)
} }
return plans, b.getChannelPlan(replica, stoppingNodesSegments) return plans, b.getChannelPlan(replica, lo.Keys(nodesSegments), lo.Keys(stoppingNodesSegments))
} }
func (b *RowCountBasedBalancer) handleStoppingNodes(replica *meta.Replica, nodeSegments map[int64][]*meta.Segment) ([]SegmentAssignPlan, []ChannelAssignPlan) { func (b *RowCountBasedBalancer) handleStoppingNodes(replica *meta.Replica, nodeSegments map[int64][]*meta.Segment) ([]SegmentAssignPlan, []ChannelAssignPlan) {
@ -271,17 +281,11 @@ func (b *RowCountBasedBalancer) collectionStoppingSegments(stoppingNodesSegments
return segments, removeRowCnt return segments, removeRowCnt
} }
func (b *RowCountBasedBalancer) getChannelPlan(replica *meta.Replica, stoppingNodesSegments map[int64][]*meta.Segment) []ChannelAssignPlan { func (b *RowCountBasedBalancer) getChannelPlan(replica *meta.Replica, onlineNodes []int64, offlineNodes []int64) []ChannelAssignPlan {
// maybe it will have some strategies to balance the channel in the future
// but now, only balance the channel for the stopping nodes.
return b.getChannelPlanForStoppingNodes(replica, stoppingNodesSegments)
}
func (b *RowCountBasedBalancer) getChannelPlanForStoppingNodes(replica *meta.Replica, stoppingNodesSegments map[int64][]*meta.Segment) []ChannelAssignPlan {
channelPlans := make([]ChannelAssignPlan, 0) channelPlans := make([]ChannelAssignPlan, 0)
for nodeID := range stoppingNodesSegments { for _, nodeID := range offlineNodes {
dmChannels := b.dist.ChannelDistManager.GetByCollectionAndNode(replica.GetCollectionID(), nodeID) dmChannels := b.dist.ChannelDistManager.GetByCollectionAndNode(replica.GetCollectionID(), nodeID)
plans := b.AssignChannel(dmChannels, replica.Replica.GetNodes()) plans := b.AssignChannel(dmChannels, onlineNodes)
for i := range plans { for i := range plans {
plans[i].From = nodeID plans[i].From = nodeID
plans[i].ReplicaID = replica.ID plans[i].ReplicaID = replica.ID

View File

@ -62,11 +62,11 @@ func (suite *RowCountBasedBalancerTestSuite) SetupTest() {
store := meta.NewMetaStore(suite.kv) store := meta.NewMetaStore(suite.kv)
idAllocator := RandomIncrementIDAllocator() idAllocator := RandomIncrementIDAllocator()
testMeta := meta.NewMeta(idAllocator, store) nodeManager := session.NewNodeManager()
testMeta := meta.NewMeta(idAllocator, store, nodeManager)
testTarget := meta.NewTargetManager(suite.broker, testMeta) testTarget := meta.NewTargetManager(suite.broker, testMeta)
distManager := meta.NewDistributionManager() distManager := meta.NewDistributionManager()
nodeManager := session.NewNodeManager()
suite.mockScheduler = task.NewMockScheduler(suite.T()) suite.mockScheduler = task.NewMockScheduler(suite.T())
suite.balancer = NewRowCountBasedBalancer(suite.mockScheduler, nodeManager, distManager, testMeta, testTarget) suite.balancer = NewRowCountBasedBalancer(suite.mockScheduler, nodeManager, distManager, testMeta, testTarget)
} }
@ -272,8 +272,10 @@ func (suite *RowCountBasedBalancerTestSuite) TestBalance() {
for i := range c.nodes { for i := range c.nodes {
nodeInfo := session.NewNodeInfo(c.nodes[i], "127.0.0.1:0") nodeInfo := session.NewNodeInfo(c.nodes[i], "127.0.0.1:0")
nodeInfo.UpdateStats(session.WithSegmentCnt(c.segmentCnts[i])) nodeInfo.UpdateStats(session.WithSegmentCnt(c.segmentCnts[i]))
nodeInfo.UpdateStats(session.WithChannelCnt(len(c.distributionChannels[c.nodes[i]])))
nodeInfo.SetState(c.states[i]) nodeInfo.SetState(c.states[i])
suite.balancer.nodeManager.Add(nodeInfo) suite.balancer.nodeManager.Add(nodeInfo)
suite.balancer.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, c.nodes[i])
} }
segmentPlans, channelPlans := balancer.Balance() segmentPlans, channelPlans := balancer.Balance()
suite.ElementsMatch(c.expectChannelPlans, channelPlans) suite.ElementsMatch(c.expectChannelPlans, channelPlans)
@ -283,6 +285,111 @@ func (suite *RowCountBasedBalancerTestSuite) TestBalance() {
} }
func (suite *RowCountBasedBalancerTestSuite) TestBalanceOutboundNodes() {
cases := []struct {
name string
nodes []int64
notExistedNodes []int64
segmentCnts []int
states []session.State
shouldMock bool
distributions map[int64][]*meta.Segment
distributionChannels map[int64][]*meta.DmChannel
expectPlans []SegmentAssignPlan
expectChannelPlans []ChannelAssignPlan
}{
{
name: "balance out bound nodes",
nodes: []int64{1, 2, 3},
segmentCnts: []int{1, 2, 2},
states: []session.State{session.NodeStateNormal, session.NodeStateNormal, session.NodeStateNormal},
shouldMock: true,
distributions: map[int64][]*meta.Segment{
1: {{SegmentInfo: &datapb.SegmentInfo{ID: 1, CollectionID: 1, NumOfRows: 10}, Node: 1}},
2: {
{SegmentInfo: &datapb.SegmentInfo{ID: 2, CollectionID: 1, NumOfRows: 20}, Node: 2},
{SegmentInfo: &datapb.SegmentInfo{ID: 3, CollectionID: 1, NumOfRows: 30}, Node: 2},
},
3: {
{SegmentInfo: &datapb.SegmentInfo{ID: 4, CollectionID: 1, NumOfRows: 10}, Node: 3},
{SegmentInfo: &datapb.SegmentInfo{ID: 5, CollectionID: 1, NumOfRows: 10}, Node: 3},
},
},
distributionChannels: map[int64][]*meta.DmChannel{
2: {
{VchannelInfo: &datapb.VchannelInfo{CollectionID: 1, ChannelName: "v2"}, Node: 2},
},
3: {
{VchannelInfo: &datapb.VchannelInfo{CollectionID: 1, ChannelName: "v3"}, Node: 3},
},
},
expectPlans: []SegmentAssignPlan{
{Segment: &meta.Segment{SegmentInfo: &datapb.SegmentInfo{ID: 4, CollectionID: 1, NumOfRows: 10}, Node: 3}, From: 3, To: 1, ReplicaID: 1, Weight: weightHigh},
{Segment: &meta.Segment{SegmentInfo: &datapb.SegmentInfo{ID: 5, CollectionID: 1, NumOfRows: 10}, Node: 3}, From: 3, To: 1, ReplicaID: 1, Weight: weightHigh},
},
expectChannelPlans: []ChannelAssignPlan{
{Channel: &meta.DmChannel{VchannelInfo: &datapb.VchannelInfo{CollectionID: 1, ChannelName: "v3"}, Node: 3}, From: 3, To: 1, ReplicaID: 1, Weight: weightHigh},
},
},
}
suite.mockScheduler.Mock.On("GetNodeChannelDelta", mock.Anything).Return(0)
for _, c := range cases {
suite.Run(c.name, func() {
suite.SetupSuite()
defer suite.TearDownTest()
balancer := suite.balancer
collection := utils.CreateTestCollection(1, 1)
segments := []*datapb.SegmentBinlogs{
{
SegmentID: 1,
},
{
SegmentID: 2,
},
{
SegmentID: 3,
},
{
SegmentID: 4,
},
{
SegmentID: 5,
},
}
suite.broker.EXPECT().GetRecoveryInfo(mock.Anything, int64(1), int64(1)).Return(
nil, segments, nil)
balancer.targetMgr.UpdateCollectionNextTargetWithPartitions(int64(1), int64(1))
balancer.targetMgr.UpdateCollectionCurrentTarget(1, 1)
collection.LoadPercentage = 100
collection.Status = querypb.LoadStatus_Loaded
balancer.meta.CollectionManager.PutCollection(collection)
balancer.meta.ReplicaManager.Put(utils.CreateTestReplica(1, 1, append(c.nodes, c.notExistedNodes...)))
for node, s := range c.distributions {
balancer.dist.SegmentDistManager.Update(node, s...)
}
for node, v := range c.distributionChannels {
balancer.dist.ChannelDistManager.Update(node, v...)
}
for i := range c.nodes {
nodeInfo := session.NewNodeInfo(c.nodes[i], "127.0.0.1:0")
nodeInfo.UpdateStats(session.WithSegmentCnt(c.segmentCnts[i]))
nodeInfo.UpdateStats(session.WithChannelCnt(len(c.distributionChannels[c.nodes[i]])))
nodeInfo.SetState(c.states[i])
suite.balancer.nodeManager.Add(nodeInfo)
}
// make node-3 outbound
err := balancer.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, 1)
suite.NoError(err)
err = balancer.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, 2)
suite.NoError(err)
segmentPlans, channelPlans := balancer.Balance()
suite.ElementsMatch(c.expectChannelPlans, channelPlans)
suite.ElementsMatch(c.expectPlans, segmentPlans)
})
}
}
func (suite *RowCountBasedBalancerTestSuite) TestBalanceOnLoadingCollection() { func (suite *RowCountBasedBalancerTestSuite) TestBalanceOnLoadingCollection() {
cases := []struct { cases := []struct {
name string name string

View File

@ -27,6 +27,7 @@ import (
"github.com/milvus-io/milvus/internal/querycoordv2/task" "github.com/milvus-io/milvus/internal/querycoordv2/task"
"github.com/milvus-io/milvus/internal/querycoordv2/utils" "github.com/milvus-io/milvus/internal/querycoordv2/utils"
"github.com/milvus-io/milvus/internal/util/typeutil" "github.com/milvus-io/milvus/internal/util/typeutil"
"github.com/samber/lo"
"go.uber.org/zap" "go.uber.org/zap"
) )
@ -134,7 +135,7 @@ func (c *ChannelChecker) getDmChannelDiff(targetMgr *meta.TargetManager,
func (c *ChannelChecker) getChannelDist(distMgr *meta.DistributionManager, replica *meta.Replica) []*meta.DmChannel { func (c *ChannelChecker) getChannelDist(distMgr *meta.DistributionManager, replica *meta.Replica) []*meta.DmChannel {
dist := make([]*meta.DmChannel, 0) dist := make([]*meta.DmChannel, 0)
for _, nodeID := range replica.Nodes.Collect() { for _, nodeID := range replica.GetNodes() {
dist = append(dist, distMgr.ChannelDistManager.GetByCollectionAndNode(replica.GetCollectionID(), nodeID)...) dist = append(dist, distMgr.ChannelDistManager.GetByCollectionAndNode(replica.GetCollectionID(), nodeID)...)
} }
return dist return dist
@ -170,7 +171,11 @@ func (c *ChannelChecker) findRepeatedChannels(distMgr *meta.DistributionManager,
} }
func (c *ChannelChecker) createChannelLoadTask(ctx context.Context, channels []*meta.DmChannel, replica *meta.Replica) []task.Task { func (c *ChannelChecker) createChannelLoadTask(ctx context.Context, channels []*meta.DmChannel, replica *meta.Replica) []task.Task {
plans := c.balancer.AssignChannel(channels, replica.Replica.GetNodes()) outboundNodes := c.meta.ResourceManager.CheckOutboundNodes(replica)
availableNodes := lo.Filter(replica.Replica.GetNodes(), func(node int64, _ int) bool {
return !outboundNodes.Contain(node)
})
plans := c.balancer.AssignChannel(channels, availableNodes)
for i := range plans { for i := range plans {
plans[i].ReplicaID = replica.GetID() plans[i].ReplicaID = replica.GetID()
} }

View File

@ -28,6 +28,7 @@ import (
"github.com/milvus-io/milvus/internal/querycoordv2/balance" "github.com/milvus-io/milvus/internal/querycoordv2/balance"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
. "github.com/milvus-io/milvus/internal/querycoordv2/params" . "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/task" "github.com/milvus-io/milvus/internal/querycoordv2/task"
"github.com/milvus-io/milvus/internal/querycoordv2/utils" "github.com/milvus-io/milvus/internal/querycoordv2/utils"
"github.com/milvus-io/milvus/internal/util/etcd" "github.com/milvus-io/milvus/internal/util/etcd"
@ -39,6 +40,8 @@ type ChannelCheckerTestSuite struct {
checker *ChannelChecker checker *ChannelChecker
meta *meta.Meta meta *meta.Meta
broker *meta.MockBroker broker *meta.MockBroker
nodeMgr *session.NodeManager
} }
func (suite *ChannelCheckerTestSuite) SetupSuite() { func (suite *ChannelCheckerTestSuite) SetupSuite() {
@ -62,7 +65,8 @@ func (suite *ChannelCheckerTestSuite) SetupTest() {
// meta // meta
store := meta.NewMetaStore(suite.kv) store := meta.NewMetaStore(suite.kv)
idAllocator := RandomIncrementIDAllocator() idAllocator := RandomIncrementIDAllocator()
suite.meta = meta.NewMeta(idAllocator, store) suite.nodeMgr = session.NewNodeManager()
suite.meta = meta.NewMeta(idAllocator, store, suite.nodeMgr)
suite.broker = meta.NewMockBroker(suite.T()) suite.broker = meta.NewMockBroker(suite.T())
targetManager := meta.NewTargetManager(suite.broker, suite.meta) targetManager := meta.NewTargetManager(suite.broker, suite.meta)
@ -98,6 +102,8 @@ func (suite *ChannelCheckerTestSuite) TestLoadChannel() {
checker := suite.checker checker := suite.checker
checker.meta.CollectionManager.PutCollection(utils.CreateTestCollection(1, 1)) checker.meta.CollectionManager.PutCollection(utils.CreateTestCollection(1, 1))
checker.meta.ReplicaManager.Put(utils.CreateTestReplica(1, 1, []int64{1})) checker.meta.ReplicaManager.Put(utils.CreateTestReplica(1, 1, []int64{1}))
suite.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
checker.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, 1)
channels := []*datapb.VchannelInfo{ channels := []*datapb.VchannelInfo{
{ {

View File

@ -29,6 +29,7 @@ import (
"github.com/milvus-io/milvus/internal/querycoordv2/task" "github.com/milvus-io/milvus/internal/querycoordv2/task"
"github.com/milvus-io/milvus/internal/querycoordv2/utils" "github.com/milvus-io/milvus/internal/querycoordv2/utils"
"github.com/milvus-io/milvus/internal/util/typeutil" "github.com/milvus-io/milvus/internal/util/typeutil"
"github.com/samber/lo"
"go.uber.org/zap" "go.uber.org/zap"
) )
@ -144,7 +145,7 @@ func (c *SegmentChecker) getStreamingSegmentDiff(targetMgr *meta.TargetManager,
func (c *SegmentChecker) getStreamingSegmentsDist(distMgr *meta.DistributionManager, replica *meta.Replica) map[int64]*meta.Segment { func (c *SegmentChecker) getStreamingSegmentsDist(distMgr *meta.DistributionManager, replica *meta.Replica) map[int64]*meta.Segment {
segments := make(map[int64]*meta.Segment, 0) segments := make(map[int64]*meta.Segment, 0)
for _, node := range replica.Nodes.Collect() { for _, node := range replica.GetNodes() {
segmentsOnNodes := distMgr.LeaderViewManager.GetGrowingSegmentDistByCollectionAndNode(replica.CollectionID, node) segmentsOnNodes := distMgr.LeaderViewManager.GetGrowingSegmentDistByCollectionAndNode(replica.CollectionID, node)
for k, v := range segmentsOnNodes { for k, v := range segmentsOnNodes {
segments[k] = v segments[k] = v
@ -196,7 +197,7 @@ func (c *SegmentChecker) getHistoricalSegmentDiff(targetMgr *meta.TargetManager,
func (c *SegmentChecker) getHistoricalSegmentsDist(distMgr *meta.DistributionManager, replica *meta.Replica) []*meta.Segment { func (c *SegmentChecker) getHistoricalSegmentsDist(distMgr *meta.DistributionManager, replica *meta.Replica) []*meta.Segment {
ret := make([]*meta.Segment, 0) ret := make([]*meta.Segment, 0)
for _, node := range replica.Nodes.Collect() { for _, node := range replica.GetNodes() {
ret = append(ret, distMgr.SegmentDistManager.GetByCollectionAndNode(replica.CollectionID, node)...) ret = append(ret, distMgr.SegmentDistManager.GetByCollectionAndNode(replica.CollectionID, node)...)
} }
return ret return ret
@ -266,7 +267,11 @@ func (c *SegmentChecker) createSegmentLoadTasks(ctx context.Context, segments []
} }
packedSegments = append(packedSegments, &meta.Segment{SegmentInfo: s}) packedSegments = append(packedSegments, &meta.Segment{SegmentInfo: s})
} }
plans := c.balancer.AssignSegment(packedSegments, replica.Replica.GetNodes()) outboundNodes := c.meta.ResourceManager.CheckOutboundNodes(replica)
availableNodes := lo.Filter(replica.Replica.GetNodes(), func(node int64, _ int) bool {
return !outboundNodes.Contain(node)
})
plans := c.balancer.AssignSegment(packedSegments, availableNodes)
for i := range plans { for i := range plans {
plans[i].ReplicaID = replica.GetID() plans[i].ReplicaID = replica.GetID()
} }

View File

@ -30,6 +30,7 @@ import (
"github.com/milvus-io/milvus/internal/querycoordv2/balance" "github.com/milvus-io/milvus/internal/querycoordv2/balance"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
. "github.com/milvus-io/milvus/internal/querycoordv2/params" . "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/task" "github.com/milvus-io/milvus/internal/querycoordv2/task"
"github.com/milvus-io/milvus/internal/querycoordv2/utils" "github.com/milvus-io/milvus/internal/querycoordv2/utils"
"github.com/milvus-io/milvus/internal/util/etcd" "github.com/milvus-io/milvus/internal/util/etcd"
@ -41,6 +42,7 @@ type SegmentCheckerTestSuite struct {
checker *SegmentChecker checker *SegmentChecker
meta *meta.Meta meta *meta.Meta
broker *meta.MockBroker broker *meta.MockBroker
nodeMgr *session.NodeManager
} }
func (suite *SegmentCheckerTestSuite) SetupSuite() { func (suite *SegmentCheckerTestSuite) SetupSuite() {
@ -64,7 +66,8 @@ func (suite *SegmentCheckerTestSuite) SetupTest() {
// meta // meta
store := meta.NewMetaStore(suite.kv) store := meta.NewMetaStore(suite.kv)
idAllocator := RandomIncrementIDAllocator() idAllocator := RandomIncrementIDAllocator()
suite.meta = meta.NewMeta(idAllocator, store) suite.nodeMgr = session.NewNodeManager()
suite.meta = meta.NewMeta(idAllocator, store, suite.nodeMgr)
distManager := meta.NewDistributionManager() distManager := meta.NewDistributionManager()
suite.broker = meta.NewMockBroker(suite.T()) suite.broker = meta.NewMockBroker(suite.T())
targetManager := meta.NewTargetManager(suite.broker, suite.meta) targetManager := meta.NewTargetManager(suite.broker, suite.meta)
@ -100,6 +103,10 @@ func (suite *SegmentCheckerTestSuite) TestLoadSegments() {
// set meta // set meta
checker.meta.CollectionManager.PutCollection(utils.CreateTestCollection(1, 1)) checker.meta.CollectionManager.PutCollection(utils.CreateTestCollection(1, 1))
checker.meta.ReplicaManager.Put(utils.CreateTestReplica(1, 1, []int64{1, 2})) checker.meta.ReplicaManager.Put(utils.CreateTestReplica(1, 1, []int64{1, 2}))
suite.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
checker.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, 1)
checker.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, 2)
// set target // set target
segments := []*datapb.SegmentBinlogs{ segments := []*datapb.SegmentBinlogs{

View File

@ -65,7 +65,7 @@ func (suite *DistControllerTestSuite) SetupTest() {
// meta // meta
store := meta.NewMetaStore(suite.kv) store := meta.NewMetaStore(suite.kv)
idAllocator := RandomIncrementIDAllocator() idAllocator := RandomIncrementIDAllocator()
suite.meta = meta.NewMeta(idAllocator, store) suite.meta = meta.NewMeta(idAllocator, store, session.NewNodeManager())
suite.mockCluster = session.NewMockCluster(suite.T()) suite.mockCluster = session.NewMockCluster(suite.T())
nodeManager := session.NewNodeManager() nodeManager := session.NewNodeManager()

View File

@ -48,7 +48,7 @@ import (
func (s *Server) checkAnyReplicaAvailable(collectionID int64) bool { func (s *Server) checkAnyReplicaAvailable(collectionID int64) bool {
for _, replica := range s.meta.ReplicaManager.GetByCollection(collectionID) { for _, replica := range s.meta.ReplicaManager.GetByCollection(collectionID) {
isAvailable := true isAvailable := true
for node := range replica.Nodes { for _, node := range replica.GetNodes() {
if s.nodeMgr.Get(node) == nil { if s.nodeMgr.Get(node) == nil {
isAvailable = false isAvailable = false
break break
@ -94,7 +94,11 @@ func (s *Server) balanceSegments(ctx context.Context, req *querypb.LoadBalanceRe
srcNode := req.GetSourceNodeIDs()[0] srcNode := req.GetSourceNodeIDs()[0]
dstNodeSet := typeutil.NewUniqueSet(req.GetDstNodeIDs()...) dstNodeSet := typeutil.NewUniqueSet(req.GetDstNodeIDs()...)
if dstNodeSet.Len() == 0 { if dstNodeSet.Len() == 0 {
dstNodeSet.Insert(replica.GetNodes()...) outboundNodes := s.meta.ResourceManager.CheckOutboundNodes(replica)
availableNodes := lo.Filter(replica.Replica.GetNodes(), func(node int64, _ int) bool {
return !outboundNodes.Contain(node)
})
dstNodeSet.Insert(availableNodes...)
} }
dstNodeSet.Remove(srcNode) dstNodeSet.Remove(srcNode)
@ -302,7 +306,13 @@ func (s *Server) tryGetNodesMetrics(ctx context.Context, req *milvuspb.GetMetric
} }
func (s *Server) fillReplicaInfo(replica *meta.Replica, withShardNodes bool) (*milvuspb.ReplicaInfo, error) { func (s *Server) fillReplicaInfo(replica *meta.Replica, withShardNodes bool) (*milvuspb.ReplicaInfo, error) {
info := utils.Replica2ReplicaInfo(replica.Replica) info := &milvuspb.ReplicaInfo{
ReplicaID: replica.GetID(),
CollectionID: replica.GetCollectionID(),
NodeIds: replica.GetNodes(),
ResourceGroupName: replica.GetResourceGroup(),
NumOutboundNode: s.meta.GetOutgoingNodeNumByReplica(replica),
}
channels := s.targetMgr.GetDmChannelsByCollection(replica.GetCollectionID(), meta.CurrentTarget) channels := s.targetMgr.GetDmChannelsByCollection(replica.GetCollectionID(), meta.CurrentTarget)
if len(channels) == 0 { if len(channels) == 0 {
@ -335,7 +345,7 @@ func (s *Server) fillReplicaInfo(replica *meta.Replica, withShardNodes bool) (*m
} }
if withShardNodes { if withShardNodes {
shardNodes := lo.FilterMap(segments, func(segment *meta.Segment, _ int) (int64, bool) { shardNodes := lo.FilterMap(segments, func(segment *meta.Segment, _ int) (int64, bool) {
if replica.Nodes.Contain(segment.Node) { if replica.Contains(segment.Node) {
return segment.Node, true return segment.Node, true
} }
return 0, false return 0, false

View File

@ -197,10 +197,11 @@ func (job *LoadCollectionJob) Execute() error {
} }
// Create replicas // Create replicas
replicas, err := utils.SpawnReplicas(job.meta.ReplicaManager, replicas, err := utils.SpawnReplicasWithRG(job.meta,
job.nodeMgr,
req.GetCollectionID(), req.GetCollectionID(),
req.GetReplicaNumber()) req.GetResourceGroups(),
req.GetReplicaNumber(),
)
if err != nil { if err != nil {
msg := "failed to spawn replica for collection" msg := "failed to spawn replica for collection"
log.Error(msg, zap.Error(err)) log.Error(msg, zap.Error(err))
@ -209,7 +210,8 @@ func (job *LoadCollectionJob) Execute() error {
for _, replica := range replicas { for _, replica := range replicas {
log.Info("replica created", log.Info("replica created",
zap.Int64("replicaID", replica.GetID()), zap.Int64("replicaID", replica.GetID()),
zap.Int64s("nodes", replica.GetNodes())) zap.Int64s("nodes", replica.GetNodes()),
zap.String("resourceGroup", replica.GetResourceGroup()))
} }
// Fetch channels and segments from DataCoord // Fetch channels and segments from DataCoord
@ -411,10 +413,11 @@ func (job *LoadPartitionJob) Execute() error {
} }
// Create replicas // Create replicas
replicas, err := utils.SpawnReplicas(job.meta.ReplicaManager, replicas, err := utils.SpawnReplicasWithRG(job.meta,
job.nodeMgr,
req.GetCollectionID(), req.GetCollectionID(),
req.GetReplicaNumber()) req.GetResourceGroups(),
req.GetReplicaNumber(),
)
if err != nil { if err != nil {
msg := "failed to spawn replica for collection" msg := "failed to spawn replica for collection"
log.Error(msg, zap.Error(err)) log.Error(msg, zap.Error(err))
@ -423,7 +426,8 @@ func (job *LoadPartitionJob) Execute() error {
for _, replica := range replicas { for _, replica := range replicas {
log.Info("replica created", log.Info("replica created",
zap.Int64("replicaID", replica.GetID()), zap.Int64("replicaID", replica.GetID()),
zap.Int64s("nodes", replica.GetNodes())) zap.Int64s("nodes", replica.GetNodes()),
zap.String("resourceGroup", replica.GetResourceGroup()))
} }
// It's safe here to call UpdateCollectionNextTargetWithPartitions, as the collection not existing // It's safe here to call UpdateCollectionNextTargetWithPartitions, as the collection not existing

View File

@ -131,19 +131,29 @@ func (suite *JobSuite) SetupTest() {
suite.store = meta.NewMetaStore(suite.kv) suite.store = meta.NewMetaStore(suite.kv)
suite.dist = meta.NewDistributionManager() suite.dist = meta.NewDistributionManager()
suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), suite.store) suite.nodeMgr = session.NewNodeManager()
suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), suite.store, suite.nodeMgr)
suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta) suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta)
suite.targetObserver = observers.NewTargetObserver(suite.meta, suite.targetObserver = observers.NewTargetObserver(suite.meta,
suite.targetMgr, suite.targetMgr,
suite.dist, suite.dist,
suite.broker, suite.broker,
) )
suite.nodeMgr = session.NewNodeManager()
suite.nodeMgr.Add(&session.NodeInfo{})
suite.scheduler = NewScheduler() suite.scheduler = NewScheduler()
suite.scheduler.Start(context.Background()) suite.scheduler.Start(context.Background())
meta.GlobalFailedLoadCache = meta.NewFailedLoadCache() meta.GlobalFailedLoadCache = meta.NewFailedLoadCache()
suite.nodeMgr.Add(session.NewNodeInfo(1000, "localhost"))
suite.nodeMgr.Add(session.NewNodeInfo(2000, "localhost"))
suite.nodeMgr.Add(session.NewNodeInfo(3000, "localhost"))
err = suite.meta.AssignNode(meta.DefaultResourceGroupName, 1000)
suite.NoError(err)
err = suite.meta.AssignNode(meta.DefaultResourceGroupName, 2000)
suite.NoError(err)
err = suite.meta.AssignNode(meta.DefaultResourceGroupName, 3000)
suite.NoError(err)
} }
func (suite *JobSuite) TearDownTest() { func (suite *JobSuite) TearDownTest() {
@ -265,6 +275,48 @@ func (suite *JobSuite) TestLoadCollection() {
err := job.Wait() err := job.Wait()
suite.ErrorIs(err, ErrLoadParameterMismatched) suite.ErrorIs(err, ErrLoadParameterMismatched)
} }
suite.meta.ResourceManager.AddResourceGroup("rg1")
suite.meta.ResourceManager.AddResourceGroup("rg2")
suite.meta.ResourceManager.AddResourceGroup("rg3")
// Load with 3 replica on 1 rg
req := &querypb.LoadCollectionRequest{
CollectionID: 1001,
ReplicaNumber: 3,
ResourceGroups: []string{"rg1"},
}
job := NewLoadCollectionJob(
ctx,
req,
suite.dist,
suite.meta,
suite.targetMgr,
suite.broker,
suite.nodeMgr,
)
suite.scheduler.Add(job)
err := job.Wait()
suite.ErrorContains(err, meta.ErrNodeNotEnough.Error())
// Load with 3 replica on 3 rg
req = &querypb.LoadCollectionRequest{
CollectionID: 1002,
ReplicaNumber: 3,
ResourceGroups: []string{"rg1", "rg2", "rg3"},
}
job = NewLoadCollectionJob(
ctx,
req,
suite.dist,
suite.meta,
suite.targetMgr,
suite.broker,
suite.nodeMgr,
)
suite.scheduler.Add(job)
err = job.Wait()
suite.ErrorContains(err, meta.ErrNodeNotEnough.Error())
} }
func (suite *JobSuite) TestLoadCollectionWithReplicas() { func (suite *JobSuite) TestLoadCollectionWithReplicas() {
@ -278,7 +330,7 @@ func (suite *JobSuite) TestLoadCollectionWithReplicas() {
// Load with 3 replica // Load with 3 replica
req := &querypb.LoadCollectionRequest{ req := &querypb.LoadCollectionRequest{
CollectionID: collection, CollectionID: collection,
ReplicaNumber: 3, ReplicaNumber: 5,
} }
job := NewLoadCollectionJob( job := NewLoadCollectionJob(
ctx, ctx,
@ -482,6 +534,50 @@ func (suite *JobSuite) TestLoadPartition() {
err := job.Wait() err := job.Wait()
suite.ErrorIs(err, ErrLoadParameterMismatched) suite.ErrorIs(err, ErrLoadParameterMismatched)
} }
suite.meta.ResourceManager.AddResourceGroup("rg1")
suite.meta.ResourceManager.AddResourceGroup("rg2")
suite.meta.ResourceManager.AddResourceGroup("rg3")
// test load 3 replica in 1 rg, should pass rg check
req := &querypb.LoadPartitionsRequest{
CollectionID: 100,
PartitionIDs: []int64{1001},
ReplicaNumber: 3,
ResourceGroups: []string{"rg1"},
}
job := NewLoadPartitionJob(
ctx,
req,
suite.dist,
suite.meta,
suite.targetMgr,
suite.broker,
suite.nodeMgr,
)
suite.scheduler.Add(job)
err := job.Wait()
suite.Contains(err.Error(), meta.ErrNodeNotEnough.Error())
// test load 3 replica in 3 rg, should pass rg check
req = &querypb.LoadPartitionsRequest{
CollectionID: 102,
PartitionIDs: []int64{1001},
ReplicaNumber: 3,
ResourceGroups: []string{"rg1", "rg2", "rg3"},
}
job = NewLoadPartitionJob(
ctx,
req,
suite.dist,
suite.meta,
suite.targetMgr,
suite.broker,
suite.nodeMgr,
)
suite.scheduler.Add(job)
err = job.Wait()
suite.Contains(err.Error(), meta.ErrNodeNotEnough.Error())
} }
func (suite *JobSuite) TestLoadPartitionWithReplicas() { func (suite *JobSuite) TestLoadPartitionWithReplicas() {
@ -496,7 +592,7 @@ func (suite *JobSuite) TestLoadPartitionWithReplicas() {
req := &querypb.LoadPartitionsRequest{ req := &querypb.LoadPartitionsRequest{
CollectionID: collection, CollectionID: collection,
PartitionIDs: suite.partitions[collection], PartitionIDs: suite.partitions[collection],
ReplicaNumber: 3, ReplicaNumber: 5,
} }
job := NewLoadPartitionJob( job := NewLoadPartitionJob(
ctx, ctx,
@ -707,7 +803,16 @@ func (suite *JobSuite) TestReleasePartition() {
func (suite *JobSuite) TestLoadCollectionStoreFailed() { func (suite *JobSuite) TestLoadCollectionStoreFailed() {
// Store collection failed // Store collection failed
store := meta.NewMockStore(suite.T()) store := meta.NewMockStore(suite.T())
suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), store) suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), store, suite.nodeMgr)
store.EXPECT().SaveResourceGroup(mock.Anything, mock.Anything).Return(nil)
err := suite.meta.AssignNode(meta.DefaultResourceGroupName, 1000)
suite.NoError(err)
err = suite.meta.AssignNode(meta.DefaultResourceGroupName, 2000)
suite.NoError(err)
err = suite.meta.AssignNode(meta.DefaultResourceGroupName, 3000)
suite.NoError(err)
for _, collection := range suite.collections { for _, collection := range suite.collections {
if suite.loadTypes[collection] != querypb.LoadType_LoadCollection { if suite.loadTypes[collection] != querypb.LoadType_LoadCollection {
continue continue
@ -743,8 +848,17 @@ func (suite *JobSuite) TestLoadCollectionStoreFailed() {
func (suite *JobSuite) TestLoadPartitionStoreFailed() { func (suite *JobSuite) TestLoadPartitionStoreFailed() {
// Store partition failed // Store partition failed
store := meta.NewMockStore(suite.T()) store := meta.NewMockStore(suite.T())
suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), store) suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), store, suite.nodeMgr)
err := errors.New("failed to store collection")
store.EXPECT().SaveResourceGroup(mock.Anything, mock.Anything).Return(nil)
err := suite.meta.AssignNode(meta.DefaultResourceGroupName, 1000)
suite.NoError(err)
err = suite.meta.AssignNode(meta.DefaultResourceGroupName, 2000)
suite.NoError(err)
err = suite.meta.AssignNode(meta.DefaultResourceGroupName, 3000)
suite.NoError(err)
err = errors.New("failed to store collection")
for _, collection := range suite.collections { for _, collection := range suite.collections {
if suite.loadTypes[collection] != querypb.LoadType_LoadPartition { if suite.loadTypes[collection] != querypb.LoadType_LoadPartition {
continue continue
@ -775,7 +889,7 @@ func (suite *JobSuite) TestLoadPartitionStoreFailed() {
func (suite *JobSuite) TestLoadCreateReplicaFailed() { func (suite *JobSuite) TestLoadCreateReplicaFailed() {
// Store replica failed // Store replica failed
suite.meta = meta.NewMeta(ErrorIDAllocator(), suite.store) suite.meta = meta.NewMeta(ErrorIDAllocator(), suite.store, session.NewNodeManager())
for _, collection := range suite.collections { for _, collection := range suite.collections {
req := &querypb.LoadCollectionRequest{ req := &querypb.LoadCollectionRequest{
CollectionID: collection, CollectionID: collection,

View File

@ -91,7 +91,7 @@ func (m *ChannelDistManager) GetShardLeader(replica *Replica, shard string) (int
m.rwmutex.RLock() m.rwmutex.RLock()
defer m.rwmutex.RUnlock() defer m.rwmutex.RUnlock()
for node := range replica.Nodes { for _, node := range replica.GetNodes() {
channels := m.channels[node] channels := m.channels[node]
for _, dmc := range channels { for _, dmc := range channels {
if dmc.ChannelName == shard { if dmc.ChannelName == shard {
@ -108,7 +108,7 @@ func (m *ChannelDistManager) GetShardLeadersByReplica(replica *Replica) map[stri
defer m.rwmutex.RUnlock() defer m.rwmutex.RUnlock()
ret := make(map[string]int64) ret := make(map[string]int64)
for node := range replica.Nodes { for _, node := range replica.GetNodes() {
channels := m.channels[node] channels := m.channels[node]
for _, dmc := range channels { for _, dmc := range channels {
if dmc.GetCollectionID() == replica.GetCollectionID() { if dmc.GetCollectionID() == replica.GetCollectionID() {

View File

@ -100,18 +100,18 @@ func (suite *ChannelDistManagerSuite) TestGetBy() {
func (suite *ChannelDistManagerSuite) TestGetShardLeader() { func (suite *ChannelDistManagerSuite) TestGetShardLeader() {
replicas := []*Replica{ replicas := []*Replica{
{ NewReplica(
Replica: &querypb.Replica{ &querypb.Replica{
CollectionID: suite.collection, CollectionID: suite.collection,
}, },
Nodes: typeutil.NewUniqueSet(suite.nodes[0], suite.nodes[2]), typeutil.NewUniqueSet(suite.nodes[0], suite.nodes[2]),
}, ),
{ NewReplica(
Replica: &querypb.Replica{ &querypb.Replica{
CollectionID: suite.collection, CollectionID: suite.collection,
}, },
Nodes: typeutil.NewUniqueSet(suite.nodes[1]), typeutil.NewUniqueSet(suite.nodes[1]),
}, ),
} }
// Test on replica 0 // Test on replica 0

View File

@ -16,17 +16,22 @@
package meta package meta
import "github.com/milvus-io/milvus/internal/querycoordv2/session"
type Meta struct { type Meta struct {
*CollectionManager *CollectionManager
*ReplicaManager *ReplicaManager
*ResourceManager
} }
func NewMeta( func NewMeta(
idAllocator func() (int64, error), idAllocator func() (int64, error),
store Store, store Store,
nodeMgr *session.NodeManager,
) *Meta { ) *Meta {
return &Meta{ return &Meta{
NewCollectionManager(store), NewCollectionManager(store),
NewReplicaManager(idAllocator, store), NewReplicaManager(idAllocator, store),
NewResourceManager(store, nodeMgr),
} }
} }

View File

@ -155,6 +155,51 @@ func (_c *MockStore_GetReplicas_Call) Return(_a0 []*querypb.Replica, _a1 error)
return _c return _c
} }
// GetResourceGroups provides a mock function with given fields:
func (_m *MockStore) GetResourceGroups() ([]*querypb.ResourceGroup, error) {
ret := _m.Called()
var r0 []*querypb.ResourceGroup
if rf, ok := ret.Get(0).(func() []*querypb.ResourceGroup); ok {
r0 = rf()
} else {
if ret.Get(0) != nil {
r0 = ret.Get(0).([]*querypb.ResourceGroup)
}
}
var r1 error
if rf, ok := ret.Get(1).(func() error); ok {
r1 = rf()
} else {
r1 = ret.Error(1)
}
return r0, r1
}
// MockStore_GetResourceGroups_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetResourceGroups'
type MockStore_GetResourceGroups_Call struct {
*mock.Call
}
// GetResourceGroups is a helper method to define mock.On call
func (_e *MockStore_Expecter) GetResourceGroups() *MockStore_GetResourceGroups_Call {
return &MockStore_GetResourceGroups_Call{Call: _e.mock.On("GetResourceGroups")}
}
func (_c *MockStore_GetResourceGroups_Call) Run(run func()) *MockStore_GetResourceGroups_Call {
_c.Call.Run(func(args mock.Arguments) {
run()
})
return _c
}
func (_c *MockStore_GetResourceGroups_Call) Return(_a0 []*querypb.ResourceGroup, _a1 error) *MockStore_GetResourceGroups_Call {
_c.Call.Return(_a0, _a1)
return _c
}
// ReleaseCollection provides a mock function with given fields: id // ReleaseCollection provides a mock function with given fields: id
func (_m *MockStore) ReleaseCollection(id int64) error { func (_m *MockStore) ReleaseCollection(id int64) error {
ret := _m.Called(id) ret := _m.Called(id)
@ -319,6 +364,43 @@ func (_c *MockStore_ReleaseReplicas_Call) Return(_a0 error) *MockStore_ReleaseRe
return _c return _c
} }
// RemoveResourceGroup provides a mock function with given fields: rgName
func (_m *MockStore) RemoveResourceGroup(rgName string) error {
ret := _m.Called(rgName)
var r0 error
if rf, ok := ret.Get(0).(func(string) error); ok {
r0 = rf(rgName)
} else {
r0 = ret.Error(0)
}
return r0
}
// MockStore_RemoveResourceGroup_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'RemoveResourceGroup'
type MockStore_RemoveResourceGroup_Call struct {
*mock.Call
}
// RemoveResourceGroup is a helper method to define mock.On call
// - rgName string
func (_e *MockStore_Expecter) RemoveResourceGroup(rgName interface{}) *MockStore_RemoveResourceGroup_Call {
return &MockStore_RemoveResourceGroup_Call{Call: _e.mock.On("RemoveResourceGroup", rgName)}
}
func (_c *MockStore_RemoveResourceGroup_Call) Run(run func(rgName string)) *MockStore_RemoveResourceGroup_Call {
_c.Call.Run(func(args mock.Arguments) {
run(args[0].(string))
})
return _c
}
func (_c *MockStore_RemoveResourceGroup_Call) Return(_a0 error) *MockStore_RemoveResourceGroup_Call {
_c.Call.Return(_a0)
return _c
}
// SaveCollection provides a mock function with given fields: info // SaveCollection provides a mock function with given fields: info
func (_m *MockStore) SaveCollection(info *querypb.CollectionLoadInfo) error { func (_m *MockStore) SaveCollection(info *querypb.CollectionLoadInfo) error {
ret := _m.Called(info) ret := _m.Called(info)
@ -443,6 +525,56 @@ func (_c *MockStore_SaveReplica_Call) Return(_a0 error) *MockStore_SaveReplica_C
return _c return _c
} }
// SaveResourceGroup provides a mock function with given fields: rgs
func (_m *MockStore) SaveResourceGroup(rgs ...*querypb.ResourceGroup) error {
_va := make([]interface{}, len(rgs))
for _i := range rgs {
_va[_i] = rgs[_i]
}
var _ca []interface{}
_ca = append(_ca, _va...)
ret := _m.Called(_ca...)
var r0 error
if rf, ok := ret.Get(0).(func(...*querypb.ResourceGroup) error); ok {
r0 = rf(rgs...)
} else {
r0 = ret.Error(0)
}
return r0
}
// MockStore_SaveResourceGroup_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'SaveResourceGroup'
type MockStore_SaveResourceGroup_Call struct {
*mock.Call
}
// SaveResourceGroup is a helper method to define mock.On call
// - rgs ...*querypb.ResourceGroup
func (_e *MockStore_Expecter) SaveResourceGroup(rgs ...interface{}) *MockStore_SaveResourceGroup_Call {
return &MockStore_SaveResourceGroup_Call{Call: _e.mock.On("SaveResourceGroup",
append([]interface{}{}, rgs...)...)}
}
func (_c *MockStore_SaveResourceGroup_Call) Run(run func(rgs ...*querypb.ResourceGroup)) *MockStore_SaveResourceGroup_Call {
_c.Call.Run(func(args mock.Arguments) {
variadicArgs := make([]*querypb.ResourceGroup, len(args)-0)
for i, a := range args[0:] {
if a != nil {
variadicArgs[i] = a.(*querypb.ResourceGroup)
}
}
run(variadicArgs...)
})
return _c
}
func (_c *MockStore_SaveResourceGroup_Call) Return(_a0 error) *MockStore_SaveResourceGroup_Call {
_c.Call.Return(_a0)
return _c
}
type mockConstructorTestingTNewMockStore interface { type mockConstructorTestingTNewMockStore interface {
mock.TestingT mock.TestingT
Cleanup(func()) Cleanup(func())

View File

@ -30,23 +30,66 @@ import (
type Replica struct { type Replica struct {
*querypb.Replica *querypb.Replica
Nodes UniqueSet // a helper field for manipulating replica's Nodes slice field nodes UniqueSet // a helper field for manipulating replica's Nodes slice field
rwmutex sync.RWMutex
}
func NewReplica(replica *querypb.Replica, nodes UniqueSet) *Replica {
return &Replica{
Replica: replica,
nodes: nodes,
}
} }
func (replica *Replica) AddNode(nodes ...int64) { func (replica *Replica) AddNode(nodes ...int64) {
replica.Nodes.Insert(nodes...) replica.rwmutex.Lock()
replica.Replica.Nodes = replica.Nodes.Collect() defer replica.rwmutex.Unlock()
replica.nodes.Insert(nodes...)
replica.Replica.Nodes = replica.nodes.Collect()
}
func (replica *Replica) GetNodes() []int64 {
replica.rwmutex.RLock()
defer replica.rwmutex.RUnlock()
if replica != nil {
return replica.nodes.Collect()
}
return nil
}
func (replica *Replica) Len() int {
replica.rwmutex.RLock()
defer replica.rwmutex.RUnlock()
if replica != nil {
return replica.nodes.Len()
}
return 0
}
func (replica *Replica) Contains(node int64) bool {
replica.rwmutex.RLock()
defer replica.rwmutex.RUnlock()
if replica != nil {
return replica.nodes.Contain(node)
}
return false
} }
func (replica *Replica) RemoveNode(nodes ...int64) { func (replica *Replica) RemoveNode(nodes ...int64) {
replica.Nodes.Remove(nodes...) replica.rwmutex.Lock()
replica.Replica.Nodes = replica.Nodes.Collect() defer replica.rwmutex.Unlock()
replica.nodes.Remove(nodes...)
replica.Replica.Nodes = replica.nodes.Collect()
} }
func (replica *Replica) Clone() *Replica { func (replica *Replica) Clone() *Replica {
replica.rwmutex.RLock()
defer replica.rwmutex.RUnlock()
return &Replica{ return &Replica{
Replica: proto.Clone(replica.Replica).(*querypb.Replica), Replica: proto.Clone(replica.Replica).(*querypb.Replica),
Nodes: NewUniqueSet(replica.Replica.Nodes...), nodes: NewUniqueSet(replica.Replica.Nodes...),
} }
} }
@ -75,10 +118,14 @@ func (m *ReplicaManager) Recover(collections []int64) error {
collectionSet := typeutil.NewUniqueSet(collections...) collectionSet := typeutil.NewUniqueSet(collections...)
for _, replica := range replicas { for _, replica := range replicas {
if len(replica.GetResourceGroup()) == 0 {
replica.ResourceGroup = DefaultResourceGroupName
}
if collectionSet.Contain(replica.GetCollectionID()) { if collectionSet.Contain(replica.GetCollectionID()) {
m.replicas[replica.GetID()] = &Replica{ m.replicas[replica.GetID()] = &Replica{
Replica: replica, Replica: replica,
Nodes: NewUniqueSet(replica.GetNodes()...), nodes: NewUniqueSet(replica.GetNodes()...),
} }
log.Info("recover replica", log.Info("recover replica",
zap.Int64("collectionID", replica.GetCollectionID()), zap.Int64("collectionID", replica.GetCollectionID()),
@ -109,13 +156,13 @@ func (m *ReplicaManager) Get(id UniqueID) *Replica {
// Spawn spawns replicas of the given number, for given collection, // Spawn spawns replicas of the given number, for given collection,
// this doesn't store these replicas and assign nodes to them. // this doesn't store these replicas and assign nodes to them.
func (m *ReplicaManager) Spawn(collection int64, replicaNumber int32) ([]*Replica, error) { func (m *ReplicaManager) Spawn(collection int64, replicaNumber int32, rgName string) ([]*Replica, error) {
var ( var (
replicas = make([]*Replica, replicaNumber) replicas = make([]*Replica, replicaNumber)
err error err error
) )
for i := range replicas { for i := range replicas {
replicas[i], err = m.spawn(collection) replicas[i], err = m.spawn(collection, rgName)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -130,7 +177,7 @@ func (m *ReplicaManager) Put(replicas ...*Replica) error {
return m.put(replicas...) return m.put(replicas...)
} }
func (m *ReplicaManager) spawn(collectionID UniqueID) (*Replica, error) { func (m *ReplicaManager) spawn(collectionID UniqueID, rgName string) (*Replica, error) {
id, err := m.idAllocator() id, err := m.idAllocator()
if err != nil { if err != nil {
return nil, err return nil, err
@ -139,8 +186,9 @@ func (m *ReplicaManager) spawn(collectionID UniqueID) (*Replica, error) {
Replica: &querypb.Replica{ Replica: &querypb.Replica{
ID: id, ID: id,
CollectionID: collectionID, CollectionID: collectionID,
ResourceGroup: rgName,
}, },
Nodes: make(UniqueSet), nodes: make(UniqueSet),
}, nil }, nil
} }
@ -192,7 +240,7 @@ func (m *ReplicaManager) GetByCollectionAndNode(collectionID, nodeID UniqueID) *
defer m.rwmutex.RUnlock() defer m.rwmutex.RUnlock()
for _, replica := range m.replicas { for _, replica := range m.replicas {
if replica.CollectionID == collectionID && replica.Nodes.Contain(nodeID) { if replica.CollectionID == collectionID && replica.nodes.Contain(nodeID) {
return replica return replica
} }
} }
@ -200,6 +248,34 @@ func (m *ReplicaManager) GetByCollectionAndNode(collectionID, nodeID UniqueID) *
return nil return nil
} }
func (m *ReplicaManager) GetByCollectionAndRG(collectionID int64, rgName string) []*Replica {
m.rwmutex.RLock()
defer m.rwmutex.RUnlock()
ret := make([]*Replica, 0)
for _, replica := range m.replicas {
if replica.GetCollectionID() == collectionID && replica.GetResourceGroup() == rgName {
ret = append(ret, replica)
}
}
return ret
}
func (m *ReplicaManager) GetByResourceGroup(rgName string) []*Replica {
m.rwmutex.RLock()
defer m.rwmutex.RUnlock()
ret := make([]*Replica, 0)
for _, replica := range m.replicas {
if replica.GetResourceGroup() == rgName {
ret = append(ret, replica)
}
}
return ret
}
func (m *ReplicaManager) AddNode(replicaID UniqueID, nodes ...UniqueID) error { func (m *ReplicaManager) AddNode(replicaID UniqueID, nodes ...UniqueID) error {
m.rwmutex.Lock() m.rwmutex.Lock()
defer m.rwmutex.Unlock() defer m.rwmutex.Unlock()
@ -227,3 +303,17 @@ func (m *ReplicaManager) RemoveNode(replicaID UniqueID, nodes ...UniqueID) error
replica.RemoveNode(nodes...) replica.RemoveNode(nodes...)
return m.put(replica) return m.put(replica)
} }
func (m *ReplicaManager) GetResourceGroupByCollection(collection UniqueID) typeutil.Set[string] {
m.rwmutex.Lock()
defer m.rwmutex.Unlock()
ret := typeutil.NewSet[string]()
for _, r := range m.replicas {
if r.GetCollectionID() == collection {
ret.Insert(r.GetResourceGroup())
}
}
return ret
}

View File

@ -76,14 +76,14 @@ func (suite *ReplicaManagerSuite) TestSpawn() {
mgr := suite.mgr mgr := suite.mgr
for i, collection := range suite.collections { for i, collection := range suite.collections {
replicas, err := mgr.Spawn(collection, suite.replicaNumbers[i]) replicas, err := mgr.Spawn(collection, suite.replicaNumbers[i], DefaultResourceGroupName)
suite.NoError(err) suite.NoError(err)
suite.Len(replicas, int(suite.replicaNumbers[i])) suite.Len(replicas, int(suite.replicaNumbers[i]))
} }
mgr.idAllocator = ErrorIDAllocator() mgr.idAllocator = ErrorIDAllocator()
for i, collection := range suite.collections { for i, collection := range suite.collections {
_, err := mgr.Spawn(collection, suite.replicaNumbers[i]) _, err := mgr.Spawn(collection, suite.replicaNumbers[i], DefaultResourceGroupName)
suite.Error(err) suite.Error(err)
} }
} }
@ -98,8 +98,8 @@ func (suite *ReplicaManagerSuite) TestGet() {
for _, replica := range replicas { for _, replica := range replicas {
suite.Equal(collection, replica.GetCollectionID()) suite.Equal(collection, replica.GetCollectionID())
suite.Equal(replica, mgr.Get(replica.GetID())) suite.Equal(replica, mgr.Get(replica.GetID()))
suite.Equal(replica.Replica.Nodes, replica.Nodes.Collect()) suite.Equal(replica.Replica.GetNodes(), replica.GetNodes())
replicaNodes[replica.GetID()] = replica.Replica.Nodes replicaNodes[replica.GetID()] = replica.Replica.GetNodes()
nodes = append(nodes, replica.Replica.Nodes...) nodes = append(nodes, replica.Replica.Nodes...)
} }
suite.Len(nodes, int(suite.replicaNumbers[i])) suite.Len(nodes, int(suite.replicaNumbers[i]))
@ -137,9 +137,9 @@ func (suite *ReplicaManagerSuite) TestRecover() {
suite.NotNil(replica) suite.NotNil(replica)
suite.EqualValues(1000, replica.CollectionID) suite.EqualValues(1000, replica.CollectionID)
suite.EqualValues([]int64{1, 2, 3}, replica.Replica.Nodes) suite.EqualValues([]int64{1, 2, 3}, replica.Replica.Nodes)
suite.Len(replica.Nodes, len(replica.Replica.GetNodes())) suite.Len(replica.GetNodes(), len(replica.Replica.GetNodes()))
for _, node := range replica.Replica.GetNodes() { for _, node := range replica.Replica.GetNodes() {
suite.True(replica.Nodes.Contain(node)) suite.True(replica.Contains(node))
} }
} }
@ -175,7 +175,7 @@ func (suite *ReplicaManagerSuite) TestNodeManipulate() {
suite.NoError(err) suite.NoError(err)
replica = mgr.GetByCollectionAndNode(collection, newNode) replica = mgr.GetByCollectionAndNode(collection, newNode)
suite.Contains(replica.Nodes, newNode) suite.Contains(replica.GetNodes(), newNode)
suite.Contains(replica.Replica.GetNodes(), newNode) suite.Contains(replica.Replica.GetNodes(), newNode)
err = mgr.RemoveNode(replica.GetID(), firstNode) err = mgr.RemoveNode(replica.GetID(), firstNode)
@ -192,7 +192,7 @@ func (suite *ReplicaManagerSuite) TestNodeManipulate() {
suite.Nil(replica) suite.Nil(replica)
replica = mgr.GetByCollectionAndNode(collection, newNode) replica = mgr.GetByCollectionAndNode(collection, newNode)
suite.Contains(replica.Nodes, newNode) suite.Contains(replica.GetNodes(), newNode)
suite.Contains(replica.Replica.GetNodes(), newNode) suite.Contains(replica.Replica.GetNodes(), newNode)
} }
} }
@ -201,7 +201,7 @@ func (suite *ReplicaManagerSuite) spawnAndPutAll() {
mgr := suite.mgr mgr := suite.mgr
for i, collection := range suite.collections { for i, collection := range suite.collections {
replicas, err := mgr.Spawn(collection, suite.replicaNumbers[i]) replicas, err := mgr.Spawn(collection, suite.replicaNumbers[i], DefaultResourceGroupName)
suite.NoError(err) suite.NoError(err)
suite.Len(replicas, int(suite.replicaNumbers[i])) suite.Len(replicas, int(suite.replicaNumbers[i]))
for j, replica := range replicas { for j, replica := range replicas {
@ -212,6 +212,27 @@ func (suite *ReplicaManagerSuite) spawnAndPutAll() {
} }
} }
func (suite *ReplicaManagerSuite) TestResourceGroup() {
mgr := NewReplicaManager(suite.idAllocator, suite.store)
replica1, err := mgr.spawn(int64(1000), DefaultResourceGroupName)
replica1.AddNode(1)
suite.NoError(err)
mgr.Put(replica1)
replica2, err := mgr.spawn(int64(2000), DefaultResourceGroupName)
replica2.AddNode(1)
suite.NoError(err)
mgr.Put(replica2)
replicas := mgr.GetByResourceGroup(DefaultResourceGroupName)
suite.Len(replicas, 2)
replicas = mgr.GetByCollectionAndRG(int64(1000), DefaultResourceGroupName)
suite.Len(replicas, 1)
rgNames := mgr.GetResourceGroupByCollection(int64(1000))
suite.Len(rgNames, 1)
suite.True(rgNames.Contain(DefaultResourceGroupName))
}
func (suite *ReplicaManagerSuite) clearMemory() { func (suite *ReplicaManagerSuite) clearMemory() {
suite.mgr.replicas = make(map[int64]*Replica) suite.mgr.replicas = make(map[int64]*Replica)
} }

View File

@ -0,0 +1,632 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package meta
import (
"errors"
"sync"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/util/typeutil"
. "github.com/milvus-io/milvus/internal/util/typeutil"
"github.com/samber/lo"
"go.uber.org/zap"
)
var (
ErrNodeAlreadyAssign = errors.New("node already assign to other resource group")
ErrRGIsFull = errors.New("resource group is full")
ErrRGIsEmpty = errors.New("resource group is empty")
ErrRGNotExist = errors.New("resource group doesn't exist")
ErrRGAlreadyExist = errors.New("resource group already exist")
ErrRGAssignNodeFailed = errors.New("failed to assign node to resource group")
ErrRGUnAssignNodeFailed = errors.New("failed to unassign node from resource group")
ErrSaveResourceGroupToStore = errors.New("failed to save resource group to store")
ErrRemoveResourceGroupFromStore = errors.New("failed to remove resource group from store")
ErrRecoverResourceGroupToStore = errors.New("failed to recover resource group to store")
ErrNodeNotAssignToRG = errors.New("node hasn't been assign to any resource group")
ErrRGNameIsEmpty = errors.New("resource group name couldn't be empty")
ErrDeleteDefaultRG = errors.New("delete default rg is not permitted")
ErrDeleteNonEmptyRG = errors.New("delete non-empty rg is not permitted")
ErrNodeNotExist = errors.New("node does not exist")
ErrNodeStopped = errors.New("node has been stopped")
ErrRGLimit = errors.New("resource group num reach limit 1024")
ErrNodeNotEnough = errors.New("nodes not enough")
)
var DefaultResourceGroupName = "__default_resource_group"
type ResourceGroup struct {
nodes UniqueSet
capacity int
}
func NewResourceGroup(capacity int) *ResourceGroup {
rg := &ResourceGroup{
nodes: typeutil.NewUniqueSet(),
capacity: capacity,
}
return rg
}
// assign node to resource group
func (rg *ResourceGroup) assignNode(id int64) error {
if rg.containsNode(id) {
return ErrNodeAlreadyAssign
}
rg.nodes.Insert(id)
rg.capacity++
return nil
}
// unassign node from resource group
func (rg *ResourceGroup) unassignNode(id int64) error {
if !rg.containsNode(id) {
// remove non exist node should be tolerable
return nil
}
rg.nodes.Remove(id)
rg.capacity--
return nil
}
func (rg *ResourceGroup) handleNodeUp(id int64) error {
if rg.LackOfNodes() == 0 {
return ErrRGIsFull
}
if rg.containsNode(id) {
return ErrNodeAlreadyAssign
}
rg.nodes.Insert(id)
return nil
}
func (rg *ResourceGroup) handleNodeDown(id int64) error {
if !rg.containsNode(id) {
// remove non exist node should be tolerable
return nil
}
rg.nodes.Remove(id)
return nil
}
func (rg *ResourceGroup) LackOfNodes() int {
return rg.capacity - len(rg.nodes)
}
func (rg *ResourceGroup) containsNode(id int64) bool {
return rg.nodes.Contain(id)
}
func (rg *ResourceGroup) GetNodes() []int64 {
return rg.nodes.Collect()
}
func (rg *ResourceGroup) GetCapacity() int {
return rg.capacity
}
type ResourceManager struct {
groups map[string]*ResourceGroup
store Store
nodeMgr *session.NodeManager
rwmutex sync.RWMutex
}
func NewResourceManager(store Store, nodeMgr *session.NodeManager) *ResourceManager {
groupMap := make(map[string]*ResourceGroup)
groupMap[DefaultResourceGroupName] = NewResourceGroup(1000000)
return &ResourceManager{
groups: groupMap,
store: store,
nodeMgr: nodeMgr,
}
}
func (rm *ResourceManager) AddResourceGroup(rgName string) error {
rm.rwmutex.Lock()
defer rm.rwmutex.Unlock()
if len(rgName) == 0 {
return ErrRGNameIsEmpty
}
if rm.groups[rgName] != nil {
return ErrRGAlreadyExist
}
if len(rm.groups) >= 1024 {
return ErrRGLimit
}
err := rm.store.SaveResourceGroup(&querypb.ResourceGroup{
Name: rgName,
Capacity: 0,
})
if err != nil {
log.Info("failed to add resource group",
zap.String("rgName", rgName),
zap.Error(err),
)
return err
}
rm.groups[rgName] = NewResourceGroup(0)
log.Info("add resource group",
zap.String("rgName", rgName),
)
return nil
}
func (rm *ResourceManager) RemoveResourceGroup(rgName string) error {
rm.rwmutex.Lock()
defer rm.rwmutex.Unlock()
if rgName == DefaultResourceGroupName {
return ErrDeleteDefaultRG
}
if rm.groups[rgName] == nil {
// delete a non-exist rg should be tolerable
return nil
}
if rm.groups[rgName].GetCapacity() != 0 {
return ErrDeleteNonEmptyRG
}
err := rm.store.RemoveResourceGroup(rgName)
if err != nil {
log.Info("failed to remove resource group",
zap.String("rgName", rgName),
zap.Error(err),
)
return err
}
delete(rm.groups, rgName)
log.Info("remove resource group",
zap.String("rgName", rgName),
)
return nil
}
func (rm *ResourceManager) AssignNode(rgName string, node int64) error {
rm.rwmutex.Lock()
defer rm.rwmutex.Unlock()
return rm.assignNode(rgName, node)
}
func (rm *ResourceManager) assignNode(rgName string, node int64) error {
if rm.groups[rgName] == nil {
return ErrRGNotExist
}
if rm.nodeMgr.Get(node) == nil {
return ErrNodeNotExist
}
if ok, _ := rm.nodeMgr.IsStoppingNode(node); ok {
return ErrNodeStopped
}
rm.checkRGNodeStatus(rgName)
if rm.checkNodeAssigned(node) {
return ErrNodeAlreadyAssign
}
newNodes := rm.groups[rgName].GetNodes()
newNodes = append(newNodes, node)
err := rm.store.SaveResourceGroup(&querypb.ResourceGroup{
Name: rgName,
Capacity: int32(rm.groups[rgName].GetCapacity()) + 1,
Nodes: newNodes,
})
if err != nil {
log.Info("failed to add node to resource group",
zap.String("rgName", rgName),
zap.Int64("node", node),
zap.Error(err),
)
return err
}
err = rm.groups[rgName].assignNode(node)
if err != nil {
return err
}
log.Info("add node to resource group",
zap.String("rgName", rgName),
zap.Int64("node", node),
)
return nil
}
func (rm *ResourceManager) checkNodeAssigned(node int64) bool {
for _, group := range rm.groups {
if group.containsNode(node) {
return true
}
}
return false
}
func (rm *ResourceManager) UnassignNode(rgName string, node int64) error {
rm.rwmutex.Lock()
defer rm.rwmutex.Unlock()
return rm.unassignNode(rgName, node)
}
func (rm *ResourceManager) unassignNode(rgName string, node int64) error {
if rm.groups[rgName] == nil {
return ErrRGNotExist
}
if rm.nodeMgr.Get(node) == nil {
// remove non exist node should be tolerable
return nil
}
newNodes := make([]int64, 0)
for nid := range rm.groups[rgName].nodes {
if nid != node {
newNodes = append(newNodes, nid)
}
}
err := rm.store.SaveResourceGroup(&querypb.ResourceGroup{
Name: rgName,
Capacity: int32(rm.groups[rgName].GetCapacity()) - 1,
Nodes: newNodes,
})
if err != nil {
log.Info("remove node from resource group",
zap.String("rgName", rgName),
zap.Int64("node", node),
zap.Error(err),
)
return err
}
rm.checkRGNodeStatus(rgName)
err = rm.groups[rgName].unassignNode(node)
if err != nil {
return err
}
log.Info("remove node from resource group",
zap.String("rgName", rgName),
zap.Int64("node", node),
)
return nil
}
func (rm *ResourceManager) GetNodes(rgName string) ([]int64, error) {
rm.rwmutex.RLock()
defer rm.rwmutex.RUnlock()
if rm.groups[rgName] == nil {
return nil, ErrRGNotExist
}
rm.checkRGNodeStatus(rgName)
return rm.groups[rgName].GetNodes(), nil
}
// return all outbound node
func (rm *ResourceManager) CheckOutboundNodes(replica *Replica) typeutil.UniqueSet {
rm.rwmutex.RLock()
defer rm.rwmutex.RUnlock()
if rm.groups[replica.GetResourceGroup()] == nil {
return typeutil.NewUniqueSet()
}
rg := rm.groups[replica.GetResourceGroup()]
ret := typeutil.NewUniqueSet()
for _, node := range replica.GetNodes() {
if !rg.containsNode(node) {
ret.Insert(node)
}
}
return ret
}
// return outgoing node num on each rg from this replica
func (rm *ResourceManager) GetOutgoingNodeNumByReplica(replica *Replica) map[string]int32 {
rm.rwmutex.RLock()
defer rm.rwmutex.RUnlock()
if rm.groups[replica.GetResourceGroup()] == nil {
return nil
}
rg := rm.groups[replica.GetResourceGroup()]
ret := make(map[string]int32)
for _, node := range replica.GetNodes() {
if !rg.containsNode(node) {
rgName, err := rm.findResourceGroupByNode(node)
if err == nil {
ret[rgName]++
}
}
}
return ret
}
func (rm *ResourceManager) ContainsNode(rgName string, node int64) bool {
rm.rwmutex.RLock()
defer rm.rwmutex.RUnlock()
if rm.groups[rgName] == nil {
return false
}
rm.checkRGNodeStatus(rgName)
return rm.groups[rgName].containsNode(node)
}
func (rm *ResourceManager) ContainResourceGroup(rgName string) bool {
rm.rwmutex.RLock()
defer rm.rwmutex.RUnlock()
return rm.groups[rgName] != nil
}
func (rm *ResourceManager) GetResourceGroup(rgName string) (*ResourceGroup, error) {
rm.rwmutex.RLock()
defer rm.rwmutex.RUnlock()
if rm.groups[rgName] == nil {
return nil, ErrRGNotExist
}
rm.checkRGNodeStatus(rgName)
return rm.groups[rgName], nil
}
func (rm *ResourceManager) ListResourceGroups() []string {
rm.rwmutex.RLock()
defer rm.rwmutex.RUnlock()
return lo.Keys(rm.groups)
}
func (rm *ResourceManager) FindResourceGroupByNode(node int64) (string, error) {
rm.rwmutex.RLock()
defer rm.rwmutex.RUnlock()
return rm.findResourceGroupByNode(node)
}
func (rm *ResourceManager) findResourceGroupByNode(node int64) (string, error) {
for name, group := range rm.groups {
if group.containsNode(node) {
return name, nil
}
}
return "", ErrNodeNotAssignToRG
}
func (rm *ResourceManager) HandleNodeUp(node int64) (string, error) {
rm.rwmutex.Lock()
defer rm.rwmutex.Unlock()
if rm.nodeMgr.Get(node) == nil {
return "", ErrNodeNotExist
}
if ok, _ := rm.nodeMgr.IsStoppingNode(node); ok {
return "", ErrNodeStopped
}
// if node already assign to rg
rgName, err := rm.findResourceGroupByNode(node)
if err == nil {
log.Info("HandleNodeUp: node already assign to resource group",
zap.String("rgName", rgName),
zap.Int64("node", node),
)
return rgName, nil
}
// add new node to default rg
rm.groups[DefaultResourceGroupName].handleNodeUp(node)
log.Info("HandleNodeUp: assign node to default resource group",
zap.String("rgName", DefaultResourceGroupName),
zap.Int64("node", node),
)
return DefaultResourceGroupName, nil
}
func (rm *ResourceManager) HandleNodeDown(node int64) (string, error) {
rm.rwmutex.Lock()
defer rm.rwmutex.Unlock()
if rm.nodeMgr.Get(node) == nil {
return "", ErrNodeNotExist
}
rgName, err := rm.findResourceGroupByNode(node)
if err == nil {
log.Info("HandleNodeDown: remove node from resource group",
zap.String("rgName", rgName),
zap.Int64("node", node),
)
return rgName, rm.groups[rgName].handleNodeDown(node)
}
return "", ErrNodeNotAssignToRG
}
func (rm *ResourceManager) TransferNode(from, to string) error {
rm.rwmutex.Lock()
defer rm.rwmutex.Unlock()
if rm.groups[from] == nil || rm.groups[to] == nil {
return ErrRGNotExist
}
if len(rm.groups[from].nodes) == 0 {
return ErrRGIsEmpty
}
rm.checkRGNodeStatus(from)
rm.checkRGNodeStatus(to)
//todo: a better way to choose a node with least balance cost
node := rm.groups[from].GetNodes()[0]
if err := rm.transferNodeInStore(from, to, node); err != nil {
return err
}
err := rm.groups[from].unassignNode(node)
if err != nil {
// interrupt transfer, unreachable logic path
return err
}
err = rm.groups[to].assignNode(node)
if err != nil {
// interrupt transfer, unreachable logic path
return err
}
return nil
}
func (rm *ResourceManager) transferNodeInStore(from string, to string, node int64) error {
fromNodeList := make([]int64, 0)
for nid := range rm.groups[from].nodes {
if nid != node {
fromNodeList = append(fromNodeList, nid)
}
}
toNodeList := rm.groups[to].GetNodes()
toNodeList = append(toNodeList, node)
fromRG := &querypb.ResourceGroup{
Name: from,
Capacity: int32(rm.groups[from].GetCapacity()) - 1,
Nodes: fromNodeList,
}
toRG := &querypb.ResourceGroup{
Name: to,
Capacity: int32(rm.groups[to].GetCapacity()) + 1,
Nodes: toNodeList,
}
return rm.store.SaveResourceGroup(fromRG, toRG)
}
// auto recover rg, return recover used node num
func (rm *ResourceManager) AutoRecoverResourceGroup(rgName string) (int, error) {
rm.rwmutex.Lock()
defer rm.rwmutex.Unlock()
if rm.groups[rgName] == nil {
return 0, ErrRGNotExist
}
rm.checkRGNodeStatus(rgName)
lackNodesNum := rm.groups[rgName].LackOfNodes()
nodesInDefault := rm.groups[DefaultResourceGroupName].GetNodes()
for i := 0; i < len(nodesInDefault) && i < lackNodesNum; i++ {
//todo: a better way to choose a node with least balance cost
node := nodesInDefault[i]
err := rm.unassignNode(DefaultResourceGroupName, node)
if err != nil {
// interrupt transfer, unreachable logic path
return i + 1, err
}
err = rm.groups[rgName].handleNodeUp(node)
if err != nil {
// roll back, unreachable logic path
rm.assignNode(DefaultResourceGroupName, node)
}
}
return lackNodesNum, nil
}
func (rm *ResourceManager) Recover() error {
rm.rwmutex.Lock()
defer rm.rwmutex.Unlock()
rgs, err := rm.store.GetResourceGroups()
if err != nil {
return ErrRecoverResourceGroupToStore
}
for _, rg := range rgs {
rm.groups[rg.GetName()] = NewResourceGroup(0)
for _, node := range rg.GetNodes() {
rm.groups[rg.GetName()].assignNode(node)
}
rm.checkRGNodeStatus(rg.GetName())
log.Info("Recover resource group",
zap.String("rgName", rg.GetName()),
zap.Int64s("nodes", rg.GetNodes()),
zap.Int32("capacity", rg.GetCapacity()),
)
}
return nil
}
// every operation which involves nodes access, should check nodes status first
func (rm *ResourceManager) checkRGNodeStatus(rgName string) {
for _, node := range rm.groups[rgName].GetNodes() {
if rm.nodeMgr.Get(node) == nil {
log.Info("found node down, remove it",
zap.String("rgName", rgName),
zap.Int64("nodeID", node),
)
rm.groups[rgName].handleNodeDown(node)
}
}
}
// return lack of nodes num
func (rm *ResourceManager) CheckLackOfNode(rgName string) int {
rm.rwmutex.Lock()
defer rm.rwmutex.Unlock()
if rm.groups[rgName] == nil {
return 0
}
rm.checkRGNodeStatus(rgName)
return rm.groups[rgName].LackOfNodes()
}

View File

@ -0,0 +1,294 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package meta
import (
"testing"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/proto/querypb"
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/util/etcd"
"github.com/milvus-io/milvus/internal/util/typeutil"
"github.com/stretchr/testify/suite"
)
type ResourceManagerSuite struct {
suite.Suite
kv *etcdkv.EtcdKV
manager *ResourceManager
}
func (suite *ResourceManagerSuite) SetupSuite() {
Params.Init()
}
func (suite *ResourceManagerSuite) SetupTest() {
config := GenerateEtcdConfig()
cli, err := etcd.GetEtcdClient(
config.UseEmbedEtcd.GetAsBool(),
config.EtcdUseSSL.GetAsBool(),
config.Endpoints.GetAsStrings(),
config.EtcdTLSCert.GetValue(),
config.EtcdTLSKey.GetValue(),
config.EtcdTLSCACert.GetValue(),
config.EtcdTLSMinVersion.GetValue())
suite.Require().NoError(err)
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
store := NewMetaStore(suite.kv)
suite.manager = NewResourceManager(store, session.NewNodeManager())
}
func (suite *ResourceManagerSuite) TestManipulateResourceGroup() {
// test add rg
err := suite.manager.AddResourceGroup("rg1")
suite.NoError(err)
suite.True(suite.manager.ContainResourceGroup("rg1"))
suite.Len(suite.manager.ListResourceGroups(), 2)
// test add duplicate rg
err = suite.manager.AddResourceGroup("rg1")
suite.ErrorIs(err, ErrRGAlreadyExist)
// test delete rg
err = suite.manager.RemoveResourceGroup("rg1")
suite.NoError(err)
// test delete rg which doesn't exist
err = suite.manager.RemoveResourceGroup("rg1")
suite.NoError(err)
// test delete default rg
err = suite.manager.RemoveResourceGroup(DefaultResourceGroupName)
suite.ErrorIs(ErrDeleteDefaultRG, err)
}
func (suite *ResourceManagerSuite) TestManipulateNode() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
err := suite.manager.AddResourceGroup("rg1")
suite.NoError(err)
// test add node to rg
err = suite.manager.AssignNode("rg1", 1)
suite.NoError(err)
// test add non-exist node to rg
err = suite.manager.AssignNode("rg1", 2)
suite.ErrorIs(err, ErrNodeNotExist)
// test add node to non-exist rg
err = suite.manager.AssignNode("rg2", 1)
suite.ErrorIs(err, ErrRGNotExist)
// test remove node from rg
err = suite.manager.UnassignNode("rg1", 1)
suite.NoError(err)
// test remove non-exist node from rg
err = suite.manager.UnassignNode("rg1", 2)
suite.NoError(err)
// test remove node from non-exist rg
err = suite.manager.UnassignNode("rg2", 1)
suite.ErrorIs(err, ErrRGNotExist)
// add node which already assign to rg to another rg
err = suite.manager.AddResourceGroup("rg2")
suite.NoError(err)
err = suite.manager.AssignNode("rg1", 1)
suite.NoError(err)
err = suite.manager.AssignNode("rg2", 1)
println(err.Error())
suite.ErrorIs(err, ErrNodeAlreadyAssign)
// transfer node between rgs
err = suite.manager.TransferNode("rg1", "rg2")
suite.NoError(err)
// transfer meet non exist rg
err = suite.manager.TransferNode("rgggg", "rg2")
suite.ErrorIs(err, ErrRGNotExist)
}
func (suite *ResourceManagerSuite) TestHandleNodeUp() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(100, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(101, "localhost"))
err := suite.manager.AddResourceGroup("rg1")
suite.NoError(err)
suite.manager.AssignNode("rg1", 1)
suite.manager.AssignNode("rg1", 2)
suite.manager.AssignNode("rg1", 3)
// test query node id not change, expect assign back to origin rg
rg, err := suite.manager.GetResourceGroup("rg1")
suite.NoError(err)
suite.Equal(rg.GetCapacity(), 3)
suite.Equal(len(rg.GetNodes()), 3)
suite.manager.HandleNodeUp(1)
suite.Equal(rg.GetCapacity(), 3)
suite.Equal(len(rg.GetNodes()), 3)
suite.manager.HandleNodeDown(2)
rg, err = suite.manager.GetResourceGroup("rg1")
suite.NoError(err)
suite.Equal(rg.GetCapacity(), 3)
suite.Equal(len(rg.GetNodes()), 2)
suite.NoError(err)
defaultRG, err := suite.manager.GetResourceGroup(DefaultResourceGroupName)
suite.NoError(err)
oldNodesNum := len(defaultRG.GetNodes())
suite.manager.HandleNodeUp(101)
rg, err = suite.manager.GetResourceGroup("rg1")
suite.NoError(err)
suite.Equal(rg.GetCapacity(), 3)
suite.Equal(len(rg.GetNodes()), 2)
suite.False(suite.manager.ContainsNode("rg1", 101))
nodes, err := suite.manager.GetNodes(DefaultResourceGroupName)
suite.NoError(err)
suite.Equal(len(nodes), oldNodesNum+1)
}
func (suite *ResourceManagerSuite) TestRecover() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
err := suite.manager.AddResourceGroup("rg")
suite.NoError(err)
suite.manager.AssignNode("rg", 1)
suite.manager.AssignNode("rg", 2)
suite.manager.AssignNode("rg", 3)
suite.manager.UnassignNode("rg", 3)
// clear resource manager in hack way
delete(suite.manager.groups, "rg")
delete(suite.manager.groups, DefaultResourceGroupName)
suite.manager.Recover()
rg, err := suite.manager.GetResourceGroup("rg")
suite.NoError(err)
suite.Equal(2, rg.GetCapacity())
suite.True(suite.manager.ContainsNode("rg", 1))
suite.True(suite.manager.ContainsNode("rg", 2))
suite.False(suite.manager.ContainsNode("rg", 3))
}
func (suite *ResourceManagerSuite) TestCheckOutboundNodes() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
err := suite.manager.AddResourceGroup("rg")
suite.NoError(err)
suite.manager.AssignNode("rg", 1)
suite.manager.AssignNode("rg", 2)
suite.manager.AssignNode("rg", 3)
replica := NewReplica(
&querypb.Replica{
ID: 1,
CollectionID: 1,
Nodes: []int64{1, 2, 3, 4},
ResourceGroup: "rg",
},
typeutil.NewUniqueSet(1, 2, 3, 4),
)
outboundNodes := suite.manager.CheckOutboundNodes(replica)
suite.Len(outboundNodes, 1)
suite.True(outboundNodes.Contain(4))
}
func (suite *ResourceManagerSuite) TestCheckResourceGroup() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
err := suite.manager.AddResourceGroup("rg")
suite.NoError(err)
suite.manager.AssignNode("rg", 1)
suite.manager.AssignNode("rg", 2)
suite.manager.AssignNode("rg", 3)
suite.manager.HandleNodeDown(1)
lackNodes := suite.manager.CheckLackOfNode("rg")
suite.Equal(lackNodes, 1)
suite.manager.nodeMgr.Remove(2)
suite.manager.checkRGNodeStatus("rg")
lackNodes = suite.manager.CheckLackOfNode("rg")
suite.Equal(lackNodes, 2)
rg, err := suite.manager.FindResourceGroupByNode(3)
suite.NoError(err)
suite.Equal(rg, "rg")
}
func (suite *ResourceManagerSuite) TestGetOutboundNode() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
suite.manager.AddResourceGroup("rg")
suite.manager.AddResourceGroup("rg1")
suite.manager.AssignNode("rg", 1)
suite.manager.AssignNode("rg", 2)
suite.manager.AssignNode("rg1", 3)
replica := NewReplica(
&querypb.Replica{
ID: 1,
CollectionID: 100,
ResourceGroup: "rg",
Nodes: []int64{1, 2, 3},
},
typeutil.NewUniqueSet(1, 2, 3),
)
outgoingNodes := suite.manager.GetOutgoingNodeNumByReplica(replica)
suite.NotNil(outgoingNodes)
suite.Len(outgoingNodes, 1)
suite.NotNil(outgoingNodes["rg1"])
suite.Equal(outgoingNodes["rg1"], int32(1))
}
func (suite *ResourceManagerSuite) TestAutoRecover() {
suite.manager.nodeMgr.Add(session.NewNodeInfo(1, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(2, "localhost"))
suite.manager.nodeMgr.Add(session.NewNodeInfo(3, "localhost"))
err := suite.manager.AddResourceGroup("rg")
suite.NoError(err)
suite.manager.AssignNode(DefaultResourceGroupName, 1)
suite.manager.AssignNode(DefaultResourceGroupName, 2)
suite.manager.AssignNode("rg", 3)
suite.manager.HandleNodeDown(3)
lackNodes := suite.manager.CheckLackOfNode("rg")
suite.Equal(lackNodes, 1)
suite.manager.AutoRecoverResourceGroup("rg")
lackNodes = suite.manager.CheckLackOfNode("rg")
suite.Equal(lackNodes, 0)
}
func (suite *ResourceManagerSuite) TearDownSuite() {
suite.kv.Close()
}
func TestResourceManager(t *testing.T) {
suite.Run(t, new(ResourceManagerSuite))
}

View File

@ -150,7 +150,7 @@ func (m *SegmentDistManager) GetByShardWithReplica(shard string, replica *Replic
ret := make([]*Segment, 0) ret := make([]*Segment, 0)
for nodeID, segments := range m.segments { for nodeID, segments := range m.segments {
if !replica.Nodes.Contain(nodeID) { if !replica.Contains(nodeID) {
continue continue
} }
for _, segment := range segments { for _, segment := range segments {

View File

@ -28,7 +28,6 @@ import (
"github.com/milvus-io/milvus/internal/kv" "github.com/milvus-io/milvus/internal/kv"
"github.com/milvus-io/milvus/internal/metastore" "github.com/milvus-io/milvus/internal/metastore"
"github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/util"
) )
var ( var (
@ -41,6 +40,7 @@ const (
ReplicaPrefix = "querycoord-replica" ReplicaPrefix = "querycoord-replica"
CollectionMetaPrefixV1 = "queryCoord-collectionMeta" CollectionMetaPrefixV1 = "queryCoord-collectionMeta"
ReplicaMetaPrefixV1 = "queryCoord-ReplicaMeta" ReplicaMetaPrefixV1 = "queryCoord-ReplicaMeta"
ResourceGroupPrefix = "queryCoord-ResourceGroup"
) )
type WatchStoreChan = clientv3.WatchChan type WatchStoreChan = clientv3.WatchChan
@ -91,6 +91,26 @@ func (s metaStore) SaveReplica(replica *querypb.Replica) error {
return s.cli.Save(key, string(value)) return s.cli.Save(key, string(value))
} }
func (s metaStore) SaveResourceGroup(rgs ...*querypb.ResourceGroup) error {
ret := make(map[string]string)
for _, rg := range rgs {
key := encodeResourceGroupKey(rg.GetName())
value, err := proto.Marshal(rg)
if err != nil {
return err
}
ret[key] = string(value)
}
return s.cli.MultiSave(ret)
}
func (s metaStore) RemoveResourceGroup(rgName string) error {
key := encodeResourceGroupKey(rgName)
return s.cli.Remove(key)
}
func (s metaStore) GetCollections() ([]*querypb.CollectionLoadInfo, error) { func (s metaStore) GetCollections() ([]*querypb.CollectionLoadInfo, error) {
_, values, err := s.cli.LoadWithPrefix(CollectionLoadInfoPrefix) _, values, err := s.cli.LoadWithPrefix(CollectionLoadInfoPrefix)
if err != nil { if err != nil {
@ -171,6 +191,25 @@ func (s metaStore) getReplicasFromV1() ([]*querypb.Replica, error) {
return ret, nil return ret, nil
} }
func (s metaStore) GetResourceGroups() ([]*querypb.ResourceGroup, error) {
_, rgs, err := s.cli.LoadWithPrefix(ResourceGroupPrefix)
if err != nil {
return nil, err
}
ret := make([]*querypb.ResourceGroup, 0, len(rgs))
for _, value := range rgs {
rg := &querypb.ResourceGroup{}
err := proto.Unmarshal([]byte(value), rg)
if err != nil {
return nil, err
}
ret = append(ret, rg)
}
return ret, nil
}
func (s metaStore) ReleaseCollection(id int64) error { func (s metaStore) ReleaseCollection(id int64) error {
k := encodeCollectionLoadInfoKey(id) k := encodeCollectionLoadInfoKey(id)
return s.cli.Remove(k) return s.cli.Remove(k)
@ -209,6 +248,6 @@ func encodeCollectionReplicaKey(collection int64) string {
return fmt.Sprintf("%s/%d", ReplicaPrefix, collection) return fmt.Sprintf("%s/%d", ReplicaPrefix, collection)
} }
func encodeHandoffEventKey(collection, partition, segment int64) string { func encodeResourceGroupKey(rgName string) string {
return fmt.Sprintf("%s/%d/%d/%d", util.HandoffSegmentPrefix, collection, partition, segment) return fmt.Sprintf("%s/%s", ResourceGroupPrefix, rgName)
} }

View File

@ -17,22 +17,151 @@
package meta package meta
import ( import (
"sort"
"testing" "testing"
"github.com/milvus-io/milvus/internal/kv"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/proto/querypb"
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/util/etcd"
"github.com/stretchr/testify/suite" "github.com/stretchr/testify/suite"
) )
type StoreTestSuite struct { type StoreTestSuite struct {
suite.Suite suite.Suite
kv kv.MetaKv
store metaStore store metaStore
} }
func (suite *StoreTestSuite) SetupTest() { func (suite *StoreTestSuite) SetupSuite() {
//kv := memkv.NewMemoryKV() Params.Init()
//suite.store = NewMetaStore(kv)
} }
func (suite *StoreTestSuite) TearDownTest() {} func (suite *StoreTestSuite) SetupTest() {
config := GenerateEtcdConfig()
cli, err := etcd.GetEtcdClient(
config.UseEmbedEtcd.GetAsBool(),
config.EtcdUseSSL.GetAsBool(),
config.Endpoints.GetAsStrings(),
config.EtcdTLSCert.GetValue(),
config.EtcdTLSKey.GetValue(),
config.EtcdTLSCACert.GetValue(),
config.EtcdTLSMinVersion.GetValue())
suite.Require().NoError(err)
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
suite.store = NewMetaStore(suite.kv)
}
func (suite *StoreTestSuite) TearDownTest() {
if suite.kv != nil {
suite.kv.Close()
}
}
func (suite *StoreTestSuite) TestCollection() {
suite.store.SaveCollection(&querypb.CollectionLoadInfo{
CollectionID: 1,
})
suite.store.SaveCollection(&querypb.CollectionLoadInfo{
CollectionID: 2,
})
suite.store.SaveCollection(&querypb.CollectionLoadInfo{
CollectionID: 3,
})
suite.store.ReleaseCollection(1)
suite.store.ReleaseCollection(2)
collections, err := suite.store.GetCollections()
suite.NoError(err)
suite.Len(collections, 1)
}
func (suite *StoreTestSuite) TestPartition() {
suite.store.SavePartition(&querypb.PartitionLoadInfo{
PartitionID: 1,
})
suite.store.SavePartition(&querypb.PartitionLoadInfo{
PartitionID: 2,
})
suite.store.SavePartition(&querypb.PartitionLoadInfo{
PartitionID: 3,
})
suite.store.ReleasePartition(1)
suite.store.ReleasePartition(2)
partitions, err := suite.store.GetPartitions()
suite.NoError(err)
suite.Len(partitions, 1)
}
func (suite *StoreTestSuite) TestReplica() {
suite.store.SaveReplica(&querypb.Replica{
CollectionID: 1,
ID: 1,
})
suite.store.SaveReplica(&querypb.Replica{
CollectionID: 1,
ID: 2,
})
suite.store.SaveReplica(&querypb.Replica{
CollectionID: 1,
ID: 3,
})
suite.store.ReleaseReplica(1, 1)
suite.store.ReleaseReplica(1, 2)
replicas, err := suite.store.GetReplicas()
suite.NoError(err)
suite.Len(replicas, 1)
}
func (suite *StoreTestSuite) TestResourceGroup() {
suite.store.SaveResourceGroup(&querypb.ResourceGroup{
Name: "rg1",
Capacity: 3,
Nodes: []int64{1, 2, 3},
})
suite.store.SaveResourceGroup(&querypb.ResourceGroup{
Name: "rg2",
Capacity: 3,
Nodes: []int64{4, 5},
})
suite.store.SaveResourceGroup(&querypb.ResourceGroup{
Name: "rg3",
Capacity: 0,
Nodes: []int64{},
})
suite.store.RemoveResourceGroup("rg3")
groups, err := suite.store.GetResourceGroups()
suite.NoError(err)
suite.Len(groups, 2)
sort.Slice(groups, func(i, j int) bool {
return groups[i].GetName() < groups[j].GetName()
})
suite.Equal("rg1", groups[0].GetName())
suite.Equal(int32(3), groups[0].GetCapacity())
suite.Equal([]int64{1, 2, 3}, groups[0].GetNodes())
suite.Equal("rg2", groups[1].GetName())
suite.Equal(int32(3), groups[1].GetCapacity())
suite.Equal([]int64{4, 5}, groups[1].GetNodes())
}
func (suite *StoreTestSuite) TestLoadRelease() { func (suite *StoreTestSuite) TestLoadRelease() {
// TODO(sunby): add ut // TODO(sunby): add ut

View File

@ -27,6 +27,7 @@ import (
"github.com/milvus-io/milvus/internal/proto/datapb" "github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/proto/querypb"
. "github.com/milvus-io/milvus/internal/querycoordv2/params" . "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/util/etcd" "github.com/milvus-io/milvus/internal/util/etcd"
"github.com/milvus-io/milvus/internal/util/typeutil" "github.com/milvus-io/milvus/internal/util/typeutil"
) )
@ -101,7 +102,7 @@ func (suite *TargetManagerSuite) SetupTest() {
// meta // meta
store := NewMetaStore(suite.kv) store := NewMetaStore(suite.kv)
idAllocator := RandomIncrementIDAllocator() idAllocator := RandomIncrementIDAllocator()
suite.meta = NewMeta(idAllocator, store) suite.meta = NewMeta(idAllocator, store, session.NewNodeManager())
suite.broker = NewMockBroker(suite.T()) suite.broker = NewMockBroker(suite.T())
suite.mgr = NewTargetManager(suite.broker, suite.meta) suite.mgr = NewTargetManager(suite.broker, suite.meta)

View File

@ -32,6 +32,7 @@ import (
"github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
. "github.com/milvus-io/milvus/internal/querycoordv2/params" . "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/util/etcd" "github.com/milvus-io/milvus/internal/util/etcd"
"github.com/milvus-io/milvus/internal/util/paramtable" "github.com/milvus-io/milvus/internal/util/paramtable"
) )
@ -178,7 +179,7 @@ func (suite *CollectionObserverSuite) SetupTest() {
// Dependencies // Dependencies
suite.dist = meta.NewDistributionManager() suite.dist = meta.NewDistributionManager()
suite.meta = meta.NewMeta(suite.idAllocator, suite.store) suite.meta = meta.NewMeta(suite.idAllocator, suite.store, session.NewNodeManager())
suite.broker = meta.NewMockBroker(suite.T()) suite.broker = meta.NewMockBroker(suite.T())
suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta) suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta)
suite.targetObserver = NewTargetObserver(suite.meta, suite.targetObserver = NewTargetObserver(suite.meta,
@ -323,7 +324,7 @@ func (suite *CollectionObserverSuite) loadAll() {
func (suite *CollectionObserverSuite) load(collection int64) { func (suite *CollectionObserverSuite) load(collection int64) {
// Mock meta data // Mock meta data
replicas, err := suite.meta.ReplicaManager.Spawn(collection, suite.replicaNumber[collection]) replicas, err := suite.meta.ReplicaManager.Spawn(collection, suite.replicaNumber[collection], meta.DefaultResourceGroupName)
suite.NoError(err) suite.NoError(err)
for _, replica := range replicas { for _, replica := range replicas {
replica.AddNode(suite.nodes...) replica.AddNode(suite.nodes...)

View File

@ -67,7 +67,7 @@ func (suite *LeaderObserverTestSuite) SetupTest() {
// meta // meta
store := meta.NewMetaStore(suite.kv) store := meta.NewMetaStore(suite.kv)
idAllocator := RandomIncrementIDAllocator() idAllocator := RandomIncrementIDAllocator()
suite.meta = meta.NewMeta(idAllocator, store) suite.meta = meta.NewMeta(idAllocator, store, session.NewNodeManager())
suite.broker = meta.NewMockBroker(suite.T()) suite.broker = meta.NewMockBroker(suite.T())
suite.mockCluster = session.NewMockCluster(suite.T()) suite.mockCluster = session.NewMockCluster(suite.T())

View File

@ -0,0 +1,112 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package observers
import (
"context"
"sync"
"time"
"go.uber.org/zap"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/milvus-io/milvus/internal/querycoordv2/params"
)
// check replica, find outbound nodes and remove it from replica if all segment/channel has been moved
type ReplicaObserver struct {
c chan struct{}
wg sync.WaitGroup
meta *meta.Meta
distMgr *meta.DistributionManager
stopOnce sync.Once
}
func NewReplicaObserver(meta *meta.Meta, distMgr *meta.DistributionManager) *ReplicaObserver {
return &ReplicaObserver{
c: make(chan struct{}),
meta: meta,
distMgr: distMgr,
}
}
func (ob *ReplicaObserver) Start(ctx context.Context) {
ob.wg.Add(1)
go ob.schedule(ctx)
}
func (ob *ReplicaObserver) Stop() {
ob.stopOnce.Do(func() {
close(ob.c)
ob.wg.Wait()
})
}
func (ob *ReplicaObserver) schedule(ctx context.Context) {
defer ob.wg.Done()
log.Info("Start check replica loop")
ticker := time.NewTicker(params.Params.QueryCoordCfg.CheckNodeInReplicaInterval.GetAsDuration(time.Second))
for {
select {
case <-ctx.Done():
log.Info("Close replica observer due to context canceled")
return
case <-ob.c:
log.Info("Close replica observer")
return
case <-ticker.C:
ob.checkNodesInReplica()
}
}
}
func (ob *ReplicaObserver) checkNodesInReplica() {
collections := ob.meta.GetAll()
for _, collectionID := range collections {
replicas := ob.meta.ReplicaManager.GetByCollection(collectionID)
for _, replica := range replicas {
outboundNodes := ob.meta.ResourceManager.CheckOutboundNodes(replica)
if len(outboundNodes) > 0 {
log.RatedInfo(10, "found outbound nodes in replica",
zap.Int64("collectionID", replica.GetCollectionID()),
zap.Int64("replicaID", replica.GetCollectionID()),
zap.Int64s("allOutboundNodes", outboundNodes.Collect()),
)
for node := range outboundNodes {
channels := ob.distMgr.ChannelDistManager.GetByCollectionAndNode(collectionID, node)
segments := ob.distMgr.SegmentDistManager.GetByCollectionAndNode(collectionID, node)
if len(channels) == 0 && len(segments) == 0 {
replica.RemoveNode(node)
log.Info("all segment/channel has been removed from outbound node, remove it from replica",
zap.Int64("collectionID", replica.GetCollectionID()),
zap.Int64("replicaID", replica.GetCollectionID()),
zap.Int64("removedNodes", node),
zap.Int64s("availableNodes", replica.GetNodes()),
)
}
}
}
}
}
}

View File

@ -0,0 +1,134 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package observers
import (
"context"
"testing"
"time"
etcdkv "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/utils"
"github.com/milvus-io/milvus/internal/util/etcd"
"github.com/milvus-io/milvus/internal/util/paramtable"
"github.com/stretchr/testify/suite"
)
type ReplicaObserverSuite struct {
suite.Suite
kv *etcdkv.EtcdKV
//dependency
meta *meta.Meta
distMgr *meta.DistributionManager
observer *ReplicaObserver
collectionID int64
partitionID int64
}
func (suite *ReplicaObserverSuite) SetupSuite() {
paramtable.Init()
paramtable.Get().Save(Params.QueryCoordCfg.CheckNodeInReplicaInterval.Key, "1")
}
func (suite *ReplicaObserverSuite) SetupTest() {
var err error
config := GenerateEtcdConfig()
cli, err := etcd.GetEtcdClient(
config.UseEmbedEtcd.GetAsBool(),
config.EtcdUseSSL.GetAsBool(),
config.Endpoints.GetAsStrings(),
config.EtcdTLSCert.GetValue(),
config.EtcdTLSKey.GetValue(),
config.EtcdTLSCACert.GetValue(),
config.EtcdTLSMinVersion.GetValue())
suite.Require().NoError(err)
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
// meta
store := meta.NewMetaStore(suite.kv)
idAllocator := RandomIncrementIDAllocator()
suite.meta = meta.NewMeta(idAllocator, store, session.NewNodeManager())
suite.distMgr = meta.NewDistributionManager()
suite.observer = NewReplicaObserver(suite.meta, suite.distMgr)
suite.observer.Start(context.TODO())
suite.collectionID = int64(1000)
suite.partitionID = int64(100)
suite.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, 1)
err = suite.meta.CollectionManager.PutCollection(utils.CreateTestCollection(suite.collectionID, 1))
suite.NoError(err)
replicas, err := suite.meta.ReplicaManager.Spawn(suite.collectionID, 1, meta.DefaultResourceGroupName)
suite.NoError(err)
err = suite.meta.ReplicaManager.Put(replicas...)
suite.NoError(err)
}
func (suite *ReplicaObserverSuite) TestCheckNodesInReplica() {
replicas := suite.meta.ReplicaManager.GetByCollection(suite.collectionID)
suite.distMgr.ChannelDistManager.Update(1, utils.CreateTestChannel(suite.collectionID, 2, 1, "test-insert-channel1"))
suite.distMgr.SegmentDistManager.Update(1, utils.CreateTestSegment(suite.collectionID, suite.partitionID, 1, 100, 1, "test-insert-channel1"))
replicas[0].AddNode(1)
suite.distMgr.ChannelDistManager.Update(100, utils.CreateTestChannel(suite.collectionID, 100, 1, "test-insert-channel2"))
suite.distMgr.SegmentDistManager.Update(100, utils.CreateTestSegment(suite.collectionID, suite.partitionID, 2, 100, 1, "test-insert-channel2"))
replicas[0].AddNode(100)
suite.Eventually(func() bool {
// node 100 should be kept
replicas := suite.meta.ReplicaManager.GetByCollection(suite.collectionID)
for _, node := range replicas[0].GetNodes() {
if node == 100 {
return true
}
}
return false
}, 6*time.Second, 2*time.Second)
suite.Len(replicas[0].GetNodes(), 2)
suite.distMgr.ChannelDistManager.Update(100)
suite.distMgr.SegmentDistManager.Update(100)
suite.Eventually(func() bool {
// node 100 should be removed
replicas := suite.meta.ReplicaManager.GetByCollection(suite.collectionID)
for _, node := range replicas[0].GetNodes() {
if node == 100 {
return false
}
}
return true
}, 5*time.Second, 1*time.Second)
suite.Len(replicas[0].GetNodes(), 1)
suite.Equal([]int64{1}, replicas[0].GetNodes())
}
func (suite *ReplicaObserverSuite) TearDownSuite() {
suite.kv.Close()
suite.observer.Stop()
}
func TestReplicaObserver(t *testing.T) {
suite.Run(t, new(ReplicaObserverSuite))
}

View File

@ -0,0 +1,107 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package observers
import (
"context"
"sync"
"time"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/milvus-io/milvus/internal/querycoordv2/params"
"go.uber.org/zap"
)
// check whether rg lack of node, try to transfer node from default rg
type ResourceObserver struct {
c chan struct{}
wg sync.WaitGroup
meta *meta.Meta
stopOnce sync.Once
}
func NewResourceObserver(meta *meta.Meta) *ResourceObserver {
return &ResourceObserver{
c: make(chan struct{}),
meta: meta,
}
}
func (ob *ResourceObserver) Start(ctx context.Context) {
ob.wg.Add(1)
go ob.schedule(ctx)
}
func (ob *ResourceObserver) Stop() {
ob.stopOnce.Do(func() {
close(ob.c)
ob.wg.Wait()
})
}
func (ob *ResourceObserver) schedule(ctx context.Context) {
defer ob.wg.Done()
log.Info("Start check resource group loop")
ticker := time.NewTicker(params.Params.QueryCoordCfg.CheckResourceGroupInterval.GetAsDuration(time.Second))
for {
select {
case <-ctx.Done():
log.Info("Close resource group observer due to context canceled")
return
case <-ob.c:
log.Info("Close resource group observer")
return
case <-ticker.C:
ob.checkResourceGroup()
}
}
}
func (ob *ResourceObserver) checkResourceGroup() {
manager := ob.meta.ResourceManager
rgNames := manager.ListResourceGroups()
enableRGAutoRecover := params.Params.QueryCoordCfg.EnableRGAutoRecover.GetAsBool()
for _, rgName := range rgNames {
if rgName == meta.DefaultResourceGroupName {
continue
}
lackNodeNum := manager.CheckLackOfNode(rgName)
if lackNodeNum > 0 {
log.Info("found resource group lack of nodes",
zap.String("rgName", rgName),
zap.Int("lackNodeNum", lackNodeNum),
)
if enableRGAutoRecover {
usedNodeNum, err := manager.AutoRecoverResourceGroup(rgName)
if err != nil {
log.Warn("failed to recover resource group",
zap.String("rgName", rgName),
zap.Int("lackNodeNum", lackNodeNum-usedNodeNum),
zap.Error(err),
)
}
}
}
}
}

View File

@ -0,0 +1,111 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package observers
import (
"context"
"testing"
"time"
etcdKV "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/util/etcd"
"github.com/milvus-io/milvus/internal/util/paramtable"
"github.com/stretchr/testify/suite"
)
type ResourceObserverSuite struct {
suite.Suite
kv *etcdKV.EtcdKV
//dependency
meta *meta.Meta
observer *ResourceObserver
nodeMgr *session.NodeManager
collectionID int64
partitionID int64
}
func (suite *ResourceObserverSuite) SetupSuite() {
paramtable.Init()
paramtable.Get().Save(Params.QueryCoordCfg.CheckResourceGroupInterval.Key, "3")
}
func (suite *ResourceObserverSuite) SetupTest() {
var err error
config := GenerateEtcdConfig()
cli, err := etcd.GetEtcdClient(
config.UseEmbedEtcd.GetAsBool(),
config.EtcdUseSSL.GetAsBool(),
config.Endpoints.GetAsStrings(),
config.EtcdTLSCert.GetValue(),
config.EtcdTLSKey.GetValue(),
config.EtcdTLSCACert.GetValue(),
config.EtcdTLSMinVersion.GetValue())
suite.Require().NoError(err)
suite.kv = etcdKV.NewEtcdKV(cli, config.MetaRootPath.GetValue())
// meta
store := meta.NewMetaStore(suite.kv)
idAllocator := RandomIncrementIDAllocator()
suite.nodeMgr = session.NewNodeManager()
suite.meta = meta.NewMeta(idAllocator, store, suite.nodeMgr)
suite.observer = NewResourceObserver(suite.meta)
suite.observer.Start(context.TODO())
for i := 1; i < 10; i++ {
suite.nodeMgr.Add(session.NewNodeInfo(int64(i), "localhost"))
suite.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, int64(i))
}
}
func (suite *ResourceObserverSuite) TestCheckNodesInReplica() {
suite.meta.ResourceManager.AddResourceGroup("rg")
suite.nodeMgr.Add(session.NewNodeInfo(int64(100), "localhost"))
suite.nodeMgr.Add(session.NewNodeInfo(int64(101), "localhost"))
suite.nodeMgr.Add(session.NewNodeInfo(int64(102), "localhost"))
suite.meta.ResourceManager.AssignNode("rg", 100)
suite.meta.ResourceManager.AssignNode("rg", 101)
suite.meta.ResourceManager.AssignNode("rg", 102)
suite.meta.ResourceManager.HandleNodeDown(100)
suite.meta.ResourceManager.HandleNodeDown(101)
//before auto recover rg
suite.Eventually(func() bool {
lackNodesNum := suite.meta.ResourceManager.CheckLackOfNode("rg")
return lackNodesNum == 2
}, 5*time.Second, 1*time.Second)
// after auto recover rg
suite.Eventually(func() bool {
lackNodesNum := suite.meta.ResourceManager.CheckLackOfNode("rg")
return lackNodesNum == 0
}, 5*time.Second, 1*time.Second)
}
func (suite *ResourceObserverSuite) TearDownSuite() {
suite.kv.Close()
suite.observer.Stop()
}
func TestResourceObserver(t *testing.T) {
suite.Run(t, new(ResourceObserverSuite))
}

View File

@ -29,6 +29,7 @@ import (
"github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
. "github.com/milvus-io/milvus/internal/querycoordv2/params" . "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/querycoordv2/utils" "github.com/milvus-io/milvus/internal/querycoordv2/utils"
"github.com/milvus-io/milvus/internal/util/etcd" "github.com/milvus-io/milvus/internal/util/etcd"
"github.com/milvus-io/milvus/internal/util/paramtable" "github.com/milvus-io/milvus/internal/util/paramtable"
@ -74,7 +75,7 @@ func (suite *TargetObserverSuite) SetupTest() {
// meta // meta
store := meta.NewMetaStore(suite.kv) store := meta.NewMetaStore(suite.kv)
idAllocator := RandomIncrementIDAllocator() idAllocator := RandomIncrementIDAllocator()
suite.meta = meta.NewMeta(idAllocator, store) suite.meta = meta.NewMeta(idAllocator, store, session.NewNodeManager())
suite.broker = meta.NewMockBroker(suite.T()) suite.broker = meta.NewMockBroker(suite.T())
suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta) suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta)
@ -86,7 +87,7 @@ func (suite *TargetObserverSuite) SetupTest() {
err = suite.meta.CollectionManager.PutCollection(utils.CreateTestCollection(suite.collectionID, 1)) err = suite.meta.CollectionManager.PutCollection(utils.CreateTestCollection(suite.collectionID, 1))
suite.NoError(err) suite.NoError(err)
replicas, err := suite.meta.ReplicaManager.Spawn(suite.collectionID, 1) replicas, err := suite.meta.ReplicaManager.Spawn(suite.collectionID, 1, meta.DefaultResourceGroupName)
suite.NoError(err) suite.NoError(err)
replicas[0].AddNode(2) replicas[0].AddNode(2)
err = suite.meta.ReplicaManager.Put(replicas...) err = suite.meta.ReplicaManager.Put(replicas...)
@ -212,6 +213,6 @@ func (suite *TargetObserverSuite) TearDownSuite() {
suite.observer.Stop() suite.observer.Stop()
} }
func TestTargetManager(t *testing.T) { func TestTargetObserver(t *testing.T) {
suite.Run(t, new(TargetObserverSuite)) suite.Run(t, new(TargetObserverSuite))
} }

View File

@ -103,6 +103,8 @@ type Server struct {
collectionObserver *observers.CollectionObserver collectionObserver *observers.CollectionObserver
leaderObserver *observers.LeaderObserver leaderObserver *observers.LeaderObserver
targetObserver *observers.TargetObserver targetObserver *observers.TargetObserver
replicaObserver *observers.ReplicaObserver
resourceObserver *observers.ResourceObserver
balancer balance.Balance balancer balance.Balance
@ -177,13 +179,13 @@ func (s *Server) Init() error {
s.metricsCacheManager = metricsinfo.NewMetricsCacheManager() s.metricsCacheManager = metricsinfo.NewMetricsCacheManager()
// Init meta // Init meta
s.nodeMgr = session.NewNodeManager()
err = s.initMeta() err = s.initMeta()
if err != nil { if err != nil {
return err return err
} }
// Init session // Init session
log.Info("init session") log.Info("init session")
s.nodeMgr = session.NewNodeManager()
s.cluster = session.NewCluster(s.nodeMgr, s.queryNodeCreator) s.cluster = session.NewCluster(s.nodeMgr, s.queryNodeCreator)
// Init schedulers // Init schedulers
@ -244,7 +246,7 @@ func (s *Server) initMeta() error {
log.Info("init meta") log.Info("init meta")
s.store = meta.NewMetaStore(s.kv) s.store = meta.NewMetaStore(s.kv)
s.meta = meta.NewMeta(s.idAllocator, s.store) s.meta = meta.NewMeta(s.idAllocator, s.store, s.nodeMgr)
log.Info("recover meta...") log.Info("recover meta...")
err := s.meta.CollectionManager.Recover() err := s.meta.CollectionManager.Recover()
@ -262,6 +264,12 @@ func (s *Server) initMeta() error {
return err return err
} }
err = s.meta.ResourceManager.Recover()
if err != nil {
log.Error("failed to recover resource groups")
return err
}
s.dist = &meta.DistributionManager{ s.dist = &meta.DistributionManager{
SegmentDistManager: meta.NewSegmentDistManager(), SegmentDistManager: meta.NewSegmentDistManager(),
ChannelDistManager: meta.NewChannelDistManager(), ChannelDistManager: meta.NewChannelDistManager(),
@ -297,6 +305,13 @@ func (s *Server) initObserver() {
s.targetMgr, s.targetMgr,
s.targetObserver, s.targetObserver,
) )
s.replicaObserver = observers.NewReplicaObserver(
s.meta,
s.dist,
)
s.resourceObserver = observers.NewResourceObserver(s.meta)
} }
func (s *Server) afterStart() { func (s *Server) afterStart() {
@ -360,6 +375,8 @@ func (s *Server) startServerLoop() {
s.collectionObserver.Start(s.ctx) s.collectionObserver.Start(s.ctx)
s.leaderObserver.Start(s.ctx) s.leaderObserver.Start(s.ctx)
s.targetObserver.Start(s.ctx) s.targetObserver.Start(s.ctx)
s.replicaObserver.Start(s.ctx)
s.resourceObserver.Start(s.ctx)
} }
func (s *Server) Stop() error { func (s *Server) Stop() error {
@ -403,6 +420,12 @@ func (s *Server) Stop() error {
if s.targetObserver != nil { if s.targetObserver != nil {
s.targetObserver.Stop() s.targetObserver.Stop()
} }
if s.replicaObserver != nil {
s.replicaObserver.Stop()
}
if s.resourceObserver != nil {
s.resourceObserver.Stop()
}
s.wg.Wait() s.wg.Wait()
log.Info("QueryCoord stop successfully") log.Info("QueryCoord stop successfully")
@ -580,17 +603,33 @@ func (s *Server) handleNodeUp(node int64) {
s.taskScheduler.AddExecutor(node) s.taskScheduler.AddExecutor(node)
s.distController.StartDistInstance(s.ctx, node) s.distController.StartDistInstance(s.ctx, node)
// need assign to new rg and replica
rgName, err := s.meta.ResourceManager.HandleNodeUp(node)
if err != nil {
log.Warn("HandleNodeUp: failed to assign node to resource group",
zap.Error(err),
)
return
}
log.Info("HandleNodeUp: assign node to resource group",
zap.String("resourceGroup", rgName),
)
for _, collection := range s.meta.CollectionManager.GetAll() { for _, collection := range s.meta.CollectionManager.GetAll() {
log := log.With(zap.Int64("collectionID", collection)) log := log.With(zap.Int64("collectionID", collection))
replica := s.meta.ReplicaManager.GetByCollectionAndNode(collection, node) replica := s.meta.ReplicaManager.GetByCollectionAndNode(collection, node)
if replica == nil { if replica == nil {
replicas := s.meta.ReplicaManager.GetByCollection(collection) replicas := s.meta.ReplicaManager.GetByCollectionAndRG(collection, rgName)
if len(replicas) == 0 {
continue
}
sort.Slice(replicas, func(i, j int) bool { sort.Slice(replicas, func(i, j int) bool {
return replicas[i].Nodes.Len() < replicas[j].Nodes.Len() return replicas[i].Len() < replicas[j].Len()
}) })
replica := replicas[0] replica := replicas[0]
// TODO(yah01): this may fail, need a component to check whether a node is assigned // TODO(yah01): this may fail, need a component to check whether a node is assigned
err := s.meta.ReplicaManager.AddNode(replica.GetID(), node) err = s.meta.ReplicaManager.AddNode(replica.GetID(), node)
if err != nil { if err != nil {
log.Warn("failed to assign node to replicas", log.Warn("failed to assign node to replicas",
zap.Int64("replicaID", replica.GetID()), zap.Int64("replicaID", replica.GetID()),
@ -608,20 +647,6 @@ func (s *Server) handleNodeDown(node int64) {
s.taskScheduler.RemoveExecutor(node) s.taskScheduler.RemoveExecutor(node)
s.distController.Remove(node) s.distController.Remove(node)
// Refresh the targets, to avoid consuming messages too early from channel
// FIXME(yah01): the leads to miss data, the segments flushed between the two check points
// are missed, it will recover for a while.
channels := s.dist.ChannelDistManager.GetByNode(node)
for _, channel := range channels {
_, err := s.targetObserver.UpdateNextTarget(channel.GetCollectionID())
if err != nil {
msg := "failed to update next targets for collection"
log.Error(msg,
zap.Error(err))
continue
}
}
// Clear dist // Clear dist
s.dist.LeaderViewManager.Update(node) s.dist.LeaderViewManager.Update(node)
s.dist.ChannelDistManager.Update(node) s.dist.ChannelDistManager.Update(node)
@ -647,6 +672,19 @@ func (s *Server) handleNodeDown(node int64) {
// Clear tasks // Clear tasks
s.taskScheduler.RemoveByNode(node) s.taskScheduler.RemoveByNode(node)
rgName, err := s.meta.ResourceManager.HandleNodeDown(node)
if err != nil {
log.Warn("HandleNodeDown: failed to remove node from resource group",
zap.String("resourceGroup", rgName),
zap.Error(err),
)
return
}
log.Info("HandleNodeDown: remove node from resource group",
zap.String("resourceGroup", rgName),
)
} }
// checkReplicas checks whether replica contains offline node, and remove those nodes // checkReplicas checks whether replica contains offline node, and remove those nodes
@ -657,7 +695,7 @@ func (s *Server) checkReplicas() {
for _, replica := range replicas { for _, replica := range replicas {
replica := replica.Clone() replica := replica.Clone()
toRemove := make([]int64, 0) toRemove := make([]int64, 0)
for node := range replica.Nodes { for _, node := range replica.GetNodes() {
if s.nodeMgr.Get(node) == nil { if s.nodeMgr.Get(node) == nil {
toRemove = append(toRemove, node) toRemove = append(toRemove, node)
} }

View File

@ -110,6 +110,7 @@ func (suite *ServerSuite) SetupTest() {
suite.Require().NoError(err) suite.Require().NoError(err)
ok := suite.waitNodeUp(suite.nodes[i], 5*time.Second) ok := suite.waitNodeUp(suite.nodes[i], 5*time.Second)
suite.Require().True(ok) suite.Require().True(ok)
suite.server.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, suite.nodes[i].ID)
} }
suite.loadAll() suite.loadAll()
@ -184,7 +185,6 @@ func (suite *ServerSuite) TestNodeUp() {
} }
return true return true
}, 5*time.Second, time.Second) }, 5*time.Second, time.Second)
} }
func (suite *ServerSuite) TestNodeUpdate() { func (suite *ServerSuite) TestNodeUpdate() {

View File

@ -45,6 +45,16 @@ import (
var ( var (
successStatus = utils.WrapStatus(commonpb.ErrorCode_Success, "") successStatus = utils.WrapStatus(commonpb.ErrorCode_Success, "")
ErrCreateResourceGroupFailed = errors.New("failed to create resource group")
ErrDropResourceGroupFailed = errors.New("failed to drop resource group")
ErrAddNodeToRGFailed = errors.New("failed to add node to resource group")
ErrRemoveNodeFromRGFailed = errors.New("failed to remove node from resource group")
ErrTransferNodeFailed = errors.New("failed to transfer node between resource group")
ErrTransferReplicaFailed = errors.New("failed to transfer replica between resource group")
ErrListResourceGroupsFailed = errors.New("failed to list resource group")
ErrDescribeResourceGroupFailed = errors.New("failed to describe resource group")
ErrLoadUseWrongRG = errors.New("load operation should use collection's resource group")
) )
func (s *Server) ShowCollections(ctx context.Context, req *querypb.ShowCollectionsRequest) (*querypb.ShowCollectionsResponse, error) { func (s *Server) ShowCollections(ctx context.Context, req *querypb.ShowCollectionsRequest) (*querypb.ShowCollectionsResponse, error) {
@ -218,6 +228,13 @@ func (s *Server) LoadCollection(ctx context.Context, req *querypb.LoadCollection
return s.refreshCollection(ctx, req.GetCollectionID()) return s.refreshCollection(ctx, req.GetCollectionID())
} }
if err := s.checkResourceGroup(req.GetCollectionID(), req.GetResourceGroups()); err != nil {
msg := "failed to load collection"
log.Warn(msg, zap.Error(err))
metrics.QueryCoordLoadCount.WithLabelValues(metrics.FailLabel).Inc()
return utils.WrapStatus(commonpb.ErrorCode_IllegalArgument, msg, err), nil
}
loadJob := job.NewLoadCollectionJob(ctx, loadJob := job.NewLoadCollectionJob(ctx,
req, req,
s.dist, s.dist,
@ -282,6 +299,8 @@ func (s *Server) ReleaseCollection(ctx context.Context, req *querypb.ReleaseColl
func (s *Server) LoadPartitions(ctx context.Context, req *querypb.LoadPartitionsRequest) (*commonpb.Status, error) { func (s *Server) LoadPartitions(ctx context.Context, req *querypb.LoadPartitionsRequest) (*commonpb.Status, error) {
log := log.Ctx(ctx).With( log := log.Ctx(ctx).With(
zap.Int64("collectionID", req.GetCollectionID()), zap.Int64("collectionID", req.GetCollectionID()),
zap.Int32("replicaNumber", req.GetReplicaNumber()),
zap.Strings("resourceGroups", req.GetResourceGroups()),
) )
log.Info("received load partitions request", log.Info("received load partitions request",
@ -300,6 +319,14 @@ func (s *Server) LoadPartitions(ctx context.Context, req *querypb.LoadPartitions
// If refresh mode is ON. // If refresh mode is ON.
if req.GetRefresh() { if req.GetRefresh() {
return s.refreshPartitions(ctx, req.GetCollectionID(), req.GetPartitionIDs()) return s.refreshPartitions(ctx, req.GetCollectionID(), req.GetPartitionIDs())
}
if err := s.checkResourceGroup(req.GetCollectionID(), req.GetResourceGroups()); err != nil {
msg := "failed to load partitions"
log.Warn(msg, zap.Error(ErrLoadUseWrongRG))
metrics.QueryCoordLoadCount.WithLabelValues(metrics.FailLabel).Inc()
return utils.WrapStatus(commonpb.ErrorCode_IllegalArgument, msg, ErrLoadUseWrongRG), nil
} }
loadJob := job.NewLoadPartitionJob(ctx, loadJob := job.NewLoadPartitionJob(ctx,
@ -323,6 +350,19 @@ func (s *Server) LoadPartitions(ctx context.Context, req *querypb.LoadPartitions
return successStatus, nil return successStatus, nil
} }
func (s *Server) checkResourceGroup(collectionID int64, resourceGroups []string) error {
if len(resourceGroups) != 0 {
collectionUsedRG := s.meta.ReplicaManager.GetResourceGroupByCollection(collectionID)
for _, rgName := range resourceGroups {
if !collectionUsedRG.Contain(rgName) {
return ErrLoadUseWrongRG
}
}
}
return nil
}
func (s *Server) ReleasePartitions(ctx context.Context, req *querypb.ReleasePartitionsRequest) (*commonpb.Status, error) { func (s *Server) ReleasePartitions(ctx context.Context, req *querypb.ReleasePartitionsRequest) (*commonpb.Status, error) {
log := log.Ctx(ctx).With( log := log.Ctx(ctx).With(
zap.Int64("collectionID", req.GetCollectionID()), zap.Int64("collectionID", req.GetCollectionID()),
@ -637,7 +677,7 @@ func (s *Server) LoadBalance(ctx context.Context, req *querypb.LoadBalanceReques
fmt.Sprintf("can't balance, because the source node[%d] is invalid", srcNode), err), nil fmt.Sprintf("can't balance, because the source node[%d] is invalid", srcNode), err), nil
} }
for _, dstNode := range req.GetDstNodeIDs() { for _, dstNode := range req.GetDstNodeIDs() {
if !replica.Nodes.Contain(dstNode) { if !replica.Contains(dstNode) {
msg := "destination nodes have to be in the same replica of source node" msg := "destination nodes have to be in the same replica of source node"
log.Warn(msg) log.Warn(msg)
return utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, msg), nil return utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, msg), nil
@ -924,3 +964,204 @@ func (s *Server) CheckHealth(ctx context.Context, req *milvuspb.CheckHealthReque
return &milvuspb.CheckHealthResponse{IsHealthy: true, Reasons: errReasons}, nil return &milvuspb.CheckHealthResponse{IsHealthy: true, Reasons: errReasons}, nil
} }
func (s *Server) CreateResourceGroup(ctx context.Context, req *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error) {
log := log.Ctx(ctx).With(
zap.String("rgName", req.GetResourceGroup()),
)
log.Info("create resource group request received")
if s.status.Load() != commonpb.StateCode_Healthy {
log.Warn(ErrCreateResourceGroupFailed.Error(), zap.Error(ErrNotHealthy))
return utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, ErrCreateResourceGroupFailed.Error(), ErrNotHealthy), nil
}
err := s.meta.ResourceManager.AddResourceGroup(req.GetResourceGroup())
if err != nil {
log.Warn(ErrCreateResourceGroupFailed.Error(), zap.Error(err))
return utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, ErrCreateResourceGroupFailed.Error(), err), nil
}
return successStatus, nil
}
func (s *Server) DropResourceGroup(ctx context.Context, req *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error) {
log := log.Ctx(ctx).With(
zap.String("rgName", req.GetResourceGroup()),
)
log.Info("drop resource group request received")
if s.status.Load() != commonpb.StateCode_Healthy {
log.Warn(ErrDropResourceGroupFailed.Error(), zap.Error(ErrNotHealthy))
return utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, ErrDropResourceGroupFailed.Error(), ErrNotHealthy), nil
}
err := s.meta.ResourceManager.RemoveResourceGroup(req.GetResourceGroup())
if err != nil {
log.Warn(ErrDropResourceGroupFailed.Error(), zap.Error(err))
return utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, ErrDropResourceGroupFailed.Error(), err), nil
}
return successStatus, nil
}
func (s *Server) TransferNode(ctx context.Context, req *milvuspb.TransferNodeRequest) (*commonpb.Status, error) {
log := log.Ctx(ctx).With(
zap.String("source", req.GetSourceResourceGroup()),
zap.String("target", req.GetTargetResourceGroup()),
)
log.Info("transfer node between resource group request received")
if s.status.Load() != commonpb.StateCode_Healthy {
log.Warn(ErrTransferNodeFailed.Error(), zap.Error(ErrNotHealthy))
return utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, ErrTransferNodeFailed.Error(), ErrNotHealthy), nil
}
if ok := s.meta.ResourceManager.ContainResourceGroup(req.GetSourceResourceGroup()); !ok {
return utils.WrapStatus(commonpb.ErrorCode_IllegalArgument,
fmt.Sprintf("the source resource group[%s] doesn't exist", req.GetTargetResourceGroup()), meta.ErrRGNotExist), nil
}
if ok := s.meta.ResourceManager.ContainResourceGroup(req.GetTargetResourceGroup()); !ok {
return utils.WrapStatus(commonpb.ErrorCode_IllegalArgument,
fmt.Sprintf("the target resource group[%s] doesn't exist", req.GetTargetResourceGroup()), meta.ErrRGNotExist), nil
}
err := s.meta.ResourceManager.TransferNode(req.GetSourceResourceGroup(), req.GetTargetResourceGroup())
if err != nil {
log.Warn(ErrTransferNodeFailed.Error(), zap.Error(err))
return utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, ErrTransferNodeFailed.Error(), err), nil
}
return successStatus, nil
}
func (s *Server) TransferReplica(ctx context.Context, req *querypb.TransferReplicaRequest) (*commonpb.Status, error) {
log := log.Ctx(ctx).With(
zap.String("source", req.GetSourceResourceGroup()),
zap.String("target", req.GetTargetResourceGroup()),
zap.Int64("collectionID", req.GetCollectionID()),
)
log.Info("transfer replica request received")
if s.status.Load() != commonpb.StateCode_Healthy {
log.Warn(ErrTransferReplicaFailed.Error(), zap.Error(ErrNotHealthy))
return utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, ErrTransferReplicaFailed.Error(), ErrNotHealthy), nil
}
if ok := s.meta.ResourceManager.ContainResourceGroup(req.GetSourceResourceGroup()); !ok {
return utils.WrapStatus(commonpb.ErrorCode_IllegalArgument,
fmt.Sprintf("the source resource group[%s] doesn't exist", req.GetSourceResourceGroup()), meta.ErrRGNotExist), nil
}
if ok := s.meta.ResourceManager.ContainResourceGroup(req.GetTargetResourceGroup()); !ok {
return utils.WrapStatus(commonpb.ErrorCode_IllegalArgument,
fmt.Sprintf("the target resource group[%s] doesn't exist", req.GetTargetResourceGroup()), meta.ErrRGNotExist), nil
}
// for now, we don't support to transfer replica of same collection to same resource group
replicas := s.meta.ReplicaManager.GetByCollectionAndRG(req.GetCollectionID(), req.GetSourceResourceGroup())
if len(replicas) < int(req.GetNumReplica()) {
return utils.WrapStatus(commonpb.ErrorCode_IllegalArgument,
fmt.Sprintf("found [%d] replicas of collection[%d] in source resource group[%s]",
len(replicas), req.GetCollectionID(), req.GetSourceResourceGroup())), nil
}
err := s.transferReplica(req.GetTargetResourceGroup(), replicas[:req.GetNumReplica()])
if err != nil {
return utils.WrapStatus(commonpb.ErrorCode_IllegalArgument, ErrTransferReplicaFailed.Error(), err), nil
}
return successStatus, nil
}
func (s *Server) transferReplica(targetRG string, replicas []*meta.Replica) error {
ret := make([]*meta.Replica, 0)
for _, replica := range replicas {
newReplica := replica.Clone()
newReplica.ResourceGroup = targetRG
ret = append(ret, newReplica)
}
err := utils.AssignNodesToReplicas(s.meta, targetRG, ret...)
if err != nil {
return err
}
return s.meta.ReplicaManager.Put(ret...)
}
func (s *Server) ListResourceGroups(ctx context.Context, req *milvuspb.ListResourceGroupsRequest) (*milvuspb.ListResourceGroupsResponse, error) {
log := log.Ctx(ctx)
log.Info("list resource group request received")
resp := &milvuspb.ListResourceGroupsResponse{
Status: successStatus,
}
if s.status.Load() != commonpb.StateCode_Healthy {
log.Warn(ErrListResourceGroupsFailed.Error(), zap.Error(ErrNotHealthy))
resp.Status = utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, ErrListResourceGroupsFailed.Error(), ErrNotHealthy)
return resp, nil
}
resp.ResourceGroups = s.meta.ResourceManager.ListResourceGroups()
return resp, nil
}
func (s *Server) DescribeResourceGroup(ctx context.Context, req *querypb.DescribeResourceGroupRequest) (*querypb.DescribeResourceGroupResponse, error) {
log := log.Ctx(ctx).With(
zap.String("rgName", req.GetResourceGroup()),
)
log.Info("describe resource group request received")
resp := &querypb.DescribeResourceGroupResponse{
Status: successStatus,
}
if s.status.Load() != commonpb.StateCode_Healthy {
log.Warn(ErrDescribeResourceGroupFailed.Error(), zap.Error(ErrNotHealthy))
resp.Status = utils.WrapStatus(commonpb.ErrorCode_UnexpectedError, ErrDescribeResourceGroupFailed.Error(), ErrNotHealthy)
return resp, nil
}
rg, err := s.meta.ResourceManager.GetResourceGroup(req.GetResourceGroup())
if err != nil {
resp.Status = utils.WrapStatus(commonpb.ErrorCode_IllegalArgument, ErrDescribeResourceGroupFailed.Error(), err)
return resp, nil
}
loadedReplicas := make(map[int64]int32)
outgoingNodes := make(map[int64]int32)
replicasInRG := s.meta.GetByResourceGroup(req.GetResourceGroup())
for _, replica := range replicasInRG {
loadedReplicas[replica.GetCollectionID()]++
for _, node := range replica.GetNodes() {
if !s.meta.ContainsNode(replica.GetResourceGroup(), node) {
outgoingNodes[replica.GetCollectionID()]++
}
}
}
incomingNodes := make(map[int64]int32)
collections := s.meta.GetAll()
for _, collection := range collections {
replicas := s.meta.GetByCollection(collection)
for _, replica := range replicas {
if replica.GetResourceGroup() == req.GetResourceGroup() {
continue
}
for _, node := range replica.GetNodes() {
if s.meta.ContainsNode(req.GetResourceGroup(), node) {
incomingNodes[collection]++
}
}
}
}
resp.ResourceGroup = &querypb.ResourceGroupInfo{
Name: req.GetResourceGroup(),
Capacity: int32(rg.GetCapacity()),
NumAvailableNode: int32(len(rg.GetNodes())),
NumLoadedReplica: loadedReplicas,
NumOutgoingNode: outgoingNodes,
NumIncomingNode: incomingNodes,
}
return resp, nil
}

View File

@ -128,7 +128,8 @@ func (suite *ServiceSuite) SetupTest() {
suite.store = meta.NewMetaStore(suite.kv) suite.store = meta.NewMetaStore(suite.kv)
suite.dist = meta.NewDistributionManager() suite.dist = meta.NewDistributionManager()
suite.meta = meta.NewMeta(params.RandomIncrementIDAllocator(), suite.store) suite.nodeMgr = session.NewNodeManager()
suite.meta = meta.NewMeta(params.RandomIncrementIDAllocator(), suite.store, suite.nodeMgr)
suite.broker = meta.NewMockBroker(suite.T()) suite.broker = meta.NewMockBroker(suite.T())
suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta) suite.targetMgr = meta.NewTargetManager(suite.broker, suite.meta)
suite.targetObserver = observers.NewTargetObserver( suite.targetObserver = observers.NewTargetObserver(
@ -137,9 +138,10 @@ func (suite *ServiceSuite) SetupTest() {
suite.dist, suite.dist,
suite.broker, suite.broker,
) )
suite.nodeMgr = session.NewNodeManager()
for _, node := range suite.nodes { for _, node := range suite.nodes {
suite.nodeMgr.Add(session.NewNodeInfo(node, "localhost")) suite.nodeMgr.Add(session.NewNodeInfo(node, "localhost"))
err := suite.meta.ResourceManager.AssignNode(meta.DefaultResourceGroupName, node)
suite.NoError(err)
} }
suite.cluster = session.NewMockCluster(suite.T()) suite.cluster = session.NewMockCluster(suite.T())
suite.jobScheduler = job.NewScheduler() suite.jobScheduler = job.NewScheduler()
@ -334,6 +336,260 @@ func (suite *ServiceSuite) TestLoadCollection() {
suite.Contains(resp.Reason, ErrNotHealthy.Error()) suite.Contains(resp.Reason, ErrNotHealthy.Error())
} }
func (suite *ServiceSuite) TestResourceGroup() {
ctx := context.Background()
server := suite.server
createRG := &milvuspb.CreateResourceGroupRequest{
ResourceGroup: "rg1",
}
resp, err := server.CreateResourceGroup(ctx, createRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_Success, resp.ErrorCode)
resp, err = server.CreateResourceGroup(ctx, createRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_UnexpectedError, resp.ErrorCode)
suite.Contains(resp.Reason, ErrCreateResourceGroupFailed.Error())
suite.Contains(resp.Reason, meta.ErrRGAlreadyExist.Error())
listRG := &milvuspb.ListResourceGroupsRequest{}
resp1, err := server.ListResourceGroups(ctx, listRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_Success, resp1.Status.ErrorCode)
suite.Len(resp1.ResourceGroups, 2)
server.nodeMgr.Add(session.NewNodeInfo(1011, "localhost"))
server.nodeMgr.Add(session.NewNodeInfo(1012, "localhost"))
server.nodeMgr.Add(session.NewNodeInfo(1013, "localhost"))
server.nodeMgr.Add(session.NewNodeInfo(1014, "localhost"))
server.meta.ResourceManager.AddResourceGroup("rg11")
server.meta.ResourceManager.AssignNode("rg11", 1011)
server.meta.ResourceManager.AssignNode("rg11", 1012)
server.meta.ResourceManager.AddResourceGroup("rg12")
server.meta.ResourceManager.AssignNode("rg12", 1013)
server.meta.ResourceManager.AssignNode("rg12", 1014)
server.meta.CollectionManager.PutCollection(utils.CreateTestCollection(1, 1))
server.meta.CollectionManager.PutCollection(utils.CreateTestCollection(2, 1))
server.meta.ReplicaManager.Put(meta.NewReplica(&querypb.Replica{
ID: 1,
CollectionID: 1,
Nodes: []int64{1011, 1013},
ResourceGroup: "rg11"},
typeutil.NewUniqueSet(1011, 1013)),
)
server.meta.ReplicaManager.Put(meta.NewReplica(&querypb.Replica{
ID: 2,
CollectionID: 2,
Nodes: []int64{1012, 1014},
ResourceGroup: "rg12"},
typeutil.NewUniqueSet(1012, 1014)),
)
describeRG := &querypb.DescribeResourceGroupRequest{
ResourceGroup: "rg11",
}
resp2, err := server.DescribeResourceGroup(ctx, describeRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_Success, resp2.Status.ErrorCode)
suite.Equal("rg11", resp2.GetResourceGroup().GetName())
suite.Equal(int32(2), resp2.GetResourceGroup().GetCapacity())
suite.Equal(int32(2), resp2.GetResourceGroup().GetNumAvailableNode())
suite.Equal(map[int64]int32{1: 1}, resp2.GetResourceGroup().GetNumLoadedReplica())
suite.Equal(map[int64]int32{2: 1}, resp2.GetResourceGroup().GetNumIncomingNode())
suite.Equal(map[int64]int32{1: 1}, resp2.GetResourceGroup().GetNumOutgoingNode())
dropRG := &milvuspb.DropResourceGroupRequest{
ResourceGroup: "rg1",
}
resp3, err := server.DropResourceGroup(ctx, dropRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_Success, resp3.ErrorCode)
resp4, err := server.ListResourceGroups(ctx, listRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_Success, resp4.Status.ErrorCode)
suite.Len(resp4.GetResourceGroups(), 3)
}
func (suite *ServiceSuite) TestResourceGroupFailed() {
ctx := context.Background()
server := suite.server
// illegal argument
describeRG := &querypb.DescribeResourceGroupRequest{
ResourceGroup: "rfffff",
}
resp, err := server.DescribeResourceGroup(ctx, describeRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_IllegalArgument, resp.Status.ErrorCode)
// server unhealthy
server.status.Store(commonpb.StateCode_Abnormal)
createRG := &milvuspb.CreateResourceGroupRequest{
ResourceGroup: "rg1",
}
resp1, err := server.CreateResourceGroup(ctx, createRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_UnexpectedError, resp1.ErrorCode)
listRG := &milvuspb.ListResourceGroupsRequest{}
resp2, err := server.ListResourceGroups(ctx, listRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_UnexpectedError, resp2.Status.ErrorCode)
describeRG = &querypb.DescribeResourceGroupRequest{
ResourceGroup: "rg1",
}
resp3, err := server.DescribeResourceGroup(ctx, describeRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_UnexpectedError, resp3.Status.ErrorCode)
dropRG := &milvuspb.DropResourceGroupRequest{
ResourceGroup: "rg1",
}
resp4, err := server.DropResourceGroup(ctx, dropRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_UnexpectedError, resp4.ErrorCode)
resp5, err := server.ListResourceGroups(ctx, listRG)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_UnexpectedError, resp5.Status.ErrorCode)
}
func (suite *ServiceSuite) TestTransferNode() {
ctx := context.Background()
server := suite.server
err := server.meta.ResourceManager.AddResourceGroup("rg1")
suite.NoError(err)
err = server.meta.ResourceManager.AddResourceGroup("rg2")
suite.NoError(err)
// test transfer node
resp, err := server.TransferNode(ctx, &milvuspb.TransferNodeRequest{
SourceResourceGroup: meta.DefaultResourceGroupName,
TargetResourceGroup: "rg1",
})
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_Success, resp.ErrorCode)
nodes, err := server.meta.ResourceManager.GetNodes("rg1")
suite.NoError(err)
suite.Len(nodes, 1)
// test transfer node meet non-exist source rg
resp, err = server.TransferNode(ctx, &milvuspb.TransferNodeRequest{
SourceResourceGroup: "rgggg",
TargetResourceGroup: meta.DefaultResourceGroupName,
})
suite.NoError(err)
suite.Contains(resp.Reason, meta.ErrRGNotExist.Error())
suite.Equal(commonpb.ErrorCode_IllegalArgument, resp.ErrorCode)
// test transfer node meet non-exist target rg
resp, err = server.TransferNode(ctx, &milvuspb.TransferNodeRequest{
SourceResourceGroup: meta.DefaultResourceGroupName,
TargetResourceGroup: "rgggg",
})
suite.NoError(err)
suite.Contains(resp.Reason, meta.ErrRGNotExist.Error())
suite.Equal(commonpb.ErrorCode_IllegalArgument, resp.ErrorCode)
// server unhealthy
server.status.Store(commonpb.StateCode_Abnormal)
resp, err = server.TransferNode(ctx, &milvuspb.TransferNodeRequest{
SourceResourceGroup: meta.DefaultResourceGroupName,
TargetResourceGroup: "rg1",
})
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_UnexpectedError, resp.ErrorCode)
}
func (suite *ServiceSuite) TestTransferReplica() {
ctx := context.Background()
server := suite.server
err := server.meta.ResourceManager.AddResourceGroup("rg1")
suite.NoError(err)
err = server.meta.ResourceManager.AddResourceGroup("rg2")
suite.NoError(err)
err = server.meta.ResourceManager.AddResourceGroup("rg3")
suite.NoError(err)
resp, err := suite.server.TransferReplica(ctx, &querypb.TransferReplicaRequest{
SourceResourceGroup: meta.DefaultResourceGroupName,
TargetResourceGroup: "rg1",
CollectionID: 1,
NumReplica: 2,
})
suite.NoError(err)
suite.Contains(resp.Reason, "found [0] replicas of collection[1] in source resource group")
resp, err = suite.server.TransferReplica(ctx, &querypb.TransferReplicaRequest{
SourceResourceGroup: "rgg",
TargetResourceGroup: meta.DefaultResourceGroupName,
CollectionID: 1,
NumReplica: 2,
})
suite.NoError(err)
suite.Equal(resp.ErrorCode, commonpb.ErrorCode_IllegalArgument)
resp, err = suite.server.TransferReplica(ctx, &querypb.TransferReplicaRequest{
SourceResourceGroup: meta.DefaultResourceGroupName,
TargetResourceGroup: "rgg",
CollectionID: 1,
NumReplica: 2,
})
suite.NoError(err)
suite.Equal(resp.ErrorCode, commonpb.ErrorCode_IllegalArgument)
suite.server.meta.Put(meta.NewReplica(&querypb.Replica{
CollectionID: 1,
ID: 111,
ResourceGroup: meta.DefaultResourceGroupName,
}, typeutil.NewUniqueSet(1)))
suite.server.meta.Put(meta.NewReplica(&querypb.Replica{
CollectionID: 1,
ID: 222,
ResourceGroup: meta.DefaultResourceGroupName,
}, typeutil.NewUniqueSet(2)))
suite.server.nodeMgr.Add(session.NewNodeInfo(1001, "localhost"))
suite.server.nodeMgr.Add(session.NewNodeInfo(1002, "localhost"))
suite.server.nodeMgr.Add(session.NewNodeInfo(1003, "localhost"))
suite.server.nodeMgr.Add(session.NewNodeInfo(1004, "localhost"))
suite.server.meta.AssignNode("rg1", 1001)
suite.server.meta.AssignNode("rg2", 1002)
suite.server.meta.AssignNode("rg3", 1003)
suite.server.meta.AssignNode("rg3", 1004)
resp, err = suite.server.TransferReplica(ctx, &querypb.TransferReplicaRequest{
SourceResourceGroup: meta.DefaultResourceGroupName,
TargetResourceGroup: "rg3",
CollectionID: 1,
NumReplica: 2,
})
suite.NoError(err)
suite.Equal(resp.ErrorCode, commonpb.ErrorCode_Success)
suite.Len(suite.server.meta.GetByResourceGroup("rg3"), 2)
// server unhealthy
server.status.Store(commonpb.StateCode_Abnormal)
resp, err = suite.server.TransferReplica(ctx, &querypb.TransferReplicaRequest{
SourceResourceGroup: meta.DefaultResourceGroupName,
TargetResourceGroup: "rg3",
CollectionID: 1,
NumReplica: 2,
})
suite.NoError(err)
suite.Equal(resp.ErrorCode, commonpb.ErrorCode_UnexpectedError)
}
func (suite *ServiceSuite) TestLoadCollectionFailed() { func (suite *ServiceSuite) TestLoadCollectionFailed() {
suite.loadAll() suite.loadAll()
ctx := context.Background() ctx := context.Background()
@ -365,6 +621,19 @@ func (suite *ServiceSuite) TestLoadCollectionFailed() {
suite.Equal(commonpb.ErrorCode_IllegalArgument, resp.ErrorCode) suite.Equal(commonpb.ErrorCode_IllegalArgument, resp.ErrorCode)
suite.Contains(resp.Reason, job.ErrLoadParameterMismatched.Error()) suite.Contains(resp.Reason, job.ErrLoadParameterMismatched.Error())
} }
// Test load with wrong rg num
for _, collection := range suite.collections {
req := &querypb.LoadCollectionRequest{
CollectionID: collection,
ReplicaNumber: suite.replicaNumber[collection] + 1,
ResourceGroups: []string{"rg1", "rg2"},
}
resp, err := server.LoadCollection(ctx, req)
suite.NoError(err)
suite.Equal(commonpb.ErrorCode_IllegalArgument, resp.ErrorCode)
suite.Contains(resp.Reason, ErrLoadUseWrongRG.Error())
}
} }
func (suite *ServiceSuite) TestLoadPartition() { func (suite *ServiceSuite) TestLoadPartition() {
@ -756,8 +1025,9 @@ func (suite *ServiceSuite) TestLoadBalance() {
// Test get balance first segment // Test get balance first segment
for _, collection := range suite.collections { for _, collection := range suite.collections {
replicas := suite.meta.ReplicaManager.GetByCollection(collection) replicas := suite.meta.ReplicaManager.GetByCollection(collection)
srcNode := replicas[0].GetNodes()[0] nodes := replicas[0].GetNodes()
dstNode := replicas[0].GetNodes()[1] srcNode := nodes[0]
dstNode := nodes[1]
suite.updateCollectionStatus(collection, querypb.LoadStatus_Loaded) suite.updateCollectionStatus(collection, querypb.LoadStatus_Loaded)
suite.updateSegmentDist(collection, srcNode) suite.updateSegmentDist(collection, srcNode)
segments := suite.getAllSegments(collection) segments := suite.getAllSegments(collection)
@ -883,8 +1153,9 @@ func (suite *ServiceSuite) TestLoadBalanceFailed() {
// Test load balance with not fully loaded // Test load balance with not fully loaded
for _, collection := range suite.collections { for _, collection := range suite.collections {
replicas := suite.meta.ReplicaManager.GetByCollection(collection) replicas := suite.meta.ReplicaManager.GetByCollection(collection)
srcNode := replicas[0].GetNodes()[0] nodes := replicas[0].GetNodes()
dstNode := replicas[0].GetNodes()[1] srcNode := nodes[0]
dstNode := nodes[1]
suite.updateCollectionStatus(collection, querypb.LoadStatus_Loading) suite.updateCollectionStatus(collection, querypb.LoadStatus_Loading)
segments := suite.getAllSegments(collection) segments := suite.getAllSegments(collection)
req := &querypb.LoadBalanceRequest{ req := &querypb.LoadBalanceRequest{
@ -926,8 +1197,9 @@ func (suite *ServiceSuite) TestLoadBalanceFailed() {
// Test balance task failed // Test balance task failed
for _, collection := range suite.collections { for _, collection := range suite.collections {
replicas := suite.meta.ReplicaManager.GetByCollection(collection) replicas := suite.meta.ReplicaManager.GetByCollection(collection)
srcNode := replicas[0].GetNodes()[0] nodes := replicas[0].GetNodes()
dstNode := replicas[0].GetNodes()[1] srcNode := nodes[0]
dstNode := nodes[1]
suite.updateCollectionStatus(collection, querypb.LoadStatus_Loaded) suite.updateCollectionStatus(collection, querypb.LoadStatus_Loaded)
suite.updateSegmentDist(collection, srcNode) suite.updateSegmentDist(collection, srcNode)
segments := suite.getAllSegments(collection) segments := suite.getAllSegments(collection)
@ -1171,6 +1443,11 @@ func (suite *ServiceSuite) TestGetShardLeadersFailed() {
suite.Equal(commonpb.ErrorCode_NoReplicaAvailable, resp.Status.ErrorCode) suite.Equal(commonpb.ErrorCode_NoReplicaAvailable, resp.Status.ErrorCode)
// Segment not fully loaded // Segment not fully loaded
for _, node := range suite.nodes {
suite.dist.SegmentDistManager.Update(node)
suite.dist.ChannelDistManager.Update(node)
suite.dist.LeaderViewManager.Update(node)
}
suite.updateChannelDistWithoutSegment(collection) suite.updateChannelDistWithoutSegment(collection)
suite.fetchHeartbeats(time.Now()) suite.fetchHeartbeats(time.Now())
resp, err = server.GetShardLeaders(ctx, req) resp, err = server.GetShardLeaders(ctx, req)

View File

@ -130,7 +130,7 @@ func (suite *TaskSuite) SetupTest() {
suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue()) suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue())
suite.store = meta.NewMetaStore(suite.kv) suite.store = meta.NewMetaStore(suite.kv)
suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), suite.store) suite.meta = meta.NewMeta(RandomIncrementIDAllocator(), suite.store, session.NewNodeManager())
suite.dist = meta.NewDistributionManager() suite.dist = meta.NewDistributionManager()
suite.broker = meta.NewMockBroker(suite.T()) suite.broker = meta.NewMockBroker(suite.T())
suite.target = meta.NewTargetManager(suite.broker, suite.meta) suite.target = meta.NewTargetManager(suite.broker, suite.meta)
@ -1260,14 +1260,14 @@ func (suite *TaskSuite) newScheduler() *taskScheduler {
} }
func createReplica(collection int64, nodes ...int64) *meta.Replica { func createReplica(collection int64, nodes ...int64) *meta.Replica {
return &meta.Replica{ return meta.NewReplica(
Replica: &querypb.Replica{ &querypb.Replica{
ID: rand.Int63()/2 + 1, ID: rand.Int63()/2 + 1,
CollectionID: collection, CollectionID: collection,
Nodes: nodes, Nodes: nodes,
}, },
Nodes: typeutil.NewUniqueSet(nodes...), typeutil.NewUniqueSet(nodes...),
} )
} }
func TestTask(t *testing.T) { func TestTask(t *testing.T) {

View File

@ -18,12 +18,22 @@ package utils
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"math/rand" "math/rand"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
"github.com/milvus-io/milvus/internal/querycoordv2/session" "github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/samber/lo" "github.com/samber/lo"
"go.uber.org/zap"
)
var (
ErrGetNodesFromRG = errors.New("failed to get node from rg")
ErrNoReplicaFound = errors.New("no replica found during assign nodes")
ErrReplicasInconsistent = errors.New("all replicas should belong to same collection during assign nodes")
ErrUseWrongNumRG = errors.New("resource num can only be 0, 1 or same as replica number")
) )
func GetReplicaNodesInfo(replicaMgr *meta.ReplicaManager, nodeMgr *session.NodeManager, replicaID int64) []*session.NodeInfo { func GetReplicaNodesInfo(replicaMgr *meta.ReplicaManager, nodeMgr *session.NodeManager, replicaID int64) []*session.NodeInfo {
@ -32,8 +42,8 @@ func GetReplicaNodesInfo(replicaMgr *meta.ReplicaManager, nodeMgr *session.NodeM
return nil return nil
} }
nodes := make([]*session.NodeInfo, 0, len(replica.Nodes)) nodes := make([]*session.NodeInfo, 0, len(replica.GetNodes()))
for node := range replica.Nodes { for _, node := range replica.GetNodes() {
nodes = append(nodes, nodeMgr.Get(node)) nodes = append(nodes, nodeMgr.Get(node))
} }
return nodes return nodes
@ -64,7 +74,7 @@ func GroupNodesByReplica(replicaMgr *meta.ReplicaManager, collectionID int64, no
replicas := replicaMgr.GetByCollection(collectionID) replicas := replicaMgr.GetByCollection(collectionID)
for _, replica := range replicas { for _, replica := range replicas {
for _, node := range nodes { for _, node := range nodes {
if replica.Nodes.Contain(node) { if replica.Contains(node) {
ret[replica.ID] = append(ret[replica.ID], node) ret[replica.ID] = append(ret[replica.ID], node)
} }
} }
@ -90,7 +100,7 @@ func GroupSegmentsByReplica(replicaMgr *meta.ReplicaManager, collectionID int64,
replicas := replicaMgr.GetByCollection(collectionID) replicas := replicaMgr.GetByCollection(collectionID)
for _, replica := range replicas { for _, replica := range replicas {
for _, segment := range segments { for _, segment := range segments {
if replica.Nodes.Contain(segment.Node) { if replica.Contains(segment.Node) {
ret[replica.ID] = append(ret[replica.ID], segment) ret[replica.ID] = append(ret[replica.ID], segment)
} }
} }
@ -101,24 +111,92 @@ func GroupSegmentsByReplica(replicaMgr *meta.ReplicaManager, collectionID int64,
// AssignNodesToReplicas assigns nodes to the given replicas, // AssignNodesToReplicas assigns nodes to the given replicas,
// all given replicas must be the same collection, // all given replicas must be the same collection,
// the given replicas have to be not in ReplicaManager // the given replicas have to be not in ReplicaManager
func AssignNodesToReplicas(nodeMgr *session.NodeManager, replicas ...*meta.Replica) { func AssignNodesToReplicas(m *meta.Meta, rgName string, replicas ...*meta.Replica) error {
replicaNumber := len(replicas) replicaIDs := lo.Map(replicas, func(r *meta.Replica, _ int) int64 { return r.GetID() })
nodes := nodeMgr.GetAll() log := log.With(zap.Int64("collectionID", replicas[0].GetCollectionID()),
rand.Shuffle(len(nodes), func(i, j int) { zap.Int64s("replicas", replicaIDs),
nodes[i], nodes[j] = nodes[j], nodes[i] zap.String("rgName", rgName),
)
if len(replicaIDs) == 0 {
return nil
}
nodeGroup, err := m.ResourceManager.GetNodes(rgName)
if err != nil {
log.Error("failed to get nodes", zap.Error(err))
return err
}
if len(nodeGroup) < len(replicaIDs) {
log.Error(meta.ErrNodeNotEnough.Error())
return meta.ErrNodeNotEnough
}
rand.Shuffle(len(nodeGroup), func(i, j int) {
nodeGroup[i], nodeGroup[j] = nodeGroup[j], nodeGroup[i]
}) })
for i, node := range nodes { log.Info("assign nodes to replicas",
replicas[i%replicaNumber].AddNode(node.ID()) zap.Int64s("nodes", nodeGroup),
)
for i, node := range nodeGroup {
replicas[i%len(replicas)].AddNode(node)
} }
return nil
} }
// SpawnReplicas spawns replicas for given collection, assign nodes to them, and save them // SpawnReplicas spawns replicas for given collection, assign nodes to them, and save them
func SpawnReplicas(replicaMgr *meta.ReplicaManager, nodeMgr *session.NodeManager, collection int64, replicaNumber int32) ([]*meta.Replica, error) { func SpawnAllReplicasInRG(m *meta.Meta, collection int64, replicaNumber int32, rgName string) ([]*meta.Replica, error) {
replicas, err := replicaMgr.Spawn(collection, replicaNumber) replicas, err := m.ReplicaManager.Spawn(collection, replicaNumber, rgName)
if err != nil { if err != nil {
return nil, err return nil, err
} }
AssignNodesToReplicas(nodeMgr, replicas...) err = AssignNodesToReplicas(m, rgName, replicas...)
return replicas, replicaMgr.Put(replicas...) if err != nil {
return nil, err
}
return replicas, m.ReplicaManager.Put(replicas...)
}
func checkResourceGroup(collectionID int64, replicaNumber int32, resourceGroups []string) error {
if len(resourceGroups) != 0 && len(resourceGroups) != 1 && len(resourceGroups) != int(replicaNumber) {
return ErrUseWrongNumRG
}
return nil
}
func SpawnReplicasWithRG(m *meta.Meta, collection int64, resourceGroups []string, replicaNumber int32) ([]*meta.Replica, error) {
if err := checkResourceGroup(collection, replicaNumber, resourceGroups); err != nil {
return nil, err
}
if len(resourceGroups) == 0 {
return SpawnAllReplicasInRG(m, collection, replicaNumber, meta.DefaultResourceGroupName)
}
if len(resourceGroups) == 1 {
return SpawnAllReplicasInRG(m, collection, replicaNumber, resourceGroups[0])
}
replicaSet := make([]*meta.Replica, 0)
for _, rgName := range resourceGroups {
if !m.ResourceManager.ContainResourceGroup(rgName) {
return nil, meta.ErrRGNotExist
}
replicas, err := m.ReplicaManager.Spawn(collection, 1, rgName)
if err != nil {
return nil, err
}
err = AssignNodesToReplicas(m, rgName, replicas...)
if err != nil {
return nil, err
}
replicaSet = append(replicaSet, replicas...)
}
return replicaSet, m.ReplicaManager.Put(replicaSet...)
} }

View File

@ -0,0 +1,110 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package utils
import (
"testing"
etcdKV "github.com/milvus-io/milvus/internal/kv/etcd"
"github.com/milvus-io/milvus/internal/querycoordv2/meta"
. "github.com/milvus-io/milvus/internal/querycoordv2/params"
"github.com/milvus-io/milvus/internal/querycoordv2/session"
"github.com/milvus-io/milvus/internal/util/etcd"
)
func TestSpawnReplicasWithRG(t *testing.T) {
Params.Init()
config := GenerateEtcdConfig()
cli, _ := etcd.GetEtcdClient(
config.UseEmbedEtcd.GetAsBool(),
config.EtcdUseSSL.GetAsBool(),
config.Endpoints.GetAsStrings(),
config.EtcdTLSCert.GetValue(),
config.EtcdTLSKey.GetValue(),
config.EtcdTLSCACert.GetValue(),
config.EtcdTLSMinVersion.GetValue())
kv := etcdKV.NewEtcdKV(cli, config.MetaRootPath.GetValue())
store := meta.NewMetaStore(kv)
nodeMgr := session.NewNodeManager()
m := meta.NewMeta(RandomIncrementIDAllocator(), store, nodeMgr)
m.ResourceManager.AddResourceGroup("rg1")
m.ResourceManager.AddResourceGroup("rg2")
m.ResourceManager.AddResourceGroup("rg3")
for i := 1; i < 10; i++ {
nodeMgr.Add(session.NewNodeInfo(int64(i), "localhost"))
if i%3 == 0 {
m.ResourceManager.AssignNode("rg1", int64(i))
}
if i%3 == 1 {
m.ResourceManager.AssignNode("rg2", int64(i))
}
if i%3 == 2 {
m.ResourceManager.AssignNode("rg3", int64(i))
}
}
type args struct {
m *meta.Meta
collection int64
resourceGroups []string
replicaNumber int32
}
tests := []struct {
name string
args args
wantReplicaNum int
wantErr bool
}{
{
name: "test 3 replica on 1 rg",
args: args{m, 1000, []string{"rg1"}, 3},
wantReplicaNum: 3,
wantErr: false,
},
{
name: "test 3 replica on 2 rg",
args: args{m, 1000, []string{"rg1", "rg2"}, 3},
wantReplicaNum: 0,
wantErr: true,
},
{
name: "test 3 replica on 3 rg",
args: args{m, 1000, []string{"rg1", "rg2", "rg3"}, 3},
wantReplicaNum: 3,
wantErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got, err := SpawnReplicasWithRG(tt.args.m, tt.args.collection, tt.args.resourceGroups, tt.args.replicaNumber)
if (err != nil) != tt.wantErr {
t.Errorf("SpawnReplicasWithRG() error = %v, wantErr %v", err, tt.wantErr)
return
}
if len(got) != tt.wantReplicaNum {
t.Errorf("SpawnReplicasWithRG() = %v, want %d replicas", got, tt.args.replicaNumber)
}
})
}
}

View File

@ -52,14 +52,15 @@ func CreateTestChannel(collection, node, version int64, channel string) *meta.Dm
} }
func CreateTestReplica(id, collectionID int64, nodes []int64) *meta.Replica { func CreateTestReplica(id, collectionID int64, nodes []int64) *meta.Replica {
return &meta.Replica{ return meta.NewReplica(
Replica: &querypb.Replica{ &querypb.Replica{
ID: id, ID: id,
CollectionID: collectionID, CollectionID: collectionID,
Nodes: nodes, Nodes: nodes,
ResourceGroup: meta.DefaultResourceGroupName,
}, },
Nodes: typeutil.NewUniqueSet(nodes...), typeutil.NewUniqueSet(nodes...),
} )
} }
func CreateTestCollection(collection int64, replica int32) *meta.Collection { func CreateTestCollection(collection int64, replica int32) *meta.Collection {

View File

@ -20,7 +20,6 @@ import (
"fmt" "fmt"
"github.com/milvus-io/milvus-proto/go-api/commonpb" "github.com/milvus-io/milvus-proto/go-api/commonpb"
"github.com/milvus-io/milvus-proto/go-api/milvuspb"
"github.com/milvus-io/milvus/internal/proto/datapb" "github.com/milvus-io/milvus/internal/proto/datapb"
"github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/proto/querypb"
"github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/meta"
@ -148,11 +147,3 @@ func MergeDmChannelInfo(infos []*datapb.VchannelInfo) *meta.DmChannel {
return dmChannel return dmChannel
} }
func Replica2ReplicaInfo(replica *querypb.Replica) *milvuspb.ReplicaInfo {
return &milvuspb.ReplicaInfo{
ReplicaID: replica.GetID(),
CollectionID: replica.GetCollectionID(),
NodeIds: replica.GetNodes(),
}
}

View File

@ -1304,6 +1304,13 @@ type ProxyComponent interface {
// RenameCollection rename collection from old name to new name // RenameCollection rename collection from old name to new name
RenameCollection(ctx context.Context, req *milvuspb.RenameCollectionRequest) (*commonpb.Status, error) RenameCollection(ctx context.Context, req *milvuspb.RenameCollectionRequest) (*commonpb.Status, error)
CreateResourceGroup(ctx context.Context, req *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error)
DropResourceGroup(ctx context.Context, req *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error)
TransferNode(ctx context.Context, req *milvuspb.TransferNodeRequest) (*commonpb.Status, error)
TransferReplica(ctx context.Context, req *milvuspb.TransferReplicaRequest) (*commonpb.Status, error)
ListResourceGroups(ctx context.Context, req *milvuspb.ListResourceGroupsRequest) (*milvuspb.ListResourceGroupsResponse, error)
DescribeResourceGroup(ctx context.Context, req *milvuspb.DescribeResourceGroupRequest) (*milvuspb.DescribeResourceGroupResponse, error)
} }
// QueryNode is the interface `querynode` package implements // QueryNode is the interface `querynode` package implements
@ -1376,6 +1383,13 @@ type QueryCoord interface {
GetShardLeaders(ctx context.Context, req *querypb.GetShardLeadersRequest) (*querypb.GetShardLeadersResponse, error) GetShardLeaders(ctx context.Context, req *querypb.GetShardLeadersRequest) (*querypb.GetShardLeadersResponse, error)
CheckHealth(ctx context.Context, req *milvuspb.CheckHealthRequest) (*milvuspb.CheckHealthResponse, error) CheckHealth(ctx context.Context, req *milvuspb.CheckHealthRequest) (*milvuspb.CheckHealthResponse, error)
CreateResourceGroup(ctx context.Context, req *milvuspb.CreateResourceGroupRequest) (*commonpb.Status, error)
DropResourceGroup(ctx context.Context, req *milvuspb.DropResourceGroupRequest) (*commonpb.Status, error)
TransferNode(ctx context.Context, req *milvuspb.TransferNodeRequest) (*commonpb.Status, error)
TransferReplica(ctx context.Context, req *querypb.TransferReplicaRequest) (*commonpb.Status, error)
ListResourceGroups(ctx context.Context, req *milvuspb.ListResourceGroupsRequest) (*milvuspb.ListResourceGroupsResponse, error)
DescribeResourceGroup(ctx context.Context, req *querypb.DescribeResourceGroupRequest) (*querypb.DescribeResourceGroupResponse, error)
} }
// QueryCoordComponent is used by grpc server of QueryCoord // QueryCoordComponent is used by grpc server of QueryCoord

View File

@ -101,3 +101,27 @@ func (m *GrpcQueryCoordClient) GetReplicas(ctx context.Context, in *milvuspb.Get
func (m *GrpcQueryCoordClient) GetShardLeaders(ctx context.Context, in *querypb.GetShardLeadersRequest, opts ...grpc.CallOption) (*querypb.GetShardLeadersResponse, error) { func (m *GrpcQueryCoordClient) GetShardLeaders(ctx context.Context, in *querypb.GetShardLeadersRequest, opts ...grpc.CallOption) (*querypb.GetShardLeadersResponse, error) {
return &querypb.GetShardLeadersResponse{}, m.Err return &querypb.GetShardLeadersResponse{}, m.Err
} }
func (m *GrpcQueryCoordClient) CreateResourceGroup(ctx context.Context, req *milvuspb.CreateResourceGroupRequest, opts ...grpc.CallOption) (*commonpb.Status, error) {
return &commonpb.Status{}, m.Err
}
func (m *GrpcQueryCoordClient) DropResourceGroup(ctx context.Context, req *milvuspb.DropResourceGroupRequest, opts ...grpc.CallOption) (*commonpb.Status, error) {
return &commonpb.Status{}, m.Err
}
func (m *GrpcQueryCoordClient) TransferNode(ctx context.Context, req *milvuspb.TransferNodeRequest, opts ...grpc.CallOption) (*commonpb.Status, error) {
return &commonpb.Status{}, m.Err
}
func (m *GrpcQueryCoordClient) TransferReplica(ctx context.Context, req *querypb.TransferReplicaRequest, opts ...grpc.CallOption) (*commonpb.Status, error) {
return &commonpb.Status{}, m.Err
}
func (m *GrpcQueryCoordClient) ListResourceGroups(ctx context.Context, req *milvuspb.ListResourceGroupsRequest, opts ...grpc.CallOption) (*milvuspb.ListResourceGroupsResponse, error) {
return &milvuspb.ListResourceGroupsResponse{}, m.Err
}
func (m *GrpcQueryCoordClient) DescribeResourceGroup(ctx context.Context, req *querypb.DescribeResourceGroupRequest, opts ...grpc.CallOption) (*querypb.DescribeResourceGroupResponse, error) {
return &querypb.DescribeResourceGroupResponse{}, m.Err
}

View File

@ -890,6 +890,9 @@ type queryCoordConfig struct {
NextTargetSurviveTime ParamItem `refreshable:"true"` NextTargetSurviveTime ParamItem `refreshable:"true"`
UpdateNextTargetInterval ParamItem `refreshable:"false"` UpdateNextTargetInterval ParamItem `refreshable:"false"`
CheckNodeInReplicaInterval ParamItem `refreshable:"false"`
CheckResourceGroupInterval ParamItem `refreshable:"false"`
EnableRGAutoRecover ParamItem `refreshable:"true"`
} }
func (p *queryCoordConfig) init(base *BaseTable) { func (p *queryCoordConfig) init(base *BaseTable) {
@ -1040,6 +1043,30 @@ func (p *queryCoordConfig) init(base *BaseTable) {
PanicIfEmpty: true, PanicIfEmpty: true,
} }
p.UpdateNextTargetInterval.Init(base.mgr) p.UpdateNextTargetInterval.Init(base.mgr)
p.CheckNodeInReplicaInterval = ParamItem{
Key: "queryCoord.checkNodeInReplicaInterval",
Version: "2.2.3",
DefaultValue: "60",
PanicIfEmpty: true,
}
p.CheckNodeInReplicaInterval.Init(base.mgr)
p.CheckResourceGroupInterval = ParamItem{
Key: "queryCoord.checkResourceGroupInterval",
Version: "2.2.3",
DefaultValue: "30",
PanicIfEmpty: true,
}
p.CheckResourceGroupInterval.Init(base.mgr)
p.EnableRGAutoRecover = ParamItem{
Key: "queryCoord.enableRGAutoRecover",
Version: "2.2.3",
DefaultValue: "true",
PanicIfEmpty: true,
}
p.EnableRGAutoRecover.Init(base.mgr)
} }
// ///////////////////////////////////////////////////////////////////////////// // /////////////////////////////////////////////////////////////////////////////

View File

@ -243,6 +243,28 @@ func TestComponentParam(t *testing.T) {
Params := params.QueryCoordCfg Params := params.QueryCoordCfg
assert.Equal(t, Params.EnableActiveStandby.GetAsBool(), false) assert.Equal(t, Params.EnableActiveStandby.GetAsBool(), false)
t.Logf("queryCoord EnableActiveStandby = %t", Params.EnableActiveStandby.GetAsBool()) t.Logf("queryCoord EnableActiveStandby = %t", Params.EnableActiveStandby.GetAsBool())
params.Save("queryCoord.NextTargetSurviveTime", "100")
NextTargetSurviveTime := Params.NextTargetSurviveTime
assert.Equal(t, int64(100), NextTargetSurviveTime.GetAsInt64())
params.Save("queryCoord.UpdateNextTargetInterval", "100")
UpdateNextTargetInterval := Params.UpdateNextTargetInterval
assert.Equal(t, int64(100), UpdateNextTargetInterval.GetAsInt64())
params.Save("queryCoord.checkNodeInReplicaInterval", "100")
checkNodeInReplicaInterval := Params.CheckNodeInReplicaInterval
assert.Equal(t, 100, checkNodeInReplicaInterval.GetAsInt())
params.Save("queryCoord.checkResourceGroupInterval", "10")
checkResourceGroupInterval := Params.CheckResourceGroupInterval
assert.Equal(t, 10, checkResourceGroupInterval.GetAsInt())
enableResourceGroupAutoRecover := Params.EnableRGAutoRecover
assert.Equal(t, true, enableResourceGroupAutoRecover.GetAsBool())
params.Save("queryCoord.enableRGAutoRecover", "false")
enableResourceGroupAutoRecover = Params.EnableRGAutoRecover
assert.Equal(t, false, enableResourceGroupAutoRecover.GetAsBool())
}) })
t.Run("test queryNodeConfig", func(t *testing.T) { t.Run("test queryNodeConfig", func(t *testing.T) {