sunby 189ac881f3 Fix bugs (#5676)
* Remove redundant session startup

Signed-off-by: sunby <bingyi.sun@zilliz.com>

* Register datanode after start success

Signed-off-by: sunby <bingyi.sun@zilliz.com>

* fix meta snap shot

Signed-off-by: yefu.chen <yefu.chen@zilliz.com>

* fix datanode message stream channel

Signed-off-by: yangxuan <xuan.yang@zilliz.com>

* Fix bugs when drop empty collection

Signed-off-by: sunby <bingyi.sun@zilliz.com>

* Fix bug of getting pchan statistics from task scheduler

Signed-off-by: dragondriver <jiquan.long@zilliz.com>

* Fix i/dist/dataservice test code

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>

* Fix epoch lifetime not applied

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>

* fix datanode flowgraph dd node

Signed-off-by: yangxuan <xuan.yang@zilliz.com>

* Fix handle datanode timetick bug

Signed-off-by: sunby <bingyi.sun@zilliz.com>

* Remove repack function of dml stream

Signed-off-by: dragondriver <jiquan.long@zilliz.com>

* fix proxynode

Signed-off-by: yefu.chen <yefu.chen@zilliz.com>

* Apply extended seal policy

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>

* add check for time tick

Signed-off-by: yefu.chen <yefu.chen@zilliz.com>

* fix check

Signed-off-by: yefu.chen <yefu.chen@zilliz.com>

* Fix the repack function of dml stream

Signed-off-by: dragondriver <jiquan.long@zilliz.com>

* Fix the bug when send statistics of pchan

Signed-off-by: dragondriver <jiquan.long@zilliz.com>

* Fix the repack function when craete dml stream

Signed-off-by: dragondriver <jiquan.long@zilliz.com>

* fix bugs

Signed-off-by: yefu.chen <yefu.chen@zilliz.com>

* fix describe collection

Signed-off-by: yefu.chen <yefu.chen@zilliz.com>

* Fix bug when send timestamp statistics

Signed-off-by: dragondriver <jiquan.long@zilliz.com>

* fix data node

Signed-off-by: yefu.chen <yefu.chen@zilliz.com>

* Add length check before flush request

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>

* add log for data node

Signed-off-by: yefu.chen <yefu.chen@zilliz.com>

* Fix SaveBinlog bugs

Signed-off-by: sunby <bingyi.sun@zilliz.com>

* Add more log in datanode

Signed-off-by: yangxuan <xuan.yang@zilliz.com>

* Put SegmentState.Flushing as the last one in enum to fit the client

Signed-off-by: sunby <bingyi.sun@zilliz.com>

* Fix params in GetInsertBinlogPaths

Signed-off-by: sunby <bingyi.sun@zilliz.com>

* Rename policy

Signed-off-by: sunby <bingyi.sun@zilliz.com>

* Remove unused ddl functions and fields

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>

* Remove pchan when drop collection

Signed-off-by: dragondriver <jiquan.long@zilliz.com>

* Add balanced assignment policy

Signed-off-by: sunby <bingyi.sun@zilliz.com>

* fix master ut

Signed-off-by: yefu.chen <yefu.chen@zilliz.com>

* Add lock in session manager

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>

* add log for debug

Signed-off-by: yefu.chen <yefu.chen@zilliz.com>

* Fix some logic bug and typo

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>

* Fix recover bugs

Signed-off-by: sunby <bingyi.sun@zilliz.com>

* Get collection scheme of a specific timestamp

Signed-off-by: yangxuan <xuan.yang@zilliz.com>

* Change CheckPoint to SegmentInfo in VchannelInfo

Signed-off-by: sunby <bingyi.sun@zilliz.com>

* Recover Unflushed segment numOfRows

Signed-off-by: yangxuan <xuan.yang@zilliz.com>

* Fix dataservice unit tests

Signed-off-by: sunby <bingyi.sun@zilliz.com>

Co-authored-by: yefu.chen <yefu.chen@zilliz.com>
Co-authored-by: yangxuan <xuan.yang@zilliz.com>
Co-authored-by: dragondriver <jiquan.long@zilliz.com>
Co-authored-by: Congqi Xia <congqi.xia@zilliz.com>
2021-06-15 16:06:11 +08:00

260 lines
7.1 KiB
Go

// Copyright (C) 2019-2020 Zilliz. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.
package dataservice
import (
"fmt"
"sync"
"github.com/milvus-io/milvus/internal/log"
"github.com/milvus-io/milvus/internal/proto/commonpb"
"github.com/milvus-io/milvus/internal/proto/datapb"
"go.uber.org/zap"
"golang.org/x/net/context"
)
type cluster struct {
mu sync.RWMutex
ctx context.Context
dataManager *clusterNodeManager
sessionManager sessionManager
posProvider positionProvider
startupPolicy clusterStartupPolicy
registerPolicy dataNodeRegisterPolicy
unregisterPolicy dataNodeUnregisterPolicy
assignPolicy channelAssignPolicy
}
type clusterOption struct {
apply func(c *cluster)
}
func withStartupPolicy(p clusterStartupPolicy) clusterOption {
return clusterOption{
apply: func(c *cluster) { c.startupPolicy = p },
}
}
func withRegisterPolicy(p dataNodeRegisterPolicy) clusterOption {
return clusterOption{
apply: func(c *cluster) { c.registerPolicy = p },
}
}
func withUnregistorPolicy(p dataNodeUnregisterPolicy) clusterOption {
return clusterOption{
apply: func(c *cluster) { c.unregisterPolicy = p },
}
}
func withAssignPolicy(p channelAssignPolicy) clusterOption {
return clusterOption{
apply: func(c *cluster) { c.assignPolicy = p },
}
}
func defaultStartupPolicy() clusterStartupPolicy {
return newWatchRestartsStartupPolicy()
}
func defaultRegisterPolicy() dataNodeRegisterPolicy {
return newEmptyRegisterPolicy()
}
func defaultUnregisterPolicy() dataNodeUnregisterPolicy {
return newEmptyUnregisterPolicy()
}
func defaultAssignPolicy() channelAssignPolicy {
return newBalancedAssignPolicy()
}
func newCluster(ctx context.Context, dataManager *clusterNodeManager, sessionManager sessionManager, posProvider positionProvider, opts ...clusterOption) *cluster {
c := &cluster{
ctx: ctx,
sessionManager: sessionManager,
dataManager: dataManager,
posProvider: posProvider,
startupPolicy: defaultStartupPolicy(),
registerPolicy: defaultRegisterPolicy(),
unregisterPolicy: defaultUnregisterPolicy(),
assignPolicy: defaultAssignPolicy(),
}
for _, opt := range opts {
opt.apply(c)
}
return c
}
func (c *cluster) startup(dataNodes []*datapb.DataNodeInfo) error {
deltaChange := c.dataManager.updateCluster(dataNodes)
nodes := c.dataManager.getDataNodes(false)
rets := c.startupPolicy.apply(nodes, deltaChange)
c.dataManager.updateDataNodes(rets)
rets = c.watch(rets)
c.dataManager.updateDataNodes(rets)
return nil
}
func (c *cluster) watch(nodes []*datapb.DataNodeInfo) []*datapb.DataNodeInfo {
for _, n := range nodes {
logMsg := fmt.Sprintf("Begin to watch channels for node %s:", n.Address)
uncompletes := make([]vchannel, 0, len(n.Channels))
for _, ch := range n.Channels {
if ch.State == datapb.ChannelWatchState_Uncomplete {
if len(uncompletes) == 0 {
logMsg += ch.Name
} else {
logMsg += "," + ch.Name
}
uncompletes = append(uncompletes, vchannel{
CollectionID: ch.CollectionID,
DmlChannel: ch.Name,
})
}
}
if len(uncompletes) == 0 {
continue
}
log.Debug(logMsg)
vchanInfos, err := c.posProvider.GetVChanPositions(uncompletes)
if err != nil {
log.Warn("get vchannel position failed", zap.Error(err))
continue
}
cli, err := c.sessionManager.getOrCreateSession(n.Address)
if err != nil {
log.Warn("get session failed", zap.String("addr", n.Address), zap.Error(err))
continue
}
req := &datapb.WatchDmChannelsRequest{
Base: &commonpb.MsgBase{
SourceID: Params.NodeID,
},
Vchannels: vchanInfos,
}
resp, err := cli.WatchDmChannels(c.ctx, req)
if err != nil {
log.Warn("watch dm channel failed", zap.String("addr", n.Address), zap.Error(err))
continue
}
if resp.ErrorCode != commonpb.ErrorCode_Success {
log.Warn("watch channels failed", zap.String("address", n.Address), zap.Error(err))
continue
}
for _, ch := range n.Channels {
if ch.State == datapb.ChannelWatchState_Uncomplete {
ch.State = datapb.ChannelWatchState_Complete
}
}
}
return nodes
}
func (c *cluster) register(n *datapb.DataNodeInfo) {
c.mu.Lock()
defer c.mu.Unlock()
c.dataManager.register(n)
cNodes := c.dataManager.getDataNodes(true)
rets := c.registerPolicy.apply(cNodes, n)
c.dataManager.updateDataNodes(rets)
rets = c.watch(rets)
c.dataManager.updateDataNodes(rets)
}
func (c *cluster) unregister(n *datapb.DataNodeInfo) {
c.mu.Lock()
defer c.mu.Unlock()
c.sessionManager.releaseSession(n.Address)
c.dataManager.unregister(n)
cNodes := c.dataManager.getDataNodes(true)
rets := c.unregisterPolicy.apply(cNodes, n)
c.dataManager.updateDataNodes(rets)
rets = c.watch(rets)
c.dataManager.updateDataNodes(rets)
}
func (c *cluster) watchIfNeeded(channel string, collectionID UniqueID) {
c.mu.Lock()
defer c.mu.Unlock()
cNodes := c.dataManager.getDataNodes(true)
rets := c.assignPolicy.apply(cNodes, channel, collectionID)
c.dataManager.updateDataNodes(rets)
rets = c.watch(rets)
c.dataManager.updateDataNodes(rets)
}
func (c *cluster) flush(segments []*datapb.SegmentInfo) {
c.mu.Lock()
defer c.mu.Unlock()
m := make(map[string]map[UniqueID][]UniqueID) // channel-> map[collectionID]segmentIDs
for _, seg := range segments {
if _, ok := m[seg.InsertChannel]; !ok {
m[seg.InsertChannel] = make(map[UniqueID][]UniqueID)
}
m[seg.InsertChannel][seg.CollectionID] = append(m[seg.InsertChannel][seg.CollectionID], seg.ID)
}
dataNodes := c.dataManager.getDataNodes(true)
channel2Node := make(map[string]string)
for _, node := range dataNodes {
for _, chstatus := range node.Channels {
channel2Node[chstatus.Name] = node.Address
}
}
for ch, coll2seg := range m {
node, ok := channel2Node[ch]
if !ok {
continue
}
cli, err := c.sessionManager.getOrCreateSession(node)
if err != nil {
log.Warn("get session failed", zap.String("addr", node), zap.Error(err))
continue
}
for coll, segs := range coll2seg {
req := &datapb.FlushSegmentsRequest{
Base: &commonpb.MsgBase{
MsgType: commonpb.MsgType_Flush,
SourceID: Params.NodeID,
},
CollectionID: coll,
SegmentIDs: segs,
}
resp, err := cli.FlushSegments(c.ctx, req)
if err != nil {
log.Warn("flush segment failed", zap.String("addr", node), zap.Error(err))
continue
}
if resp.ErrorCode != commonpb.ErrorCode_Success {
log.Warn("flush segment failed", zap.String("dataNode", node), zap.Error(err))
continue
}
log.Debug("flush segments succeed", zap.Any("segmentIDs", segs))
}
}
}
func (c *cluster) releaseSessions() {
c.mu.Lock()
defer c.mu.Unlock()
c.sessionManager.release()
}