mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
issue: #41435 After introducing the caching layer's lazy loading and eviction mechanisms, most parts of a segment won't be loaded into memory or disk immediately, even if the segment is marked as LOADED. This means physical resource usage may be very low. However, we still need to reserve enough resources for the segments marked as LOADED. Thus, the logic of resource usage estimation during segment loading, which based on physcial resource usage only for now, should be changed. To address this issue, we introduced the concept of logical resource usage in this patch. This can be thought of as the base reserved resource for each LOADED segment. A segment’s logical resource usage is derived from its final evictable and inevictable resource usage and calculated as follows: ``` SLR = SFPIER + evitable_cache_ratio * SFPER ``` it also equals to ``` SLR = (SFPIER + SFPER) - (1.0 - evitable_cache_ratio) * SFPER ``` `SLR`: The logical resource usage of a segment. `SFPIER`: The final physical inevictable resource usage of a segment. `SFPER`: The final physical evictable resource usage of a segment. `evitable_cache_ratio`: The ratio of a segment's evictable resources that can be cached locally. The higher the ratio, the more physical memory is reserved for evictable memory. When loading a segment, two types of resource usage are taken into account. First is the estimated maximum physical resource usage: ``` PPR = HPR + CPR + SMPR - SFPER ``` `PPR`: The predicted physical resource usage after the current segment is allowed to load. `HPR`: The physical resource usage obtained from hardware information. `CPR`: The total physical resource usage of segments that have been committed but not yet loaded. When one new segment is allow to load, `CPR' = CPR + (SMR - SER)`. When one of the committed segments is loaded, `CPR' = CPR - (SMR - SER)`. `SMPR`: The maximum physical resource usage of the current segment. `SFPER`: The final physical evictable resource usage of the current segment. Second is the estimated logical resource usage, this check is only valid when eviction is enabled: ``` PLR = LLR + CLR + SLR ``` `PLR`: The predicted logical resource usage after the current segment is allowed to load. `LLR`: The total logical resource usage of all loaded segments. When a new segment is loaded, `LLR` should be updated to `LLR' = LLR + SLR`. `CLR`: The total logical resource usage of segments that have been committed but not yet loaded. When one new segment is allow to load, `CLR' = CLR + SLR`. When one of the committed segments is loaded, `CLR' = CLR - SLR`. `SLR`: The logical resource usage of the current segment. Only when `PPR < PRL && PLR < PRL` (`PRL`: Physical resource limit of the querynode), the segment is allowed to be loaded. --------- Signed-off-by: Shawn Wang <shawn.wang@zilliz.com>
253 lines
7.9 KiB
Go
253 lines
7.9 KiB
Go
// Licensed to the LF AI & Data foundation under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package querynodev2
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"sync/atomic"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/cockroachdb/errors"
|
|
"github.com/spf13/viper"
|
|
"github.com/stretchr/testify/mock"
|
|
"github.com/stretchr/testify/suite"
|
|
clientv3 "go.etcd.io/etcd/client/v3"
|
|
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/commonpb"
|
|
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
|
|
"github.com/milvus-io/milvus/internal/mocks/util/mock_segcore"
|
|
"github.com/milvus-io/milvus/internal/mocks/util/searchutil/mock_optimizers"
|
|
"github.com/milvus-io/milvus/internal/querynodev2/segments"
|
|
"github.com/milvus-io/milvus/internal/storage"
|
|
"github.com/milvus-io/milvus/internal/util/dependency"
|
|
"github.com/milvus-io/milvus/pkg/v2/objectstorage"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/datapb"
|
|
"github.com/milvus-io/milvus/pkg/v2/proto/querypb"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/etcd"
|
|
"github.com/milvus-io/milvus/pkg/v2/util/paramtable"
|
|
)
|
|
|
|
type QueryNodeSuite struct {
|
|
suite.Suite
|
|
// data
|
|
address string
|
|
|
|
// dependency
|
|
params *paramtable.ComponentParam
|
|
node *QueryNode
|
|
etcd *clientv3.Client
|
|
chunkManagerFactory *storage.ChunkManagerFactory
|
|
|
|
// mock
|
|
factory *dependency.MockFactory
|
|
}
|
|
|
|
func (suite *QueryNodeSuite) SetupSuite() {
|
|
suite.address = "test-address"
|
|
}
|
|
|
|
func (suite *QueryNodeSuite) SetupTest() {
|
|
var err error
|
|
paramtable.Init()
|
|
suite.params = paramtable.Get()
|
|
suite.params.Save(suite.params.CommonCfg.GCEnabled.Key, "false")
|
|
|
|
// mock factory
|
|
suite.factory = dependency.NewMockFactory(suite.T())
|
|
suite.chunkManagerFactory = storage.NewChunkManagerFactory("local", objectstorage.RootPath("/tmp/milvus_test"))
|
|
// new node
|
|
suite.node = NewQueryNode(context.Background(), suite.factory)
|
|
// init etcd
|
|
suite.etcd, err = etcd.GetEtcdClient(
|
|
suite.params.EtcdCfg.UseEmbedEtcd.GetAsBool(),
|
|
suite.params.EtcdCfg.EtcdUseSSL.GetAsBool(),
|
|
suite.params.EtcdCfg.Endpoints.GetAsStrings(),
|
|
suite.params.EtcdCfg.EtcdTLSCert.GetValue(),
|
|
suite.params.EtcdCfg.EtcdTLSKey.GetValue(),
|
|
suite.params.EtcdCfg.EtcdTLSCACert.GetValue(),
|
|
suite.params.EtcdCfg.EtcdTLSMinVersion.GetValue())
|
|
suite.NoError(err)
|
|
}
|
|
|
|
func (suite *QueryNodeSuite) TearDownTest() {
|
|
suite.etcd.Close()
|
|
os.RemoveAll("/tmp/milvus-test")
|
|
}
|
|
|
|
func (suite *QueryNodeSuite) TestBasic() {
|
|
// mock expect
|
|
suite.factory.EXPECT().Init(mock.Anything).Return()
|
|
suite.factory.EXPECT().NewPersistentStorageChunkManager(mock.Anything).Return(suite.chunkManagerFactory.NewPersistentStorageChunkManager(context.Background()))
|
|
|
|
var err error
|
|
suite.node.SetEtcdClient(suite.etcd)
|
|
err = suite.node.Init()
|
|
suite.NoError(err)
|
|
|
|
// node should be unhealthy before node start
|
|
suite.False(suite.node.lifetime.GetState() == commonpb.StateCode_Healthy)
|
|
|
|
// start node
|
|
err = suite.node.Start()
|
|
suite.NoError(err)
|
|
|
|
// node should be healthy after node start
|
|
suite.True(suite.node.lifetime.GetState() == commonpb.StateCode_Healthy)
|
|
|
|
// register node to etcd
|
|
suite.node.session.TriggerKill = false
|
|
err = suite.node.Register()
|
|
suite.NoError(err)
|
|
|
|
// set and get address
|
|
suite.node.SetAddress(suite.address)
|
|
address := suite.node.GetAddress()
|
|
suite.Equal(suite.address, address)
|
|
|
|
// close node
|
|
err = suite.node.Stop()
|
|
suite.NoError(err)
|
|
|
|
// node should be unhealthy after node stop
|
|
suite.False(suite.node.lifetime.GetState() == commonpb.StateCode_Healthy)
|
|
}
|
|
|
|
func (suite *QueryNodeSuite) TestInit_RemoteChunkManagerFailed() {
|
|
var err error
|
|
suite.node.SetEtcdClient(suite.etcd)
|
|
|
|
// init remote chunk manager failed
|
|
suite.factory.EXPECT().Init(mock.Anything).Return()
|
|
suite.factory.EXPECT().NewPersistentStorageChunkManager(mock.Anything).Return(nil, errors.New("mock error"))
|
|
err = suite.node.Init()
|
|
suite.Error(err)
|
|
}
|
|
|
|
func (suite *QueryNodeSuite) TestInit_VactorChunkManagerFailed() {
|
|
var err error
|
|
suite.node.SetEtcdClient(suite.etcd)
|
|
|
|
// init vactor chunk manager failed
|
|
suite.factory.EXPECT().Init(mock.Anything).Return()
|
|
suite.factory.EXPECT().NewPersistentStorageChunkManager(mock.Anything).Return(nil, errors.New("mock error")).Once()
|
|
err = suite.node.Init()
|
|
suite.Error(err)
|
|
}
|
|
|
|
func (suite *QueryNodeSuite) TestInit_QueryHook() {
|
|
// mock expect
|
|
suite.factory.EXPECT().Init(mock.Anything).Return()
|
|
suite.factory.EXPECT().NewPersistentStorageChunkManager(mock.Anything).Return(suite.chunkManagerFactory.NewPersistentStorageChunkManager(context.Background()))
|
|
|
|
var err error
|
|
suite.node.SetEtcdClient(suite.etcd)
|
|
err = suite.node.Init()
|
|
suite.NoError(err)
|
|
|
|
mockHook := mock_optimizers.NewMockQueryHook(suite.T())
|
|
suite.node.queryHook = mockHook
|
|
suite.node.handleQueryHookEvent()
|
|
|
|
yamlWriter := viper.New()
|
|
yamlWriter.SetConfigFile("../../configs/milvus.yaml")
|
|
yamlWriter.ReadInConfig()
|
|
var x1, x2, x3 int32
|
|
suite.Equal(atomic.LoadInt32(&x1), int32(0))
|
|
suite.Equal(atomic.LoadInt32(&x2), int32(0))
|
|
suite.Equal(atomic.LoadInt32(&x3), int32(0))
|
|
|
|
mockHook.EXPECT().InitTuningConfig(mock.Anything).Run(func(params map[string]string) {
|
|
atomic.StoreInt32(&x1, 6)
|
|
}).Return(nil)
|
|
|
|
// create tuning conf
|
|
yamlWriter.Set("autoIndex.params.tuning.1238", "xxxx")
|
|
yamlWriter.WriteConfig()
|
|
suite.Eventually(func() bool {
|
|
return atomic.LoadInt32(&x1) == int32(6)
|
|
}, 20*time.Second, time.Second)
|
|
|
|
mockHook.EXPECT().Init(mock.Anything).Run(func(params string) {
|
|
atomic.StoreInt32(&x2, 5)
|
|
}).Return(nil)
|
|
yamlWriter.Set("autoIndex.params.search", "aaaa")
|
|
yamlWriter.WriteConfig()
|
|
suite.Eventually(func() bool {
|
|
return atomic.LoadInt32(&x2) == int32(5)
|
|
}, 20*time.Second, time.Second)
|
|
yamlWriter.Set("autoIndex.params.search", "")
|
|
yamlWriter.WriteConfig()
|
|
|
|
atomic.StoreInt32(&x1, 0)
|
|
suite.Equal(atomic.LoadInt32(&x1), int32(0))
|
|
// update tuning conf
|
|
yamlWriter.Set("autoIndex.params.tuning.1238", "yyyy")
|
|
yamlWriter.WriteConfig()
|
|
suite.Eventually(func() bool {
|
|
return atomic.LoadInt32(&x1) == int32(6)
|
|
}, 20*time.Second, time.Second)
|
|
|
|
mockHook.EXPECT().DeleteTuningConfig(mock.Anything).Run(func(params string) {
|
|
atomic.StoreInt32(&x3, 7)
|
|
}).Return(nil)
|
|
|
|
// delete tuning conf
|
|
yamlWriter.Set("autoIndex.params.tuning", "")
|
|
yamlWriter.WriteConfig()
|
|
suite.Eventually(func() bool {
|
|
return atomic.LoadInt32(&x3) == int32(7)
|
|
}, 20*time.Second, time.Second)
|
|
}
|
|
|
|
func (suite *QueryNodeSuite) TestStop() {
|
|
paramtable.Get().Save(paramtable.Get().QueryNodeCfg.GracefulStopTimeout.Key, "2")
|
|
|
|
suite.node.manager = segments.NewManager()
|
|
|
|
schema := mock_segcore.GenTestCollectionSchema("test_stop", schemapb.DataType_Int64, true)
|
|
collection, err := segments.NewCollection(1, schema, nil, &querypb.LoadMetaInfo{
|
|
LoadType: querypb.LoadType_LoadCollection,
|
|
})
|
|
suite.Require().NoError(err)
|
|
segment, err := segments.NewSegment(
|
|
context.Background(),
|
|
collection,
|
|
suite.node.manager.Segment,
|
|
segments.SegmentTypeSealed,
|
|
1,
|
|
&querypb.SegmentLoadInfo{
|
|
SegmentID: 100,
|
|
PartitionID: 10,
|
|
CollectionID: 1,
|
|
Level: datapb.SegmentLevel_Legacy,
|
|
InsertChannel: fmt.Sprintf("by-dev-rootcoord-dml_0_%dv0", 1),
|
|
},
|
|
)
|
|
suite.NoError(err)
|
|
suite.node.manager.Segment.Put(context.Background(), segments.SegmentTypeSealed, segment)
|
|
err = suite.node.Stop()
|
|
suite.NoError(err)
|
|
suite.True(suite.node.manager.Segment.Empty())
|
|
}
|
|
|
|
func TestQueryNode(t *testing.T) {
|
|
suite.Run(t, new(QueryNodeSuite))
|
|
}
|