From ae6d6f91e6aed4ac253769ea4497f0eb5e481663 Mon Sep 17 00:00:00 2001 From: Gao Date: Mon, 8 Jul 2024 19:52:24 +0800 Subject: [PATCH] enhance: change autoindex default metric type (#34261) issue: #34304 cosine is more widely used in float vectors, and cosine and hamming distance are 'metrics' which have good geometric properties Signed-off-by: chasingegg --- internal/datacoord/index_meta.go | 21 ++++++++- internal/datacoord/index_meta_test.go | 44 ++++++++++++++++++- internal/proto/index_coord.proto | 1 + internal/proxy/task_index.go | 26 ++++++----- internal/proxy/task_index_test.go | 7 +++ pkg/common/common.go | 1 + pkg/util/indexparamcheck/constraints.go | 4 +- pkg/util/indexparamcheck/hnsw_checker_test.go | 8 ++-- pkg/util/paramtable/autoindex_param.go | 4 +- tests/python_client/testcases/test_index.py | 7 +-- 10 files changed, 98 insertions(+), 25 deletions(-) diff --git a/internal/datacoord/index_meta.go b/internal/datacoord/index_meta.go index f257f472b6..ba280769e8 100644 --- a/internal/datacoord/index_meta.go +++ b/internal/datacoord/index_meta.go @@ -187,11 +187,15 @@ func checkParams(fieldIndex *model.Index, req *indexpb.CreateIndexRequest) bool return false } + useAutoIndex := false userIndexParamsWithoutMmapKey := make([]*commonpb.KeyValuePair, 0) for _, param := range fieldIndex.UserIndexParams { if param.Key == common.MmapEnabledKey { continue } + if param.Key == common.IndexTypeKey && param.Value == common.AutoIndexName { + useAutoIndex = true + } userIndexParamsWithoutMmapKey = append(userIndexParamsWithoutMmapKey, param) } @@ -200,9 +204,24 @@ func checkParams(fieldIndex *model.Index, req *indexpb.CreateIndexRequest) bool } for _, param1 := range userIndexParamsWithoutMmapKey { exist := false - for _, param2 := range req.GetUserIndexParams() { + for i, param2 := range req.GetUserIndexParams() { if param2.Key == param1.Key && param2.Value == param1.Value { exist = true + } else if param1.Key == common.MetricTypeKey && param2.Key == param1.Key && useAutoIndex && !req.GetUserAutoindexMetricTypeSpecified() { + // when users use autoindex, metric type is the only thing they can specify + // if they do not specify metric type, will use autoindex default metric type + // when autoindex default config upgraded, remain the old metric type at the very first time for compatibility + // warn! replace request metric type + log.Warn("user not specify autoindex metric type, autoindex config has changed, use old metric for compatibility", + zap.String("old metric type", param1.Value), zap.String("new metric type", param2.Value)) + req.GetUserIndexParams()[i].Value = param1.Value + for j, param := range req.GetIndexParams() { + if param.Key == common.MetricTypeKey { + req.GetIndexParams()[j].Value = param1.Value + break + } + } + exist = true } } if !exist { diff --git a/internal/datacoord/index_meta_test.go b/internal/datacoord/index_meta_test.go index 196071abd9..3797733d85 100644 --- a/internal/datacoord/index_meta_test.go +++ b/internal/datacoord/index_meta_test.go @@ -95,6 +95,20 @@ func TestMeta_CanCreateIndex(t *testing.T) { Key: common.IndexTypeKey, Value: "FLAT", }, + { + Key: common.MetricTypeKey, + Value: "L2", + }, + } + userIndexParams = []*commonpb.KeyValuePair{ + { + Key: common.IndexTypeKey, + Value: common.AutoIndexName, + }, + { + Key: common.MetricTypeKey, + Value: "L2", + }, } ) @@ -114,7 +128,7 @@ func TestMeta_CanCreateIndex(t *testing.T) { IndexParams: indexParams, Timestamp: 0, IsAutoIndex: false, - UserIndexParams: indexParams, + UserIndexParams: userIndexParams, } t.Run("can create index", func(t *testing.T) { @@ -132,7 +146,7 @@ func TestMeta_CanCreateIndex(t *testing.T) { TypeParams: typeParams, IndexParams: indexParams, IsAutoIndex: false, - UserIndexParams: indexParams, + UserIndexParams: userIndexParams, } err = m.CreateIndex(index) @@ -166,6 +180,32 @@ func TestMeta_CanCreateIndex(t *testing.T) { assert.Error(t, err) assert.Equal(t, int64(0), tmpIndexID) + req.IndexParams = []*commonpb.KeyValuePair{{Key: common.IndexTypeKey, Value: "FLAT"}, {Key: common.MetricTypeKey, Value: "COSINE"}} + req.UserIndexParams = req.IndexParams + tmpIndexID, err = m.CanCreateIndex(req) + assert.Error(t, err) + assert.Equal(t, int64(0), tmpIndexID) + + // when we use autoindex, it is possible autoindex changes default metric type + // if user does not specify metric type, we should follow the very first autoindex config + req.IndexParams = []*commonpb.KeyValuePair{{Key: common.IndexTypeKey, Value: "FLAT"}, {Key: common.MetricTypeKey, Value: "COSINE"}} + req.UserIndexParams = []*commonpb.KeyValuePair{{Key: common.IndexTypeKey, Value: "AUTOINDEX"}, {Key: common.MetricTypeKey, Value: "COSINE"}} + req.UserAutoindexMetricTypeSpecified = false + tmpIndexID, err = m.CanCreateIndex(req) + assert.NoError(t, err) + assert.Equal(t, indexID, tmpIndexID) + // req should follow the meta + assert.Equal(t, "L2", req.GetUserIndexParams()[1].Value) + assert.Equal(t, "L2", req.GetIndexParams()[1].Value) + + // if autoindex specify metric type, so the index param change is from user, return error + req.IndexParams = []*commonpb.KeyValuePair{{Key: common.IndexTypeKey, Value: "FLAT"}, {Key: common.MetricTypeKey, Value: "COSINE"}} + req.UserIndexParams = []*commonpb.KeyValuePair{{Key: common.IndexTypeKey, Value: "AUTOINDEX"}, {Key: common.MetricTypeKey, Value: "COSINE"}} + req.UserAutoindexMetricTypeSpecified = true + tmpIndexID, err = m.CanCreateIndex(req) + assert.Error(t, err) + assert.Equal(t, int64(0), tmpIndexID) + req.IndexParams = indexParams req.UserIndexParams = indexParams req.FieldID++ diff --git a/internal/proto/index_coord.proto b/internal/proto/index_coord.proto index dc789d3054..0c331f8bba 100644 --- a/internal/proto/index_coord.proto +++ b/internal/proto/index_coord.proto @@ -163,6 +163,7 @@ message CreateIndexRequest { uint64 timestamp = 6; bool is_auto_index = 7; repeated common.KeyValuePair user_index_params = 8; + bool user_autoindex_metric_type_specified = 9; } message AlterIndexRequest { diff --git a/internal/proxy/task_index.go b/internal/proxy/task_index.go index cd56750f73..7eb5284496 100644 --- a/internal/proxy/task_index.go +++ b/internal/proxy/task_index.go @@ -49,7 +49,7 @@ const ( GetIndexStateTaskName = "GetIndexStateTask" GetIndexBuildProgressTaskName = "GetIndexBuildProgressTask" - AutoIndexName = "AUTOINDEX" + AutoIndexName = common.AutoIndexName DimKey = common.DimKey IsSparseKey = common.IsSparseKey ) @@ -70,8 +70,9 @@ type createIndexTask struct { newTypeParams []*commonpb.KeyValuePair newExtraParams []*commonpb.KeyValuePair - collectionID UniqueID - fieldSchema *schemapb.FieldSchema + collectionID UniqueID + fieldSchema *schemapb.FieldSchema + userAutoIndexMetricTypeSpecified bool } func (cit *createIndexTask) TraceCtx() context.Context { @@ -198,6 +199,7 @@ func (cit *createIndexTask) parseIndexParams() error { if metricTypeExist { // make the users' metric type first class citizen. indexParamsMap[common.MetricTypeKey] = metricType + cit.userAutoIndexMetricTypeSpecified = true } } else { // behavior change after 2.2.9, adapt autoindex logic here. useAutoIndex := func(autoIndexConfig map[string]string) { @@ -235,6 +237,7 @@ func (cit *createIndexTask) parseIndexParams() error { useAutoIndex(autoIndexConfig) // make the users' metric type first class citizen. indexParamsMap[common.MetricTypeKey] = metricType + cit.userAutoIndexMetricTypeSpecified = true } return nil @@ -451,14 +454,15 @@ func (cit *createIndexTask) Execute(ctx context.Context) error { var err error req := &indexpb.CreateIndexRequest{ - CollectionID: cit.collectionID, - FieldID: cit.fieldSchema.GetFieldID(), - IndexName: cit.req.GetIndexName(), - TypeParams: cit.newTypeParams, - IndexParams: cit.newIndexParams, - IsAutoIndex: cit.isAutoIndex, - UserIndexParams: cit.newExtraParams, - Timestamp: cit.BeginTs(), + CollectionID: cit.collectionID, + FieldID: cit.fieldSchema.GetFieldID(), + IndexName: cit.req.GetIndexName(), + TypeParams: cit.newTypeParams, + IndexParams: cit.newIndexParams, + IsAutoIndex: cit.isAutoIndex, + UserIndexParams: cit.newExtraParams, + Timestamp: cit.BeginTs(), + UserAutoindexMetricTypeSpecified: cit.userAutoIndexMetricTypeSpecified, } cit.result, err = cit.datacoord.CreateIndex(ctx, req) if err != nil { diff --git a/internal/proxy/task_index_test.go b/internal/proxy/task_index_test.go index f5e9840453..9976ffa8fb 100644 --- a/internal/proxy/task_index_test.go +++ b/internal/proxy/task_index_test.go @@ -1006,6 +1006,7 @@ func Test_parseIndexParams_AutoIndex_WithType(t *testing.T) { } err := task.parseIndexParams() assert.NoError(t, err) + assert.True(t, task.userAutoIndexMetricTypeSpecified) assert.ElementsMatch(t, []*commonpb.KeyValuePair{ {Key: common.IndexTypeKey, Value: "HNSW"}, {Key: common.MetricTypeKey, Value: "L2"}, @@ -1026,6 +1027,7 @@ func Test_parseIndexParams_AutoIndex_WithType(t *testing.T) { } err := task.parseIndexParams() assert.NoError(t, err) + assert.True(t, task.userAutoIndexMetricTypeSpecified) assert.ElementsMatch(t, []*commonpb.KeyValuePair{ {Key: common.IndexTypeKey, Value: "SPARSE_INVERTED_INDEX"}, {Key: common.MetricTypeKey, Value: "IP"}, @@ -1044,6 +1046,7 @@ func Test_parseIndexParams_AutoIndex_WithType(t *testing.T) { } err := task.parseIndexParams() assert.NoError(t, err) + assert.True(t, task.userAutoIndexMetricTypeSpecified) assert.ElementsMatch(t, []*commonpb.KeyValuePair{ {Key: common.IndexTypeKey, Value: "BIN_IVF_FLAT"}, {Key: common.MetricTypeKey, Value: "JACCARD"}, @@ -1093,6 +1096,7 @@ func Test_parseIndexParams_AutoIndex(t *testing.T) { } err := task.parseIndexParams() assert.NoError(t, err) + assert.False(t, task.userAutoIndexMetricTypeSpecified) assert.ElementsMatch(t, []*commonpb.KeyValuePair{ {Key: common.IndexTypeKey, Value: AutoIndexName}, {Key: common.MetricTypeKey, Value: autoIndexConfigBinary[common.MetricTypeKey]}, @@ -1108,6 +1112,7 @@ func Test_parseIndexParams_AutoIndex(t *testing.T) { } err := task.parseIndexParams() assert.NoError(t, err) + assert.False(t, task.userAutoIndexMetricTypeSpecified) assert.ElementsMatch(t, []*commonpb.KeyValuePair{ {Key: common.IndexTypeKey, Value: AutoIndexName}, {Key: common.MetricTypeKey, Value: autoIndexConfigSparse[common.MetricTypeKey]}, @@ -1123,6 +1128,7 @@ func Test_parseIndexParams_AutoIndex(t *testing.T) { } err := task.parseIndexParams() assert.NoError(t, err) + assert.False(t, task.userAutoIndexMetricTypeSpecified) assert.ElementsMatch(t, []*commonpb.KeyValuePair{ {Key: common.IndexTypeKey, Value: AutoIndexName}, {Key: common.MetricTypeKey, Value: autoIndexConfig[common.MetricTypeKey]}, @@ -1140,6 +1146,7 @@ func Test_parseIndexParams_AutoIndex(t *testing.T) { } err := task.parseIndexParams() assert.NoError(t, err) + assert.True(t, task.userAutoIndexMetricTypeSpecified) assert.ElementsMatch(t, []*commonpb.KeyValuePair{ {Key: common.IndexTypeKey, Value: AutoIndexName}, {Key: common.MetricTypeKey, Value: "L2"}, diff --git a/pkg/common/common.go b/pkg/common/common.go index 64513ca39a..b0fbe73043 100644 --- a/pkg/common/common.go +++ b/pkg/common/common.go @@ -121,6 +121,7 @@ const ( BitmapCardinalityLimitKey = "bitmap_cardinality_limit" IsSparseKey = "is_sparse" + AutoIndexName = "AUTOINDEX" ) // Collection properties key diff --git a/pkg/util/indexparamcheck/constraints.go b/pkg/util/indexparamcheck/constraints.go index 2228105b92..55ea516666 100644 --- a/pkg/util/indexparamcheck/constraints.go +++ b/pkg/util/indexparamcheck/constraints.go @@ -64,7 +64,7 @@ var ( ) const ( - FloatVectorDefaultMetricType = metric.IP + FloatVectorDefaultMetricType = metric.COSINE SparseFloatVectorDefaultMetricType = metric.IP - BinaryVectorDefaultMetricType = metric.JACCARD + BinaryVectorDefaultMetricType = metric.HAMMING ) diff --git a/pkg/util/indexparamcheck/hnsw_checker_test.go b/pkg/util/indexparamcheck/hnsw_checker_test.go index 1b19099e93..b911812540 100644 --- a/pkg/util/indexparamcheck/hnsw_checker_test.go +++ b/pkg/util/indexparamcheck/hnsw_checker_test.go @@ -180,15 +180,15 @@ func Test_hnswChecker_SetDefaultMetricType(t *testing.T) { }{ { dType: schemapb.DataType_FloatVector, - metricType: metric.IP, + metricType: metric.COSINE, }, { dType: schemapb.DataType_Float16Vector, - metricType: metric.IP, + metricType: metric.COSINE, }, { dType: schemapb.DataType_BFloat16Vector, - metricType: metric.IP, + metricType: metric.COSINE, }, { dType: schemapb.DataType_SparseFloatVector, @@ -196,7 +196,7 @@ func Test_hnswChecker_SetDefaultMetricType(t *testing.T) { }, { dType: schemapb.DataType_BinaryVector, - metricType: metric.JACCARD, + metricType: metric.HAMMING, }, } diff --git a/pkg/util/paramtable/autoindex_param.go b/pkg/util/paramtable/autoindex_param.go index 36ef52e2ae..0607e6d30b 100644 --- a/pkg/util/paramtable/autoindex_param.go +++ b/pkg/util/paramtable/autoindex_param.go @@ -70,7 +70,7 @@ func (p *autoIndexConfig) init(base *BaseTable) { p.IndexParams = ParamItem{ Key: "autoIndex.params.build", Version: "2.2.0", - DefaultValue: `{"M": 18,"efConstruction": 240,"index_type": "HNSW", "metric_type": "IP"}`, + DefaultValue: `{"M": 18,"efConstruction": 240,"index_type": "HNSW", "metric_type": "COSINE"}`, Export: true, } p.IndexParams.Init(base.mgr) @@ -86,7 +86,7 @@ func (p *autoIndexConfig) init(base *BaseTable) { p.BinaryIndexParams = ParamItem{ Key: "autoIndex.params.binary.build", Version: "2.4.5", - DefaultValue: `{"nlist": 1024, "index_type": "BIN_IVF_FLAT", "metric_type": "JACCARD"}`, + DefaultValue: `{"nlist": 1024, "index_type": "BIN_IVF_FLAT", "metric_type": "HAMMING"}`, Export: true, } p.BinaryIndexParams.Init(base.mgr) diff --git a/tests/python_client/testcases/test_index.py b/tests/python_client/testcases/test_index.py index db18f0833d..948f115bd1 100644 --- a/tests/python_client/testcases/test_index.py +++ b/tests/python_client/testcases/test_index.py @@ -22,7 +22,8 @@ prefix = "index" default_schema = cf.gen_default_collection_schema() default_field_name = ct.default_float_vec_field_name default_index_params = ct.default_index -default_autoindex_params = {"index_type": "AUTOINDEX", "metric_type": "IP"} +default_autoindex_params = {"index_type": "AUTOINDEX", "metric_type": "COSINE"} +default_sparse_autoindex_params = {"index_type": "AUTOINDEX", "metric_type": "IP"} # copied from pymilvus uid = "test_index" @@ -2118,7 +2119,7 @@ class TestAutoIndex(TestcaseBase): """ collection_w = self.init_collection_general(prefix, is_binary=True, is_index=False)[0] collection_w.create_index(binary_field_name, {}) - assert collection_w.index()[0].params == {'index_type': 'AUTOINDEX', 'metric_type': 'JACCARD'} + assert collection_w.index()[0].params == {'index_type': 'AUTOINDEX', 'metric_type': 'HAMMING'} @pytest.mark.tags(CaseLabel.L1) def test_create_autoindex_on_all_vector_type(self): @@ -2141,7 +2142,7 @@ class TestAutoIndex(TestcaseBase): collection_w.index(index_name="bf16")[0].params.items()) collection_w.create_index("sparse", index_name="sparse") - assert all(item in default_autoindex_params.items() for item in + assert all(item in default_sparse_autoindex_params.items() for item in collection_w.index(index_name="sparse")[0].params.items())