From d50b365375ece81782930ef4e8dc1d24e6c7df09 Mon Sep 17 00:00:00 2001 From: cqy123456 <39671710+cqy123456@users.noreply.github.com> Date: Wed, 3 Sep 2025 17:19:53 +0800 Subject: [PATCH] enhance: add autoindex config for deduplication case (#44186) Signed-off-by: cqy123456 --- internal/proxy/task_index.go | 35 +++++++++++++----- internal/util/indexparamcheck/constraints.go | 1 + internal/util/indexparamcheck/utils.go | 1 + pkg/util/paramtable/autoindex_param.go | 39 +++++++++++++++----- 4 files changed, 57 insertions(+), 19 deletions(-) diff --git a/internal/proxy/task_index.go b/internal/proxy/task_index.go index 3fe8d283a5..c1fd1baef4 100644 --- a/internal/proxy/task_index.go +++ b/internal/proxy/task_index.go @@ -270,9 +270,20 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error { indexParamsMap[k] = v } } else if typeutil.IsBinaryVectorType(cit.fieldSchema.DataType) { - // override binary vector index params by autoindex - for k, v := range Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap() { - indexParamsMap[k] = v + if metricTypeExist && funcutil.SliceContain(indexparamcheck.DeduplicateMetrics, metricType) { + if !Params.AutoIndexConfig.EnableDeduplicateIndex.GetAsBool() { + log.Ctx(ctx).Warn("Deduplicate index is not enabled, but metric type is deduplicate.") + return merr.WrapErrParameterInvalidMsg("Deduplicate index is not enabled, but metric type is deduplicate.") + } + // override binary vector index params by autoindex deduplicate params + for k, v := range Params.AutoIndexConfig.DeduplicateIndexParams.GetAsJSONMap() { + indexParamsMap[k] = v + } + } else { + // override binary vector index params by autoindex + for k, v := range Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap() { + indexParamsMap[k] = v + } } } else if typeutil.IsIntVectorType(cit.fieldSchema.DataType) { // override int vector index params by autoindex @@ -295,6 +306,7 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error { } log.Ctx(ctx).Info("AutoIndex triggered", fields...) } + metricType, metricTypeExist := indexParamsMap[common.MetricTypeKey] handle := func(numberParams int, autoIndexConfig map[string]string) error { // empty case. @@ -306,8 +318,6 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error { return nil } - metricType, metricTypeExist := indexParamsMap[common.MetricTypeKey] - if len(indexParamsMap) > numberParams+1 { return errors.New("only metric type can be passed when use AutoIndex") } @@ -337,10 +347,17 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error { (typeutil.IsArrayOfVectorType(cit.fieldSchema.DataType) && typeutil.IsSparseFloatVectorType(cit.fieldSchema.ElementType)) { // override sparse float vector index params by autoindex config = Params.AutoIndexConfig.SparseIndexParams.GetAsJSONMap() - } else if typeutil.IsBinaryVectorType(cit.fieldSchema.DataType) || - (typeutil.IsArrayOfVectorType(cit.fieldSchema.DataType) && typeutil.IsBinaryVectorType(cit.fieldSchema.ElementType)) { - // override binary vector index params by autoindex - config = Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap() + } else if typeutil.IsBinaryVectorType(cit.fieldSchema.DataType) { + if metricTypeExist && funcutil.SliceContain(indexparamcheck.DeduplicateMetrics, metricType) { + if !Params.AutoIndexConfig.EnableDeduplicateIndex.GetAsBool() { + log.Ctx(ctx).Warn("Deduplicate index is not enabled, but metric type is deduplicate.") + return merr.WrapErrParameterInvalidMsg("Deduplicate index is not enabled, but metric type is deduplicate.") + } + config = Params.AutoIndexConfig.DeduplicateIndexParams.GetAsJSONMap() + } else { + // override binary vector index params by autoindex + config = Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap() + } } else if typeutil.IsIntVectorType(cit.fieldSchema.DataType) || (typeutil.IsArrayOfVectorType(cit.fieldSchema.DataType) && typeutil.IsIntVectorType(cit.fieldSchema.ElementType)) { // override int vector index params by autoindex diff --git a/internal/util/indexparamcheck/constraints.go b/internal/util/indexparamcheck/constraints.go index d9696cb2fc..c55693b0d2 100644 --- a/internal/util/indexparamcheck/constraints.go +++ b/internal/util/indexparamcheck/constraints.go @@ -69,6 +69,7 @@ var ( supportDimPerSubQuantizer = []int{32, 28, 24, 20, 16, 12, 10, 8, 6, 4, 3, 2, 1} // const supportSubQuantizer = []int{96, 64, 56, 48, 40, 32, 28, 24, 20, 16, 12, 8, 4, 3, 2, 1} // const SparseMetrics = []string{metric.IP, metric.BM25} // const + DeduplicateMetrics = []string{metric.MHJACCARD} // const ) const ( diff --git a/internal/util/indexparamcheck/utils.go b/internal/util/indexparamcheck/utils.go index db9735b16d..c3baf1babc 100644 --- a/internal/util/indexparamcheck/utils.go +++ b/internal/util/indexparamcheck/utils.go @@ -93,6 +93,7 @@ func CheckAutoIndexConfig() { autoIndexCfg := ¶mtable.Get().AutoIndexConfig CheckAutoIndexHelper(autoIndexCfg.IndexParams.Key, autoIndexCfg.IndexParams.GetAsJSONMap(), schemapb.DataType_FloatVector) CheckAutoIndexHelper(autoIndexCfg.BinaryIndexParams.Key, autoIndexCfg.BinaryIndexParams.GetAsJSONMap(), schemapb.DataType_BinaryVector) + CheckAutoIndexHelper(autoIndexCfg.BinaryIndexParams.Key, autoIndexCfg.DeduplicateIndexParams.GetAsJSONMap(), schemapb.DataType_BinaryVector) CheckAutoIndexHelper(autoIndexCfg.SparseIndexParams.Key, autoIndexCfg.SparseIndexParams.GetAsJSONMap(), schemapb.DataType_SparseFloatVector) } diff --git a/pkg/util/paramtable/autoindex_param.go b/pkg/util/paramtable/autoindex_param.go index 02a1a90e0a..cbccd72037 100644 --- a/pkg/util/paramtable/autoindex_param.go +++ b/pkg/util/paramtable/autoindex_param.go @@ -35,16 +35,18 @@ type AutoIndexConfig struct { EnableOptimize ParamItem `refreshable:"true"` EnableResultLimitCheck ParamItem `refreshable:"true"` - IndexParams ParamItem `refreshable:"true"` - SparseIndexParams ParamItem `refreshable:"true"` - BinaryIndexParams ParamItem `refreshable:"true"` - PrepareParams ParamItem `refreshable:"true"` - LoadAdaptParams ParamItem `refreshable:"true"` - ExtraParams ParamItem `refreshable:"true"` - IndexType ParamItem `refreshable:"true"` - AutoIndexTypeName ParamItem `refreshable:"true"` - AutoIndexSearchConfig ParamItem `refreshable:"true"` - AutoIndexTuningConfig ParamGroup `refreshable:"true"` + IndexParams ParamItem `refreshable:"true"` + SparseIndexParams ParamItem `refreshable:"true"` + BinaryIndexParams ParamItem `refreshable:"true"` + DeduplicateIndexParams ParamItem `refreshable:"true"` + EnableDeduplicateIndex ParamItem `refreshable:"true"` + PrepareParams ParamItem `refreshable:"true"` + LoadAdaptParams ParamItem `refreshable:"true"` + ExtraParams ParamItem `refreshable:"true"` + IndexType ParamItem `refreshable:"true"` + AutoIndexTypeName ParamItem `refreshable:"true"` + AutoIndexSearchConfig ParamItem `refreshable:"true"` + AutoIndexTuningConfig ParamGroup `refreshable:"true"` ScalarAutoIndexEnable ParamItem `refreshable:"true"` ScalarAutoIndexParams ParamItem `refreshable:"true"` @@ -108,6 +110,23 @@ func (p *AutoIndexConfig) init(base *BaseTable) { } p.BinaryIndexParams.Init(base.mgr) + p.DeduplicateIndexParams = ParamItem{ + Key: "autoIndex.params.deduplicate.build", + Version: "2.5.18", + DefaultValue: `{"index_type": "MINHASH_LSH", "metric_type": "MHJACCARD"}`, + Formatter: GetBuildParamFormatter(BinaryVectorDefaultMetricType, "autoIndex.params.deduplicate.build"), + Export: true, + } + p.DeduplicateIndexParams.Init(base.mgr) + + p.EnableDeduplicateIndex = ParamItem{ + Key: "autoIndex.params.deduplicate.enable", + Version: "2.5.18", + DefaultValue: "false", + PanicIfEmpty: false, + } + p.EnableDeduplicateIndex.Init(base.mgr) + p.PrepareParams = ParamItem{ Key: "autoIndex.params.prepare", Version: "2.3.2",