diff --git a/client/entity/common.go b/client/entity/common.go index f14c47f627..d81919e64a 100644 --- a/client/entity/common.go +++ b/client/entity/common.go @@ -32,6 +32,7 @@ const ( SUBSTRUCTURE MetricType = "SUBSTRUCTURE" SUPERSTRUCTURE MetricType = "SUPERSTRUCTURE" BM25 MetricType = "BM25" + MHJACCARD MetricType = "MHJACCARD" ) // CompactionState enum type for compaction state diff --git a/internal/core/src/common/Utils.h b/internal/core/src/common/Utils.h index abe5dfa54e..bb91d1aa19 100644 --- a/internal/core/src/common/Utils.h +++ b/internal/core/src/common/Utils.h @@ -178,7 +178,8 @@ inline bool PositivelyRelated(const knowhere::MetricType& metric_type) { return IsMetricType(metric_type, knowhere::metric::IP) || IsMetricType(metric_type, knowhere::metric::COSINE) || - IsMetricType(metric_type, knowhere::metric::BM25); + IsMetricType(metric_type, knowhere::metric::BM25) || + IsMetricType(metric_type, knowhere::metric::MHJACCARD); } inline std::string diff --git a/internal/proxy/task_index.go b/internal/proxy/task_index.go index e52d8a06b8..bdaee5abc6 100644 --- a/internal/proxy/task_index.go +++ b/internal/proxy/task_index.go @@ -272,9 +272,20 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error { indexParamsMap[k] = v } } else if typeutil.IsBinaryVectorType(cit.fieldSchema.DataType) { - // override binary vector index params by autoindex - for k, v := range Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap() { - indexParamsMap[k] = v + if metricTypeExist && funcutil.SliceContain(indexparamcheck.DeduplicateMetrics, metricType) { + if !Params.AutoIndexConfig.EnableDeduplicateIndex.GetAsBool() { + log.Ctx(ctx).Warn("Deduplicate index is not enabled, but metric type is deduplicate.") + return merr.WrapErrParameterInvalidMsg("Deduplicate index is not enabled, but metric type is deduplicate.") + } + // override binary vector index params by autoindex deduplicate params + for k, v := range Params.AutoIndexConfig.DeduplicateIndexParams.GetAsJSONMap() { + indexParamsMap[k] = v + } + } else { + // override binary vector index params by autoindex + for k, v := range Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap() { + indexParamsMap[k] = v + } } } @@ -292,6 +303,7 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error { } log.Ctx(ctx).Info("AutoIndex triggered", fields...) } + metricType, metricTypeExist := indexParamsMap[common.MetricTypeKey] handle := func(numberParams int, autoIndexConfig map[string]string) error { // empty case. @@ -303,8 +315,6 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error { return nil } - metricType, metricTypeExist := indexParamsMap[common.MetricTypeKey] - if len(indexParamsMap) > numberParams+1 { return errors.New("only metric type can be passed when use AutoIndex") } @@ -333,8 +343,16 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error { // override sparse float vector index params by autoindex config = Params.AutoIndexConfig.SparseIndexParams.GetAsJSONMap() } else if typeutil.IsBinaryVectorType(cit.fieldSchema.DataType) { - // override binary vector index params by autoindex - config = Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap() + if metricTypeExist && funcutil.SliceContain(indexparamcheck.DeduplicateMetrics, metricType) { + if !Params.AutoIndexConfig.EnableDeduplicateIndex.GetAsBool() { + log.Ctx(ctx).Warn("Deduplicate index is not enabled, but metric type is deduplicate.") + return merr.WrapErrParameterInvalidMsg("Deduplicate index is not enabled, but metric type is deduplicate.") + } + config = Params.AutoIndexConfig.DeduplicateIndexParams.GetAsJSONMap() + } else { + // override binary vector index params by autoindex + config = Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap() + } } if !exist { if err := handle(0, config); err != nil { diff --git a/internal/proxy/util.go b/internal/proxy/util.go index b53a54fca8..3820aed08f 100644 --- a/internal/proxy/util.go +++ b/internal/proxy/util.go @@ -710,7 +710,7 @@ func validateMetricType(dataType schemapb.DataType, metricTypeStrRaw string) err if typeutil.IsFloatVectorType(dataType) { return nil } - case metric.JACCARD, metric.HAMMING, metric.SUBSTRUCTURE, metric.SUPERSTRUCTURE: + case metric.JACCARD, metric.HAMMING, metric.SUBSTRUCTURE, metric.SUPERSTRUCTURE, metric.MHJACCARD: if dataType == schemapb.DataType_BinaryVector { return nil } diff --git a/internal/util/indexparamcheck/constraints.go b/internal/util/indexparamcheck/constraints.go index cf0863d7e1..9a9659ec01 100644 --- a/internal/util/indexparamcheck/constraints.go +++ b/internal/util/indexparamcheck/constraints.go @@ -52,8 +52,8 @@ const ( ) var ( - FloatVectorMetrics = []string{metric.L2, metric.IP, metric.COSINE} // const - BinaryVectorMetrics = []string{metric.HAMMING, metric.JACCARD, metric.SUBSTRUCTURE, metric.SUPERSTRUCTURE} // const + FloatVectorMetrics = []string{metric.L2, metric.IP, metric.COSINE} // const + BinaryVectorMetrics = []string{metric.HAMMING, metric.JACCARD, metric.SUBSTRUCTURE, metric.SUPERSTRUCTURE, metric.MHJACCARD} // const ) // BinIDMapMetrics is a set of all metric types supported for binary vector. @@ -66,6 +66,7 @@ var ( supportDimPerSubQuantizer = []int{32, 28, 24, 20, 16, 12, 10, 8, 6, 4, 3, 2, 1} // const supportSubQuantizer = []int{96, 64, 56, 48, 40, 32, 28, 24, 20, 16, 12, 8, 4, 3, 2, 1} // const SparseMetrics = []string{metric.IP, metric.BM25} // const + DeduplicateMetrics = []string{metric.MHJACCARD} // const ) const ( diff --git a/internal/util/indexparamcheck/utils.go b/internal/util/indexparamcheck/utils.go index 2717436e51..80335aae9c 100644 --- a/internal/util/indexparamcheck/utils.go +++ b/internal/util/indexparamcheck/utils.go @@ -93,6 +93,7 @@ func CheckAutoIndexConfig() { autoIndexCfg := ¶mtable.Get().AutoIndexConfig CheckAutoIndexHelper(autoIndexCfg.IndexParams.Key, autoIndexCfg.IndexParams.GetAsJSONMap(), schemapb.DataType_FloatVector) CheckAutoIndexHelper(autoIndexCfg.BinaryIndexParams.Key, autoIndexCfg.BinaryIndexParams.GetAsJSONMap(), schemapb.DataType_BinaryVector) + CheckAutoIndexHelper(autoIndexCfg.BinaryIndexParams.Key, autoIndexCfg.DeduplicateIndexParams.GetAsJSONMap(), schemapb.DataType_BinaryVector) CheckAutoIndexHelper(autoIndexCfg.SparseIndexParams.Key, autoIndexCfg.SparseIndexParams.GetAsJSONMap(), schemapb.DataType_SparseFloatVector) } diff --git a/pkg/util/metric/metric_type.go b/pkg/util/metric/metric_type.go index b2b952e436..3b764d24c2 100644 --- a/pkg/util/metric/metric_type.go +++ b/pkg/util/metric/metric_type.go @@ -31,6 +31,9 @@ const ( // JACCARD represents jaccard distance JACCARD MetricType = "JACCARD" + // MHJACCARD represents jaccard distance of minhash vector + MHJACCARD MetricType = "MHJACCARD" + // SUBSTRUCTURE represents substructure distance SUBSTRUCTURE MetricType = "SUBSTRUCTURE" diff --git a/pkg/util/metric/similarity_corelation.go b/pkg/util/metric/similarity_corelation.go index f34a5e3141..0ba41d6483 100644 --- a/pkg/util/metric/similarity_corelation.go +++ b/pkg/util/metric/similarity_corelation.go @@ -21,5 +21,5 @@ import "strings" // PositivelyRelated return if metricType are "ip" or "IP" func PositivelyRelated(metricType string) bool { mUpper := strings.ToUpper(metricType) - return mUpper == strings.ToUpper(IP) || mUpper == strings.ToUpper(COSINE) || mUpper == strings.ToUpper(BM25) + return mUpper == strings.ToUpper(IP) || mUpper == strings.ToUpper(COSINE) || mUpper == strings.ToUpper(BM25) || mUpper == strings.ToUpper(MHJACCARD) } diff --git a/pkg/util/paramtable/autoindex_param.go b/pkg/util/paramtable/autoindex_param.go index 971ed1552d..933a20acf3 100644 --- a/pkg/util/paramtable/autoindex_param.go +++ b/pkg/util/paramtable/autoindex_param.go @@ -35,16 +35,18 @@ type AutoIndexConfig struct { EnableOptimize ParamItem `refreshable:"true"` EnableResultLimitCheck ParamItem `refreshable:"true"` - IndexParams ParamItem `refreshable:"true"` - SparseIndexParams ParamItem `refreshable:"true"` - BinaryIndexParams ParamItem `refreshable:"true"` - PrepareParams ParamItem `refreshable:"true"` - LoadAdaptParams ParamItem `refreshable:"true"` - ExtraParams ParamItem `refreshable:"true"` - IndexType ParamItem `refreshable:"true"` - AutoIndexTypeName ParamItem `refreshable:"true"` - AutoIndexSearchConfig ParamItem `refreshable:"true"` - AutoIndexTuningConfig ParamGroup `refreshable:"true"` + IndexParams ParamItem `refreshable:"true"` + SparseIndexParams ParamItem `refreshable:"true"` + BinaryIndexParams ParamItem `refreshable:"true"` + DeduplicateIndexParams ParamItem `refreshable:"true"` + EnableDeduplicateIndex ParamItem `refreshable:"true"` + PrepareParams ParamItem `refreshable:"true"` + LoadAdaptParams ParamItem `refreshable:"true"` + ExtraParams ParamItem `refreshable:"true"` + IndexType ParamItem `refreshable:"true"` + AutoIndexTypeName ParamItem `refreshable:"true"` + AutoIndexSearchConfig ParamItem `refreshable:"true"` + AutoIndexTuningConfig ParamGroup `refreshable:"true"` ScalarAutoIndexEnable ParamItem `refreshable:"true"` ScalarAutoIndexParams ParamItem `refreshable:"true"` @@ -108,6 +110,23 @@ func (p *AutoIndexConfig) init(base *BaseTable) { } p.BinaryIndexParams.Init(base.mgr) + p.DeduplicateIndexParams = ParamItem{ + Key: "autoIndex.params.deduplicate.build", + Version: "2.5.18", + DefaultValue: `{"index_type": "MINHASH_LSH", "metric_type": "MHJACCARD"}`, + Formatter: GetBuildParamFormatter(BinaryVectorDefaultMetricType, "autoIndex.params.deduplicate.build"), + Export: true, + } + p.DeduplicateIndexParams.Init(base.mgr) + + p.EnableDeduplicateIndex = ParamItem{ + Key: "autoIndex.params.deduplicate.enable", + Version: "2.5.18", + DefaultValue: "false", + PanicIfEmpty: false, + } + p.EnableDeduplicateIndex.Init(base.mgr) + p.PrepareParams = ParamItem{ Key: "autoIndex.params.prepare", Version: "2.3.2",