enhance:[2.5]minhash support and add autoindex config (#44015)

master pr: https://github.com/milvus-io/milvus/pull/44186

Signed-off-by: cqy123456 <qianya.cheng@zilliz.com>
This commit is contained in:
cqy123456 2025-09-03 17:39:54 +08:00 committed by GitHub
parent d658b6f50a
commit c17ce3cf90
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 66 additions and 22 deletions

View File

@ -32,6 +32,7 @@ const (
SUBSTRUCTURE MetricType = "SUBSTRUCTURE" SUBSTRUCTURE MetricType = "SUBSTRUCTURE"
SUPERSTRUCTURE MetricType = "SUPERSTRUCTURE" SUPERSTRUCTURE MetricType = "SUPERSTRUCTURE"
BM25 MetricType = "BM25" BM25 MetricType = "BM25"
MHJACCARD MetricType = "MHJACCARD"
) )
// CompactionState enum type for compaction state // CompactionState enum type for compaction state

View File

@ -178,7 +178,8 @@ inline bool
PositivelyRelated(const knowhere::MetricType& metric_type) { PositivelyRelated(const knowhere::MetricType& metric_type) {
return IsMetricType(metric_type, knowhere::metric::IP) || return IsMetricType(metric_type, knowhere::metric::IP) ||
IsMetricType(metric_type, knowhere::metric::COSINE) || IsMetricType(metric_type, knowhere::metric::COSINE) ||
IsMetricType(metric_type, knowhere::metric::BM25); IsMetricType(metric_type, knowhere::metric::BM25) ||
IsMetricType(metric_type, knowhere::metric::MHJACCARD);
} }
inline std::string inline std::string

View File

@ -272,11 +272,22 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error {
indexParamsMap[k] = v indexParamsMap[k] = v
} }
} else if typeutil.IsBinaryVectorType(cit.fieldSchema.DataType) { } else if typeutil.IsBinaryVectorType(cit.fieldSchema.DataType) {
if metricTypeExist && funcutil.SliceContain(indexparamcheck.DeduplicateMetrics, metricType) {
if !Params.AutoIndexConfig.EnableDeduplicateIndex.GetAsBool() {
log.Ctx(ctx).Warn("Deduplicate index is not enabled, but metric type is deduplicate.")
return merr.WrapErrParameterInvalidMsg("Deduplicate index is not enabled, but metric type is deduplicate.")
}
// override binary vector index params by autoindex deduplicate params
for k, v := range Params.AutoIndexConfig.DeduplicateIndexParams.GetAsJSONMap() {
indexParamsMap[k] = v
}
} else {
// override binary vector index params by autoindex // override binary vector index params by autoindex
for k, v := range Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap() { for k, v := range Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap() {
indexParamsMap[k] = v indexParamsMap[k] = v
} }
} }
}
if metricTypeExist { if metricTypeExist {
// make the users' metric type first class citizen. // make the users' metric type first class citizen.
@ -292,6 +303,7 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error {
} }
log.Ctx(ctx).Info("AutoIndex triggered", fields...) log.Ctx(ctx).Info("AutoIndex triggered", fields...)
} }
metricType, metricTypeExist := indexParamsMap[common.MetricTypeKey]
handle := func(numberParams int, autoIndexConfig map[string]string) error { handle := func(numberParams int, autoIndexConfig map[string]string) error {
// empty case. // empty case.
@ -303,8 +315,6 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error {
return nil return nil
} }
metricType, metricTypeExist := indexParamsMap[common.MetricTypeKey]
if len(indexParamsMap) > numberParams+1 { if len(indexParamsMap) > numberParams+1 {
return errors.New("only metric type can be passed when use AutoIndex") return errors.New("only metric type can be passed when use AutoIndex")
} }
@ -333,9 +343,17 @@ func (cit *createIndexTask) parseIndexParams(ctx context.Context) error {
// override sparse float vector index params by autoindex // override sparse float vector index params by autoindex
config = Params.AutoIndexConfig.SparseIndexParams.GetAsJSONMap() config = Params.AutoIndexConfig.SparseIndexParams.GetAsJSONMap()
} else if typeutil.IsBinaryVectorType(cit.fieldSchema.DataType) { } else if typeutil.IsBinaryVectorType(cit.fieldSchema.DataType) {
if metricTypeExist && funcutil.SliceContain(indexparamcheck.DeduplicateMetrics, metricType) {
if !Params.AutoIndexConfig.EnableDeduplicateIndex.GetAsBool() {
log.Ctx(ctx).Warn("Deduplicate index is not enabled, but metric type is deduplicate.")
return merr.WrapErrParameterInvalidMsg("Deduplicate index is not enabled, but metric type is deduplicate.")
}
config = Params.AutoIndexConfig.DeduplicateIndexParams.GetAsJSONMap()
} else {
// override binary vector index params by autoindex // override binary vector index params by autoindex
config = Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap() config = Params.AutoIndexConfig.BinaryIndexParams.GetAsJSONMap()
} }
}
if !exist { if !exist {
if err := handle(0, config); err != nil { if err := handle(0, config); err != nil {
return err return err

View File

@ -710,7 +710,7 @@ func validateMetricType(dataType schemapb.DataType, metricTypeStrRaw string) err
if typeutil.IsFloatVectorType(dataType) { if typeutil.IsFloatVectorType(dataType) {
return nil return nil
} }
case metric.JACCARD, metric.HAMMING, metric.SUBSTRUCTURE, metric.SUPERSTRUCTURE: case metric.JACCARD, metric.HAMMING, metric.SUBSTRUCTURE, metric.SUPERSTRUCTURE, metric.MHJACCARD:
if dataType == schemapb.DataType_BinaryVector { if dataType == schemapb.DataType_BinaryVector {
return nil return nil
} }

View File

@ -53,7 +53,7 @@ const (
var ( var (
FloatVectorMetrics = []string{metric.L2, metric.IP, metric.COSINE} // const FloatVectorMetrics = []string{metric.L2, metric.IP, metric.COSINE} // const
BinaryVectorMetrics = []string{metric.HAMMING, metric.JACCARD, metric.SUBSTRUCTURE, metric.SUPERSTRUCTURE} // const BinaryVectorMetrics = []string{metric.HAMMING, metric.JACCARD, metric.SUBSTRUCTURE, metric.SUPERSTRUCTURE, metric.MHJACCARD} // const
) )
// BinIDMapMetrics is a set of all metric types supported for binary vector. // BinIDMapMetrics is a set of all metric types supported for binary vector.
@ -66,6 +66,7 @@ var (
supportDimPerSubQuantizer = []int{32, 28, 24, 20, 16, 12, 10, 8, 6, 4, 3, 2, 1} // const supportDimPerSubQuantizer = []int{32, 28, 24, 20, 16, 12, 10, 8, 6, 4, 3, 2, 1} // const
supportSubQuantizer = []int{96, 64, 56, 48, 40, 32, 28, 24, 20, 16, 12, 8, 4, 3, 2, 1} // const supportSubQuantizer = []int{96, 64, 56, 48, 40, 32, 28, 24, 20, 16, 12, 8, 4, 3, 2, 1} // const
SparseMetrics = []string{metric.IP, metric.BM25} // const SparseMetrics = []string{metric.IP, metric.BM25} // const
DeduplicateMetrics = []string{metric.MHJACCARD} // const
) )
const ( const (

View File

@ -93,6 +93,7 @@ func CheckAutoIndexConfig() {
autoIndexCfg := &paramtable.Get().AutoIndexConfig autoIndexCfg := &paramtable.Get().AutoIndexConfig
CheckAutoIndexHelper(autoIndexCfg.IndexParams.Key, autoIndexCfg.IndexParams.GetAsJSONMap(), schemapb.DataType_FloatVector) CheckAutoIndexHelper(autoIndexCfg.IndexParams.Key, autoIndexCfg.IndexParams.GetAsJSONMap(), schemapb.DataType_FloatVector)
CheckAutoIndexHelper(autoIndexCfg.BinaryIndexParams.Key, autoIndexCfg.BinaryIndexParams.GetAsJSONMap(), schemapb.DataType_BinaryVector) CheckAutoIndexHelper(autoIndexCfg.BinaryIndexParams.Key, autoIndexCfg.BinaryIndexParams.GetAsJSONMap(), schemapb.DataType_BinaryVector)
CheckAutoIndexHelper(autoIndexCfg.BinaryIndexParams.Key, autoIndexCfg.DeduplicateIndexParams.GetAsJSONMap(), schemapb.DataType_BinaryVector)
CheckAutoIndexHelper(autoIndexCfg.SparseIndexParams.Key, autoIndexCfg.SparseIndexParams.GetAsJSONMap(), schemapb.DataType_SparseFloatVector) CheckAutoIndexHelper(autoIndexCfg.SparseIndexParams.Key, autoIndexCfg.SparseIndexParams.GetAsJSONMap(), schemapb.DataType_SparseFloatVector)
} }

View File

@ -31,6 +31,9 @@ const (
// JACCARD represents jaccard distance // JACCARD represents jaccard distance
JACCARD MetricType = "JACCARD" JACCARD MetricType = "JACCARD"
// MHJACCARD represents jaccard distance of minhash vector
MHJACCARD MetricType = "MHJACCARD"
// SUBSTRUCTURE represents substructure distance // SUBSTRUCTURE represents substructure distance
SUBSTRUCTURE MetricType = "SUBSTRUCTURE" SUBSTRUCTURE MetricType = "SUBSTRUCTURE"

View File

@ -21,5 +21,5 @@ import "strings"
// PositivelyRelated return if metricType are "ip" or "IP" // PositivelyRelated return if metricType are "ip" or "IP"
func PositivelyRelated(metricType string) bool { func PositivelyRelated(metricType string) bool {
mUpper := strings.ToUpper(metricType) mUpper := strings.ToUpper(metricType)
return mUpper == strings.ToUpper(IP) || mUpper == strings.ToUpper(COSINE) || mUpper == strings.ToUpper(BM25) return mUpper == strings.ToUpper(IP) || mUpper == strings.ToUpper(COSINE) || mUpper == strings.ToUpper(BM25) || mUpper == strings.ToUpper(MHJACCARD)
} }

View File

@ -38,6 +38,8 @@ type AutoIndexConfig struct {
IndexParams ParamItem `refreshable:"true"` IndexParams ParamItem `refreshable:"true"`
SparseIndexParams ParamItem `refreshable:"true"` SparseIndexParams ParamItem `refreshable:"true"`
BinaryIndexParams ParamItem `refreshable:"true"` BinaryIndexParams ParamItem `refreshable:"true"`
DeduplicateIndexParams ParamItem `refreshable:"true"`
EnableDeduplicateIndex ParamItem `refreshable:"true"`
PrepareParams ParamItem `refreshable:"true"` PrepareParams ParamItem `refreshable:"true"`
LoadAdaptParams ParamItem `refreshable:"true"` LoadAdaptParams ParamItem `refreshable:"true"`
ExtraParams ParamItem `refreshable:"true"` ExtraParams ParamItem `refreshable:"true"`
@ -108,6 +110,23 @@ func (p *AutoIndexConfig) init(base *BaseTable) {
} }
p.BinaryIndexParams.Init(base.mgr) p.BinaryIndexParams.Init(base.mgr)
p.DeduplicateIndexParams = ParamItem{
Key: "autoIndex.params.deduplicate.build",
Version: "2.5.18",
DefaultValue: `{"index_type": "MINHASH_LSH", "metric_type": "MHJACCARD"}`,
Formatter: GetBuildParamFormatter(BinaryVectorDefaultMetricType, "autoIndex.params.deduplicate.build"),
Export: true,
}
p.DeduplicateIndexParams.Init(base.mgr)
p.EnableDeduplicateIndex = ParamItem{
Key: "autoIndex.params.deduplicate.enable",
Version: "2.5.18",
DefaultValue: "false",
PanicIfEmpty: false,
}
p.EnableDeduplicateIndex.Init(base.mgr)
p.PrepareParams = ParamItem{ p.PrepareParams = ParamItem{
Key: "autoIndex.params.prepare", Key: "autoIndex.params.prepare",
Version: "2.3.2", Version: "2.3.2",