milvus/internal/util/indexparamcheck/ngram_index_checker.go
Spade A 26ec841feb
feat: optimize Like query with n-gram (#41803)
Ref #42053

This is the first PR for optimizing `LIKE` with ngram inverted index.
Now, only VARCHAR data type is supported and only InnerMatch LIKE
(%xxx%) query is supported.


How to use it:
```
milvus_client = MilvusClient("http://localhost:19530")
schema = milvus_client.create_schema()
...
schema.add_field("content_ngram", DataType.VARCHAR, max_length=10000)
...
index_params = milvus_client.prepare_index_params()
index_params.add_index(field_name="content_ngram", index_type="NGRAM", index_name="ngram_index", min_gram=2, max_gram=3)
milvus_client.create_collection(COLLECTION_NAME, ...)
```

min_gram and max_gram controls how we tokenize the documents. For
example, for min_gram=2 and max_gram=4, we will tokenize each document
with 2-gram, 3-gram and 4-gram.

---------

Signed-off-by: SpadeA <tangchenjie1210@gmail.com>
Signed-off-by: SpadeA-Tang <tangchenjie1210@gmail.com>
2025-07-01 10:08:44 +08:00

61 lines
1.8 KiB
Go

package indexparamcheck
import (
"fmt"
"strconv"
"github.com/milvus-io/milvus-proto/go-api/v2/schemapb"
"github.com/milvus-io/milvus/pkg/v2/util/merr"
"github.com/milvus-io/milvus/pkg/v2/util/typeutil"
)
const (
MinGramKey = "min_gram"
MaxGramKey = "max_gram"
)
type NgramIndexChecker struct {
scalarIndexChecker
}
func newNgramIndexChecker() *NgramIndexChecker {
return &NgramIndexChecker{}
}
func (c *NgramIndexChecker) CheckTrain(dataType schemapb.DataType, params map[string]string) error {
if dataType != schemapb.DataType_VarChar {
// todo(SpadeA): we may support it for json in the future
return merr.WrapErrParameterInvalidMsg("Ngram index can only be created on VARCHAR field")
}
minGramStr, minGramExist := params[MinGramKey]
maxGramStr, maxGramExist := params[MaxGramKey]
if !minGramExist || !maxGramExist {
return merr.WrapErrParameterInvalidMsg("Ngram index must specify both min_gram and max_gram")
}
minGram, err := strconv.Atoi(minGramStr)
if err != nil {
return merr.WrapErrParameterInvalidMsg("min_gram for Ngram index must be an integer, got: %s", minGramStr)
}
maxGram, err := strconv.Atoi(maxGramStr)
if err != nil {
return merr.WrapErrParameterInvalidMsg("max_gram for Ngram index must be an integer, got: %s", maxGramStr)
}
if minGram <= 0 || maxGram <= 0 || minGram > maxGram {
return merr.WrapErrParameterInvalidMsg("invalid min_gram or max_gram value for Ngram index, min_gram: %d, max_gram: %d", minGram, maxGram)
}
return c.scalarIndexChecker.CheckTrain(dataType, params)
}
func (c *NgramIndexChecker) CheckValidDataType(indexType IndexType, field *schemapb.FieldSchema) error {
dType := field.GetDataType()
if !typeutil.IsStringType(dType) {
return fmt.Errorf("ngram index can only be created on VARCHAR field")
}
return nil
}