test: add phrase match test in go client (#39782)

/kind improvement

---------

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
zhuwenxing 2025-03-26 15:32:21 +08:00 committed by GitHub
parent 8788e591cd
commit 0d2d2c9064
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 189 additions and 7 deletions

View File

@ -211,7 +211,12 @@ func GenText(lang string) string {
object = chineseObjects[rand.Intn(len(chineseObjects))]
return fmt.Sprintf("%s%s%s", topic, verb, object)
default:
return "Unsupported language"
// Fallback to en for unsupported languages
log.Warn("Unsupported language, fallback to English", zap.String("language", lang))
topic = englishTopics[rand.Intn(len(englishTopics))]
verb = englishVerbs[rand.Intn(len(englishVerbs))]
object = englishObjects[rand.Intn(len(englishObjects))]
return fmt.Sprintf("%s %s %s", topic, verb, object)
}
}

View File

@ -51,6 +51,7 @@ type GenDataOption struct {
elementType entity.FieldType
fieldName string
textLang string
texts []string
textEmptyPercent int
}
@ -99,6 +100,11 @@ func (opt *GenDataOption) TWithTextLang(lang string) *GenDataOption {
return opt
}
func (opt *GenDataOption) TWithTextData(texts []string) *GenDataOption {
opt.texts = texts
return opt
}
func (opt *GenDataOption) TWithTextEmptyPercent(percent int) *GenDataOption {
opt.textEmptyPercent = percent
return opt
@ -362,11 +368,9 @@ func GenColumnData(nb int, fieldType entity.FieldType, option GenDataOption) col
case "zh", "chinese":
lang = "zh"
default:
// Fallback to sequential numbers for unsupported languages
for i := start; i < start+nb; i++ {
varcharValues = append(varcharValues, strconv.Itoa(i))
}
return column.NewColumnVarChar(fieldName, varcharValues)
// Fallback to en for unsupported languages
log.Warn("Unsupported language, fallback to English", zap.String("language", option.textLang))
lang = "en"
}
// Generate text data with empty values based on textEmptyPercent
@ -383,6 +387,12 @@ func GenColumnData(nb int, fieldType entity.FieldType, option GenDataOption) col
varcharValues = append(varcharValues, strconv.Itoa(i))
}
}
if len(option.texts) > 0 {
// Replace part of varcharValues with texts from option
for i := 0; i < len(option.texts) && i < len(varcharValues); i++ {
varcharValues[i] = option.texts[i]
}
}
return column.NewColumnVarChar(fieldName, varcharValues)
case entity.FieldTypeArray:

View File

@ -360,7 +360,7 @@ type FieldsFullTextSearch struct{}
func (cf FieldsFullTextSearch) GenFields(option GenFieldsOption) []*entity.Field {
pkField := entity.NewField().WithName(GetFieldNameByFieldType(entity.FieldTypeInt64)).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true)
textField := entity.NewField().WithName(common.DefaultTextFieldName).WithDataType(entity.FieldTypeVarChar).WithMaxLength(option.MaxLength).WithIsPartitionKey(option.IsPartitionKey).WithEnableAnalyzer(true).WithAnalyzerParams(option.AnalyzerParams)
textField := entity.NewField().WithName(common.DefaultTextFieldName).WithDataType(entity.FieldTypeVarChar).WithMaxLength(option.MaxLength).WithIsPartitionKey(option.IsPartitionKey).WithEnableAnalyzer(true).WithAnalyzerParams(option.AnalyzerParams).WithEnableMatch(true)
sparseVecField := entity.NewField().WithName(common.DefaultTextSparseVecFieldName).WithDataType(entity.FieldTypeSparseVector)
if option.AutoID {
pkField.WithIsAutoID(option.AutoID)

View File

@ -0,0 +1,167 @@
package testcases
import (
"fmt"
"math"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/milvus-io/milvus/client/v2/entity"
"github.com/milvus-io/milvus/client/v2/index"
"github.com/milvus-io/milvus/client/v2/milvusclient"
"github.com/milvus-io/milvus/tests/go_client/common"
hp "github.com/milvus-io/milvus/tests/go_client/testcases/helper"
)
// TestPhraseMatchDefault tests basic phrase match functionality with slop=0
func TestPhraseMatchDefault(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create -> insert -> flush -> index -> load
analyzerParams := map[string]any{"tokenizer": "standard"}
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
query := common.GenText(common.DefaultTextLang)
insertOption := hp.TNewDataOption().TWithTextLang(common.DefaultTextLang).TWithTextData([]string{query})
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
prepare.FlushData(ctx, t, mc, schema.CollectionName)
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
prepare.CreateIndex(ctx, t, mc, indexparams)
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// Test exact phrase match (slop=0)
expr := fmt.Sprintf("phrase_match(%s, \"%s\", 0)", common.DefaultTextFieldName, query)
queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr))
common.CheckErr(t, err, true)
// Results may vary as we're using auto-generated data, but it should >= 1, since query text has been inserted
require.GreaterOrEqual(t, queryRes.ResultCount, 1)
}
// TestPhraseMatchWithSlop tests phrase match with different slop values
func TestPhraseMatchWithSlop(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create -> insert -> flush -> index -> load
analyzerParams := map[string]any{"tokenizer": "standard"}
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
// Insert test data with varying distances between words
query := common.GenText(common.DefaultTextLang)
insertOption := hp.TNewDataOption().TWithTextLang(common.DefaultTextLang).TWithTextData([]string{query})
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
prepare.FlushData(ctx, t, mc, schema.CollectionName)
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
prepare.CreateIndex(ctx, t, mc, indexparams)
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// Test cases with different slop values
testCases := []struct {
name string
slop int
}{
{"ExactMatch", 0}, // Matches only exact phrase
{"SmallSlop", 1}, // Matches phrases with 1 word between
{"MediumSlop", 2}, // Matches phrases with 2 words between
{"LargeSlop", 3}, // Matches phrases with up to 3 words between
{"VeryLargeSlop", math.MaxUint32}, // Matches phrases with up to max u32 words between
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
expr := fmt.Sprintf("phrase_match(%s, \"%s\", %d)", common.DefaultTextFieldName, query, tc.slop)
queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr))
common.CheckErr(t, err, true)
require.GreaterOrEqual(t, queryRes.ResultCount, 1)
})
}
}
// TestPhraseMatchWithDiffLang tests phrase match with different languages
func TestPhraseMatchWithDiffLang(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// Test cases for different languages and analyzers
testCases := []struct {
name string
language string
analyzer string
slop int
}{
{
name: "English_Standard",
language: common.English,
analyzer: "standard",
slop: 3,
},
{
name: "Chinese_Jieba",
language: common.Chinese,
analyzer: "jieba",
slop: 3,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
analyzerParams := map[string]any{"tokenizer": tc.analyzer}
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
query := common.GenText(tc.language)
insertOption := hp.TNewDataOption().TWithTextLang(tc.language).TWithTextData([]string{query})
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
prepare.FlushData(ctx, t, mc, schema.CollectionName)
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
prepare.CreateIndex(ctx, t, mc, indexparams)
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
expr := fmt.Sprintf("phrase_match(%s, \"%s\", %d)", common.DefaultTextFieldName, query, tc.slop)
queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr))
common.CheckErr(t, err, true)
require.GreaterOrEqual(t, queryRes.ResultCount, 1)
})
}
}
// TestPhraseMatchWithEmptyData tests phrase match with empty data
func TestPhraseMatchWithEmptyData(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create -> insert -> flush -> index -> load
analyzerParams := map[string]any{"tokenizer": "standard"}
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
insertOption := hp.TNewDataOption().TWithTextLang(common.DefaultTextLang).TWithTextEmptyPercent(100)
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
prepare.FlushData(ctx, t, mc, schema.CollectionName)
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
prepare.CreateIndex(ctx, t, mc, indexparams)
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// Test phrase match with empty data
query := common.GenText(common.DefaultTextLang)
expr := fmt.Sprintf("phrase_match(%s, \"%s\", 0)", common.DefaultTextFieldName, query)
queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr))
common.CheckErr(t, err, true)
require.Equal(t, 0, queryRes.ResultCount)
}