From 0d2d2c90648769412abe09bc27e1a4a2ce5369cd Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Wed, 26 Mar 2025 15:32:21 +0800 Subject: [PATCH] test: add phrase match test in go client (#39782) /kind improvement --------- Signed-off-by: zhuwenxing --- tests/go_client/common/utils.go | 7 +- .../go_client/testcases/helper/data_helper.go | 20 ++- .../testcases/helper/field_helper.go | 2 +- .../go_client/testcases/phrase_match_test.go | 167 ++++++++++++++++++ 4 files changed, 189 insertions(+), 7 deletions(-) create mode 100644 tests/go_client/testcases/phrase_match_test.go diff --git a/tests/go_client/common/utils.go b/tests/go_client/common/utils.go index df6957e5cf..7021c2c554 100644 --- a/tests/go_client/common/utils.go +++ b/tests/go_client/common/utils.go @@ -211,7 +211,12 @@ func GenText(lang string) string { object = chineseObjects[rand.Intn(len(chineseObjects))] return fmt.Sprintf("%s%s%s", topic, verb, object) default: - return "Unsupported language" + // Fallback to en for unsupported languages + log.Warn("Unsupported language, fallback to English", zap.String("language", lang)) + topic = englishTopics[rand.Intn(len(englishTopics))] + verb = englishVerbs[rand.Intn(len(englishVerbs))] + object = englishObjects[rand.Intn(len(englishObjects))] + return fmt.Sprintf("%s %s %s", topic, verb, object) } } diff --git a/tests/go_client/testcases/helper/data_helper.go b/tests/go_client/testcases/helper/data_helper.go index 0d9ffe4deb..b1c5637149 100644 --- a/tests/go_client/testcases/helper/data_helper.go +++ b/tests/go_client/testcases/helper/data_helper.go @@ -51,6 +51,7 @@ type GenDataOption struct { elementType entity.FieldType fieldName string textLang string + texts []string textEmptyPercent int } @@ -99,6 +100,11 @@ func (opt *GenDataOption) TWithTextLang(lang string) *GenDataOption { return opt } +func (opt *GenDataOption) TWithTextData(texts []string) *GenDataOption { + opt.texts = texts + return opt +} + func (opt *GenDataOption) TWithTextEmptyPercent(percent int) *GenDataOption { opt.textEmptyPercent = percent return opt @@ -362,11 +368,9 @@ func GenColumnData(nb int, fieldType entity.FieldType, option GenDataOption) col case "zh", "chinese": lang = "zh" default: - // Fallback to sequential numbers for unsupported languages - for i := start; i < start+nb; i++ { - varcharValues = append(varcharValues, strconv.Itoa(i)) - } - return column.NewColumnVarChar(fieldName, varcharValues) + // Fallback to en for unsupported languages + log.Warn("Unsupported language, fallback to English", zap.String("language", option.textLang)) + lang = "en" } // Generate text data with empty values based on textEmptyPercent @@ -383,6 +387,12 @@ func GenColumnData(nb int, fieldType entity.FieldType, option GenDataOption) col varcharValues = append(varcharValues, strconv.Itoa(i)) } } + if len(option.texts) > 0 { + // Replace part of varcharValues with texts from option + for i := 0; i < len(option.texts) && i < len(varcharValues); i++ { + varcharValues[i] = option.texts[i] + } + } return column.NewColumnVarChar(fieldName, varcharValues) case entity.FieldTypeArray: diff --git a/tests/go_client/testcases/helper/field_helper.go b/tests/go_client/testcases/helper/field_helper.go index e29cb62ac2..9153ffc08f 100644 --- a/tests/go_client/testcases/helper/field_helper.go +++ b/tests/go_client/testcases/helper/field_helper.go @@ -360,7 +360,7 @@ type FieldsFullTextSearch struct{} func (cf FieldsFullTextSearch) GenFields(option GenFieldsOption) []*entity.Field { pkField := entity.NewField().WithName(GetFieldNameByFieldType(entity.FieldTypeInt64)).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true) - textField := entity.NewField().WithName(common.DefaultTextFieldName).WithDataType(entity.FieldTypeVarChar).WithMaxLength(option.MaxLength).WithIsPartitionKey(option.IsPartitionKey).WithEnableAnalyzer(true).WithAnalyzerParams(option.AnalyzerParams) + textField := entity.NewField().WithName(common.DefaultTextFieldName).WithDataType(entity.FieldTypeVarChar).WithMaxLength(option.MaxLength).WithIsPartitionKey(option.IsPartitionKey).WithEnableAnalyzer(true).WithAnalyzerParams(option.AnalyzerParams).WithEnableMatch(true) sparseVecField := entity.NewField().WithName(common.DefaultTextSparseVecFieldName).WithDataType(entity.FieldTypeSparseVector) if option.AutoID { pkField.WithIsAutoID(option.AutoID) diff --git a/tests/go_client/testcases/phrase_match_test.go b/tests/go_client/testcases/phrase_match_test.go new file mode 100644 index 0000000000..708799cb22 --- /dev/null +++ b/tests/go_client/testcases/phrase_match_test.go @@ -0,0 +1,167 @@ +package testcases + +import ( + "fmt" + "math" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/milvus-io/milvus/client/v2/entity" + "github.com/milvus-io/milvus/client/v2/index" + "github.com/milvus-io/milvus/client/v2/milvusclient" + "github.com/milvus-io/milvus/tests/go_client/common" + hp "github.com/milvus-io/milvus/tests/go_client/testcases/helper" +) + +// TestPhraseMatchDefault tests basic phrase match functionality with slop=0 +func TestPhraseMatchDefault(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create -> insert -> flush -> index -> load + analyzerParams := map[string]any{"tokenizer": "standard"} + fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams) + function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption) + query := common.GenText(common.DefaultTextLang) + insertOption := hp.TNewDataOption().TWithTextLang(common.DefaultTextLang).TWithTextData([]string{query}) + prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption) + prepare.FlushData(ctx, t, mc, schema.CollectionName) + + indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)}) + prepare.CreateIndex(ctx, t, mc, indexparams) + prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName)) + + // Test exact phrase match (slop=0) + + expr := fmt.Sprintf("phrase_match(%s, \"%s\", 0)", common.DefaultTextFieldName, query) + queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr)) + common.CheckErr(t, err, true) + // Results may vary as we're using auto-generated data, but it should >= 1, since query text has been inserted + require.GreaterOrEqual(t, queryRes.ResultCount, 1) +} + +// TestPhraseMatchWithSlop tests phrase match with different slop values +func TestPhraseMatchWithSlop(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create -> insert -> flush -> index -> load + analyzerParams := map[string]any{"tokenizer": "standard"} + fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams) + function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption) + + // Insert test data with varying distances between words + query := common.GenText(common.DefaultTextLang) + insertOption := hp.TNewDataOption().TWithTextLang(common.DefaultTextLang).TWithTextData([]string{query}) + prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption) + prepare.FlushData(ctx, t, mc, schema.CollectionName) + + indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)}) + prepare.CreateIndex(ctx, t, mc, indexparams) + prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName)) + + // Test cases with different slop values + testCases := []struct { + name string + slop int + }{ + {"ExactMatch", 0}, // Matches only exact phrase + {"SmallSlop", 1}, // Matches phrases with 1 word between + {"MediumSlop", 2}, // Matches phrases with 2 words between + {"LargeSlop", 3}, // Matches phrases with up to 3 words between + {"VeryLargeSlop", math.MaxUint32}, // Matches phrases with up to max u32 words between + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + expr := fmt.Sprintf("phrase_match(%s, \"%s\", %d)", common.DefaultTextFieldName, query, tc.slop) + queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr)) + common.CheckErr(t, err, true) + require.GreaterOrEqual(t, queryRes.ResultCount, 1) + }) + } +} + +// TestPhraseMatchWithDiffLang tests phrase match with different languages +func TestPhraseMatchWithDiffLang(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // Test cases for different languages and analyzers + testCases := []struct { + name string + language string + analyzer string + slop int + }{ + { + name: "English_Standard", + language: common.English, + analyzer: "standard", + slop: 3, + }, + { + name: "Chinese_Jieba", + language: common.Chinese, + analyzer: "jieba", + slop: 3, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + analyzerParams := map[string]any{"tokenizer": tc.analyzer} + fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams) + function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption) + query := common.GenText(tc.language) + insertOption := hp.TNewDataOption().TWithTextLang(tc.language).TWithTextData([]string{query}) + prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption) + prepare.FlushData(ctx, t, mc, schema.CollectionName) + + indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)}) + prepare.CreateIndex(ctx, t, mc, indexparams) + prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName)) + + expr := fmt.Sprintf("phrase_match(%s, \"%s\", %d)", common.DefaultTextFieldName, query, tc.slop) + queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr)) + common.CheckErr(t, err, true) + require.GreaterOrEqual(t, queryRes.ResultCount, 1) + }) + } +} + +// TestPhraseMatchWithEmptyData tests phrase match with empty data +func TestPhraseMatchWithEmptyData(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create -> insert -> flush -> index -> load + analyzerParams := map[string]any{"tokenizer": "standard"} + fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams) + function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption) + + insertOption := hp.TNewDataOption().TWithTextLang(common.DefaultTextLang).TWithTextEmptyPercent(100) + prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption) + prepare.FlushData(ctx, t, mc, schema.CollectionName) + + indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)}) + prepare.CreateIndex(ctx, t, mc, indexparams) + prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName)) + + // Test phrase match with empty data + query := common.GenText(common.DefaultTextLang) + expr := fmt.Sprintf("phrase_match(%s, \"%s\", 0)", common.DefaultTextFieldName, query) + queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr)) + common.CheckErr(t, err, true) + require.Equal(t, 0, queryRes.ResultCount) +}