mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
test: add phrase match test in go client (#39782)
/kind improvement --------- Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
parent
8788e591cd
commit
0d2d2c9064
@ -211,7 +211,12 @@ func GenText(lang string) string {
|
||||
object = chineseObjects[rand.Intn(len(chineseObjects))]
|
||||
return fmt.Sprintf("%s%s%s", topic, verb, object)
|
||||
default:
|
||||
return "Unsupported language"
|
||||
// Fallback to en for unsupported languages
|
||||
log.Warn("Unsupported language, fallback to English", zap.String("language", lang))
|
||||
topic = englishTopics[rand.Intn(len(englishTopics))]
|
||||
verb = englishVerbs[rand.Intn(len(englishVerbs))]
|
||||
object = englishObjects[rand.Intn(len(englishObjects))]
|
||||
return fmt.Sprintf("%s %s %s", topic, verb, object)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -51,6 +51,7 @@ type GenDataOption struct {
|
||||
elementType entity.FieldType
|
||||
fieldName string
|
||||
textLang string
|
||||
texts []string
|
||||
textEmptyPercent int
|
||||
}
|
||||
|
||||
@ -99,6 +100,11 @@ func (opt *GenDataOption) TWithTextLang(lang string) *GenDataOption {
|
||||
return opt
|
||||
}
|
||||
|
||||
func (opt *GenDataOption) TWithTextData(texts []string) *GenDataOption {
|
||||
opt.texts = texts
|
||||
return opt
|
||||
}
|
||||
|
||||
func (opt *GenDataOption) TWithTextEmptyPercent(percent int) *GenDataOption {
|
||||
opt.textEmptyPercent = percent
|
||||
return opt
|
||||
@ -362,11 +368,9 @@ func GenColumnData(nb int, fieldType entity.FieldType, option GenDataOption) col
|
||||
case "zh", "chinese":
|
||||
lang = "zh"
|
||||
default:
|
||||
// Fallback to sequential numbers for unsupported languages
|
||||
for i := start; i < start+nb; i++ {
|
||||
varcharValues = append(varcharValues, strconv.Itoa(i))
|
||||
}
|
||||
return column.NewColumnVarChar(fieldName, varcharValues)
|
||||
// Fallback to en for unsupported languages
|
||||
log.Warn("Unsupported language, fallback to English", zap.String("language", option.textLang))
|
||||
lang = "en"
|
||||
}
|
||||
|
||||
// Generate text data with empty values based on textEmptyPercent
|
||||
@ -383,6 +387,12 @@ func GenColumnData(nb int, fieldType entity.FieldType, option GenDataOption) col
|
||||
varcharValues = append(varcharValues, strconv.Itoa(i))
|
||||
}
|
||||
}
|
||||
if len(option.texts) > 0 {
|
||||
// Replace part of varcharValues with texts from option
|
||||
for i := 0; i < len(option.texts) && i < len(varcharValues); i++ {
|
||||
varcharValues[i] = option.texts[i]
|
||||
}
|
||||
}
|
||||
return column.NewColumnVarChar(fieldName, varcharValues)
|
||||
|
||||
case entity.FieldTypeArray:
|
||||
|
||||
@ -360,7 +360,7 @@ type FieldsFullTextSearch struct{}
|
||||
|
||||
func (cf FieldsFullTextSearch) GenFields(option GenFieldsOption) []*entity.Field {
|
||||
pkField := entity.NewField().WithName(GetFieldNameByFieldType(entity.FieldTypeInt64)).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true)
|
||||
textField := entity.NewField().WithName(common.DefaultTextFieldName).WithDataType(entity.FieldTypeVarChar).WithMaxLength(option.MaxLength).WithIsPartitionKey(option.IsPartitionKey).WithEnableAnalyzer(true).WithAnalyzerParams(option.AnalyzerParams)
|
||||
textField := entity.NewField().WithName(common.DefaultTextFieldName).WithDataType(entity.FieldTypeVarChar).WithMaxLength(option.MaxLength).WithIsPartitionKey(option.IsPartitionKey).WithEnableAnalyzer(true).WithAnalyzerParams(option.AnalyzerParams).WithEnableMatch(true)
|
||||
sparseVecField := entity.NewField().WithName(common.DefaultTextSparseVecFieldName).WithDataType(entity.FieldTypeSparseVector)
|
||||
if option.AutoID {
|
||||
pkField.WithIsAutoID(option.AutoID)
|
||||
|
||||
167
tests/go_client/testcases/phrase_match_test.go
Normal file
167
tests/go_client/testcases/phrase_match_test.go
Normal file
@ -0,0 +1,167 @@
|
||||
package testcases
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/milvus-io/milvus/client/v2/entity"
|
||||
"github.com/milvus-io/milvus/client/v2/index"
|
||||
"github.com/milvus-io/milvus/client/v2/milvusclient"
|
||||
"github.com/milvus-io/milvus/tests/go_client/common"
|
||||
hp "github.com/milvus-io/milvus/tests/go_client/testcases/helper"
|
||||
)
|
||||
|
||||
// TestPhraseMatchDefault tests basic phrase match functionality with slop=0
|
||||
func TestPhraseMatchDefault(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create -> insert -> flush -> index -> load
|
||||
analyzerParams := map[string]any{"tokenizer": "standard"}
|
||||
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
|
||||
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
|
||||
query := common.GenText(common.DefaultTextLang)
|
||||
insertOption := hp.TNewDataOption().TWithTextLang(common.DefaultTextLang).TWithTextData([]string{query})
|
||||
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
|
||||
prepare.FlushData(ctx, t, mc, schema.CollectionName)
|
||||
|
||||
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
|
||||
prepare.CreateIndex(ctx, t, mc, indexparams)
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// Test exact phrase match (slop=0)
|
||||
|
||||
expr := fmt.Sprintf("phrase_match(%s, \"%s\", 0)", common.DefaultTextFieldName, query)
|
||||
queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr))
|
||||
common.CheckErr(t, err, true)
|
||||
// Results may vary as we're using auto-generated data, but it should >= 1, since query text has been inserted
|
||||
require.GreaterOrEqual(t, queryRes.ResultCount, 1)
|
||||
}
|
||||
|
||||
// TestPhraseMatchWithSlop tests phrase match with different slop values
|
||||
func TestPhraseMatchWithSlop(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create -> insert -> flush -> index -> load
|
||||
analyzerParams := map[string]any{"tokenizer": "standard"}
|
||||
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
|
||||
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
|
||||
|
||||
// Insert test data with varying distances between words
|
||||
query := common.GenText(common.DefaultTextLang)
|
||||
insertOption := hp.TNewDataOption().TWithTextLang(common.DefaultTextLang).TWithTextData([]string{query})
|
||||
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
|
||||
prepare.FlushData(ctx, t, mc, schema.CollectionName)
|
||||
|
||||
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
|
||||
prepare.CreateIndex(ctx, t, mc, indexparams)
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// Test cases with different slop values
|
||||
testCases := []struct {
|
||||
name string
|
||||
slop int
|
||||
}{
|
||||
{"ExactMatch", 0}, // Matches only exact phrase
|
||||
{"SmallSlop", 1}, // Matches phrases with 1 word between
|
||||
{"MediumSlop", 2}, // Matches phrases with 2 words between
|
||||
{"LargeSlop", 3}, // Matches phrases with up to 3 words between
|
||||
{"VeryLargeSlop", math.MaxUint32}, // Matches phrases with up to max u32 words between
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
expr := fmt.Sprintf("phrase_match(%s, \"%s\", %d)", common.DefaultTextFieldName, query, tc.slop)
|
||||
queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr))
|
||||
common.CheckErr(t, err, true)
|
||||
require.GreaterOrEqual(t, queryRes.ResultCount, 1)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestPhraseMatchWithDiffLang tests phrase match with different languages
|
||||
func TestPhraseMatchWithDiffLang(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// Test cases for different languages and analyzers
|
||||
testCases := []struct {
|
||||
name string
|
||||
language string
|
||||
analyzer string
|
||||
slop int
|
||||
}{
|
||||
{
|
||||
name: "English_Standard",
|
||||
language: common.English,
|
||||
analyzer: "standard",
|
||||
slop: 3,
|
||||
},
|
||||
{
|
||||
name: "Chinese_Jieba",
|
||||
language: common.Chinese,
|
||||
analyzer: "jieba",
|
||||
slop: 3,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
analyzerParams := map[string]any{"tokenizer": tc.analyzer}
|
||||
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
|
||||
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
|
||||
query := common.GenText(tc.language)
|
||||
insertOption := hp.TNewDataOption().TWithTextLang(tc.language).TWithTextData([]string{query})
|
||||
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
|
||||
prepare.FlushData(ctx, t, mc, schema.CollectionName)
|
||||
|
||||
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
|
||||
prepare.CreateIndex(ctx, t, mc, indexparams)
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
expr := fmt.Sprintf("phrase_match(%s, \"%s\", %d)", common.DefaultTextFieldName, query, tc.slop)
|
||||
queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr))
|
||||
common.CheckErr(t, err, true)
|
||||
require.GreaterOrEqual(t, queryRes.ResultCount, 1)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestPhraseMatchWithEmptyData tests phrase match with empty data
|
||||
func TestPhraseMatchWithEmptyData(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create -> insert -> flush -> index -> load
|
||||
analyzerParams := map[string]any{"tokenizer": "standard"}
|
||||
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
|
||||
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
|
||||
|
||||
insertOption := hp.TNewDataOption().TWithTextLang(common.DefaultTextLang).TWithTextEmptyPercent(100)
|
||||
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
|
||||
prepare.FlushData(ctx, t, mc, schema.CollectionName)
|
||||
|
||||
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
|
||||
prepare.CreateIndex(ctx, t, mc, indexparams)
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// Test phrase match with empty data
|
||||
query := common.GenText(common.DefaultTextLang)
|
||||
expr := fmt.Sprintf("phrase_match(%s, \"%s\", 0)", common.DefaultTextFieldName, query)
|
||||
queryRes, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter(expr))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, 0, queryRes.ResultCount)
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user