mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
test: add go-sdk cases for full text search (#39570)
/kind improvement --------- Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com> Signed-off-by: zhuwenxing <wxzhuyeah@gmail.com>
This commit is contained in:
parent
1f14053c70
commit
ee87e4d0b6
@ -1 +1,191 @@
|
||||
## go_client
|
||||
# Milvus Go Client Test Framework
|
||||
|
||||
## Overview
|
||||
This is a comprehensive test framework for the Milvus Go Client, designed to validate various functionalities of the Milvus vector database client. The framework provides a structured approach to writing tests with reusable components and helper functions.
|
||||
|
||||
## Framework Architecture
|
||||
|
||||
### Directory Structure
|
||||
```
|
||||
/go_client/
|
||||
├── testcases/ # Main test cases
|
||||
│ ├── helper/ # Helper functions and utilities
|
||||
│ │ ├── helper.go
|
||||
│ │ ├── data_helper.go
|
||||
│ │ └── collection_helper.go
|
||||
│ ├── search_test.go # Search functionality tests
|
||||
│ ├── index_test.go # Index management tests
|
||||
│ └── ...
|
||||
├── common/ # Common utilities and constants
|
||||
└── base/ # Base infrastructure code
|
||||
```
|
||||
|
||||
### Key Components
|
||||
- **Collection Preparation**: Utilities for creating and managing collections
|
||||
- **Data Generation**: Tools for generating test data
|
||||
- **Helper Functions**: Common operations and validations
|
||||
- **Test Cases**: Organized by functionality
|
||||
|
||||
## Writing Test Cases
|
||||
|
||||
### Basic Test Structure
|
||||
```go
|
||||
func TestYourFeature(t *testing.T) {
|
||||
// 1. Setup context and client
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := createDefaultMilvusClient(ctx, t)
|
||||
|
||||
// 2. Prepare collection
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(
|
||||
ctx, t, mc,
|
||||
hp.NewCreateCollectionParams(hp.Int64Vec),
|
||||
hp.TNewFieldsOption(),
|
||||
hp.TNewSchemaOption(),
|
||||
)
|
||||
|
||||
// 3. Insert test data
|
||||
prepare.InsertData(ctx, t, mc,
|
||||
hp.NewInsertParams(schema),
|
||||
hp.TNewDataOption(),
|
||||
)
|
||||
|
||||
// 4. Execute test operations
|
||||
// ... your test logic here ...
|
||||
|
||||
// 5. Validate results
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, expected, actual)
|
||||
}
|
||||
```
|
||||
|
||||
### Using Custom Parameters
|
||||
|
||||
1. **Collection Creation Parameters**
|
||||
```go
|
||||
fieldsOption := hp.TNewFieldsOption().
|
||||
TWithEnableAnalyzer(true).
|
||||
TWithAnalyzerParams(map[string]any{
|
||||
"tokenizer": "standard",
|
||||
})
|
||||
|
||||
schemaOption := hp.TNewSchemaOption().
|
||||
TWithEnableDynamicField(true).
|
||||
TWithDescription("Custom schema").
|
||||
TWithAutoID(false)
|
||||
```
|
||||
|
||||
2. **Data Insertion Options**
|
||||
```go
|
||||
insertOption := hp.TNewDataOption().
|
||||
TWithNb(1000). // Number of records
|
||||
TWithDim(128). // Vector dimension
|
||||
TWithStart(100). // Starting ID
|
||||
TWithMaxLen(256). // Maximum length
|
||||
TWithTextLang("en") // Text language
|
||||
```
|
||||
|
||||
3. **Index Parameters**
|
||||
```go
|
||||
indexParams := hp.TNewIndexParams(schema).
|
||||
TWithFieldIndex(map[string]index.Index{
|
||||
common.DefaultVectorFieldName: index.NewIVFSQIndex(
|
||||
&index.IVFSQConfig{
|
||||
MetricType: entity.L2,
|
||||
NList: 128,
|
||||
},
|
||||
),
|
||||
})
|
||||
```
|
||||
|
||||
4. **Search Parameters**
|
||||
```go
|
||||
searchOpt := client.NewSearchOption(schema.CollectionName, 100, vectors).
|
||||
WithOffset(0).
|
||||
WithLimit(100).
|
||||
WithConsistencyLevel(entity.ClStrong).
|
||||
WithFilter("int64 >= 100").
|
||||
WithOutputFields([]string{"*"}).
|
||||
WithSearchParams(map[string]any{
|
||||
"nprobe": 16,
|
||||
"ef": 64,
|
||||
})
|
||||
```
|
||||
|
||||
## Adding New Parameters
|
||||
|
||||
1. **Define New Option Type**
|
||||
```go
|
||||
// In helper/data_helper.go
|
||||
type YourNewOption struct {
|
||||
newParam1 string
|
||||
newParam2 int
|
||||
}
|
||||
```
|
||||
|
||||
2. **Add Constructor**
|
||||
```go
|
||||
func TNewYourOption() *YourNewOption {
|
||||
return &YourNewOption{
|
||||
newParam1: "default",
|
||||
newParam2: 0,
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
3. **Add Parameter Methods**
|
||||
```go
|
||||
func (opt *YourNewOption) TWithNewParam1(value string) *YourNewOption {
|
||||
opt.newParam1 = value
|
||||
return opt
|
||||
}
|
||||
|
||||
func (opt *YourNewOption) TWithNewParam2(value int) *YourNewOption {
|
||||
opt.newParam2 = value
|
||||
return opt
|
||||
}
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Test Organization**
|
||||
- Group related tests in the same file
|
||||
- Use clear and descriptive test names
|
||||
- Add comments explaining test purpose
|
||||
|
||||
2. **Data Generation**
|
||||
- Use helper functions for generating test data
|
||||
- Ensure data is appropriate for the test case
|
||||
- Clean up test data after use
|
||||
|
||||
3. **Error Handling**
|
||||
- Use `common.CheckErr` for consistent error checking
|
||||
- Test both success and failure scenarios
|
||||
- Validate error messages when appropriate
|
||||
|
||||
4. **Performance Considerations**
|
||||
- Use appropriate timeouts
|
||||
- Clean up resources after tests
|
||||
- Consider test execution time
|
||||
|
||||
## Running Tests
|
||||
|
||||
```bash
|
||||
# Run all tests
|
||||
go test ./testcases/...
|
||||
|
||||
# Run specific test
|
||||
go test -run TestYourFeature ./testcases/
|
||||
|
||||
# Run with verbose output
|
||||
go test -v ./testcases/...
|
||||
```
|
||||
|
||||
## Contributing
|
||||
1. Follow the existing code structure
|
||||
2. Add comprehensive test cases
|
||||
3. Document new parameters and options
|
||||
4. Update this README for significant changes
|
||||
5. Ensure code quality standards:
|
||||
- Run `golangci-lint run` to check for style mistakes
|
||||
- Use `gofmt -w your/code/path` to format your code before submitting
|
||||
- CI will verify both golint and go format compliance
|
||||
@ -4,33 +4,35 @@ import "github.com/milvus-io/milvus/client/v2/index"
|
||||
|
||||
// cost default field name
|
||||
const (
|
||||
DefaultInt8FieldName = "int8"
|
||||
DefaultInt16FieldName = "int16"
|
||||
DefaultInt32FieldName = "int32"
|
||||
DefaultInt64FieldName = "int64"
|
||||
DefaultBoolFieldName = "bool"
|
||||
DefaultFloatFieldName = "float"
|
||||
DefaultDoubleFieldName = "double"
|
||||
DefaultVarcharFieldName = "varchar"
|
||||
DefaultJSONFieldName = "json"
|
||||
DefaultArrayFieldName = "array"
|
||||
DefaultFloatVecFieldName = "floatVec"
|
||||
DefaultBinaryVecFieldName = "binaryVec"
|
||||
DefaultFloat16VecFieldName = "fp16Vec"
|
||||
DefaultBFloat16VecFieldName = "bf16Vec"
|
||||
DefaultSparseVecFieldName = "sparseVec"
|
||||
DefaultDynamicNumberField = "dynamicNumber"
|
||||
DefaultDynamicStringField = "dynamicString"
|
||||
DefaultDynamicBoolField = "dynamicBool"
|
||||
DefaultDynamicListField = "dynamicList"
|
||||
DefaultBoolArrayField = "boolArray"
|
||||
DefaultInt8ArrayField = "int8Array"
|
||||
DefaultInt16ArrayField = "int16Array"
|
||||
DefaultInt32ArrayField = "int32Array"
|
||||
DefaultInt64ArrayField = "int64Array"
|
||||
DefaultFloatArrayField = "floatArray"
|
||||
DefaultDoubleArrayField = "doubleArray"
|
||||
DefaultVarcharArrayField = "varcharArray"
|
||||
DefaultInt8FieldName = "int8"
|
||||
DefaultInt16FieldName = "int16"
|
||||
DefaultInt32FieldName = "int32"
|
||||
DefaultInt64FieldName = "int64"
|
||||
DefaultBoolFieldName = "bool"
|
||||
DefaultFloatFieldName = "float"
|
||||
DefaultDoubleFieldName = "double"
|
||||
DefaultTextFieldName = "text"
|
||||
DefaultVarcharFieldName = "varchar"
|
||||
DefaultJSONFieldName = "json"
|
||||
DefaultArrayFieldName = "array"
|
||||
DefaultFloatVecFieldName = "floatVec"
|
||||
DefaultBinaryVecFieldName = "binaryVec"
|
||||
DefaultFloat16VecFieldName = "fp16Vec"
|
||||
DefaultBFloat16VecFieldName = "bf16Vec"
|
||||
DefaultTextSparseVecFieldName = "textSparseVec"
|
||||
DefaultSparseVecFieldName = "sparseVec"
|
||||
DefaultDynamicNumberField = "dynamicNumber"
|
||||
DefaultDynamicStringField = "dynamicString"
|
||||
DefaultDynamicBoolField = "dynamicBool"
|
||||
DefaultDynamicListField = "dynamicList"
|
||||
DefaultBoolArrayField = "boolArray"
|
||||
DefaultInt8ArrayField = "int8Array"
|
||||
DefaultInt16ArrayField = "int16Array"
|
||||
DefaultInt32ArrayField = "int32Array"
|
||||
DefaultInt64ArrayField = "int64Array"
|
||||
DefaultFloatArrayField = "floatArray"
|
||||
DefaultDoubleArrayField = "doubleArray"
|
||||
DefaultVarcharArrayField = "varcharArray"
|
||||
)
|
||||
|
||||
// cost for test cases
|
||||
@ -86,3 +88,8 @@ const (
|
||||
DatabaseForceDenyReading = "database.force.deny.reading"
|
||||
DatabaseDiskQuotaMb = "database.diskQuota.mb"
|
||||
)
|
||||
|
||||
// const for full text search
|
||||
const (
|
||||
DefaultTextLang = "en"
|
||||
)
|
||||
|
||||
@ -153,3 +153,59 @@ var InvalidExpressions = []InvalidExprStruct{
|
||||
{Expr: fmt.Sprintf("%s[-1] > %d", DefaultInt8ArrayField, TestCapacity), ErrNil: false, ErrMsg: "cannot parse expression"}, // array[-1] >
|
||||
{Expr: fmt.Sprintf("%s[-1] > 1", DefaultJSONFieldName), ErrNil: false, ErrMsg: "invalid expression"}, // json[-1] >
|
||||
}
|
||||
|
||||
// Language constants for text generation
|
||||
const (
|
||||
English = "en"
|
||||
Chinese = "zh"
|
||||
)
|
||||
|
||||
func GenText(lang string) string {
|
||||
englishTopics := []string{
|
||||
"information retrieval", "data mining", "machine learning",
|
||||
"natural language processing", "text analysis", "search engines",
|
||||
"document indexing", "query processing", "relevance ranking",
|
||||
"semantic search",
|
||||
}
|
||||
englishVerbs := []string{
|
||||
"is", "focuses on", "deals with", "involves", "combines",
|
||||
"utilizes", "improves", "enables", "enhances", "supports",
|
||||
}
|
||||
englishObjects := []string{
|
||||
"large datasets", "text documents", "user queries", "search results",
|
||||
"information needs", "relevance scores", "ranking algorithms",
|
||||
"index structures", "query expansion", "document collections",
|
||||
}
|
||||
|
||||
chineseTopics := []string{
|
||||
"信息检索", "数据挖掘", "机器学习",
|
||||
"自然语言处理", "文本分析", "搜索引擎",
|
||||
"文档索引", "查询处理", "相关性排序",
|
||||
"语义搜索",
|
||||
}
|
||||
chineseVerbs := []string{
|
||||
"是", "专注于", "处理", "涉及", "结合",
|
||||
"利用", "改进", "实现", "提升", "支持",
|
||||
}
|
||||
chineseObjects := []string{
|
||||
"大规模数据集", "文本文档", "用户查询", "搜索结果",
|
||||
"信息需求", "相关性分数", "排序算法",
|
||||
"索引结构", "查询扩展", "文档集合",
|
||||
}
|
||||
|
||||
var topic, verb, object string
|
||||
switch lang {
|
||||
case English:
|
||||
topic = englishTopics[rand.Intn(len(englishTopics))]
|
||||
verb = englishVerbs[rand.Intn(len(englishVerbs))]
|
||||
object = englishObjects[rand.Intn(len(englishObjects))]
|
||||
return fmt.Sprintf("%s %s %s", topic, verb, object)
|
||||
case Chinese:
|
||||
topic = chineseTopics[rand.Intn(len(chineseTopics))]
|
||||
verb = chineseVerbs[rand.Intn(len(chineseVerbs))]
|
||||
object = chineseObjects[rand.Intn(len(chineseObjects))]
|
||||
return fmt.Sprintf("%s%s%s", topic, verb, object)
|
||||
default:
|
||||
return "Unsupported language"
|
||||
}
|
||||
}
|
||||
|
||||
283
tests/go_client/testcases/full_text_search_test.go
Normal file
283
tests/go_client/testcases/full_text_search_test.go
Normal file
@ -0,0 +1,283 @@
|
||||
package testcases
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/milvus-io/milvus/client/v2/entity"
|
||||
"github.com/milvus-io/milvus/client/v2/index"
|
||||
"github.com/milvus-io/milvus/client/v2/milvusclient"
|
||||
"github.com/milvus-io/milvus/tests/go_client/common"
|
||||
hp "github.com/milvus-io/milvus/tests/go_client/testcases/helper"
|
||||
)
|
||||
|
||||
func TestFullTextSearchDefault(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := createDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create -> insert -> flush -> index -> load
|
||||
analyzerParams := map[string]any{"tokenizer": "standard"}
|
||||
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
|
||||
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
|
||||
insertOption := hp.TNewDataOption().TWithTextLang(common.DefaultTextLang)
|
||||
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
|
||||
prepare.FlushData(ctx, t, mc, schema.CollectionName)
|
||||
|
||||
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
|
||||
prepare.CreateIndex(ctx, t, mc, indexparams)
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// search
|
||||
queries := hp.GenFullTextQuery(common.DefaultNq, common.DefaultTextLang)
|
||||
vectors := make([]entity.Vector, 0, len(queries))
|
||||
for _, query := range queries {
|
||||
vectors = append(vectors, entity.Text(query))
|
||||
}
|
||||
resSearch, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, common.DefaultLimit, vectors).WithConsistencyLevel(entity.ClStrong))
|
||||
common.CheckErr(t, err, true)
|
||||
common.CheckSearchResult(t, resSearch, common.DefaultNq, common.DefaultLimit)
|
||||
}
|
||||
|
||||
// TestSearchFullTextBase tests basic full text search functionality with different languages
|
||||
func TestSearchFullTextWithDiffLang(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := createDefaultMilvusClient(ctx, t)
|
||||
|
||||
// Test cases for different languages and analyzers
|
||||
testCases := []struct {
|
||||
name string
|
||||
language string
|
||||
analyzer string
|
||||
query string
|
||||
numRows int
|
||||
topK int
|
||||
}{
|
||||
{
|
||||
name: "English_Standard",
|
||||
language: "english",
|
||||
analyzer: "standard",
|
||||
query: "what is information retrieval and its applications?",
|
||||
numRows: 3000,
|
||||
topK: 10,
|
||||
},
|
||||
{
|
||||
name: "Chinese_Jieba",
|
||||
language: "chinese",
|
||||
analyzer: "jieba",
|
||||
query: "信息检索的应用",
|
||||
numRows: 3000,
|
||||
topK: 10,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
analyzerParams := map[string]any{"tokenizer": tc.analyzer}
|
||||
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
|
||||
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
|
||||
insertOption := hp.TNewDataOption().TWithTextLang(tc.language).TWithNb(tc.numRows)
|
||||
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
|
||||
prepare.FlushData(ctx, t, mc, schema.CollectionName)
|
||||
|
||||
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
|
||||
prepare.CreateIndex(ctx, t, mc, indexparams)
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// search
|
||||
queries := []string{tc.query}
|
||||
vectors := make([]entity.Vector, 0, len(queries))
|
||||
for _, query := range queries {
|
||||
vectors = append(vectors, entity.Text(query))
|
||||
}
|
||||
resSearch, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, tc.topK, vectors).WithConsistencyLevel(entity.ClStrong))
|
||||
common.CheckErr(t, err, true)
|
||||
common.CheckSearchResult(t, resSearch, len(queries), tc.topK)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestSearchFullTextWithDynamicField tests full text search with dynamic field enabled
|
||||
func TestSearchFullTextWithDynamicField(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := createDefaultMilvusClient(ctx, t)
|
||||
// Test cases for different languages and analyzers
|
||||
testCases := []struct {
|
||||
name string
|
||||
language string
|
||||
analyzer string
|
||||
query string
|
||||
numRows int
|
||||
topK int
|
||||
}{
|
||||
{
|
||||
name: "English_Standard",
|
||||
language: "english",
|
||||
analyzer: "standard",
|
||||
query: "what is information retrieval and its applications?",
|
||||
numRows: 1000,
|
||||
topK: 5,
|
||||
},
|
||||
{
|
||||
name: "Chinese_Jieba",
|
||||
language: "chinese",
|
||||
analyzer: "jieba",
|
||||
query: "信息检索的应用",
|
||||
numRows: 1000,
|
||||
topK: 5,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
analyzerParams := map[string]any{"tokenizer": tc.analyzer}
|
||||
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
|
||||
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function).TWithEnableDynamicField(true)
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
|
||||
insertOption := hp.TNewDataOption().TWithTextLang(tc.language).TWithNb(tc.numRows)
|
||||
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
|
||||
prepare.FlushData(ctx, t, mc, schema.CollectionName)
|
||||
|
||||
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
|
||||
prepare.CreateIndex(ctx, t, mc, indexparams)
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// search
|
||||
queries := []string{tc.query}
|
||||
vectors := make([]entity.Vector, 0, len(queries))
|
||||
for _, query := range queries {
|
||||
vectors = append(vectors, entity.Text(query))
|
||||
}
|
||||
resSearch, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, tc.topK, vectors).WithConsistencyLevel(entity.ClStrong))
|
||||
common.CheckErr(t, err, true)
|
||||
common.CheckSearchResult(t, resSearch, len(queries), tc.topK)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestSearchFullTextWithPartitionKey tests full text search with partition key
|
||||
func TestSearchFullTextWithPartitionKey(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := createDefaultMilvusClient(ctx, t)
|
||||
|
||||
// Test cases for different languages and analyzers
|
||||
testCases := []struct {
|
||||
name string
|
||||
language string
|
||||
analyzer string
|
||||
query string
|
||||
numRows int
|
||||
topK int
|
||||
}{
|
||||
{
|
||||
name: "English_Standard",
|
||||
language: "english",
|
||||
analyzer: "standard",
|
||||
query: "what is information retrieval and its applications?",
|
||||
numRows: 1000,
|
||||
topK: 5,
|
||||
},
|
||||
{
|
||||
name: "Chinese_Jieba",
|
||||
language: "chinese",
|
||||
analyzer: "jieba",
|
||||
query: "信息检索的应用",
|
||||
numRows: 1000,
|
||||
topK: 5,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
analyzerParams := map[string]any{"tokenizer": tc.analyzer}
|
||||
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams).TWithIsPartitionKey(true)
|
||||
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
|
||||
insertOption := hp.TNewDataOption().TWithTextLang(tc.language).TWithNb(tc.numRows)
|
||||
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
|
||||
prepare.FlushData(ctx, t, mc, schema.CollectionName)
|
||||
|
||||
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
|
||||
prepare.CreateIndex(ctx, t, mc, indexparams)
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// search
|
||||
queries := []string{tc.query}
|
||||
vectors := make([]entity.Vector, 0, len(queries))
|
||||
for _, query := range queries {
|
||||
vectors = append(vectors, entity.Text(query))
|
||||
}
|
||||
resSearch, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, tc.topK, vectors).WithConsistencyLevel(entity.ClStrong))
|
||||
common.CheckErr(t, err, true)
|
||||
common.CheckSearchResult(t, resSearch, len(queries), tc.topK)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestSearchFullTextWithEmptyData tests full text search with empty data
|
||||
func TestSearchFullTextWithEmptyData(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := createDefaultMilvusClient(ctx, t)
|
||||
|
||||
// Test cases for different empty percent
|
||||
testCases := []struct {
|
||||
name string
|
||||
language string
|
||||
analyzer string
|
||||
query string
|
||||
numRows int
|
||||
topK int
|
||||
emptyPercent int
|
||||
}{
|
||||
{
|
||||
name: "English_Standard",
|
||||
language: "english",
|
||||
analyzer: "standard",
|
||||
query: "what is information retrieval and its applications?",
|
||||
numRows: 3000,
|
||||
topK: 5,
|
||||
emptyPercent: 50,
|
||||
},
|
||||
{
|
||||
name: "Chinese_Jieba",
|
||||
language: "chinese",
|
||||
analyzer: "jieba",
|
||||
query: "信息检索的应用",
|
||||
numRows: 3000,
|
||||
topK: 5,
|
||||
emptyPercent: 80,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
analyzerParams := map[string]any{"tokenizer": tc.analyzer}
|
||||
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams).TWithIsPartitionKey(true)
|
||||
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
|
||||
insertOption := hp.TNewDataOption().TWithTextLang(tc.language).TWithNb(tc.numRows).TWithTextEmptyPercent(tc.emptyPercent)
|
||||
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
|
||||
prepare.FlushData(ctx, t, mc, schema.CollectionName)
|
||||
|
||||
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
|
||||
prepare.CreateIndex(ctx, t, mc, indexparams)
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// search
|
||||
queries := []string{tc.query}
|
||||
vectors := make([]entity.Vector, 0, len(queries))
|
||||
for _, query := range queries {
|
||||
vectors = append(vectors, entity.Text(query))
|
||||
}
|
||||
resSearch, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, tc.topK, vectors).WithConsistencyLevel(entity.ClStrong))
|
||||
common.CheckErr(t, err, true)
|
||||
common.CheckSearchResult(t, resSearch, len(queries), tc.topK)
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -3,6 +3,8 @@ package helper
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"math/rand"
|
||||
"slices"
|
||||
"strconv"
|
||||
|
||||
"go.uber.org/zap"
|
||||
@ -38,14 +40,16 @@ func (opt *InsertParams) TWithIsRows(isRows bool) *InsertParams {
|
||||
|
||||
// GenColumnDataOption -- create column data --
|
||||
type GenDataOption struct {
|
||||
nb int
|
||||
start int
|
||||
dim int
|
||||
maxLen int
|
||||
sparseMaxLen int
|
||||
maxCapacity int
|
||||
elementType entity.FieldType
|
||||
fieldName string
|
||||
nb int
|
||||
start int
|
||||
dim int
|
||||
maxLen int
|
||||
sparseMaxLen int
|
||||
maxCapacity int
|
||||
elementType entity.FieldType
|
||||
fieldName string
|
||||
textLang string
|
||||
textEmptyPercent int
|
||||
}
|
||||
|
||||
func (opt *GenDataOption) TWithNb(nb int) *GenDataOption {
|
||||
@ -88,15 +92,28 @@ func (opt *GenDataOption) TWithElementType(eleType entity.FieldType) *GenDataOpt
|
||||
return opt
|
||||
}
|
||||
|
||||
func (opt *GenDataOption) TWithTextLang(lang string) *GenDataOption {
|
||||
opt.textLang = lang
|
||||
return opt
|
||||
}
|
||||
|
||||
func (opt *GenDataOption) TWithTextEmptyPercent(percent int) *GenDataOption {
|
||||
opt.textEmptyPercent = percent
|
||||
return opt
|
||||
}
|
||||
|
||||
func TNewDataOption() *GenDataOption {
|
||||
return &GenDataOption{
|
||||
nb: common.DefaultNb,
|
||||
start: 0,
|
||||
dim: common.DefaultDim,
|
||||
maxLen: common.TestMaxLen,
|
||||
sparseMaxLen: common.TestMaxLen,
|
||||
maxCapacity: common.TestCapacity,
|
||||
elementType: entity.FieldTypeNone,
|
||||
nb: common.DefaultNb,
|
||||
start: 0,
|
||||
dim: common.DefaultDim,
|
||||
maxLen: common.TestMaxLen,
|
||||
sparseMaxLen: common.TestMaxLen,
|
||||
maxCapacity: common.TestCapacity,
|
||||
elementType: entity.FieldTypeNone,
|
||||
fieldName: "",
|
||||
textLang: "",
|
||||
textEmptyPercent: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@ -310,8 +327,35 @@ func GenColumnData(nb int, fieldType entity.FieldType, option GenDataOption) col
|
||||
|
||||
case entity.FieldTypeVarChar:
|
||||
varcharValues := make([]string, 0, nb)
|
||||
for i := start; i < start+nb; i++ {
|
||||
varcharValues = append(varcharValues, strconv.Itoa(i))
|
||||
if option.textLang != "" {
|
||||
// Use language-specific text generation
|
||||
var lang string
|
||||
switch option.textLang {
|
||||
case "en", "english":
|
||||
lang = "en"
|
||||
case "zh", "chinese":
|
||||
lang = "zh"
|
||||
default:
|
||||
// Fallback to sequential numbers for unsupported languages
|
||||
for i := start; i < start+nb; i++ {
|
||||
varcharValues = append(varcharValues, strconv.Itoa(i))
|
||||
}
|
||||
return column.NewColumnVarChar(fieldName, varcharValues)
|
||||
}
|
||||
|
||||
// Generate text data with empty values based on textEmptyPercent
|
||||
for i := 0; i < nb; i++ {
|
||||
if rand.Float64()*100 < float64(option.textEmptyPercent) {
|
||||
varcharValues = append(varcharValues, "")
|
||||
} else {
|
||||
varcharValues = append(varcharValues, common.GenText(lang))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Default behavior: sequential numbers
|
||||
for i := start; i < start+nb; i++ {
|
||||
varcharValues = append(varcharValues, strconv.Itoa(i))
|
||||
}
|
||||
}
|
||||
return column.NewColumnVarChar(fieldName, varcharValues)
|
||||
|
||||
@ -449,6 +493,16 @@ func MergeColumnsToDynamic(nb int, columns []column.Column, columnName string) *
|
||||
return jsonColumn
|
||||
}
|
||||
|
||||
func GetBm25FunctionsOutputFields(schema *entity.Schema) []string {
|
||||
var outputFields []string
|
||||
for _, fn := range schema.Functions {
|
||||
if fn.Type == entity.FunctionTypeBM25 {
|
||||
outputFields = append(outputFields, fn.OutputFieldNames...)
|
||||
}
|
||||
}
|
||||
return outputFields
|
||||
}
|
||||
|
||||
func GenColumnsBasedSchema(schema *entity.Schema, option *GenDataOption) ([]column.Column, []column.Column) {
|
||||
if nil == schema || schema.CollectionName == "" {
|
||||
log.Fatal("[GenColumnsBasedSchema] Nil Schema is not expected")
|
||||
@ -463,6 +517,12 @@ func GenColumnsBasedSchema(schema *entity.Schema, option *GenDataOption) ([]colu
|
||||
if field.AutoID {
|
||||
continue
|
||||
}
|
||||
if slices.Contains(GetBm25FunctionsOutputFields(schema), field.Name) {
|
||||
continue
|
||||
}
|
||||
log.Info("GenColumnsBasedSchema", zap.Any("field", field))
|
||||
// set field name to option
|
||||
option.TWithFieldName(field.Name)
|
||||
columns = append(columns, GenColumnData(option.nb, field.DataType, *option))
|
||||
}
|
||||
if schema.EnableDynamicField {
|
||||
|
||||
@ -107,6 +107,7 @@ const (
|
||||
Int64MultiVec CollectionFieldsType = 6 // int64 + floatVec + binaryVec + fp16Vec + bf16vec
|
||||
AllFields CollectionFieldsType = 7 // all fields excepted sparse
|
||||
Int64VecAllScalar CollectionFieldsType = 8 // int64 + floatVec + all scalar fields
|
||||
FullTextSearch CollectionFieldsType = 9 // int64 + varchar + sparse vector + analyzer + function
|
||||
)
|
||||
|
||||
type GenFieldsOption struct {
|
||||
@ -116,6 +117,8 @@ type GenFieldsOption struct {
|
||||
MaxLength int64 // varchar len or array capacity
|
||||
MaxCapacity int64
|
||||
IsPartitionKey bool
|
||||
EnableAnalyzer bool
|
||||
AnalyzerParams map[string]any
|
||||
ElementType entity.FieldType
|
||||
}
|
||||
|
||||
@ -127,6 +130,8 @@ func TNewFieldsOption() *GenFieldsOption {
|
||||
MaxCapacity: common.TestCapacity,
|
||||
IsDynamic: false,
|
||||
IsPartitionKey: false,
|
||||
EnableAnalyzer: false,
|
||||
AnalyzerParams: make(map[string]any),
|
||||
ElementType: entity.FieldTypeNone,
|
||||
}
|
||||
}
|
||||
@ -166,6 +171,16 @@ func (opt *GenFieldsOption) TWithMaxCapacity(maxCapacity int64) *GenFieldsOption
|
||||
return opt
|
||||
}
|
||||
|
||||
func (opt *GenFieldsOption) TWithEnableAnalyzer(enableAnalyzer bool) *GenFieldsOption {
|
||||
opt.EnableAnalyzer = enableAnalyzer
|
||||
return opt
|
||||
}
|
||||
|
||||
func (opt *GenFieldsOption) TWithAnalyzerParams(analyzerParams map[string]any) *GenFieldsOption {
|
||||
opt.AnalyzerParams = analyzerParams
|
||||
return opt
|
||||
}
|
||||
|
||||
// factory
|
||||
type FieldsFactory struct{}
|
||||
|
||||
@ -341,6 +356,23 @@ func (cf FieldsInt64VecAllScalar) GenFields(option GenFieldsOption) []*entity.Fi
|
||||
return fields
|
||||
}
|
||||
|
||||
type FieldsFullTextSearch struct{}
|
||||
|
||||
func (cf FieldsFullTextSearch) GenFields(option GenFieldsOption) []*entity.Field {
|
||||
pkField := entity.NewField().WithName(GetFieldNameByFieldType(entity.FieldTypeInt64)).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true)
|
||||
textField := entity.NewField().WithName(common.DefaultTextFieldName).WithDataType(entity.FieldTypeVarChar).WithMaxLength(option.MaxLength).WithIsPartitionKey(option.IsPartitionKey).WithEnableAnalyzer(true).WithAnalyzerParams(option.AnalyzerParams)
|
||||
sparseVecField := entity.NewField().WithName(common.DefaultTextSparseVecFieldName).WithDataType(entity.FieldTypeSparseVector)
|
||||
if option.AutoID {
|
||||
pkField.WithIsAutoID(option.AutoID)
|
||||
}
|
||||
fields := []*entity.Field{
|
||||
pkField,
|
||||
textField,
|
||||
sparseVecField,
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func (ff FieldsFactory) GenFieldsForCollection(collectionFieldsType CollectionFieldsType, option *GenFieldsOption) []*entity.Field {
|
||||
log.Info("GenFieldsForCollection", zap.Any("GenFieldsOption", option))
|
||||
switch collectionFieldsType {
|
||||
@ -360,6 +392,8 @@ func (ff FieldsFactory) GenFieldsForCollection(collectionFieldsType CollectionFi
|
||||
return FieldsAllFields{}.GenFields(*option)
|
||||
case Int64VecAllScalar:
|
||||
return FieldsInt64VecAllScalar{}.GenFields(*option)
|
||||
case FullTextSearch:
|
||||
return FieldsFullTextSearch{}.GenFields(*option)
|
||||
default:
|
||||
return FieldsInt64Vec{}.GenFields(*option)
|
||||
}
|
||||
|
||||
14
tests/go_client/testcases/helper/function_helper.go
Normal file
14
tests/go_client/testcases/helper/function_helper.go
Normal file
@ -0,0 +1,14 @@
|
||||
package helper
|
||||
|
||||
import (
|
||||
"github.com/milvus-io/milvus/client/v2/entity"
|
||||
)
|
||||
|
||||
// TNewBM25Function creates a new BM25 function with the given input and output fields
|
||||
func TNewBM25Function(inputField, outputField string) *entity.Function {
|
||||
return entity.NewFunction().
|
||||
WithName(inputField + "_bm25_emb").
|
||||
WithInputFields(inputField).
|
||||
WithOutputFields(outputField).
|
||||
WithType(entity.FunctionTypeBM25)
|
||||
}
|
||||
@ -135,6 +135,8 @@ func (chainTask *CollectionPrepare) InsertData(ctx context.Context, t *testing.T
|
||||
if nil == ip.Schema || ip.Schema.CollectionName == "" {
|
||||
log.Fatal("[InsertData] Nil Schema is not expected")
|
||||
}
|
||||
// print option
|
||||
log.Info("GenDataOption", zap.Any("option", option))
|
||||
columns, dynamicColumns := GenColumnsBasedSchema(ip.Schema, option)
|
||||
insertOpt := clientv2.NewColumnBasedInsertOption(ip.Schema.CollectionName).WithColumns(columns...).WithColumns(dynamicColumns...)
|
||||
if ip.PartitionName != "" {
|
||||
|
||||
@ -55,6 +55,10 @@ var SupportBinIvfFlatMetricType = []entity.MetricType{
|
||||
entity.HAMMING,
|
||||
}
|
||||
|
||||
var SupportFullTextSearchMetricsType = []entity.MetricType{
|
||||
entity.BM25,
|
||||
}
|
||||
|
||||
var UnsupportedSparseVecMetricsType = []entity.MetricType{
|
||||
entity.L2,
|
||||
entity.COSINE,
|
||||
|
||||
@ -66,6 +66,14 @@ func GenSearchVectors(nq int, dim int, dataType entity.FieldType) []entity.Vecto
|
||||
return vectors
|
||||
}
|
||||
|
||||
func GenFullTextQuery(nq int, lang string) []string {
|
||||
queries := make([]string, 0, nq)
|
||||
for i := 0; i < nq; i++ {
|
||||
queries = append(queries, common.GenText(lang))
|
||||
}
|
||||
return queries
|
||||
}
|
||||
|
||||
func GenFp16OrBf16VectorsFromFloatVector(nq int, dim int, dataType entity.FieldType) []entity.Vector {
|
||||
vectors := make([]entity.Vector, 0, nq)
|
||||
switch dataType {
|
||||
|
||||
@ -12,6 +12,7 @@ type GenSchemaOption struct {
|
||||
AutoID bool
|
||||
Fields []*entity.Field
|
||||
EnableDynamicField bool
|
||||
Function *entity.Function
|
||||
}
|
||||
|
||||
func TNewSchemaOption() *GenSchemaOption {
|
||||
@ -43,6 +44,11 @@ func (opt *GenSchemaOption) TWithFields(fields []*entity.Field) *GenSchemaOption
|
||||
return opt
|
||||
}
|
||||
|
||||
func (opt *GenSchemaOption) TWithFunction(function *entity.Function) *GenSchemaOption {
|
||||
opt.Function = function
|
||||
return opt
|
||||
}
|
||||
|
||||
func GenSchema(option *GenSchemaOption) *entity.Schema {
|
||||
if len(option.Fields) == 0 {
|
||||
log.Fatal("Require at least a primary field and a vector field")
|
||||
@ -64,5 +70,8 @@ func GenSchema(option *GenSchemaOption) *entity.Schema {
|
||||
if option.EnableDynamicField {
|
||||
schema.WithDynamicFieldEnabled(option.EnableDynamicField)
|
||||
}
|
||||
if option.Function != nil {
|
||||
schema.WithFunction(option.Function)
|
||||
}
|
||||
return schema
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user