test: add go-sdk cases for full text search (#39570)

/kind improvement

---------

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
Signed-off-by: zhuwenxing <wxzhuyeah@gmail.com>
This commit is contained in:
zhuwenxing 2025-02-10 10:32:45 +08:00 committed by GitHub
parent 1f14053c70
commit ee87e4d0b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 712 additions and 45 deletions

View File

@ -1 +1,191 @@
## go_client
# Milvus Go Client Test Framework
## Overview
This is a comprehensive test framework for the Milvus Go Client, designed to validate various functionalities of the Milvus vector database client. The framework provides a structured approach to writing tests with reusable components and helper functions.
## Framework Architecture
### Directory Structure
```
/go_client/
├── testcases/ # Main test cases
│ ├── helper/ # Helper functions and utilities
│ │ ├── helper.go
│ │ ├── data_helper.go
│ │ └── collection_helper.go
│ ├── search_test.go # Search functionality tests
│ ├── index_test.go # Index management tests
│ └── ...
├── common/ # Common utilities and constants
└── base/ # Base infrastructure code
```
### Key Components
- **Collection Preparation**: Utilities for creating and managing collections
- **Data Generation**: Tools for generating test data
- **Helper Functions**: Common operations and validations
- **Test Cases**: Organized by functionality
## Writing Test Cases
### Basic Test Structure
```go
func TestYourFeature(t *testing.T) {
// 1. Setup context and client
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := createDefaultMilvusClient(ctx, t)
// 2. Prepare collection
prepare, schema := hp.CollPrepare.CreateCollection(
ctx, t, mc,
hp.NewCreateCollectionParams(hp.Int64Vec),
hp.TNewFieldsOption(),
hp.TNewSchemaOption(),
)
// 3. Insert test data
prepare.InsertData(ctx, t, mc,
hp.NewInsertParams(schema),
hp.TNewDataOption(),
)
// 4. Execute test operations
// ... your test logic here ...
// 5. Validate results
require.NoError(t, err)
require.Equal(t, expected, actual)
}
```
### Using Custom Parameters
1. **Collection Creation Parameters**
```go
fieldsOption := hp.TNewFieldsOption().
TWithEnableAnalyzer(true).
TWithAnalyzerParams(map[string]any{
"tokenizer": "standard",
})
schemaOption := hp.TNewSchemaOption().
TWithEnableDynamicField(true).
TWithDescription("Custom schema").
TWithAutoID(false)
```
2. **Data Insertion Options**
```go
insertOption := hp.TNewDataOption().
TWithNb(1000). // Number of records
TWithDim(128). // Vector dimension
TWithStart(100). // Starting ID
TWithMaxLen(256). // Maximum length
TWithTextLang("en") // Text language
```
3. **Index Parameters**
```go
indexParams := hp.TNewIndexParams(schema).
TWithFieldIndex(map[string]index.Index{
common.DefaultVectorFieldName: index.NewIVFSQIndex(
&index.IVFSQConfig{
MetricType: entity.L2,
NList: 128,
},
),
})
```
4. **Search Parameters**
```go
searchOpt := client.NewSearchOption(schema.CollectionName, 100, vectors).
WithOffset(0).
WithLimit(100).
WithConsistencyLevel(entity.ClStrong).
WithFilter("int64 >= 100").
WithOutputFields([]string{"*"}).
WithSearchParams(map[string]any{
"nprobe": 16,
"ef": 64,
})
```
## Adding New Parameters
1. **Define New Option Type**
```go
// In helper/data_helper.go
type YourNewOption struct {
newParam1 string
newParam2 int
}
```
2. **Add Constructor**
```go
func TNewYourOption() *YourNewOption {
return &YourNewOption{
newParam1: "default",
newParam2: 0,
}
}
```
3. **Add Parameter Methods**
```go
func (opt *YourNewOption) TWithNewParam1(value string) *YourNewOption {
opt.newParam1 = value
return opt
}
func (opt *YourNewOption) TWithNewParam2(value int) *YourNewOption {
opt.newParam2 = value
return opt
}
```
## Best Practices
1. **Test Organization**
- Group related tests in the same file
- Use clear and descriptive test names
- Add comments explaining test purpose
2. **Data Generation**
- Use helper functions for generating test data
- Ensure data is appropriate for the test case
- Clean up test data after use
3. **Error Handling**
- Use `common.CheckErr` for consistent error checking
- Test both success and failure scenarios
- Validate error messages when appropriate
4. **Performance Considerations**
- Use appropriate timeouts
- Clean up resources after tests
- Consider test execution time
## Running Tests
```bash
# Run all tests
go test ./testcases/...
# Run specific test
go test -run TestYourFeature ./testcases/
# Run with verbose output
go test -v ./testcases/...
```
## Contributing
1. Follow the existing code structure
2. Add comprehensive test cases
3. Document new parameters and options
4. Update this README for significant changes
5. Ensure code quality standards:
- Run `golangci-lint run` to check for style mistakes
- Use `gofmt -w your/code/path` to format your code before submitting
- CI will verify both golint and go format compliance

View File

@ -4,33 +4,35 @@ import "github.com/milvus-io/milvus/client/v2/index"
// cost default field name
const (
DefaultInt8FieldName = "int8"
DefaultInt16FieldName = "int16"
DefaultInt32FieldName = "int32"
DefaultInt64FieldName = "int64"
DefaultBoolFieldName = "bool"
DefaultFloatFieldName = "float"
DefaultDoubleFieldName = "double"
DefaultVarcharFieldName = "varchar"
DefaultJSONFieldName = "json"
DefaultArrayFieldName = "array"
DefaultFloatVecFieldName = "floatVec"
DefaultBinaryVecFieldName = "binaryVec"
DefaultFloat16VecFieldName = "fp16Vec"
DefaultBFloat16VecFieldName = "bf16Vec"
DefaultSparseVecFieldName = "sparseVec"
DefaultDynamicNumberField = "dynamicNumber"
DefaultDynamicStringField = "dynamicString"
DefaultDynamicBoolField = "dynamicBool"
DefaultDynamicListField = "dynamicList"
DefaultBoolArrayField = "boolArray"
DefaultInt8ArrayField = "int8Array"
DefaultInt16ArrayField = "int16Array"
DefaultInt32ArrayField = "int32Array"
DefaultInt64ArrayField = "int64Array"
DefaultFloatArrayField = "floatArray"
DefaultDoubleArrayField = "doubleArray"
DefaultVarcharArrayField = "varcharArray"
DefaultInt8FieldName = "int8"
DefaultInt16FieldName = "int16"
DefaultInt32FieldName = "int32"
DefaultInt64FieldName = "int64"
DefaultBoolFieldName = "bool"
DefaultFloatFieldName = "float"
DefaultDoubleFieldName = "double"
DefaultTextFieldName = "text"
DefaultVarcharFieldName = "varchar"
DefaultJSONFieldName = "json"
DefaultArrayFieldName = "array"
DefaultFloatVecFieldName = "floatVec"
DefaultBinaryVecFieldName = "binaryVec"
DefaultFloat16VecFieldName = "fp16Vec"
DefaultBFloat16VecFieldName = "bf16Vec"
DefaultTextSparseVecFieldName = "textSparseVec"
DefaultSparseVecFieldName = "sparseVec"
DefaultDynamicNumberField = "dynamicNumber"
DefaultDynamicStringField = "dynamicString"
DefaultDynamicBoolField = "dynamicBool"
DefaultDynamicListField = "dynamicList"
DefaultBoolArrayField = "boolArray"
DefaultInt8ArrayField = "int8Array"
DefaultInt16ArrayField = "int16Array"
DefaultInt32ArrayField = "int32Array"
DefaultInt64ArrayField = "int64Array"
DefaultFloatArrayField = "floatArray"
DefaultDoubleArrayField = "doubleArray"
DefaultVarcharArrayField = "varcharArray"
)
// cost for test cases
@ -86,3 +88,8 @@ const (
DatabaseForceDenyReading = "database.force.deny.reading"
DatabaseDiskQuotaMb = "database.diskQuota.mb"
)
// const for full text search
const (
DefaultTextLang = "en"
)

View File

@ -153,3 +153,59 @@ var InvalidExpressions = []InvalidExprStruct{
{Expr: fmt.Sprintf("%s[-1] > %d", DefaultInt8ArrayField, TestCapacity), ErrNil: false, ErrMsg: "cannot parse expression"}, // array[-1] >
{Expr: fmt.Sprintf("%s[-1] > 1", DefaultJSONFieldName), ErrNil: false, ErrMsg: "invalid expression"}, // json[-1] >
}
// Language constants for text generation
const (
English = "en"
Chinese = "zh"
)
func GenText(lang string) string {
englishTopics := []string{
"information retrieval", "data mining", "machine learning",
"natural language processing", "text analysis", "search engines",
"document indexing", "query processing", "relevance ranking",
"semantic search",
}
englishVerbs := []string{
"is", "focuses on", "deals with", "involves", "combines",
"utilizes", "improves", "enables", "enhances", "supports",
}
englishObjects := []string{
"large datasets", "text documents", "user queries", "search results",
"information needs", "relevance scores", "ranking algorithms",
"index structures", "query expansion", "document collections",
}
chineseTopics := []string{
"信息检索", "数据挖掘", "机器学习",
"自然语言处理", "文本分析", "搜索引擎",
"文档索引", "查询处理", "相关性排序",
"语义搜索",
}
chineseVerbs := []string{
"是", "专注于", "处理", "涉及", "结合",
"利用", "改进", "实现", "提升", "支持",
}
chineseObjects := []string{
"大规模数据集", "文本文档", "用户查询", "搜索结果",
"信息需求", "相关性分数", "排序算法",
"索引结构", "查询扩展", "文档集合",
}
var topic, verb, object string
switch lang {
case English:
topic = englishTopics[rand.Intn(len(englishTopics))]
verb = englishVerbs[rand.Intn(len(englishVerbs))]
object = englishObjects[rand.Intn(len(englishObjects))]
return fmt.Sprintf("%s %s %s", topic, verb, object)
case Chinese:
topic = chineseTopics[rand.Intn(len(chineseTopics))]
verb = chineseVerbs[rand.Intn(len(chineseVerbs))]
object = chineseObjects[rand.Intn(len(chineseObjects))]
return fmt.Sprintf("%s%s%s", topic, verb, object)
default:
return "Unsupported language"
}
}

View File

@ -0,0 +1,283 @@
package testcases
import (
"testing"
"time"
"github.com/milvus-io/milvus/client/v2/entity"
"github.com/milvus-io/milvus/client/v2/index"
"github.com/milvus-io/milvus/client/v2/milvusclient"
"github.com/milvus-io/milvus/tests/go_client/common"
hp "github.com/milvus-io/milvus/tests/go_client/testcases/helper"
)
func TestFullTextSearchDefault(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := createDefaultMilvusClient(ctx, t)
// create -> insert -> flush -> index -> load
analyzerParams := map[string]any{"tokenizer": "standard"}
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
insertOption := hp.TNewDataOption().TWithTextLang(common.DefaultTextLang)
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
prepare.FlushData(ctx, t, mc, schema.CollectionName)
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
prepare.CreateIndex(ctx, t, mc, indexparams)
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// search
queries := hp.GenFullTextQuery(common.DefaultNq, common.DefaultTextLang)
vectors := make([]entity.Vector, 0, len(queries))
for _, query := range queries {
vectors = append(vectors, entity.Text(query))
}
resSearch, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, common.DefaultLimit, vectors).WithConsistencyLevel(entity.ClStrong))
common.CheckErr(t, err, true)
common.CheckSearchResult(t, resSearch, common.DefaultNq, common.DefaultLimit)
}
// TestSearchFullTextBase tests basic full text search functionality with different languages
func TestSearchFullTextWithDiffLang(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := createDefaultMilvusClient(ctx, t)
// Test cases for different languages and analyzers
testCases := []struct {
name string
language string
analyzer string
query string
numRows int
topK int
}{
{
name: "English_Standard",
language: "english",
analyzer: "standard",
query: "what is information retrieval and its applications?",
numRows: 3000,
topK: 10,
},
{
name: "Chinese_Jieba",
language: "chinese",
analyzer: "jieba",
query: "信息检索的应用",
numRows: 3000,
topK: 10,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
analyzerParams := map[string]any{"tokenizer": tc.analyzer}
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
insertOption := hp.TNewDataOption().TWithTextLang(tc.language).TWithNb(tc.numRows)
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
prepare.FlushData(ctx, t, mc, schema.CollectionName)
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
prepare.CreateIndex(ctx, t, mc, indexparams)
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// search
queries := []string{tc.query}
vectors := make([]entity.Vector, 0, len(queries))
for _, query := range queries {
vectors = append(vectors, entity.Text(query))
}
resSearch, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, tc.topK, vectors).WithConsistencyLevel(entity.ClStrong))
common.CheckErr(t, err, true)
common.CheckSearchResult(t, resSearch, len(queries), tc.topK)
})
}
}
// TestSearchFullTextWithDynamicField tests full text search with dynamic field enabled
func TestSearchFullTextWithDynamicField(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := createDefaultMilvusClient(ctx, t)
// Test cases for different languages and analyzers
testCases := []struct {
name string
language string
analyzer string
query string
numRows int
topK int
}{
{
name: "English_Standard",
language: "english",
analyzer: "standard",
query: "what is information retrieval and its applications?",
numRows: 1000,
topK: 5,
},
{
name: "Chinese_Jieba",
language: "chinese",
analyzer: "jieba",
query: "信息检索的应用",
numRows: 1000,
topK: 5,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
analyzerParams := map[string]any{"tokenizer": tc.analyzer}
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams)
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
schemaOption := hp.TNewSchemaOption().TWithFunction(function).TWithEnableDynamicField(true)
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
insertOption := hp.TNewDataOption().TWithTextLang(tc.language).TWithNb(tc.numRows)
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
prepare.FlushData(ctx, t, mc, schema.CollectionName)
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
prepare.CreateIndex(ctx, t, mc, indexparams)
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// search
queries := []string{tc.query}
vectors := make([]entity.Vector, 0, len(queries))
for _, query := range queries {
vectors = append(vectors, entity.Text(query))
}
resSearch, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, tc.topK, vectors).WithConsistencyLevel(entity.ClStrong))
common.CheckErr(t, err, true)
common.CheckSearchResult(t, resSearch, len(queries), tc.topK)
})
}
}
// TestSearchFullTextWithPartitionKey tests full text search with partition key
func TestSearchFullTextWithPartitionKey(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := createDefaultMilvusClient(ctx, t)
// Test cases for different languages and analyzers
testCases := []struct {
name string
language string
analyzer string
query string
numRows int
topK int
}{
{
name: "English_Standard",
language: "english",
analyzer: "standard",
query: "what is information retrieval and its applications?",
numRows: 1000,
topK: 5,
},
{
name: "Chinese_Jieba",
language: "chinese",
analyzer: "jieba",
query: "信息检索的应用",
numRows: 1000,
topK: 5,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
analyzerParams := map[string]any{"tokenizer": tc.analyzer}
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams).TWithIsPartitionKey(true)
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
insertOption := hp.TNewDataOption().TWithTextLang(tc.language).TWithNb(tc.numRows)
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
prepare.FlushData(ctx, t, mc, schema.CollectionName)
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
prepare.CreateIndex(ctx, t, mc, indexparams)
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// search
queries := []string{tc.query}
vectors := make([]entity.Vector, 0, len(queries))
for _, query := range queries {
vectors = append(vectors, entity.Text(query))
}
resSearch, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, tc.topK, vectors).WithConsistencyLevel(entity.ClStrong))
common.CheckErr(t, err, true)
common.CheckSearchResult(t, resSearch, len(queries), tc.topK)
})
}
}
// TestSearchFullTextWithEmptyData tests full text search with empty data
func TestSearchFullTextWithEmptyData(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := createDefaultMilvusClient(ctx, t)
// Test cases for different empty percent
testCases := []struct {
name string
language string
analyzer string
query string
numRows int
topK int
emptyPercent int
}{
{
name: "English_Standard",
language: "english",
analyzer: "standard",
query: "what is information retrieval and its applications?",
numRows: 3000,
topK: 5,
emptyPercent: 50,
},
{
name: "Chinese_Jieba",
language: "chinese",
analyzer: "jieba",
query: "信息检索的应用",
numRows: 3000,
topK: 5,
emptyPercent: 80,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
analyzerParams := map[string]any{"tokenizer": tc.analyzer}
fieldsOption := hp.TNewFieldsOption().TWithAnalyzerParams(analyzerParams).TWithIsPartitionKey(true)
function := hp.TNewBM25Function(common.DefaultTextFieldName, common.DefaultTextSparseVecFieldName)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.FullTextSearch), fieldsOption, schemaOption)
insertOption := hp.TNewDataOption().TWithTextLang(tc.language).TWithNb(tc.numRows).TWithTextEmptyPercent(tc.emptyPercent)
prepare.InsertData(ctx, t, mc, hp.NewInsertParams(schema), insertOption)
prepare.FlushData(ctx, t, mc, schema.CollectionName)
indexparams := hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{common.DefaultTextSparseVecFieldName: index.NewSparseInvertedIndex(entity.BM25, 0.1)})
prepare.CreateIndex(ctx, t, mc, indexparams)
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// search
queries := []string{tc.query}
vectors := make([]entity.Vector, 0, len(queries))
for _, query := range queries {
vectors = append(vectors, entity.Text(query))
}
resSearch, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, tc.topK, vectors).WithConsistencyLevel(entity.ClStrong))
common.CheckErr(t, err, true)
common.CheckSearchResult(t, resSearch, len(queries), tc.topK)
})
}
}

View File

@ -3,6 +3,8 @@ package helper
import (
"bytes"
"encoding/json"
"math/rand"
"slices"
"strconv"
"go.uber.org/zap"
@ -38,14 +40,16 @@ func (opt *InsertParams) TWithIsRows(isRows bool) *InsertParams {
// GenColumnDataOption -- create column data --
type GenDataOption struct {
nb int
start int
dim int
maxLen int
sparseMaxLen int
maxCapacity int
elementType entity.FieldType
fieldName string
nb int
start int
dim int
maxLen int
sparseMaxLen int
maxCapacity int
elementType entity.FieldType
fieldName string
textLang string
textEmptyPercent int
}
func (opt *GenDataOption) TWithNb(nb int) *GenDataOption {
@ -88,15 +92,28 @@ func (opt *GenDataOption) TWithElementType(eleType entity.FieldType) *GenDataOpt
return opt
}
func (opt *GenDataOption) TWithTextLang(lang string) *GenDataOption {
opt.textLang = lang
return opt
}
func (opt *GenDataOption) TWithTextEmptyPercent(percent int) *GenDataOption {
opt.textEmptyPercent = percent
return opt
}
func TNewDataOption() *GenDataOption {
return &GenDataOption{
nb: common.DefaultNb,
start: 0,
dim: common.DefaultDim,
maxLen: common.TestMaxLen,
sparseMaxLen: common.TestMaxLen,
maxCapacity: common.TestCapacity,
elementType: entity.FieldTypeNone,
nb: common.DefaultNb,
start: 0,
dim: common.DefaultDim,
maxLen: common.TestMaxLen,
sparseMaxLen: common.TestMaxLen,
maxCapacity: common.TestCapacity,
elementType: entity.FieldTypeNone,
fieldName: "",
textLang: "",
textEmptyPercent: 0,
}
}
@ -310,8 +327,35 @@ func GenColumnData(nb int, fieldType entity.FieldType, option GenDataOption) col
case entity.FieldTypeVarChar:
varcharValues := make([]string, 0, nb)
for i := start; i < start+nb; i++ {
varcharValues = append(varcharValues, strconv.Itoa(i))
if option.textLang != "" {
// Use language-specific text generation
var lang string
switch option.textLang {
case "en", "english":
lang = "en"
case "zh", "chinese":
lang = "zh"
default:
// Fallback to sequential numbers for unsupported languages
for i := start; i < start+nb; i++ {
varcharValues = append(varcharValues, strconv.Itoa(i))
}
return column.NewColumnVarChar(fieldName, varcharValues)
}
// Generate text data with empty values based on textEmptyPercent
for i := 0; i < nb; i++ {
if rand.Float64()*100 < float64(option.textEmptyPercent) {
varcharValues = append(varcharValues, "")
} else {
varcharValues = append(varcharValues, common.GenText(lang))
}
}
} else {
// Default behavior: sequential numbers
for i := start; i < start+nb; i++ {
varcharValues = append(varcharValues, strconv.Itoa(i))
}
}
return column.NewColumnVarChar(fieldName, varcharValues)
@ -449,6 +493,16 @@ func MergeColumnsToDynamic(nb int, columns []column.Column, columnName string) *
return jsonColumn
}
func GetBm25FunctionsOutputFields(schema *entity.Schema) []string {
var outputFields []string
for _, fn := range schema.Functions {
if fn.Type == entity.FunctionTypeBM25 {
outputFields = append(outputFields, fn.OutputFieldNames...)
}
}
return outputFields
}
func GenColumnsBasedSchema(schema *entity.Schema, option *GenDataOption) ([]column.Column, []column.Column) {
if nil == schema || schema.CollectionName == "" {
log.Fatal("[GenColumnsBasedSchema] Nil Schema is not expected")
@ -463,6 +517,12 @@ func GenColumnsBasedSchema(schema *entity.Schema, option *GenDataOption) ([]colu
if field.AutoID {
continue
}
if slices.Contains(GetBm25FunctionsOutputFields(schema), field.Name) {
continue
}
log.Info("GenColumnsBasedSchema", zap.Any("field", field))
// set field name to option
option.TWithFieldName(field.Name)
columns = append(columns, GenColumnData(option.nb, field.DataType, *option))
}
if schema.EnableDynamicField {

View File

@ -107,6 +107,7 @@ const (
Int64MultiVec CollectionFieldsType = 6 // int64 + floatVec + binaryVec + fp16Vec + bf16vec
AllFields CollectionFieldsType = 7 // all fields excepted sparse
Int64VecAllScalar CollectionFieldsType = 8 // int64 + floatVec + all scalar fields
FullTextSearch CollectionFieldsType = 9 // int64 + varchar + sparse vector + analyzer + function
)
type GenFieldsOption struct {
@ -116,6 +117,8 @@ type GenFieldsOption struct {
MaxLength int64 // varchar len or array capacity
MaxCapacity int64
IsPartitionKey bool
EnableAnalyzer bool
AnalyzerParams map[string]any
ElementType entity.FieldType
}
@ -127,6 +130,8 @@ func TNewFieldsOption() *GenFieldsOption {
MaxCapacity: common.TestCapacity,
IsDynamic: false,
IsPartitionKey: false,
EnableAnalyzer: false,
AnalyzerParams: make(map[string]any),
ElementType: entity.FieldTypeNone,
}
}
@ -166,6 +171,16 @@ func (opt *GenFieldsOption) TWithMaxCapacity(maxCapacity int64) *GenFieldsOption
return opt
}
func (opt *GenFieldsOption) TWithEnableAnalyzer(enableAnalyzer bool) *GenFieldsOption {
opt.EnableAnalyzer = enableAnalyzer
return opt
}
func (opt *GenFieldsOption) TWithAnalyzerParams(analyzerParams map[string]any) *GenFieldsOption {
opt.AnalyzerParams = analyzerParams
return opt
}
// factory
type FieldsFactory struct{}
@ -341,6 +356,23 @@ func (cf FieldsInt64VecAllScalar) GenFields(option GenFieldsOption) []*entity.Fi
return fields
}
type FieldsFullTextSearch struct{}
func (cf FieldsFullTextSearch) GenFields(option GenFieldsOption) []*entity.Field {
pkField := entity.NewField().WithName(GetFieldNameByFieldType(entity.FieldTypeInt64)).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true)
textField := entity.NewField().WithName(common.DefaultTextFieldName).WithDataType(entity.FieldTypeVarChar).WithMaxLength(option.MaxLength).WithIsPartitionKey(option.IsPartitionKey).WithEnableAnalyzer(true).WithAnalyzerParams(option.AnalyzerParams)
sparseVecField := entity.NewField().WithName(common.DefaultTextSparseVecFieldName).WithDataType(entity.FieldTypeSparseVector)
if option.AutoID {
pkField.WithIsAutoID(option.AutoID)
}
fields := []*entity.Field{
pkField,
textField,
sparseVecField,
}
return fields
}
func (ff FieldsFactory) GenFieldsForCollection(collectionFieldsType CollectionFieldsType, option *GenFieldsOption) []*entity.Field {
log.Info("GenFieldsForCollection", zap.Any("GenFieldsOption", option))
switch collectionFieldsType {
@ -360,6 +392,8 @@ func (ff FieldsFactory) GenFieldsForCollection(collectionFieldsType CollectionFi
return FieldsAllFields{}.GenFields(*option)
case Int64VecAllScalar:
return FieldsInt64VecAllScalar{}.GenFields(*option)
case FullTextSearch:
return FieldsFullTextSearch{}.GenFields(*option)
default:
return FieldsInt64Vec{}.GenFields(*option)
}

View File

@ -0,0 +1,14 @@
package helper
import (
"github.com/milvus-io/milvus/client/v2/entity"
)
// TNewBM25Function creates a new BM25 function with the given input and output fields
func TNewBM25Function(inputField, outputField string) *entity.Function {
return entity.NewFunction().
WithName(inputField + "_bm25_emb").
WithInputFields(inputField).
WithOutputFields(outputField).
WithType(entity.FunctionTypeBM25)
}

View File

@ -135,6 +135,8 @@ func (chainTask *CollectionPrepare) InsertData(ctx context.Context, t *testing.T
if nil == ip.Schema || ip.Schema.CollectionName == "" {
log.Fatal("[InsertData] Nil Schema is not expected")
}
// print option
log.Info("GenDataOption", zap.Any("option", option))
columns, dynamicColumns := GenColumnsBasedSchema(ip.Schema, option)
insertOpt := clientv2.NewColumnBasedInsertOption(ip.Schema.CollectionName).WithColumns(columns...).WithColumns(dynamicColumns...)
if ip.PartitionName != "" {

View File

@ -55,6 +55,10 @@ var SupportBinIvfFlatMetricType = []entity.MetricType{
entity.HAMMING,
}
var SupportFullTextSearchMetricsType = []entity.MetricType{
entity.BM25,
}
var UnsupportedSparseVecMetricsType = []entity.MetricType{
entity.L2,
entity.COSINE,

View File

@ -66,6 +66,14 @@ func GenSearchVectors(nq int, dim int, dataType entity.FieldType) []entity.Vecto
return vectors
}
func GenFullTextQuery(nq int, lang string) []string {
queries := make([]string, 0, nq)
for i := 0; i < nq; i++ {
queries = append(queries, common.GenText(lang))
}
return queries
}
func GenFp16OrBf16VectorsFromFloatVector(nq int, dim int, dataType entity.FieldType) []entity.Vector {
vectors := make([]entity.Vector, 0, nq)
switch dataType {

View File

@ -12,6 +12,7 @@ type GenSchemaOption struct {
AutoID bool
Fields []*entity.Field
EnableDynamicField bool
Function *entity.Function
}
func TNewSchemaOption() *GenSchemaOption {
@ -43,6 +44,11 @@ func (opt *GenSchemaOption) TWithFields(fields []*entity.Field) *GenSchemaOption
return opt
}
func (opt *GenSchemaOption) TWithFunction(function *entity.Function) *GenSchemaOption {
opt.Function = function
return opt
}
func GenSchema(option *GenSchemaOption) *entity.Schema {
if len(option.Fields) == 0 {
log.Fatal("Require at least a primary field and a vector field")
@ -64,5 +70,8 @@ func GenSchema(option *GenSchemaOption) *entity.Schema {
if option.EnableDynamicField {
schema.WithDynamicFieldEnabled(option.EnableDynamicField)
}
if option.Function != nil {
schema.WithFunction(option.Function)
}
return schema
}