mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 17:48:29 +08:00
test:add text embedding function testcases in go client (#43875)
/kind improvement --------- Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
parent
c102fa8b0b
commit
1e31ad345b
@ -4,7 +4,10 @@ import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
@ -539,6 +542,154 @@ func GetBm25FunctionsOutputFields(schema *entity.Schema) []string {
|
||||
return outputFields
|
||||
}
|
||||
|
||||
func GetTextEmbeddingFunctionsOutputFields(schema *entity.Schema) []string {
|
||||
var outputFields []string
|
||||
for _, fn := range schema.Functions {
|
||||
if fn.Type == entity.FunctionTypeTextEmbedding {
|
||||
outputFields = append(outputFields, fn.OutputFieldNames...)
|
||||
}
|
||||
}
|
||||
return outputFields
|
||||
}
|
||||
|
||||
func GetAllFunctionsOutputFields(schema *entity.Schema) []string {
|
||||
var outputFields []string
|
||||
for _, fn := range schema.Functions {
|
||||
if fn.Type == entity.FunctionTypeBM25 || fn.Type == entity.FunctionTypeTextEmbedding {
|
||||
outputFields = append(outputFields, fn.OutputFieldNames...)
|
||||
}
|
||||
}
|
||||
return outputFields
|
||||
}
|
||||
|
||||
// GenTextDocuments generates realistic text documents for embedding tests
|
||||
func GenTextDocuments(count int, lang string) []string {
|
||||
documents := make([]string, count)
|
||||
|
||||
var templates []string
|
||||
switch lang {
|
||||
case "english", "en":
|
||||
templates = []string{
|
||||
"This is a document about artificial intelligence and machine learning technologies in modern computing systems",
|
||||
"Vector databases enable efficient similarity search for high-dimensional data in AI applications",
|
||||
"Text embeddings transform natural language into numerical representations for semantic understanding",
|
||||
"Information retrieval systems help users find relevant documents from large collections of data",
|
||||
"Natural language processing enables computers to understand and generate human language effectively",
|
||||
"Database management systems provide structured storage and efficient querying of information",
|
||||
"Search algorithms rank and retrieve the most relevant results for user queries",
|
||||
"Machine learning models learn patterns from data to make predictions and classifications",
|
||||
"Deep learning neural networks process complex patterns in images, text, and other data types",
|
||||
"Data science combines statistics, programming, and domain knowledge to extract insights",
|
||||
}
|
||||
case "chinese", "zh":
|
||||
templates = []string{
|
||||
"这是关于人工智能和机器学习技术的文档,介绍现代计算系统中的应用",
|
||||
"向量数据库为高维数据提供高效的相似性搜索功能,支持AI应用开发",
|
||||
"文本嵌入技术将自然语言转换为数值表示,实现语义理解和分析",
|
||||
"信息检索系统帮助用户从大规模数据集合中找到相关的文档内容",
|
||||
"自然语言处理技术使计算机能够理解和生成人类语言",
|
||||
"数据库管理系统提供结构化存储和高效的信息查询功能",
|
||||
"搜索算法对用户查询结果进行排序和检索,返回最相关的内容",
|
||||
"机器学习模型从数据中学习模式,进行预测和分类任务",
|
||||
"深度学习神经网络处理图像、文本等复杂数据类型中的模式",
|
||||
"数据科学结合统计学、编程和领域知识来提取有价值的洞察",
|
||||
}
|
||||
default:
|
||||
// Default to English
|
||||
templates = []string{
|
||||
"Document about technology and innovation in the digital age",
|
||||
"Analysis of modern computing systems and their applications",
|
||||
"Research on data processing and information management",
|
||||
"Study of algorithms and their implementation in software",
|
||||
"Overview of database systems and their optimization techniques",
|
||||
}
|
||||
}
|
||||
|
||||
for i := 0; i < count; i++ {
|
||||
baseTemplate := templates[i%len(templates)]
|
||||
documents[i] = fmt.Sprintf("%s. Document ID: %d", baseTemplate, i)
|
||||
}
|
||||
|
||||
return documents
|
||||
}
|
||||
|
||||
// CosineSimilarity calculates cosine similarity between two float32 vectors
|
||||
func CosineSimilarity(a, b []float32) float32 {
|
||||
if len(a) != len(b) || len(a) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
var dotProduct, normA, normB float32
|
||||
for i := 0; i < len(a); i++ {
|
||||
dotProduct += a[i] * b[i]
|
||||
normA += a[i] * a[i]
|
||||
normB += b[i] * b[i]
|
||||
}
|
||||
|
||||
if normA == 0 || normB == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
// Use math.Sqrt for more accurate calculation
|
||||
return dotProduct / (float32(math.Sqrt(float64(normA))) * float32(math.Sqrt(float64(normB))))
|
||||
}
|
||||
|
||||
// GenLongText generates long text with specified word count
|
||||
func GenLongText(wordCount int, lang string) string {
|
||||
var words []string
|
||||
switch lang {
|
||||
case "chinese", "zh":
|
||||
words = []string{"人工智能", "机器学习", "深度学习", "神经网络", "数据挖掘", "自然语言", "处理技术", "计算机", "算法优化", "信息检索", "向量数据库", "语义搜索", "文本分析", "知识图谱", "智能系统"}
|
||||
case "english", "en":
|
||||
words = []string{"artificial", "intelligence", "machine", "learning", "deep", "neural", "network", "algorithm", "database", "search", "vector", "embedding", "semantic", "analysis", "information", "retrieval", "computing", "technology", "system", "data", "processing", "optimization", "performance", "scalability", "efficiency"}
|
||||
default:
|
||||
words = []string{"the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog", "and", "runs", "through", "forest", "with", "great", "speed", "while", "chasing", "rabbit", "under", "bright", "moonlight", "across", "green", "fields", "toward", "distant", "mountains"}
|
||||
}
|
||||
|
||||
result := make([]string, wordCount)
|
||||
for i := 0; i < wordCount; i++ {
|
||||
result[i] = words[i%len(words)]
|
||||
}
|
||||
|
||||
return strings.Join(result, " ")
|
||||
}
|
||||
|
||||
// CallTEIDirectly calls TEI endpoint directly to get embeddings
|
||||
func CallTEIDirectly(endpoint string, texts []string) ([][]float32, error) {
|
||||
// TEI API request structure
|
||||
type TEIRequest struct {
|
||||
Inputs []string `json:"inputs"`
|
||||
}
|
||||
|
||||
// Create request
|
||||
reqBody := TEIRequest{Inputs: texts}
|
||||
jsonData, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
||||
}
|
||||
|
||||
// Make HTTP request to TEI
|
||||
resp, err := http.Post(endpoint+"/embed", "application/json", bytes.NewBuffer(jsonData))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to call TEI endpoint: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Read response
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response: %w", err)
|
||||
}
|
||||
|
||||
// Parse response - TEI returns array of arrays
|
||||
var embeddings [][]float32
|
||||
if err := json.Unmarshal(body, &embeddings); err != nil {
|
||||
return nil, fmt.Errorf("failed to unmarshal response: %w", err)
|
||||
}
|
||||
|
||||
return embeddings, nil
|
||||
}
|
||||
|
||||
func GenColumnsBasedSchema(schema *entity.Schema, option *GenDataOption) ([]column.Column, []column.Column) {
|
||||
if nil == schema || schema.CollectionName == "" {
|
||||
log.Fatal("[GenColumnsBasedSchema] Nil Schema is not expected")
|
||||
@ -557,7 +708,7 @@ func GenColumnsBasedSchema(schema *entity.Schema, option *GenDataOption) ([]colu
|
||||
if option.fieldName == "" {
|
||||
option.fieldName = field.Name
|
||||
}
|
||||
if slices.Contains(GetBm25FunctionsOutputFields(schema), field.Name) {
|
||||
if slices.Contains(GetAllFunctionsOutputFields(schema), field.Name) {
|
||||
continue
|
||||
}
|
||||
log.Info("GenColumnsBasedSchema", zap.Any("field", field))
|
||||
|
||||
@ -99,15 +99,16 @@ type CollectionFieldsType int32
|
||||
|
||||
const (
|
||||
// FieldTypeNone zero value place holder
|
||||
Int64Vec CollectionFieldsType = 1 // int64 + floatVec
|
||||
VarcharBinary CollectionFieldsType = 2 // varchar + binaryVec
|
||||
Int64VecJSON CollectionFieldsType = 3 // int64 + floatVec + json
|
||||
Int64VecArray CollectionFieldsType = 4 // int64 + floatVec + array
|
||||
Int64VarcharSparseVec CollectionFieldsType = 5 // int64 + varchar + sparse vector
|
||||
Int64MultiVec CollectionFieldsType = 6 // int64 + floatVec + binaryVec + fp16Vec + bf16vec
|
||||
AllFields CollectionFieldsType = 7 // all fields excepted sparse
|
||||
Int64VecAllScalar CollectionFieldsType = 8 // int64 + floatVec + all scalar fields
|
||||
FullTextSearch CollectionFieldsType = 9 // int64 + varchar + sparse vector + analyzer + function
|
||||
Int64Vec CollectionFieldsType = 1 // int64 + floatVec
|
||||
VarcharBinary CollectionFieldsType = 2 // varchar + binaryVec
|
||||
Int64VecJSON CollectionFieldsType = 3 // int64 + floatVec + json
|
||||
Int64VecArray CollectionFieldsType = 4 // int64 + floatVec + array
|
||||
Int64VarcharSparseVec CollectionFieldsType = 5 // int64 + varchar + sparse vector
|
||||
Int64MultiVec CollectionFieldsType = 6 // int64 + floatVec + binaryVec + fp16Vec + bf16vec
|
||||
AllFields CollectionFieldsType = 7 // all fields excepted sparse
|
||||
Int64VecAllScalar CollectionFieldsType = 8 // int64 + floatVec + all scalar fields
|
||||
FullTextSearch CollectionFieldsType = 9 // int64 + varchar + sparse vector + analyzer + function
|
||||
TextEmbedding CollectionFieldsType = 10 // int64 + varchar + float_vector + text_embedding_function
|
||||
)
|
||||
|
||||
type GenFieldsOption struct {
|
||||
@ -373,6 +374,23 @@ func (cf FieldsFullTextSearch) GenFields(option GenFieldsOption) []*entity.Field
|
||||
return fields
|
||||
}
|
||||
|
||||
type FieldsTextEmbedding struct{}
|
||||
|
||||
func (cf FieldsTextEmbedding) GenFields(option GenFieldsOption) []*entity.Field {
|
||||
pkField := entity.NewField().WithName(GetFieldNameByFieldType(entity.FieldTypeInt64)).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true)
|
||||
textField := entity.NewField().WithName("document").WithDataType(entity.FieldTypeVarChar).WithMaxLength(option.MaxLength).WithIsPartitionKey(option.IsPartitionKey)
|
||||
vecField := entity.NewField().WithName("dense").WithDataType(entity.FieldTypeFloatVector).WithDim(option.Dim)
|
||||
if option.AutoID {
|
||||
pkField.WithIsAutoID(option.AutoID)
|
||||
}
|
||||
fields := []*entity.Field{
|
||||
pkField,
|
||||
textField,
|
||||
vecField,
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func (ff FieldsFactory) GenFieldsForCollection(collectionFieldsType CollectionFieldsType, option *GenFieldsOption) []*entity.Field {
|
||||
log.Info("GenFieldsForCollection", zap.Any("GenFieldsOption", option))
|
||||
switch collectionFieldsType {
|
||||
@ -394,7 +412,14 @@ func (ff FieldsFactory) GenFieldsForCollection(collectionFieldsType CollectionFi
|
||||
return FieldsInt64VecAllScalar{}.GenFields(*option)
|
||||
case FullTextSearch:
|
||||
return FieldsFullTextSearch{}.GenFields(*option)
|
||||
case TextEmbedding:
|
||||
return FieldsTextEmbedding{}.GenFields(*option)
|
||||
default:
|
||||
return FieldsInt64Vec{}.GenFields(*option)
|
||||
}
|
||||
}
|
||||
|
||||
// TNewTextEmbeddingFieldsOption creates fields option with text embedding settings
|
||||
func TNewTextEmbeddingFieldsOption() *GenFieldsOption {
|
||||
return TNewFieldsOption().TWithDim(int64(GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
|
||||
}
|
||||
|
||||
@ -12,3 +12,19 @@ func TNewBM25Function(inputField, outputField string) *entity.Function {
|
||||
WithOutputFields(outputField).
|
||||
WithType(entity.FunctionTypeBM25)
|
||||
}
|
||||
|
||||
// TNewTextEmbeddingFunction creates a text embedding function for different providers
|
||||
func TNewTextEmbeddingFunction(inputField, outputField string, params map[string]any) *entity.Function {
|
||||
function := entity.NewFunction().
|
||||
WithName(inputField + "_text_emb").
|
||||
WithInputFields(inputField).
|
||||
WithOutputFields(outputField).
|
||||
WithType(entity.FunctionTypeTextEmbedding)
|
||||
|
||||
// Add all parameters including provider
|
||||
for key, value := range params {
|
||||
function.WithParam(key, value)
|
||||
}
|
||||
|
||||
return function
|
||||
}
|
||||
|
||||
@ -75,3 +75,12 @@ func GenSchema(option *GenSchemaOption) *entity.Schema {
|
||||
}
|
||||
return schema
|
||||
}
|
||||
|
||||
// TNewTextEmbeddingSchemaOption creates schema option with text embedding function
|
||||
func TNewTextEmbeddingSchemaOption() *GenSchemaOption {
|
||||
function := TNewTextEmbeddingFunction("document", "dense", map[string]any{
|
||||
"provider": "TEI",
|
||||
"endpoint": GetTEIEndpoint(),
|
||||
})
|
||||
return TNewSchemaOption().TWithFunction(function)
|
||||
}
|
||||
|
||||
@ -19,6 +19,8 @@ var (
|
||||
user = flag.String("user", "root", "user")
|
||||
password = flag.String("password", "Milvus", "password")
|
||||
logLevel = flag.String("log.level", "info", "log level for test")
|
||||
teiEndpoint = flag.String("tei_endpoint", "http://text-embeddings-service.milvus-ci.svc.cluster.local:80", "TEI service endpoint for text embedding tests")
|
||||
teiModelDim = flag.Int("tei_model_dim", 768, "Vector dimension for text embedding model")
|
||||
defaultClientConfig *client.ClientConfig
|
||||
)
|
||||
|
||||
@ -42,6 +44,14 @@ func GetPassword() string {
|
||||
return *password
|
||||
}
|
||||
|
||||
func GetTEIEndpoint() string {
|
||||
return *teiEndpoint
|
||||
}
|
||||
|
||||
func GetTEIModelDim() int {
|
||||
return *teiModelDim
|
||||
}
|
||||
|
||||
func parseLogConfig() {
|
||||
log.Info("Parser Log Level", zap.String("logLevel", *logLevel))
|
||||
switch *logLevel {
|
||||
|
||||
956
tests/go_client/testcases/text_embedding_test.go
Normal file
956
tests/go_client/testcases/text_embedding_test.go
Normal file
@ -0,0 +1,956 @@
|
||||
package testcases
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/milvus-io/milvus/client/v2/column"
|
||||
"github.com/milvus-io/milvus/client/v2/entity"
|
||||
"github.com/milvus-io/milvus/client/v2/index"
|
||||
"github.com/milvus-io/milvus/client/v2/milvusclient"
|
||||
"github.com/milvus-io/milvus/tests/go_client/common"
|
||||
hp "github.com/milvus-io/milvus/tests/go_client/testcases/helper"
|
||||
)
|
||||
|
||||
// TestCreateCollectionWithTextEmbedding tests basic collection creation with text embedding function
|
||||
func TestCreateCollectionWithTextEmbedding(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with TEI function
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
|
||||
|
||||
// verify collection creation
|
||||
require.NotNil(t, prepare)
|
||||
require.NotNil(t, schema)
|
||||
|
||||
// describe collection to verify function
|
||||
descRes, err := mc.DescribeCollection(ctx, milvusclient.NewDescribeCollectionOption(schema.CollectionName))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Len(t, descRes.Schema.Functions, 1)
|
||||
require.Equal(t, "document_text_emb", descRes.Schema.Functions[0].Name)
|
||||
require.Equal(t, entity.FunctionTypeTextEmbedding, descRes.Schema.Functions[0].Type)
|
||||
require.Equal(t, []string{"document"}, descRes.Schema.Functions[0].InputFieldNames)
|
||||
require.Equal(t, []string{"dense"}, descRes.Schema.Functions[0].OutputFieldNames)
|
||||
}
|
||||
|
||||
// TestCreateCollectionWithTextEmbeddingTwice tests creating collection twice with same schema
|
||||
func TestCreateCollectionWithTextEmbeddingTwice(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with TEI function
|
||||
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
|
||||
"provider": "TEI",
|
||||
"endpoint": hp.GetTEIEndpoint(),
|
||||
})
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
|
||||
|
||||
collectionName := common.GenRandomString("text_embedding", 6)
|
||||
createParams := hp.NewCreateCollectionParams(hp.TextEmbedding)
|
||||
|
||||
// first creation
|
||||
prepare1, schema1 := hp.CollPrepare.CreateCollection(
|
||||
ctx, t, mc, createParams, fieldsOption,
|
||||
schemaOption.TWithName(collectionName),
|
||||
hp.TWithConsistencyLevel(entity.ClStrong),
|
||||
)
|
||||
require.NotNil(t, prepare1)
|
||||
require.NotNil(t, schema1)
|
||||
|
||||
// second creation with same name should succeed (idempotent)
|
||||
prepare2, schema2 := hp.CollPrepare.CreateCollection(
|
||||
ctx, t, mc, createParams, fieldsOption,
|
||||
schemaOption.TWithName(collectionName),
|
||||
hp.TWithConsistencyLevel(entity.ClStrong),
|
||||
)
|
||||
require.NotNil(t, prepare2)
|
||||
require.NotNil(t, schema2)
|
||||
|
||||
// verify function exists
|
||||
descRes, err := mc.DescribeCollection(ctx, milvusclient.NewDescribeCollectionOption(collectionName))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Len(t, descRes.Schema.Functions, 1)
|
||||
}
|
||||
|
||||
// TestCreateCollectionUnsupportedEndpoint tests creation with unsupported endpoint
|
||||
func TestCreateCollectionUnsupportedEndpoint(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with invalid endpoint
|
||||
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
|
||||
"provider": "TEI",
|
||||
"endpoint": "http://unsupported_endpoint",
|
||||
})
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
|
||||
|
||||
// this should fail during collection creation
|
||||
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
|
||||
common.GenRandomString("text_embedding", 6),
|
||||
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))),
|
||||
))
|
||||
|
||||
// expect error due to unsupported endpoint
|
||||
common.CheckErr(t, err, false, "unsupported_endpoint")
|
||||
}
|
||||
|
||||
// TestCreateCollectionUnmatchedDim tests creation with mismatched dimension
|
||||
func TestCreateCollectionUnmatchedDim(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with wrong dimension (512 instead of expected 768 from TEI model)
|
||||
wrongDim := int64(512)
|
||||
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
|
||||
"provider": "TEI",
|
||||
"endpoint": hp.GetTEIEndpoint(),
|
||||
})
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
fieldsOption := hp.TNewFieldsOption().TWithDim(wrongDim).TWithAutoID(true).TWithMaxLen(65535)
|
||||
|
||||
collectionName := common.GenRandomString("text_embedding", 6)
|
||||
|
||||
// collection creation should fail with dimension mismatch error
|
||||
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
|
||||
collectionName,
|
||||
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))),
|
||||
))
|
||||
|
||||
// Expect error with specific dimension mismatch message
|
||||
expectedError := fmt.Sprintf("required embedding dim is [%d], but the embedding obtained from the model is [%d]", wrongDim, hp.GetTEIModelDim())
|
||||
common.CheckErr(t, err, false, expectedError)
|
||||
}
|
||||
|
||||
// TestInsertWithTextEmbedding tests basic data insertion with text embedding
|
||||
func TestInsertWithTextEmbedding(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with TEI function
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
|
||||
|
||||
// prepare test data - only provide text, embedding will be auto-generated
|
||||
nb := 10
|
||||
documents := make([]string, nb)
|
||||
for i := 0; i < nb; i++ {
|
||||
documents[i] = fmt.Sprintf("This is test document number %d with some content for embedding", i)
|
||||
}
|
||||
|
||||
// insert data using only text field
|
||||
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, int64(nb), res.InsertCount)
|
||||
|
||||
// create index and load
|
||||
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// query to verify vectors were generated
|
||||
resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter("").WithOutputFields("dense").WithLimit(10))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Greater(t, len(resQuery.Fields), 0)
|
||||
|
||||
// verify vector dimension - check first result
|
||||
if resQuery.Len() > 0 {
|
||||
// Query results structure is different - need to check the actual field structure
|
||||
denseColumn := resQuery.GetColumn("dense")
|
||||
require.NotNil(t, denseColumn)
|
||||
// Field should contain vectors for all results
|
||||
}
|
||||
}
|
||||
|
||||
// TestInsertWithTruncateParams tests insertion with different truncate parameters
|
||||
func TestInsertWithTruncateParams(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
truncate bool
|
||||
truncationDirection string
|
||||
shouldSucceed bool
|
||||
}{
|
||||
{"truncate_true_right", true, "Right", true},
|
||||
{"truncate_true_left", true, "Left", true},
|
||||
{"truncate_false", false, "", false}, // should fail with long text
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create TEI function with truncate parameters
|
||||
params := map[string]any{}
|
||||
if tc.truncate {
|
||||
params["truncate"] = "true"
|
||||
params["truncation_direction"] = tc.truncationDirection
|
||||
} else {
|
||||
params["truncate"] = "false"
|
||||
}
|
||||
|
||||
params["provider"] = "TEI"
|
||||
params["endpoint"] = hp.GetTEIEndpoint()
|
||||
function := hp.TNewTextEmbeddingFunction("document", "dense", params)
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
|
||||
|
||||
_, schema := hp.CollPrepare.CreateCollection(
|
||||
ctx, t, mc,
|
||||
hp.NewCreateCollectionParams(hp.TextEmbedding),
|
||||
fieldsOption,
|
||||
schemaOption,
|
||||
hp.TWithConsistencyLevel(entity.ClStrong),
|
||||
)
|
||||
|
||||
// prepare long text data that would need truncation
|
||||
// Generate distinctly different left and right parts that will exceed token limits when combined
|
||||
leftPart := "artificial intelligence machine learning deep learning neural networks computer vision natural language processing data science algorithms " + strings.Repeat("technology innovation science research development analysis ", 100)
|
||||
rightPart := "database systems vector search embeddings similarity matching retrieval information storage indexing " + strings.Repeat("query performance optimization scalability distributed computing ", 100)
|
||||
longText := leftPart + " " + rightPart // This will exceed 512 tokens and need truncation
|
||||
|
||||
documents := []string{longText, leftPart, rightPart}
|
||||
|
||||
// insert data
|
||||
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
|
||||
|
||||
if tc.shouldSucceed {
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, int64(len(documents)), res.InsertCount)
|
||||
|
||||
// create index and load for embedding comparison
|
||||
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(schema.CollectionName, "dense", index.NewAutoIndex(entity.COSINE)))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
_, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(schema.CollectionName))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
// Query embeddings from Milvus
|
||||
resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
|
||||
WithFilter("").
|
||||
WithOutputFields("dense", "document").
|
||||
WithConsistencyLevel(entity.ClStrong).
|
||||
WithLimit(10))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, len(documents), resQuery.Len())
|
||||
|
||||
// Extract Milvus embeddings
|
||||
denseColumn := resQuery.GetColumn("dense")
|
||||
require.NotNil(t, denseColumn)
|
||||
floatVecColumn, ok := denseColumn.(*column.ColumnFloatVector)
|
||||
require.True(t, ok, "Dense column should be a float vector column")
|
||||
|
||||
// Truncation validation using similarity comparison approach
|
||||
// This follows the Python test logic: compare similarity between combined text and parts
|
||||
// to verify that truncation direction works correctly
|
||||
|
||||
require.Equal(t, 3, resQuery.Len(), "Should have 3 documents: longText, leftPart, rightPart")
|
||||
|
||||
// Get embeddings for: [0]=longText, [1]=leftPart, [2]=rightPart
|
||||
embeddings := make([][]float32, 3)
|
||||
for i := 0; i < 3; i++ {
|
||||
embedding := floatVecColumn.Data()[i]
|
||||
require.Equal(t, hp.GetTEIModelDim(), len(embedding), "Embedding should have correct dimension")
|
||||
|
||||
// Check that embedding is not all zeros (would indicate a failure)
|
||||
var sum float32
|
||||
for _, val := range embedding {
|
||||
sum += val * val
|
||||
}
|
||||
require.Greater(t, sum, float32(0.01), "Embedding should not be all zeros for document %d", i)
|
||||
|
||||
embeddings[i] = embedding
|
||||
}
|
||||
|
||||
// Calculate cosine similarities
|
||||
// similarity_left: longText vs leftPart
|
||||
// similarity_right: longText vs rightPart
|
||||
similarityLeft := hp.CosineSimilarity(embeddings[0], embeddings[1])
|
||||
similarityRight := hp.CosineSimilarity(embeddings[0], embeddings[2])
|
||||
|
||||
t.Logf("Similarity longText vs leftPart: %.6f", similarityLeft)
|
||||
t.Logf("Similarity longText vs rightPart: %.6f", similarityRight)
|
||||
|
||||
// Validation based on truncation direction:
|
||||
// - If truncation_direction = "Left", we keep the right part, so longText should be more similar to rightPart
|
||||
// - If truncation_direction = "Right", we keep the left part, so longText should be more similar to leftPart
|
||||
if tc.truncationDirection == "Left" {
|
||||
require.Greater(t, similarityRight, similarityLeft,
|
||||
"With Left truncation, longText should be more similar to rightPart (%.6f) than leftPart (%.6f)",
|
||||
similarityRight, similarityLeft)
|
||||
t.Logf("Left truncation verified: rightPart similarity (%.6f) > leftPart similarity (%.6f)",
|
||||
similarityRight, similarityLeft)
|
||||
} else { // "Right"
|
||||
require.Greater(t, similarityLeft, similarityRight,
|
||||
"With Right truncation, longText should be more similar to leftPart (%.6f) than rightPart (%.6f)",
|
||||
similarityLeft, similarityRight)
|
||||
t.Logf("Right truncation verified: leftPart similarity (%.6f) > rightPart similarity (%.6f)",
|
||||
similarityLeft, similarityRight)
|
||||
}
|
||||
|
||||
t.Logf("Successfully inserted %d documents with truncate=%v, direction=%s", len(documents), tc.truncate, tc.truncationDirection)
|
||||
} else {
|
||||
common.CheckErr(t, err, false, "Payload Too Large")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestVerifyEmbeddingConsistency verifies that Milvus text embedding function produces same results as direct TEI calls
|
||||
func TestVerifyEmbeddingConsistency(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with TEI function (custom fields for autoID=false)
|
||||
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
|
||||
"provider": "TEI",
|
||||
"endpoint": hp.GetTEIEndpoint(),
|
||||
})
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(false).TWithMaxLen(65535)
|
||||
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(
|
||||
ctx, t, mc,
|
||||
hp.NewCreateCollectionParams(hp.TextEmbedding),
|
||||
fieldsOption,
|
||||
schemaOption,
|
||||
hp.TWithConsistencyLevel(entity.ClStrong),
|
||||
)
|
||||
|
||||
// Test documents
|
||||
testDocs := []string{
|
||||
"This is a test document about artificial intelligence",
|
||||
"Vector databases enable semantic search capabilities",
|
||||
"Text embeddings transform language into numbers",
|
||||
}
|
||||
|
||||
// Insert documents into Milvus (will use text embedding function)
|
||||
ids := []int64{1, 2, 3}
|
||||
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
|
||||
WithInt64Column(common.DefaultInt64FieldName, ids).
|
||||
WithVarcharColumn("document", testDocs))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, int64(len(testDocs)), res.InsertCount)
|
||||
|
||||
// Create index and load
|
||||
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// Query vectors from Milvus
|
||||
resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
|
||||
WithFilter("").
|
||||
WithOutputFields("dense", "document", common.DefaultInt64FieldName).
|
||||
WithConsistencyLevel(entity.ClStrong).
|
||||
WithLimit(10))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, len(testDocs), resQuery.Len())
|
||||
|
||||
// Get embeddings directly from TEI
|
||||
teiEmbeddings, err := hp.CallTEIDirectly(hp.GetTEIEndpoint(), testDocs)
|
||||
if err != nil {
|
||||
t.Skipf("Skip consistency test - could not connect to TEI endpoint: %v", err)
|
||||
return
|
||||
}
|
||||
require.Equal(t, len(testDocs), len(teiEmbeddings))
|
||||
|
||||
// Compare embeddings
|
||||
denseColumn := resQuery.GetColumn("dense")
|
||||
require.NotNil(t, denseColumn)
|
||||
|
||||
// Get ID column to match embeddings with documents
|
||||
idColumn := resQuery.GetColumn(common.DefaultInt64FieldName)
|
||||
require.NotNil(t, idColumn)
|
||||
|
||||
// Extract and compare embeddings - need to handle column type properly
|
||||
floatVecColumn, ok := denseColumn.(*column.ColumnFloatVector)
|
||||
require.True(t, ok, "Dense column should be a float vector column")
|
||||
|
||||
for i := 0; i < resQuery.Len(); i++ {
|
||||
// Get ID to find corresponding TEI embedding
|
||||
id, err := idColumn.GetAsInt64(i)
|
||||
require.NoError(t, err)
|
||||
teiIdx := id - 1 // IDs are 1-based, array is 0-based
|
||||
|
||||
// Get Milvus embedding from the float vector column
|
||||
milvusEmbedding := floatVecColumn.Data()[i]
|
||||
|
||||
require.NotNil(t, milvusEmbedding)
|
||||
require.Equal(t, hp.GetTEIModelDim(), len(milvusEmbedding), "Embedding dimension should match")
|
||||
|
||||
// Calculate cosine similarity
|
||||
similarity := hp.CosineSimilarity(milvusEmbedding, teiEmbeddings[teiIdx])
|
||||
|
||||
t.Logf("Document %d (ID=%d) similarity between Milvus and TEI: %.6f", i, id, similarity)
|
||||
|
||||
// Embeddings should be nearly identical (similarity > 0.99)
|
||||
require.Greater(t, similarity, float32(0.99),
|
||||
"Milvus embedding should be nearly identical to TEI embedding for document ID %d", id)
|
||||
}
|
||||
|
||||
t.Log("Embedding consistency verified: Milvus text embedding function produces same results as direct TEI calls")
|
||||
}
|
||||
|
||||
// TestUpsertTextFieldUpdatesEmbedding tests that upserting text field updates embedding
|
||||
func TestUpsertTextFieldUpdatesEmbedding(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with TEI function (custom fields for autoID=false for upsert)
|
||||
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
|
||||
"provider": "TEI",
|
||||
"endpoint": hp.GetTEIEndpoint(),
|
||||
})
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(false).TWithMaxLen(65535) // disable auto ID for upsert
|
||||
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(
|
||||
ctx, t, mc,
|
||||
hp.NewCreateCollectionParams(hp.TextEmbedding),
|
||||
fieldsOption,
|
||||
schemaOption,
|
||||
hp.TWithConsistencyLevel(entity.ClStrong),
|
||||
)
|
||||
|
||||
// create index and load first
|
||||
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// insert initial data with specific ID
|
||||
oldText := "This is the original text content"
|
||||
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
|
||||
WithInt64Column(common.DefaultInt64FieldName, []int64{1}).
|
||||
WithVarcharColumn("document", []string{oldText}))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, int64(1), res.InsertCount)
|
||||
|
||||
// query original embedding before upsert
|
||||
resQueryBefore, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
|
||||
WithFilter("int64 == 1").
|
||||
WithOutputFields("document", "dense").
|
||||
WithConsistencyLevel(entity.ClStrong))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, 1, resQueryBefore.Len())
|
||||
|
||||
// extract original embedding
|
||||
originalDenseColumn := resQueryBefore.GetColumn("dense")
|
||||
require.NotNil(t, originalDenseColumn)
|
||||
originalFloatVecColumn, ok := originalDenseColumn.(*column.ColumnFloatVector)
|
||||
require.True(t, ok, "Dense column should be a float vector column")
|
||||
originalEmbedding := originalFloatVecColumn.Data()[0]
|
||||
require.Equal(t, hp.GetTEIModelDim(), len(originalEmbedding), "Original embedding dimension should match")
|
||||
|
||||
// verify original text
|
||||
originalDocColumn := resQueryBefore.GetColumn("document")
|
||||
require.NotNil(t, originalDocColumn)
|
||||
originalVarCharColumn, ok := originalDocColumn.(*column.ColumnVarChar)
|
||||
require.True(t, ok, "Document column should be a varchar column")
|
||||
require.Equal(t, oldText, originalVarCharColumn.Data()[0], "Original text should match")
|
||||
|
||||
// upsert with new text
|
||||
newText := "This is completely different updated text content"
|
||||
res2, err := mc.Upsert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
|
||||
WithInt64Column(common.DefaultInt64FieldName, []int64{1}).
|
||||
WithVarcharColumn("document", []string{newText}))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, int64(1), res2.UpsertCount)
|
||||
|
||||
// query updated embedding after upsert
|
||||
resQueryAfter, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
|
||||
WithFilter("int64 == 1").
|
||||
WithOutputFields("document", "dense").
|
||||
WithConsistencyLevel(entity.ClStrong))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, 1, resQueryAfter.Len())
|
||||
|
||||
// extract updated embedding
|
||||
updatedDenseColumn := resQueryAfter.GetColumn("dense")
|
||||
require.NotNil(t, updatedDenseColumn)
|
||||
updatedFloatVecColumn, ok := updatedDenseColumn.(*column.ColumnFloatVector)
|
||||
require.True(t, ok, "Dense column should be a float vector column")
|
||||
updatedEmbedding := updatedFloatVecColumn.Data()[0]
|
||||
require.Equal(t, hp.GetTEIModelDim(), len(updatedEmbedding), "Updated embedding dimension should match")
|
||||
|
||||
// verify updated text
|
||||
updatedDocColumn := resQueryAfter.GetColumn("document")
|
||||
require.NotNil(t, updatedDocColumn)
|
||||
updatedVarCharColumn, ok := updatedDocColumn.(*column.ColumnVarChar)
|
||||
require.True(t, ok, "Document column should be a varchar column")
|
||||
require.Equal(t, newText, updatedVarCharColumn.Data()[0], "Updated text should match")
|
||||
|
||||
// verify embeddings are different (key assertion)
|
||||
similarity := hp.CosineSimilarity(originalEmbedding, updatedEmbedding)
|
||||
require.Less(t, similarity, float32(0.95),
|
||||
"Embeddings should be significantly different after text update (similarity=%.6f)", similarity)
|
||||
|
||||
t.Logf("Upsert verification complete: Original and updated embeddings have cosine similarity %.6f (< 0.95)", similarity)
|
||||
t.Logf(" Original text: %s", oldText)
|
||||
t.Logf(" Updated text: %s", newText)
|
||||
}
|
||||
|
||||
// TestDeleteAndSearch tests that deleted text cannot be searched
|
||||
func TestDeleteAndSearch(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with TEI function (custom fields for autoID=false)
|
||||
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
|
||||
"provider": "TEI",
|
||||
"endpoint": hp.GetTEIEndpoint(),
|
||||
})
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(false).TWithMaxLen(65535)
|
||||
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(
|
||||
ctx, t, mc,
|
||||
hp.NewCreateCollectionParams(hp.TextEmbedding),
|
||||
fieldsOption,
|
||||
schemaOption,
|
||||
hp.TWithConsistencyLevel(entity.ClStrong),
|
||||
)
|
||||
|
||||
// insert test data
|
||||
documents := []string{
|
||||
"This is test document 0",
|
||||
"This is test document 1",
|
||||
"This is test document 2",
|
||||
}
|
||||
ids := []int64{0, 1, 2}
|
||||
|
||||
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
|
||||
WithInt64Column(common.DefaultInt64FieldName, ids).
|
||||
WithVarcharColumn("document", documents))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, int64(3), res.InsertCount)
|
||||
|
||||
// create index and load
|
||||
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// delete document with ID 1
|
||||
res2, err := mc.Delete(ctx, milvusclient.NewDeleteOption(schema.CollectionName).WithExpr("int64 in [1]"))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, int64(1), res2.DeleteCount)
|
||||
|
||||
// search and verify document 1 is not in results
|
||||
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 3, []entity.Vector{entity.Text("test document 1")}).
|
||||
WithANNSField("dense").
|
||||
WithOutputFields("document", common.DefaultInt64FieldName))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
// verify deleted document is not in results
|
||||
require.Greater(t, len(searchRes), 0)
|
||||
for _, hits := range searchRes {
|
||||
for i := 0; i < hits.Len(); i++ {
|
||||
id, err := hits.IDs.GetAsInt64(i)
|
||||
require.NoError(t, err)
|
||||
require.NotEqual(t, int64(1), id, "Deleted document should not appear in search results")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestSearchWithTextEmbedding tests search functionality with text embedding
|
||||
func TestSearchWithTextEmbedding(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create -> insert -> index -> load
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
|
||||
|
||||
// prepare test data
|
||||
nb := 10
|
||||
documents := make([]string, nb)
|
||||
for i := 0; i < nb; i++ {
|
||||
documents[i] = fmt.Sprintf("This is test document number %d about artificial intelligence and machine learning", i)
|
||||
}
|
||||
|
||||
// insert data using only text field
|
||||
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, int64(nb), res.InsertCount)
|
||||
|
||||
// create index and load
|
||||
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// search using text query
|
||||
queryText := "artificial intelligence machine learning"
|
||||
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 5, []entity.Vector{entity.Text(queryText)}).
|
||||
WithANNSField("dense").
|
||||
WithOutputFields("document"))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
require.Greater(t, len(searchRes), 0)
|
||||
for _, hits := range searchRes {
|
||||
require.Greater(t, hits.Len(), 0, "Should find relevant documents")
|
||||
require.LessOrEqual(t, hits.Len(), 5, "Should respect limit")
|
||||
|
||||
// verify results contain the search terms (semantic similarity)
|
||||
for i := 0; i < hits.Len(); i++ {
|
||||
score := hits.Scores[i]
|
||||
require.Greater(t, score, float32(0), "Score should be positive")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestSearchWithEmptyQuery tests search with empty query (should fail)
|
||||
func TestSearchWithEmptyQuery(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with TEI function
|
||||
_, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
|
||||
|
||||
// insert some test data
|
||||
documents := []string{"test document"}
|
||||
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, int64(1), res.InsertCount)
|
||||
|
||||
// create index and load
|
||||
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(schema.CollectionName, "dense", index.NewAutoIndex(entity.COSINE)))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
_, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(schema.CollectionName))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
// search with empty query should fail
|
||||
_, err = mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 3, []entity.Vector{entity.Text("")}).
|
||||
WithANNSField("dense"))
|
||||
|
||||
common.CheckErr(t, err, false, "TextEmbedding function does not support empty text")
|
||||
}
|
||||
|
||||
// TestHybridSearchTextEmbeddingBM25 tests hybrid search combining TEI text embedding and BM25
|
||||
func TestHybridSearchTextEmbeddingBM25(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with both TEI text embedding and BM25 functions
|
||||
collectionName := common.GenRandomString("hybrid_search", 6)
|
||||
|
||||
// create fields manually to support both dense and sparse vectors
|
||||
fields := []*entity.Field{
|
||||
entity.NewField().WithName(common.DefaultInt64FieldName).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true).WithIsAutoID(true),
|
||||
entity.NewField().WithName("document").WithDataType(entity.FieldTypeVarChar).WithMaxLength(65535).WithEnableAnalyzer(true).WithAnalyzerParams(map[string]any{"tokenizer": "standard"}),
|
||||
entity.NewField().WithName("dense").WithDataType(entity.FieldTypeFloatVector).WithDim(int64(hp.GetTEIModelDim())),
|
||||
entity.NewField().WithName("sparse").WithDataType(entity.FieldTypeSparseVector),
|
||||
}
|
||||
|
||||
// create TEI text embedding function
|
||||
teiFunction := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
|
||||
"provider": "TEI",
|
||||
"endpoint": hp.GetTEIEndpoint(),
|
||||
})
|
||||
|
||||
// create BM25 function
|
||||
bm25Function := hp.TNewBM25Function("document", "sparse")
|
||||
|
||||
// create schema with both functions
|
||||
schema := entity.NewSchema().
|
||||
WithName(collectionName).
|
||||
WithDescription("Hybrid search collection with TEI and BM25").
|
||||
WithFunction(teiFunction).
|
||||
WithFunction(bm25Function)
|
||||
|
||||
for _, field := range fields {
|
||||
schema.WithField(field)
|
||||
}
|
||||
|
||||
// create collection
|
||||
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(collectionName, schema))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
// insert test data with diverse content
|
||||
documents := []string{
|
||||
"Artificial intelligence and machine learning are transforming technology",
|
||||
"Vector databases enable semantic search capabilities for AI applications",
|
||||
"Text embeddings capture semantic meaning in numerical representations",
|
||||
"BM25 is a traditional keyword-based search algorithm",
|
||||
"Hybrid search combines semantic and keyword-based retrieval methods",
|
||||
"Large language models use transformer architectures for text understanding",
|
||||
"Information retrieval systems help users find relevant documents",
|
||||
"Natural language processing enables computers to understand human language",
|
||||
"Database systems store and retrieve structured information efficiently",
|
||||
"Search engines use ranking algorithms to order results by relevance",
|
||||
}
|
||||
|
||||
// insert data - both embeddings will be generated automatically
|
||||
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(collectionName).WithVarcharColumn("document", documents))
|
||||
common.CheckErr(t, err, true)
|
||||
require.Equal(t, int64(len(documents)), res.InsertCount)
|
||||
|
||||
// create indexes
|
||||
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(collectionName, "dense", index.NewAutoIndex(entity.COSINE)))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(collectionName, "sparse", index.NewSparseInvertedIndex(entity.BM25, 0.1)))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
// load collection
|
||||
_, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(collectionName))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
// test 1: Dense vector search (TEI semantic search)
|
||||
t.Run("DenseVectorSearch", func(t *testing.T) {
|
||||
queryText := "machine learning artificial intelligence"
|
||||
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 3, []entity.Vector{entity.Text(queryText)}).
|
||||
WithANNSField("dense").
|
||||
WithOutputFields("document"))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
require.Greater(t, len(searchRes), 0)
|
||||
for _, hits := range searchRes {
|
||||
require.Greater(t, hits.Len(), 0, "Should find semantically similar documents")
|
||||
t.Logf("Dense search found %d results for query: %s", hits.Len(), queryText)
|
||||
}
|
||||
})
|
||||
|
||||
// test 2: Sparse vector search (BM25 keyword search)
|
||||
t.Run("SparseVectorSearch", func(t *testing.T) {
|
||||
queryText := "database systems"
|
||||
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 3, []entity.Vector{entity.Text(queryText)}).
|
||||
WithANNSField("sparse").
|
||||
WithOutputFields("document"))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
require.Greater(t, len(searchRes), 0)
|
||||
for _, hits := range searchRes {
|
||||
require.Greater(t, hits.Len(), 0, "Should find keyword-matching documents")
|
||||
t.Logf("Sparse search found %d results for query: %s", hits.Len(), queryText)
|
||||
}
|
||||
})
|
||||
|
||||
// test 3: Both search types work independently
|
||||
t.Run("IndependentSearches", func(t *testing.T) {
|
||||
queryText := "vector search"
|
||||
|
||||
// Dense search
|
||||
denseRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 5, []entity.Vector{entity.Text(queryText)}).
|
||||
WithANNSField("dense").
|
||||
WithOutputFields("document"))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
// Sparse search
|
||||
sparseRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 5, []entity.Vector{entity.Text(queryText)}).
|
||||
WithANNSField("sparse").
|
||||
WithOutputFields("document"))
|
||||
common.CheckErr(t, err, true)
|
||||
|
||||
// Both should return results
|
||||
require.Greater(t, len(denseRes), 0, "Dense search should return results")
|
||||
require.Greater(t, len(sparseRes), 0, "Sparse search should return results")
|
||||
|
||||
for _, hits := range denseRes {
|
||||
require.Greater(t, hits.Len(), 0, "Dense search should find documents")
|
||||
}
|
||||
|
||||
for _, hits := range sparseRes {
|
||||
require.Greater(t, hits.Len(), 0, "Sparse search should find documents")
|
||||
}
|
||||
|
||||
t.Logf("Dense search found %d results, Sparse search found %d results",
|
||||
len(denseRes), len(sparseRes))
|
||||
})
|
||||
}
|
||||
|
||||
// TestInsertEmptyDocument tests insertion with empty document
|
||||
func TestInsertEmptyDocument(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with TEI function
|
||||
_, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
|
||||
|
||||
// try to insert empty document
|
||||
documents := []string{"", "normal document"}
|
||||
|
||||
_, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
|
||||
|
||||
// should fail with empty document
|
||||
common.CheckErr(t, err, false, "TextEmbedding function does not support empty text")
|
||||
}
|
||||
|
||||
// TestInsertLongDocument tests insertion with very long document
|
||||
func TestInsertLongDocument(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with TEI function (no truncate)
|
||||
params := map[string]any{
|
||||
"provider": "TEI",
|
||||
"endpoint": hp.GetTEIEndpoint(),
|
||||
"truncate": "false",
|
||||
}
|
||||
function := hp.TNewTextEmbeddingFunction("document", "dense", params)
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
|
||||
|
||||
_, schema := hp.CollPrepare.CreateCollection(
|
||||
ctx, t, mc,
|
||||
hp.NewCreateCollectionParams(hp.TextEmbedding),
|
||||
fieldsOption,
|
||||
schemaOption,
|
||||
hp.TWithConsistencyLevel(entity.ClStrong),
|
||||
)
|
||||
|
||||
// try to insert very long document that exceeds model limits
|
||||
longDocument := hp.GenLongText(8192, "english") // Very long text
|
||||
documents := []string{longDocument}
|
||||
|
||||
_, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
|
||||
|
||||
// should fail with long document when truncate is false
|
||||
common.CheckErr(t, err, false, "Call service failed")
|
||||
}
|
||||
|
||||
// TestInvalidEndpointHandling tests various invalid endpoint scenarios
|
||||
func TestInvalidEndpointHandling(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
endpoint string
|
||||
errMsg string
|
||||
}{
|
||||
{"NonExistentHost", "http://nonexistent-host:8080", "nonexistent-host"},
|
||||
{"InvalidPort", "http://localhost:99999", "99999"},
|
||||
{"InvalidProtocol", "ftp://localhost:8080", "ftp"},
|
||||
{"EmptyEndpoint", "", "endpoint"},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with invalid endpoint
|
||||
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
|
||||
"provider": "TEI",
|
||||
"endpoint": tc.endpoint,
|
||||
})
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
|
||||
|
||||
// collection creation should fail for invalid endpoints
|
||||
collectionName := common.GenRandomString("test_invalid", 6)
|
||||
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
|
||||
collectionName,
|
||||
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))),
|
||||
))
|
||||
|
||||
common.CheckErr(t, err, false, tc.errMsg)
|
||||
t.Logf("Expected error for %s: %v", tc.name, err)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestMissingRequiredParameters tests creation with missing required parameters
|
||||
func TestMissingRequiredParameters(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
params map[string]any
|
||||
errMsg string
|
||||
}{
|
||||
{"MissingProvider", map[string]any{"endpoint": hp.GetTEIEndpoint()}, "provider"},
|
||||
{"MissingEndpoint", map[string]any{"provider": "TEI"}, "endpoint"},
|
||||
{"WrongProvider", map[string]any{"provider": "InvalidProvider", "endpoint": hp.GetTEIEndpoint()}, "invalidprovider"},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
// create function with incomplete parameters
|
||||
function := entity.NewFunction().
|
||||
WithName("incomplete_func").
|
||||
WithInputFields("document").
|
||||
WithOutputFields("dense").
|
||||
WithType(entity.FunctionTypeTextEmbedding)
|
||||
|
||||
for key, value := range tc.params {
|
||||
function.WithParam(key, value)
|
||||
}
|
||||
|
||||
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
|
||||
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
|
||||
|
||||
// collection creation should fail
|
||||
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
|
||||
common.GenRandomString("test_incomplete", 6),
|
||||
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))),
|
||||
))
|
||||
|
||||
common.CheckErr(t, err, false, tc.errMsg)
|
||||
t.Logf("Expected error for %s: %v", tc.name, err)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestConcurrentOperations tests concurrent text embedding operations
|
||||
func TestConcurrentOperations(t *testing.T) {
|
||||
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout*2) // longer timeout for concurrent ops
|
||||
mc := hp.CreateDefaultMilvusClient(ctx, t)
|
||||
|
||||
// create collection with TEI function
|
||||
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
|
||||
|
||||
// create index and load
|
||||
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
|
||||
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
|
||||
|
||||
// concurrent inserts
|
||||
t.Run("ConcurrentInserts", func(t *testing.T) {
|
||||
numRoutines := 5
|
||||
documentsPerRoutine := 5
|
||||
|
||||
results := make(chan error, numRoutines)
|
||||
|
||||
for i := 0; i < numRoutines; i++ {
|
||||
go func(routineID int) {
|
||||
documents := make([]string, documentsPerRoutine)
|
||||
for j := 0; j < documentsPerRoutine; j++ {
|
||||
documents[j] = fmt.Sprintf("Concurrent document from routine %d, doc %d", routineID, j)
|
||||
}
|
||||
|
||||
_, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
|
||||
results <- err
|
||||
}(i)
|
||||
}
|
||||
|
||||
// wait for all goroutines to complete
|
||||
for i := 0; i < numRoutines; i++ {
|
||||
err := <-results
|
||||
require.NoError(t, err, "Concurrent insert should succeed")
|
||||
}
|
||||
|
||||
t.Logf("Successfully completed %d concurrent inserts with %d documents each", numRoutines, documentsPerRoutine)
|
||||
})
|
||||
|
||||
// concurrent searches
|
||||
t.Run("ConcurrentSearches", func(t *testing.T) {
|
||||
numRoutines := 3
|
||||
|
||||
results := make(chan error, numRoutines)
|
||||
|
||||
for i := 0; i < numRoutines; i++ {
|
||||
go func(routineID int) {
|
||||
queryText := fmt.Sprintf("document routine %d", routineID)
|
||||
_, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 5, []entity.Vector{entity.Text(queryText)}).
|
||||
WithANNSField("dense").
|
||||
WithOutputFields("document"))
|
||||
results <- err
|
||||
}(i)
|
||||
}
|
||||
|
||||
// wait for all searches to complete
|
||||
for i := 0; i < numRoutines; i++ {
|
||||
err := <-results
|
||||
require.NoError(t, err, "Concurrent search should succeed")
|
||||
}
|
||||
|
||||
t.Logf("Successfully completed %d concurrent searches", numRoutines)
|
||||
})
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user