From 1e31ad345b0bb1678312acf825cf7745c391bebd Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Fri, 15 Aug 2025 11:37:43 +0800 Subject: [PATCH] test:add text embedding function testcases in go client (#43875) /kind improvement --------- Signed-off-by: zhuwenxing --- .../go_client/testcases/helper/data_helper.go | 153 ++- .../testcases/helper/field_helper.go | 43 +- .../testcases/helper/function_helper.go | 16 + .../testcases/helper/schema_helper.go | 9 + .../go_client/testcases/helper/test_setup.go | 10 + .../testcases/text_embedding_test.go | 956 ++++++++++++++++++ 6 files changed, 1177 insertions(+), 10 deletions(-) create mode 100644 tests/go_client/testcases/text_embedding_test.go diff --git a/tests/go_client/testcases/helper/data_helper.go b/tests/go_client/testcases/helper/data_helper.go index b1c5637149..04de83b1c6 100644 --- a/tests/go_client/testcases/helper/data_helper.go +++ b/tests/go_client/testcases/helper/data_helper.go @@ -4,7 +4,10 @@ import ( "bytes" "encoding/json" "fmt" + "io" + "math" "math/rand" + "net/http" "slices" "strconv" "strings" @@ -539,6 +542,154 @@ func GetBm25FunctionsOutputFields(schema *entity.Schema) []string { return outputFields } +func GetTextEmbeddingFunctionsOutputFields(schema *entity.Schema) []string { + var outputFields []string + for _, fn := range schema.Functions { + if fn.Type == entity.FunctionTypeTextEmbedding { + outputFields = append(outputFields, fn.OutputFieldNames...) + } + } + return outputFields +} + +func GetAllFunctionsOutputFields(schema *entity.Schema) []string { + var outputFields []string + for _, fn := range schema.Functions { + if fn.Type == entity.FunctionTypeBM25 || fn.Type == entity.FunctionTypeTextEmbedding { + outputFields = append(outputFields, fn.OutputFieldNames...) + } + } + return outputFields +} + +// GenTextDocuments generates realistic text documents for embedding tests +func GenTextDocuments(count int, lang string) []string { + documents := make([]string, count) + + var templates []string + switch lang { + case "english", "en": + templates = []string{ + "This is a document about artificial intelligence and machine learning technologies in modern computing systems", + "Vector databases enable efficient similarity search for high-dimensional data in AI applications", + "Text embeddings transform natural language into numerical representations for semantic understanding", + "Information retrieval systems help users find relevant documents from large collections of data", + "Natural language processing enables computers to understand and generate human language effectively", + "Database management systems provide structured storage and efficient querying of information", + "Search algorithms rank and retrieve the most relevant results for user queries", + "Machine learning models learn patterns from data to make predictions and classifications", + "Deep learning neural networks process complex patterns in images, text, and other data types", + "Data science combines statistics, programming, and domain knowledge to extract insights", + } + case "chinese", "zh": + templates = []string{ + "这是关于人工智能和机器学习技术的文档,介绍现代计算系统中的应用", + "向量数据库为高维数据提供高效的相似性搜索功能,支持AI应用开发", + "文本嵌入技术将自然语言转换为数值表示,实现语义理解和分析", + "信息检索系统帮助用户从大规模数据集合中找到相关的文档内容", + "自然语言处理技术使计算机能够理解和生成人类语言", + "数据库管理系统提供结构化存储和高效的信息查询功能", + "搜索算法对用户查询结果进行排序和检索,返回最相关的内容", + "机器学习模型从数据中学习模式,进行预测和分类任务", + "深度学习神经网络处理图像、文本等复杂数据类型中的模式", + "数据科学结合统计学、编程和领域知识来提取有价值的洞察", + } + default: + // Default to English + templates = []string{ + "Document about technology and innovation in the digital age", + "Analysis of modern computing systems and their applications", + "Research on data processing and information management", + "Study of algorithms and their implementation in software", + "Overview of database systems and their optimization techniques", + } + } + + for i := 0; i < count; i++ { + baseTemplate := templates[i%len(templates)] + documents[i] = fmt.Sprintf("%s. Document ID: %d", baseTemplate, i) + } + + return documents +} + +// CosineSimilarity calculates cosine similarity between two float32 vectors +func CosineSimilarity(a, b []float32) float32 { + if len(a) != len(b) || len(a) == 0 { + return 0 + } + + var dotProduct, normA, normB float32 + for i := 0; i < len(a); i++ { + dotProduct += a[i] * b[i] + normA += a[i] * a[i] + normB += b[i] * b[i] + } + + if normA == 0 || normB == 0 { + return 0 + } + + // Use math.Sqrt for more accurate calculation + return dotProduct / (float32(math.Sqrt(float64(normA))) * float32(math.Sqrt(float64(normB)))) +} + +// GenLongText generates long text with specified word count +func GenLongText(wordCount int, lang string) string { + var words []string + switch lang { + case "chinese", "zh": + words = []string{"人工智能", "机器学习", "深度学习", "神经网络", "数据挖掘", "自然语言", "处理技术", "计算机", "算法优化", "信息检索", "向量数据库", "语义搜索", "文本分析", "知识图谱", "智能系统"} + case "english", "en": + words = []string{"artificial", "intelligence", "machine", "learning", "deep", "neural", "network", "algorithm", "database", "search", "vector", "embedding", "semantic", "analysis", "information", "retrieval", "computing", "technology", "system", "data", "processing", "optimization", "performance", "scalability", "efficiency"} + default: + words = []string{"the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog", "and", "runs", "through", "forest", "with", "great", "speed", "while", "chasing", "rabbit", "under", "bright", "moonlight", "across", "green", "fields", "toward", "distant", "mountains"} + } + + result := make([]string, wordCount) + for i := 0; i < wordCount; i++ { + result[i] = words[i%len(words)] + } + + return strings.Join(result, " ") +} + +// CallTEIDirectly calls TEI endpoint directly to get embeddings +func CallTEIDirectly(endpoint string, texts []string) ([][]float32, error) { + // TEI API request structure + type TEIRequest struct { + Inputs []string `json:"inputs"` + } + + // Create request + reqBody := TEIRequest{Inputs: texts} + jsonData, err := json.Marshal(reqBody) + if err != nil { + return nil, fmt.Errorf("failed to marshal request: %w", err) + } + + // Make HTTP request to TEI + resp, err := http.Post(endpoint+"/embed", "application/json", bytes.NewBuffer(jsonData)) + if err != nil { + return nil, fmt.Errorf("failed to call TEI endpoint: %w", err) + } + defer resp.Body.Close() + + // Read response + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response: %w", err) + } + + // Parse response - TEI returns array of arrays + var embeddings [][]float32 + if err := json.Unmarshal(body, &embeddings); err != nil { + return nil, fmt.Errorf("failed to unmarshal response: %w", err) + } + + return embeddings, nil +} + func GenColumnsBasedSchema(schema *entity.Schema, option *GenDataOption) ([]column.Column, []column.Column) { if nil == schema || schema.CollectionName == "" { log.Fatal("[GenColumnsBasedSchema] Nil Schema is not expected") @@ -557,7 +708,7 @@ func GenColumnsBasedSchema(schema *entity.Schema, option *GenDataOption) ([]colu if option.fieldName == "" { option.fieldName = field.Name } - if slices.Contains(GetBm25FunctionsOutputFields(schema), field.Name) { + if slices.Contains(GetAllFunctionsOutputFields(schema), field.Name) { continue } log.Info("GenColumnsBasedSchema", zap.Any("field", field)) diff --git a/tests/go_client/testcases/helper/field_helper.go b/tests/go_client/testcases/helper/field_helper.go index 9153ffc08f..42fe5d79ea 100644 --- a/tests/go_client/testcases/helper/field_helper.go +++ b/tests/go_client/testcases/helper/field_helper.go @@ -99,15 +99,16 @@ type CollectionFieldsType int32 const ( // FieldTypeNone zero value place holder - Int64Vec CollectionFieldsType = 1 // int64 + floatVec - VarcharBinary CollectionFieldsType = 2 // varchar + binaryVec - Int64VecJSON CollectionFieldsType = 3 // int64 + floatVec + json - Int64VecArray CollectionFieldsType = 4 // int64 + floatVec + array - Int64VarcharSparseVec CollectionFieldsType = 5 // int64 + varchar + sparse vector - Int64MultiVec CollectionFieldsType = 6 // int64 + floatVec + binaryVec + fp16Vec + bf16vec - AllFields CollectionFieldsType = 7 // all fields excepted sparse - Int64VecAllScalar CollectionFieldsType = 8 // int64 + floatVec + all scalar fields - FullTextSearch CollectionFieldsType = 9 // int64 + varchar + sparse vector + analyzer + function + Int64Vec CollectionFieldsType = 1 // int64 + floatVec + VarcharBinary CollectionFieldsType = 2 // varchar + binaryVec + Int64VecJSON CollectionFieldsType = 3 // int64 + floatVec + json + Int64VecArray CollectionFieldsType = 4 // int64 + floatVec + array + Int64VarcharSparseVec CollectionFieldsType = 5 // int64 + varchar + sparse vector + Int64MultiVec CollectionFieldsType = 6 // int64 + floatVec + binaryVec + fp16Vec + bf16vec + AllFields CollectionFieldsType = 7 // all fields excepted sparse + Int64VecAllScalar CollectionFieldsType = 8 // int64 + floatVec + all scalar fields + FullTextSearch CollectionFieldsType = 9 // int64 + varchar + sparse vector + analyzer + function + TextEmbedding CollectionFieldsType = 10 // int64 + varchar + float_vector + text_embedding_function ) type GenFieldsOption struct { @@ -373,6 +374,23 @@ func (cf FieldsFullTextSearch) GenFields(option GenFieldsOption) []*entity.Field return fields } +type FieldsTextEmbedding struct{} + +func (cf FieldsTextEmbedding) GenFields(option GenFieldsOption) []*entity.Field { + pkField := entity.NewField().WithName(GetFieldNameByFieldType(entity.FieldTypeInt64)).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true) + textField := entity.NewField().WithName("document").WithDataType(entity.FieldTypeVarChar).WithMaxLength(option.MaxLength).WithIsPartitionKey(option.IsPartitionKey) + vecField := entity.NewField().WithName("dense").WithDataType(entity.FieldTypeFloatVector).WithDim(option.Dim) + if option.AutoID { + pkField.WithIsAutoID(option.AutoID) + } + fields := []*entity.Field{ + pkField, + textField, + vecField, + } + return fields +} + func (ff FieldsFactory) GenFieldsForCollection(collectionFieldsType CollectionFieldsType, option *GenFieldsOption) []*entity.Field { log.Info("GenFieldsForCollection", zap.Any("GenFieldsOption", option)) switch collectionFieldsType { @@ -394,7 +412,14 @@ func (ff FieldsFactory) GenFieldsForCollection(collectionFieldsType CollectionFi return FieldsInt64VecAllScalar{}.GenFields(*option) case FullTextSearch: return FieldsFullTextSearch{}.GenFields(*option) + case TextEmbedding: + return FieldsTextEmbedding{}.GenFields(*option) default: return FieldsInt64Vec{}.GenFields(*option) } } + +// TNewTextEmbeddingFieldsOption creates fields option with text embedding settings +func TNewTextEmbeddingFieldsOption() *GenFieldsOption { + return TNewFieldsOption().TWithDim(int64(GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535) +} diff --git a/tests/go_client/testcases/helper/function_helper.go b/tests/go_client/testcases/helper/function_helper.go index 6f289d6cfa..e2c5069f1c 100644 --- a/tests/go_client/testcases/helper/function_helper.go +++ b/tests/go_client/testcases/helper/function_helper.go @@ -12,3 +12,19 @@ func TNewBM25Function(inputField, outputField string) *entity.Function { WithOutputFields(outputField). WithType(entity.FunctionTypeBM25) } + +// TNewTextEmbeddingFunction creates a text embedding function for different providers +func TNewTextEmbeddingFunction(inputField, outputField string, params map[string]any) *entity.Function { + function := entity.NewFunction(). + WithName(inputField + "_text_emb"). + WithInputFields(inputField). + WithOutputFields(outputField). + WithType(entity.FunctionTypeTextEmbedding) + + // Add all parameters including provider + for key, value := range params { + function.WithParam(key, value) + } + + return function +} diff --git a/tests/go_client/testcases/helper/schema_helper.go b/tests/go_client/testcases/helper/schema_helper.go index aaa8b7c93f..16fc50f2c9 100644 --- a/tests/go_client/testcases/helper/schema_helper.go +++ b/tests/go_client/testcases/helper/schema_helper.go @@ -75,3 +75,12 @@ func GenSchema(option *GenSchemaOption) *entity.Schema { } return schema } + +// TNewTextEmbeddingSchemaOption creates schema option with text embedding function +func TNewTextEmbeddingSchemaOption() *GenSchemaOption { + function := TNewTextEmbeddingFunction("document", "dense", map[string]any{ + "provider": "TEI", + "endpoint": GetTEIEndpoint(), + }) + return TNewSchemaOption().TWithFunction(function) +} diff --git a/tests/go_client/testcases/helper/test_setup.go b/tests/go_client/testcases/helper/test_setup.go index 4f5b977b90..c27b7488a9 100644 --- a/tests/go_client/testcases/helper/test_setup.go +++ b/tests/go_client/testcases/helper/test_setup.go @@ -19,6 +19,8 @@ var ( user = flag.String("user", "root", "user") password = flag.String("password", "Milvus", "password") logLevel = flag.String("log.level", "info", "log level for test") + teiEndpoint = flag.String("tei_endpoint", "http://text-embeddings-service.milvus-ci.svc.cluster.local:80", "TEI service endpoint for text embedding tests") + teiModelDim = flag.Int("tei_model_dim", 768, "Vector dimension for text embedding model") defaultClientConfig *client.ClientConfig ) @@ -42,6 +44,14 @@ func GetPassword() string { return *password } +func GetTEIEndpoint() string { + return *teiEndpoint +} + +func GetTEIModelDim() int { + return *teiModelDim +} + func parseLogConfig() { log.Info("Parser Log Level", zap.String("logLevel", *logLevel)) switch *logLevel { diff --git a/tests/go_client/testcases/text_embedding_test.go b/tests/go_client/testcases/text_embedding_test.go new file mode 100644 index 0000000000..b0a248bd86 --- /dev/null +++ b/tests/go_client/testcases/text_embedding_test.go @@ -0,0 +1,956 @@ +package testcases + +import ( + "fmt" + "strings" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/milvus-io/milvus/client/v2/column" + "github.com/milvus-io/milvus/client/v2/entity" + "github.com/milvus-io/milvus/client/v2/index" + "github.com/milvus-io/milvus/client/v2/milvusclient" + "github.com/milvus-io/milvus/tests/go_client/common" + hp "github.com/milvus-io/milvus/tests/go_client/testcases/helper" +) + +// TestCreateCollectionWithTextEmbedding tests basic collection creation with text embedding function +func TestCreateCollectionWithTextEmbedding(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with TEI function + prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong)) + + // verify collection creation + require.NotNil(t, prepare) + require.NotNil(t, schema) + + // describe collection to verify function + descRes, err := mc.DescribeCollection(ctx, milvusclient.NewDescribeCollectionOption(schema.CollectionName)) + common.CheckErr(t, err, true) + require.Len(t, descRes.Schema.Functions, 1) + require.Equal(t, "document_text_emb", descRes.Schema.Functions[0].Name) + require.Equal(t, entity.FunctionTypeTextEmbedding, descRes.Schema.Functions[0].Type) + require.Equal(t, []string{"document"}, descRes.Schema.Functions[0].InputFieldNames) + require.Equal(t, []string{"dense"}, descRes.Schema.Functions[0].OutputFieldNames) +} + +// TestCreateCollectionWithTextEmbeddingTwice tests creating collection twice with same schema +func TestCreateCollectionWithTextEmbeddingTwice(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with TEI function + function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{ + "provider": "TEI", + "endpoint": hp.GetTEIEndpoint(), + }) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535) + + collectionName := common.GenRandomString("text_embedding", 6) + createParams := hp.NewCreateCollectionParams(hp.TextEmbedding) + + // first creation + prepare1, schema1 := hp.CollPrepare.CreateCollection( + ctx, t, mc, createParams, fieldsOption, + schemaOption.TWithName(collectionName), + hp.TWithConsistencyLevel(entity.ClStrong), + ) + require.NotNil(t, prepare1) + require.NotNil(t, schema1) + + // second creation with same name should succeed (idempotent) + prepare2, schema2 := hp.CollPrepare.CreateCollection( + ctx, t, mc, createParams, fieldsOption, + schemaOption.TWithName(collectionName), + hp.TWithConsistencyLevel(entity.ClStrong), + ) + require.NotNil(t, prepare2) + require.NotNil(t, schema2) + + // verify function exists + descRes, err := mc.DescribeCollection(ctx, milvusclient.NewDescribeCollectionOption(collectionName)) + common.CheckErr(t, err, true) + require.Len(t, descRes.Schema.Functions, 1) +} + +// TestCreateCollectionUnsupportedEndpoint tests creation with unsupported endpoint +func TestCreateCollectionUnsupportedEndpoint(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with invalid endpoint + function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{ + "provider": "TEI", + "endpoint": "http://unsupported_endpoint", + }) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535) + + // this should fail during collection creation + err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption( + common.GenRandomString("text_embedding", 6), + hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))), + )) + + // expect error due to unsupported endpoint + common.CheckErr(t, err, false, "unsupported_endpoint") +} + +// TestCreateCollectionUnmatchedDim tests creation with mismatched dimension +func TestCreateCollectionUnmatchedDim(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with wrong dimension (512 instead of expected 768 from TEI model) + wrongDim := int64(512) + function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{ + "provider": "TEI", + "endpoint": hp.GetTEIEndpoint(), + }) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + fieldsOption := hp.TNewFieldsOption().TWithDim(wrongDim).TWithAutoID(true).TWithMaxLen(65535) + + collectionName := common.GenRandomString("text_embedding", 6) + + // collection creation should fail with dimension mismatch error + err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption( + collectionName, + hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))), + )) + + // Expect error with specific dimension mismatch message + expectedError := fmt.Sprintf("required embedding dim is [%d], but the embedding obtained from the model is [%d]", wrongDim, hp.GetTEIModelDim()) + common.CheckErr(t, err, false, expectedError) +} + +// TestInsertWithTextEmbedding tests basic data insertion with text embedding +func TestInsertWithTextEmbedding(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with TEI function + prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong)) + + // prepare test data - only provide text, embedding will be auto-generated + nb := 10 + documents := make([]string, nb) + for i := 0; i < nb; i++ { + documents[i] = fmt.Sprintf("This is test document number %d with some content for embedding", i) + } + + // insert data using only text field + res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents)) + common.CheckErr(t, err, true) + require.Equal(t, int64(nb), res.InsertCount) + + // create index and load + prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)})) + prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName)) + + // query to verify vectors were generated + resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter("").WithOutputFields("dense").WithLimit(10)) + common.CheckErr(t, err, true) + require.Greater(t, len(resQuery.Fields), 0) + + // verify vector dimension - check first result + if resQuery.Len() > 0 { + // Query results structure is different - need to check the actual field structure + denseColumn := resQuery.GetColumn("dense") + require.NotNil(t, denseColumn) + // Field should contain vectors for all results + } +} + +// TestInsertWithTruncateParams tests insertion with different truncate parameters +func TestInsertWithTruncateParams(t *testing.T) { + testCases := []struct { + name string + truncate bool + truncationDirection string + shouldSucceed bool + }{ + {"truncate_true_right", true, "Right", true}, + {"truncate_true_left", true, "Left", true}, + {"truncate_false", false, "", false}, // should fail with long text + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create TEI function with truncate parameters + params := map[string]any{} + if tc.truncate { + params["truncate"] = "true" + params["truncation_direction"] = tc.truncationDirection + } else { + params["truncate"] = "false" + } + + params["provider"] = "TEI" + params["endpoint"] = hp.GetTEIEndpoint() + function := hp.TNewTextEmbeddingFunction("document", "dense", params) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535) + + _, schema := hp.CollPrepare.CreateCollection( + ctx, t, mc, + hp.NewCreateCollectionParams(hp.TextEmbedding), + fieldsOption, + schemaOption, + hp.TWithConsistencyLevel(entity.ClStrong), + ) + + // prepare long text data that would need truncation + // Generate distinctly different left and right parts that will exceed token limits when combined + leftPart := "artificial intelligence machine learning deep learning neural networks computer vision natural language processing data science algorithms " + strings.Repeat("technology innovation science research development analysis ", 100) + rightPart := "database systems vector search embeddings similarity matching retrieval information storage indexing " + strings.Repeat("query performance optimization scalability distributed computing ", 100) + longText := leftPart + " " + rightPart // This will exceed 512 tokens and need truncation + + documents := []string{longText, leftPart, rightPart} + + // insert data + res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents)) + + if tc.shouldSucceed { + common.CheckErr(t, err, true) + require.Equal(t, int64(len(documents)), res.InsertCount) + + // create index and load for embedding comparison + _, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(schema.CollectionName, "dense", index.NewAutoIndex(entity.COSINE))) + common.CheckErr(t, err, true) + + _, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(schema.CollectionName)) + common.CheckErr(t, err, true) + + // Query embeddings from Milvus + resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName). + WithFilter(""). + WithOutputFields("dense", "document"). + WithConsistencyLevel(entity.ClStrong). + WithLimit(10)) + common.CheckErr(t, err, true) + require.Equal(t, len(documents), resQuery.Len()) + + // Extract Milvus embeddings + denseColumn := resQuery.GetColumn("dense") + require.NotNil(t, denseColumn) + floatVecColumn, ok := denseColumn.(*column.ColumnFloatVector) + require.True(t, ok, "Dense column should be a float vector column") + + // Truncation validation using similarity comparison approach + // This follows the Python test logic: compare similarity between combined text and parts + // to verify that truncation direction works correctly + + require.Equal(t, 3, resQuery.Len(), "Should have 3 documents: longText, leftPart, rightPart") + + // Get embeddings for: [0]=longText, [1]=leftPart, [2]=rightPart + embeddings := make([][]float32, 3) + for i := 0; i < 3; i++ { + embedding := floatVecColumn.Data()[i] + require.Equal(t, hp.GetTEIModelDim(), len(embedding), "Embedding should have correct dimension") + + // Check that embedding is not all zeros (would indicate a failure) + var sum float32 + for _, val := range embedding { + sum += val * val + } + require.Greater(t, sum, float32(0.01), "Embedding should not be all zeros for document %d", i) + + embeddings[i] = embedding + } + + // Calculate cosine similarities + // similarity_left: longText vs leftPart + // similarity_right: longText vs rightPart + similarityLeft := hp.CosineSimilarity(embeddings[0], embeddings[1]) + similarityRight := hp.CosineSimilarity(embeddings[0], embeddings[2]) + + t.Logf("Similarity longText vs leftPart: %.6f", similarityLeft) + t.Logf("Similarity longText vs rightPart: %.6f", similarityRight) + + // Validation based on truncation direction: + // - If truncation_direction = "Left", we keep the right part, so longText should be more similar to rightPart + // - If truncation_direction = "Right", we keep the left part, so longText should be more similar to leftPart + if tc.truncationDirection == "Left" { + require.Greater(t, similarityRight, similarityLeft, + "With Left truncation, longText should be more similar to rightPart (%.6f) than leftPart (%.6f)", + similarityRight, similarityLeft) + t.Logf("Left truncation verified: rightPart similarity (%.6f) > leftPart similarity (%.6f)", + similarityRight, similarityLeft) + } else { // "Right" + require.Greater(t, similarityLeft, similarityRight, + "With Right truncation, longText should be more similar to leftPart (%.6f) than rightPart (%.6f)", + similarityLeft, similarityRight) + t.Logf("Right truncation verified: leftPart similarity (%.6f) > rightPart similarity (%.6f)", + similarityLeft, similarityRight) + } + + t.Logf("Successfully inserted %d documents with truncate=%v, direction=%s", len(documents), tc.truncate, tc.truncationDirection) + } else { + common.CheckErr(t, err, false, "Payload Too Large") + } + }) + } +} + +// TestVerifyEmbeddingConsistency verifies that Milvus text embedding function produces same results as direct TEI calls +func TestVerifyEmbeddingConsistency(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with TEI function (custom fields for autoID=false) + function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{ + "provider": "TEI", + "endpoint": hp.GetTEIEndpoint(), + }) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(false).TWithMaxLen(65535) + + prepare, schema := hp.CollPrepare.CreateCollection( + ctx, t, mc, + hp.NewCreateCollectionParams(hp.TextEmbedding), + fieldsOption, + schemaOption, + hp.TWithConsistencyLevel(entity.ClStrong), + ) + + // Test documents + testDocs := []string{ + "This is a test document about artificial intelligence", + "Vector databases enable semantic search capabilities", + "Text embeddings transform language into numbers", + } + + // Insert documents into Milvus (will use text embedding function) + ids := []int64{1, 2, 3} + res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName). + WithInt64Column(common.DefaultInt64FieldName, ids). + WithVarcharColumn("document", testDocs)) + common.CheckErr(t, err, true) + require.Equal(t, int64(len(testDocs)), res.InsertCount) + + // Create index and load + prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)})) + prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName)) + + // Query vectors from Milvus + resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName). + WithFilter(""). + WithOutputFields("dense", "document", common.DefaultInt64FieldName). + WithConsistencyLevel(entity.ClStrong). + WithLimit(10)) + common.CheckErr(t, err, true) + require.Equal(t, len(testDocs), resQuery.Len()) + + // Get embeddings directly from TEI + teiEmbeddings, err := hp.CallTEIDirectly(hp.GetTEIEndpoint(), testDocs) + if err != nil { + t.Skipf("Skip consistency test - could not connect to TEI endpoint: %v", err) + return + } + require.Equal(t, len(testDocs), len(teiEmbeddings)) + + // Compare embeddings + denseColumn := resQuery.GetColumn("dense") + require.NotNil(t, denseColumn) + + // Get ID column to match embeddings with documents + idColumn := resQuery.GetColumn(common.DefaultInt64FieldName) + require.NotNil(t, idColumn) + + // Extract and compare embeddings - need to handle column type properly + floatVecColumn, ok := denseColumn.(*column.ColumnFloatVector) + require.True(t, ok, "Dense column should be a float vector column") + + for i := 0; i < resQuery.Len(); i++ { + // Get ID to find corresponding TEI embedding + id, err := idColumn.GetAsInt64(i) + require.NoError(t, err) + teiIdx := id - 1 // IDs are 1-based, array is 0-based + + // Get Milvus embedding from the float vector column + milvusEmbedding := floatVecColumn.Data()[i] + + require.NotNil(t, milvusEmbedding) + require.Equal(t, hp.GetTEIModelDim(), len(milvusEmbedding), "Embedding dimension should match") + + // Calculate cosine similarity + similarity := hp.CosineSimilarity(milvusEmbedding, teiEmbeddings[teiIdx]) + + t.Logf("Document %d (ID=%d) similarity between Milvus and TEI: %.6f", i, id, similarity) + + // Embeddings should be nearly identical (similarity > 0.99) + require.Greater(t, similarity, float32(0.99), + "Milvus embedding should be nearly identical to TEI embedding for document ID %d", id) + } + + t.Log("Embedding consistency verified: Milvus text embedding function produces same results as direct TEI calls") +} + +// TestUpsertTextFieldUpdatesEmbedding tests that upserting text field updates embedding +func TestUpsertTextFieldUpdatesEmbedding(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with TEI function (custom fields for autoID=false for upsert) + function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{ + "provider": "TEI", + "endpoint": hp.GetTEIEndpoint(), + }) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(false).TWithMaxLen(65535) // disable auto ID for upsert + + prepare, schema := hp.CollPrepare.CreateCollection( + ctx, t, mc, + hp.NewCreateCollectionParams(hp.TextEmbedding), + fieldsOption, + schemaOption, + hp.TWithConsistencyLevel(entity.ClStrong), + ) + + // create index and load first + prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)})) + prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName)) + + // insert initial data with specific ID + oldText := "This is the original text content" + res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName). + WithInt64Column(common.DefaultInt64FieldName, []int64{1}). + WithVarcharColumn("document", []string{oldText})) + common.CheckErr(t, err, true) + require.Equal(t, int64(1), res.InsertCount) + + // query original embedding before upsert + resQueryBefore, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName). + WithFilter("int64 == 1"). + WithOutputFields("document", "dense"). + WithConsistencyLevel(entity.ClStrong)) + common.CheckErr(t, err, true) + require.Equal(t, 1, resQueryBefore.Len()) + + // extract original embedding + originalDenseColumn := resQueryBefore.GetColumn("dense") + require.NotNil(t, originalDenseColumn) + originalFloatVecColumn, ok := originalDenseColumn.(*column.ColumnFloatVector) + require.True(t, ok, "Dense column should be a float vector column") + originalEmbedding := originalFloatVecColumn.Data()[0] + require.Equal(t, hp.GetTEIModelDim(), len(originalEmbedding), "Original embedding dimension should match") + + // verify original text + originalDocColumn := resQueryBefore.GetColumn("document") + require.NotNil(t, originalDocColumn) + originalVarCharColumn, ok := originalDocColumn.(*column.ColumnVarChar) + require.True(t, ok, "Document column should be a varchar column") + require.Equal(t, oldText, originalVarCharColumn.Data()[0], "Original text should match") + + // upsert with new text + newText := "This is completely different updated text content" + res2, err := mc.Upsert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName). + WithInt64Column(common.DefaultInt64FieldName, []int64{1}). + WithVarcharColumn("document", []string{newText})) + common.CheckErr(t, err, true) + require.Equal(t, int64(1), res2.UpsertCount) + + // query updated embedding after upsert + resQueryAfter, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName). + WithFilter("int64 == 1"). + WithOutputFields("document", "dense"). + WithConsistencyLevel(entity.ClStrong)) + common.CheckErr(t, err, true) + require.Equal(t, 1, resQueryAfter.Len()) + + // extract updated embedding + updatedDenseColumn := resQueryAfter.GetColumn("dense") + require.NotNil(t, updatedDenseColumn) + updatedFloatVecColumn, ok := updatedDenseColumn.(*column.ColumnFloatVector) + require.True(t, ok, "Dense column should be a float vector column") + updatedEmbedding := updatedFloatVecColumn.Data()[0] + require.Equal(t, hp.GetTEIModelDim(), len(updatedEmbedding), "Updated embedding dimension should match") + + // verify updated text + updatedDocColumn := resQueryAfter.GetColumn("document") + require.NotNil(t, updatedDocColumn) + updatedVarCharColumn, ok := updatedDocColumn.(*column.ColumnVarChar) + require.True(t, ok, "Document column should be a varchar column") + require.Equal(t, newText, updatedVarCharColumn.Data()[0], "Updated text should match") + + // verify embeddings are different (key assertion) + similarity := hp.CosineSimilarity(originalEmbedding, updatedEmbedding) + require.Less(t, similarity, float32(0.95), + "Embeddings should be significantly different after text update (similarity=%.6f)", similarity) + + t.Logf("Upsert verification complete: Original and updated embeddings have cosine similarity %.6f (< 0.95)", similarity) + t.Logf(" Original text: %s", oldText) + t.Logf(" Updated text: %s", newText) +} + +// TestDeleteAndSearch tests that deleted text cannot be searched +func TestDeleteAndSearch(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with TEI function (custom fields for autoID=false) + function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{ + "provider": "TEI", + "endpoint": hp.GetTEIEndpoint(), + }) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(false).TWithMaxLen(65535) + + prepare, schema := hp.CollPrepare.CreateCollection( + ctx, t, mc, + hp.NewCreateCollectionParams(hp.TextEmbedding), + fieldsOption, + schemaOption, + hp.TWithConsistencyLevel(entity.ClStrong), + ) + + // insert test data + documents := []string{ + "This is test document 0", + "This is test document 1", + "This is test document 2", + } + ids := []int64{0, 1, 2} + + res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName). + WithInt64Column(common.DefaultInt64FieldName, ids). + WithVarcharColumn("document", documents)) + common.CheckErr(t, err, true) + require.Equal(t, int64(3), res.InsertCount) + + // create index and load + prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)})) + prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName)) + + // delete document with ID 1 + res2, err := mc.Delete(ctx, milvusclient.NewDeleteOption(schema.CollectionName).WithExpr("int64 in [1]")) + common.CheckErr(t, err, true) + require.Equal(t, int64(1), res2.DeleteCount) + + // search and verify document 1 is not in results + searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 3, []entity.Vector{entity.Text("test document 1")}). + WithANNSField("dense"). + WithOutputFields("document", common.DefaultInt64FieldName)) + common.CheckErr(t, err, true) + + // verify deleted document is not in results + require.Greater(t, len(searchRes), 0) + for _, hits := range searchRes { + for i := 0; i < hits.Len(); i++ { + id, err := hits.IDs.GetAsInt64(i) + require.NoError(t, err) + require.NotEqual(t, int64(1), id, "Deleted document should not appear in search results") + } + } +} + +// TestSearchWithTextEmbedding tests search functionality with text embedding +func TestSearchWithTextEmbedding(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create -> insert -> index -> load + prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong)) + + // prepare test data + nb := 10 + documents := make([]string, nb) + for i := 0; i < nb; i++ { + documents[i] = fmt.Sprintf("This is test document number %d about artificial intelligence and machine learning", i) + } + + // insert data using only text field + res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents)) + common.CheckErr(t, err, true) + require.Equal(t, int64(nb), res.InsertCount) + + // create index and load + prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)})) + prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName)) + + // search using text query + queryText := "artificial intelligence machine learning" + searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 5, []entity.Vector{entity.Text(queryText)}). + WithANNSField("dense"). + WithOutputFields("document")) + common.CheckErr(t, err, true) + + require.Greater(t, len(searchRes), 0) + for _, hits := range searchRes { + require.Greater(t, hits.Len(), 0, "Should find relevant documents") + require.LessOrEqual(t, hits.Len(), 5, "Should respect limit") + + // verify results contain the search terms (semantic similarity) + for i := 0; i < hits.Len(); i++ { + score := hits.Scores[i] + require.Greater(t, score, float32(0), "Score should be positive") + } + } +} + +// TestSearchWithEmptyQuery tests search with empty query (should fail) +func TestSearchWithEmptyQuery(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with TEI function + _, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong)) + + // insert some test data + documents := []string{"test document"} + res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents)) + common.CheckErr(t, err, true) + require.Equal(t, int64(1), res.InsertCount) + + // create index and load + _, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(schema.CollectionName, "dense", index.NewAutoIndex(entity.COSINE))) + common.CheckErr(t, err, true) + + _, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(schema.CollectionName)) + common.CheckErr(t, err, true) + + // search with empty query should fail + _, err = mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 3, []entity.Vector{entity.Text("")}). + WithANNSField("dense")) + + common.CheckErr(t, err, false, "TextEmbedding function does not support empty text") +} + +// TestHybridSearchTextEmbeddingBM25 tests hybrid search combining TEI text embedding and BM25 +func TestHybridSearchTextEmbeddingBM25(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with both TEI text embedding and BM25 functions + collectionName := common.GenRandomString("hybrid_search", 6) + + // create fields manually to support both dense and sparse vectors + fields := []*entity.Field{ + entity.NewField().WithName(common.DefaultInt64FieldName).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true).WithIsAutoID(true), + entity.NewField().WithName("document").WithDataType(entity.FieldTypeVarChar).WithMaxLength(65535).WithEnableAnalyzer(true).WithAnalyzerParams(map[string]any{"tokenizer": "standard"}), + entity.NewField().WithName("dense").WithDataType(entity.FieldTypeFloatVector).WithDim(int64(hp.GetTEIModelDim())), + entity.NewField().WithName("sparse").WithDataType(entity.FieldTypeSparseVector), + } + + // create TEI text embedding function + teiFunction := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{ + "provider": "TEI", + "endpoint": hp.GetTEIEndpoint(), + }) + + // create BM25 function + bm25Function := hp.TNewBM25Function("document", "sparse") + + // create schema with both functions + schema := entity.NewSchema(). + WithName(collectionName). + WithDescription("Hybrid search collection with TEI and BM25"). + WithFunction(teiFunction). + WithFunction(bm25Function) + + for _, field := range fields { + schema.WithField(field) + } + + // create collection + err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(collectionName, schema)) + common.CheckErr(t, err, true) + + // insert test data with diverse content + documents := []string{ + "Artificial intelligence and machine learning are transforming technology", + "Vector databases enable semantic search capabilities for AI applications", + "Text embeddings capture semantic meaning in numerical representations", + "BM25 is a traditional keyword-based search algorithm", + "Hybrid search combines semantic and keyword-based retrieval methods", + "Large language models use transformer architectures for text understanding", + "Information retrieval systems help users find relevant documents", + "Natural language processing enables computers to understand human language", + "Database systems store and retrieve structured information efficiently", + "Search engines use ranking algorithms to order results by relevance", + } + + // insert data - both embeddings will be generated automatically + res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(collectionName).WithVarcharColumn("document", documents)) + common.CheckErr(t, err, true) + require.Equal(t, int64(len(documents)), res.InsertCount) + + // create indexes + _, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(collectionName, "dense", index.NewAutoIndex(entity.COSINE))) + common.CheckErr(t, err, true) + + _, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(collectionName, "sparse", index.NewSparseInvertedIndex(entity.BM25, 0.1))) + common.CheckErr(t, err, true) + + // load collection + _, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(collectionName)) + common.CheckErr(t, err, true) + + // test 1: Dense vector search (TEI semantic search) + t.Run("DenseVectorSearch", func(t *testing.T) { + queryText := "machine learning artificial intelligence" + searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 3, []entity.Vector{entity.Text(queryText)}). + WithANNSField("dense"). + WithOutputFields("document")) + common.CheckErr(t, err, true) + + require.Greater(t, len(searchRes), 0) + for _, hits := range searchRes { + require.Greater(t, hits.Len(), 0, "Should find semantically similar documents") + t.Logf("Dense search found %d results for query: %s", hits.Len(), queryText) + } + }) + + // test 2: Sparse vector search (BM25 keyword search) + t.Run("SparseVectorSearch", func(t *testing.T) { + queryText := "database systems" + searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 3, []entity.Vector{entity.Text(queryText)}). + WithANNSField("sparse"). + WithOutputFields("document")) + common.CheckErr(t, err, true) + + require.Greater(t, len(searchRes), 0) + for _, hits := range searchRes { + require.Greater(t, hits.Len(), 0, "Should find keyword-matching documents") + t.Logf("Sparse search found %d results for query: %s", hits.Len(), queryText) + } + }) + + // test 3: Both search types work independently + t.Run("IndependentSearches", func(t *testing.T) { + queryText := "vector search" + + // Dense search + denseRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 5, []entity.Vector{entity.Text(queryText)}). + WithANNSField("dense"). + WithOutputFields("document")) + common.CheckErr(t, err, true) + + // Sparse search + sparseRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 5, []entity.Vector{entity.Text(queryText)}). + WithANNSField("sparse"). + WithOutputFields("document")) + common.CheckErr(t, err, true) + + // Both should return results + require.Greater(t, len(denseRes), 0, "Dense search should return results") + require.Greater(t, len(sparseRes), 0, "Sparse search should return results") + + for _, hits := range denseRes { + require.Greater(t, hits.Len(), 0, "Dense search should find documents") + } + + for _, hits := range sparseRes { + require.Greater(t, hits.Len(), 0, "Sparse search should find documents") + } + + t.Logf("Dense search found %d results, Sparse search found %d results", + len(denseRes), len(sparseRes)) + }) +} + +// TestInsertEmptyDocument tests insertion with empty document +func TestInsertEmptyDocument(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with TEI function + _, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong)) + + // try to insert empty document + documents := []string{"", "normal document"} + + _, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents)) + + // should fail with empty document + common.CheckErr(t, err, false, "TextEmbedding function does not support empty text") +} + +// TestInsertLongDocument tests insertion with very long document +func TestInsertLongDocument(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with TEI function (no truncate) + params := map[string]any{ + "provider": "TEI", + "endpoint": hp.GetTEIEndpoint(), + "truncate": "false", + } + function := hp.TNewTextEmbeddingFunction("document", "dense", params) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535) + + _, schema := hp.CollPrepare.CreateCollection( + ctx, t, mc, + hp.NewCreateCollectionParams(hp.TextEmbedding), + fieldsOption, + schemaOption, + hp.TWithConsistencyLevel(entity.ClStrong), + ) + + // try to insert very long document that exceeds model limits + longDocument := hp.GenLongText(8192, "english") // Very long text + documents := []string{longDocument} + + _, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents)) + + // should fail with long document when truncate is false + common.CheckErr(t, err, false, "Call service failed") +} + +// TestInvalidEndpointHandling tests various invalid endpoint scenarios +func TestInvalidEndpointHandling(t *testing.T) { + testCases := []struct { + name string + endpoint string + errMsg string + }{ + {"NonExistentHost", "http://nonexistent-host:8080", "nonexistent-host"}, + {"InvalidPort", "http://localhost:99999", "99999"}, + {"InvalidProtocol", "ftp://localhost:8080", "ftp"}, + {"EmptyEndpoint", "", "endpoint"}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with invalid endpoint + function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{ + "provider": "TEI", + "endpoint": tc.endpoint, + }) + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535) + + // collection creation should fail for invalid endpoints + collectionName := common.GenRandomString("test_invalid", 6) + err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption( + collectionName, + hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))), + )) + + common.CheckErr(t, err, false, tc.errMsg) + t.Logf("Expected error for %s: %v", tc.name, err) + }) + } +} + +// TestMissingRequiredParameters tests creation with missing required parameters +func TestMissingRequiredParameters(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout) + mc := hp.CreateDefaultMilvusClient(ctx, t) + + testCases := []struct { + name string + params map[string]any + errMsg string + }{ + {"MissingProvider", map[string]any{"endpoint": hp.GetTEIEndpoint()}, "provider"}, + {"MissingEndpoint", map[string]any{"provider": "TEI"}, "endpoint"}, + {"WrongProvider", map[string]any{"provider": "InvalidProvider", "endpoint": hp.GetTEIEndpoint()}, "invalidprovider"}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // create function with incomplete parameters + function := entity.NewFunction(). + WithName("incomplete_func"). + WithInputFields("document"). + WithOutputFields("dense"). + WithType(entity.FunctionTypeTextEmbedding) + + for key, value := range tc.params { + function.WithParam(key, value) + } + + schemaOption := hp.TNewSchemaOption().TWithFunction(function) + fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535) + + // collection creation should fail + err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption( + common.GenRandomString("test_incomplete", 6), + hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))), + )) + + common.CheckErr(t, err, false, tc.errMsg) + t.Logf("Expected error for %s: %v", tc.name, err) + }) + } +} + +// TestConcurrentOperations tests concurrent text embedding operations +func TestConcurrentOperations(t *testing.T) { + ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout*2) // longer timeout for concurrent ops + mc := hp.CreateDefaultMilvusClient(ctx, t) + + // create collection with TEI function + prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong)) + + // create index and load + prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)})) + prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName)) + + // concurrent inserts + t.Run("ConcurrentInserts", func(t *testing.T) { + numRoutines := 5 + documentsPerRoutine := 5 + + results := make(chan error, numRoutines) + + for i := 0; i < numRoutines; i++ { + go func(routineID int) { + documents := make([]string, documentsPerRoutine) + for j := 0; j < documentsPerRoutine; j++ { + documents[j] = fmt.Sprintf("Concurrent document from routine %d, doc %d", routineID, j) + } + + _, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents)) + results <- err + }(i) + } + + // wait for all goroutines to complete + for i := 0; i < numRoutines; i++ { + err := <-results + require.NoError(t, err, "Concurrent insert should succeed") + } + + t.Logf("Successfully completed %d concurrent inserts with %d documents each", numRoutines, documentsPerRoutine) + }) + + // concurrent searches + t.Run("ConcurrentSearches", func(t *testing.T) { + numRoutines := 3 + + results := make(chan error, numRoutines) + + for i := 0; i < numRoutines; i++ { + go func(routineID int) { + queryText := fmt.Sprintf("document routine %d", routineID) + _, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 5, []entity.Vector{entity.Text(queryText)}). + WithANNSField("dense"). + WithOutputFields("document")) + results <- err + }(i) + } + + // wait for all searches to complete + for i := 0; i < numRoutines; i++ { + err := <-results + require.NoError(t, err, "Concurrent search should succeed") + } + + t.Logf("Successfully completed %d concurrent searches", numRoutines) + }) +}