test:add text embedding function testcases in go client (#43875)

/kind improvement

---------

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
zhuwenxing 2025-08-15 11:37:43 +08:00 committed by GitHub
parent c102fa8b0b
commit 1e31ad345b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 1177 additions and 10 deletions

View File

@ -4,7 +4,10 @@ import (
"bytes" "bytes"
"encoding/json" "encoding/json"
"fmt" "fmt"
"io"
"math"
"math/rand" "math/rand"
"net/http"
"slices" "slices"
"strconv" "strconv"
"strings" "strings"
@ -539,6 +542,154 @@ func GetBm25FunctionsOutputFields(schema *entity.Schema) []string {
return outputFields return outputFields
} }
func GetTextEmbeddingFunctionsOutputFields(schema *entity.Schema) []string {
var outputFields []string
for _, fn := range schema.Functions {
if fn.Type == entity.FunctionTypeTextEmbedding {
outputFields = append(outputFields, fn.OutputFieldNames...)
}
}
return outputFields
}
func GetAllFunctionsOutputFields(schema *entity.Schema) []string {
var outputFields []string
for _, fn := range schema.Functions {
if fn.Type == entity.FunctionTypeBM25 || fn.Type == entity.FunctionTypeTextEmbedding {
outputFields = append(outputFields, fn.OutputFieldNames...)
}
}
return outputFields
}
// GenTextDocuments generates realistic text documents for embedding tests
func GenTextDocuments(count int, lang string) []string {
documents := make([]string, count)
var templates []string
switch lang {
case "english", "en":
templates = []string{
"This is a document about artificial intelligence and machine learning technologies in modern computing systems",
"Vector databases enable efficient similarity search for high-dimensional data in AI applications",
"Text embeddings transform natural language into numerical representations for semantic understanding",
"Information retrieval systems help users find relevant documents from large collections of data",
"Natural language processing enables computers to understand and generate human language effectively",
"Database management systems provide structured storage and efficient querying of information",
"Search algorithms rank and retrieve the most relevant results for user queries",
"Machine learning models learn patterns from data to make predictions and classifications",
"Deep learning neural networks process complex patterns in images, text, and other data types",
"Data science combines statistics, programming, and domain knowledge to extract insights",
}
case "chinese", "zh":
templates = []string{
"这是关于人工智能和机器学习技术的文档,介绍现代计算系统中的应用",
"向量数据库为高维数据提供高效的相似性搜索功能支持AI应用开发",
"文本嵌入技术将自然语言转换为数值表示,实现语义理解和分析",
"信息检索系统帮助用户从大规模数据集合中找到相关的文档内容",
"自然语言处理技术使计算机能够理解和生成人类语言",
"数据库管理系统提供结构化存储和高效的信息查询功能",
"搜索算法对用户查询结果进行排序和检索,返回最相关的内容",
"机器学习模型从数据中学习模式,进行预测和分类任务",
"深度学习神经网络处理图像、文本等复杂数据类型中的模式",
"数据科学结合统计学、编程和领域知识来提取有价值的洞察",
}
default:
// Default to English
templates = []string{
"Document about technology and innovation in the digital age",
"Analysis of modern computing systems and their applications",
"Research on data processing and information management",
"Study of algorithms and their implementation in software",
"Overview of database systems and their optimization techniques",
}
}
for i := 0; i < count; i++ {
baseTemplate := templates[i%len(templates)]
documents[i] = fmt.Sprintf("%s. Document ID: %d", baseTemplate, i)
}
return documents
}
// CosineSimilarity calculates cosine similarity between two float32 vectors
func CosineSimilarity(a, b []float32) float32 {
if len(a) != len(b) || len(a) == 0 {
return 0
}
var dotProduct, normA, normB float32
for i := 0; i < len(a); i++ {
dotProduct += a[i] * b[i]
normA += a[i] * a[i]
normB += b[i] * b[i]
}
if normA == 0 || normB == 0 {
return 0
}
// Use math.Sqrt for more accurate calculation
return dotProduct / (float32(math.Sqrt(float64(normA))) * float32(math.Sqrt(float64(normB))))
}
// GenLongText generates long text with specified word count
func GenLongText(wordCount int, lang string) string {
var words []string
switch lang {
case "chinese", "zh":
words = []string{"人工智能", "机器学习", "深度学习", "神经网络", "数据挖掘", "自然语言", "处理技术", "计算机", "算法优化", "信息检索", "向量数据库", "语义搜索", "文本分析", "知识图谱", "智能系统"}
case "english", "en":
words = []string{"artificial", "intelligence", "machine", "learning", "deep", "neural", "network", "algorithm", "database", "search", "vector", "embedding", "semantic", "analysis", "information", "retrieval", "computing", "technology", "system", "data", "processing", "optimization", "performance", "scalability", "efficiency"}
default:
words = []string{"the", "quick", "brown", "fox", "jumps", "over", "lazy", "dog", "and", "runs", "through", "forest", "with", "great", "speed", "while", "chasing", "rabbit", "under", "bright", "moonlight", "across", "green", "fields", "toward", "distant", "mountains"}
}
result := make([]string, wordCount)
for i := 0; i < wordCount; i++ {
result[i] = words[i%len(words)]
}
return strings.Join(result, " ")
}
// CallTEIDirectly calls TEI endpoint directly to get embeddings
func CallTEIDirectly(endpoint string, texts []string) ([][]float32, error) {
// TEI API request structure
type TEIRequest struct {
Inputs []string `json:"inputs"`
}
// Create request
reqBody := TEIRequest{Inputs: texts}
jsonData, err := json.Marshal(reqBody)
if err != nil {
return nil, fmt.Errorf("failed to marshal request: %w", err)
}
// Make HTTP request to TEI
resp, err := http.Post(endpoint+"/embed", "application/json", bytes.NewBuffer(jsonData))
if err != nil {
return nil, fmt.Errorf("failed to call TEI endpoint: %w", err)
}
defer resp.Body.Close()
// Read response
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response: %w", err)
}
// Parse response - TEI returns array of arrays
var embeddings [][]float32
if err := json.Unmarshal(body, &embeddings); err != nil {
return nil, fmt.Errorf("failed to unmarshal response: %w", err)
}
return embeddings, nil
}
func GenColumnsBasedSchema(schema *entity.Schema, option *GenDataOption) ([]column.Column, []column.Column) { func GenColumnsBasedSchema(schema *entity.Schema, option *GenDataOption) ([]column.Column, []column.Column) {
if nil == schema || schema.CollectionName == "" { if nil == schema || schema.CollectionName == "" {
log.Fatal("[GenColumnsBasedSchema] Nil Schema is not expected") log.Fatal("[GenColumnsBasedSchema] Nil Schema is not expected")
@ -557,7 +708,7 @@ func GenColumnsBasedSchema(schema *entity.Schema, option *GenDataOption) ([]colu
if option.fieldName == "" { if option.fieldName == "" {
option.fieldName = field.Name option.fieldName = field.Name
} }
if slices.Contains(GetBm25FunctionsOutputFields(schema), field.Name) { if slices.Contains(GetAllFunctionsOutputFields(schema), field.Name) {
continue continue
} }
log.Info("GenColumnsBasedSchema", zap.Any("field", field)) log.Info("GenColumnsBasedSchema", zap.Any("field", field))

View File

@ -99,15 +99,16 @@ type CollectionFieldsType int32
const ( const (
// FieldTypeNone zero value place holder // FieldTypeNone zero value place holder
Int64Vec CollectionFieldsType = 1 // int64 + floatVec Int64Vec CollectionFieldsType = 1 // int64 + floatVec
VarcharBinary CollectionFieldsType = 2 // varchar + binaryVec VarcharBinary CollectionFieldsType = 2 // varchar + binaryVec
Int64VecJSON CollectionFieldsType = 3 // int64 + floatVec + json Int64VecJSON CollectionFieldsType = 3 // int64 + floatVec + json
Int64VecArray CollectionFieldsType = 4 // int64 + floatVec + array Int64VecArray CollectionFieldsType = 4 // int64 + floatVec + array
Int64VarcharSparseVec CollectionFieldsType = 5 // int64 + varchar + sparse vector Int64VarcharSparseVec CollectionFieldsType = 5 // int64 + varchar + sparse vector
Int64MultiVec CollectionFieldsType = 6 // int64 + floatVec + binaryVec + fp16Vec + bf16vec Int64MultiVec CollectionFieldsType = 6 // int64 + floatVec + binaryVec + fp16Vec + bf16vec
AllFields CollectionFieldsType = 7 // all fields excepted sparse AllFields CollectionFieldsType = 7 // all fields excepted sparse
Int64VecAllScalar CollectionFieldsType = 8 // int64 + floatVec + all scalar fields Int64VecAllScalar CollectionFieldsType = 8 // int64 + floatVec + all scalar fields
FullTextSearch CollectionFieldsType = 9 // int64 + varchar + sparse vector + analyzer + function FullTextSearch CollectionFieldsType = 9 // int64 + varchar + sparse vector + analyzer + function
TextEmbedding CollectionFieldsType = 10 // int64 + varchar + float_vector + text_embedding_function
) )
type GenFieldsOption struct { type GenFieldsOption struct {
@ -373,6 +374,23 @@ func (cf FieldsFullTextSearch) GenFields(option GenFieldsOption) []*entity.Field
return fields return fields
} }
type FieldsTextEmbedding struct{}
func (cf FieldsTextEmbedding) GenFields(option GenFieldsOption) []*entity.Field {
pkField := entity.NewField().WithName(GetFieldNameByFieldType(entity.FieldTypeInt64)).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true)
textField := entity.NewField().WithName("document").WithDataType(entity.FieldTypeVarChar).WithMaxLength(option.MaxLength).WithIsPartitionKey(option.IsPartitionKey)
vecField := entity.NewField().WithName("dense").WithDataType(entity.FieldTypeFloatVector).WithDim(option.Dim)
if option.AutoID {
pkField.WithIsAutoID(option.AutoID)
}
fields := []*entity.Field{
pkField,
textField,
vecField,
}
return fields
}
func (ff FieldsFactory) GenFieldsForCollection(collectionFieldsType CollectionFieldsType, option *GenFieldsOption) []*entity.Field { func (ff FieldsFactory) GenFieldsForCollection(collectionFieldsType CollectionFieldsType, option *GenFieldsOption) []*entity.Field {
log.Info("GenFieldsForCollection", zap.Any("GenFieldsOption", option)) log.Info("GenFieldsForCollection", zap.Any("GenFieldsOption", option))
switch collectionFieldsType { switch collectionFieldsType {
@ -394,7 +412,14 @@ func (ff FieldsFactory) GenFieldsForCollection(collectionFieldsType CollectionFi
return FieldsInt64VecAllScalar{}.GenFields(*option) return FieldsInt64VecAllScalar{}.GenFields(*option)
case FullTextSearch: case FullTextSearch:
return FieldsFullTextSearch{}.GenFields(*option) return FieldsFullTextSearch{}.GenFields(*option)
case TextEmbedding:
return FieldsTextEmbedding{}.GenFields(*option)
default: default:
return FieldsInt64Vec{}.GenFields(*option) return FieldsInt64Vec{}.GenFields(*option)
} }
} }
// TNewTextEmbeddingFieldsOption creates fields option with text embedding settings
func TNewTextEmbeddingFieldsOption() *GenFieldsOption {
return TNewFieldsOption().TWithDim(int64(GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
}

View File

@ -12,3 +12,19 @@ func TNewBM25Function(inputField, outputField string) *entity.Function {
WithOutputFields(outputField). WithOutputFields(outputField).
WithType(entity.FunctionTypeBM25) WithType(entity.FunctionTypeBM25)
} }
// TNewTextEmbeddingFunction creates a text embedding function for different providers
func TNewTextEmbeddingFunction(inputField, outputField string, params map[string]any) *entity.Function {
function := entity.NewFunction().
WithName(inputField + "_text_emb").
WithInputFields(inputField).
WithOutputFields(outputField).
WithType(entity.FunctionTypeTextEmbedding)
// Add all parameters including provider
for key, value := range params {
function.WithParam(key, value)
}
return function
}

View File

@ -75,3 +75,12 @@ func GenSchema(option *GenSchemaOption) *entity.Schema {
} }
return schema return schema
} }
// TNewTextEmbeddingSchemaOption creates schema option with text embedding function
func TNewTextEmbeddingSchemaOption() *GenSchemaOption {
function := TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": GetTEIEndpoint(),
})
return TNewSchemaOption().TWithFunction(function)
}

View File

@ -19,6 +19,8 @@ var (
user = flag.String("user", "root", "user") user = flag.String("user", "root", "user")
password = flag.String("password", "Milvus", "password") password = flag.String("password", "Milvus", "password")
logLevel = flag.String("log.level", "info", "log level for test") logLevel = flag.String("log.level", "info", "log level for test")
teiEndpoint = flag.String("tei_endpoint", "http://text-embeddings-service.milvus-ci.svc.cluster.local:80", "TEI service endpoint for text embedding tests")
teiModelDim = flag.Int("tei_model_dim", 768, "Vector dimension for text embedding model")
defaultClientConfig *client.ClientConfig defaultClientConfig *client.ClientConfig
) )
@ -42,6 +44,14 @@ func GetPassword() string {
return *password return *password
} }
func GetTEIEndpoint() string {
return *teiEndpoint
}
func GetTEIModelDim() int {
return *teiModelDim
}
func parseLogConfig() { func parseLogConfig() {
log.Info("Parser Log Level", zap.String("logLevel", *logLevel)) log.Info("Parser Log Level", zap.String("logLevel", *logLevel))
switch *logLevel { switch *logLevel {

View File

@ -0,0 +1,956 @@
package testcases
import (
"fmt"
"strings"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/milvus-io/milvus/client/v2/column"
"github.com/milvus-io/milvus/client/v2/entity"
"github.com/milvus-io/milvus/client/v2/index"
"github.com/milvus-io/milvus/client/v2/milvusclient"
"github.com/milvus-io/milvus/tests/go_client/common"
hp "github.com/milvus-io/milvus/tests/go_client/testcases/helper"
)
// TestCreateCollectionWithTextEmbedding tests basic collection creation with text embedding function
func TestCreateCollectionWithTextEmbedding(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// verify collection creation
require.NotNil(t, prepare)
require.NotNil(t, schema)
// describe collection to verify function
descRes, err := mc.DescribeCollection(ctx, milvusclient.NewDescribeCollectionOption(schema.CollectionName))
common.CheckErr(t, err, true)
require.Len(t, descRes.Schema.Functions, 1)
require.Equal(t, "document_text_emb", descRes.Schema.Functions[0].Name)
require.Equal(t, entity.FunctionTypeTextEmbedding, descRes.Schema.Functions[0].Type)
require.Equal(t, []string{"document"}, descRes.Schema.Functions[0].InputFieldNames)
require.Equal(t, []string{"dense"}, descRes.Schema.Functions[0].OutputFieldNames)
}
// TestCreateCollectionWithTextEmbeddingTwice tests creating collection twice with same schema
func TestCreateCollectionWithTextEmbeddingTwice(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
collectionName := common.GenRandomString("text_embedding", 6)
createParams := hp.NewCreateCollectionParams(hp.TextEmbedding)
// first creation
prepare1, schema1 := hp.CollPrepare.CreateCollection(
ctx, t, mc, createParams, fieldsOption,
schemaOption.TWithName(collectionName),
hp.TWithConsistencyLevel(entity.ClStrong),
)
require.NotNil(t, prepare1)
require.NotNil(t, schema1)
// second creation with same name should succeed (idempotent)
prepare2, schema2 := hp.CollPrepare.CreateCollection(
ctx, t, mc, createParams, fieldsOption,
schemaOption.TWithName(collectionName),
hp.TWithConsistencyLevel(entity.ClStrong),
)
require.NotNil(t, prepare2)
require.NotNil(t, schema2)
// verify function exists
descRes, err := mc.DescribeCollection(ctx, milvusclient.NewDescribeCollectionOption(collectionName))
common.CheckErr(t, err, true)
require.Len(t, descRes.Schema.Functions, 1)
}
// TestCreateCollectionUnsupportedEndpoint tests creation with unsupported endpoint
func TestCreateCollectionUnsupportedEndpoint(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with invalid endpoint
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": "http://unsupported_endpoint",
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
// this should fail during collection creation
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
common.GenRandomString("text_embedding", 6),
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))),
))
// expect error due to unsupported endpoint
common.CheckErr(t, err, false, "unsupported_endpoint")
}
// TestCreateCollectionUnmatchedDim tests creation with mismatched dimension
func TestCreateCollectionUnmatchedDim(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with wrong dimension (512 instead of expected 768 from TEI model)
wrongDim := int64(512)
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := hp.TNewFieldsOption().TWithDim(wrongDim).TWithAutoID(true).TWithMaxLen(65535)
collectionName := common.GenRandomString("text_embedding", 6)
// collection creation should fail with dimension mismatch error
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
collectionName,
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))),
))
// Expect error with specific dimension mismatch message
expectedError := fmt.Sprintf("required embedding dim is [%d], but the embedding obtained from the model is [%d]", wrongDim, hp.GetTEIModelDim())
common.CheckErr(t, err, false, expectedError)
}
// TestInsertWithTextEmbedding tests basic data insertion with text embedding
func TestInsertWithTextEmbedding(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// prepare test data - only provide text, embedding will be auto-generated
nb := 10
documents := make([]string, nb)
for i := 0; i < nb; i++ {
documents[i] = fmt.Sprintf("This is test document number %d with some content for embedding", i)
}
// insert data using only text field
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
common.CheckErr(t, err, true)
require.Equal(t, int64(nb), res.InsertCount)
// create index and load
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// query to verify vectors were generated
resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter("").WithOutputFields("dense").WithLimit(10))
common.CheckErr(t, err, true)
require.Greater(t, len(resQuery.Fields), 0)
// verify vector dimension - check first result
if resQuery.Len() > 0 {
// Query results structure is different - need to check the actual field structure
denseColumn := resQuery.GetColumn("dense")
require.NotNil(t, denseColumn)
// Field should contain vectors for all results
}
}
// TestInsertWithTruncateParams tests insertion with different truncate parameters
func TestInsertWithTruncateParams(t *testing.T) {
testCases := []struct {
name string
truncate bool
truncationDirection string
shouldSucceed bool
}{
{"truncate_true_right", true, "Right", true},
{"truncate_true_left", true, "Left", true},
{"truncate_false", false, "", false}, // should fail with long text
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create TEI function with truncate parameters
params := map[string]any{}
if tc.truncate {
params["truncate"] = "true"
params["truncation_direction"] = tc.truncationDirection
} else {
params["truncate"] = "false"
}
params["provider"] = "TEI"
params["endpoint"] = hp.GetTEIEndpoint()
function := hp.TNewTextEmbeddingFunction("document", "dense", params)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
_, schema := hp.CollPrepare.CreateCollection(
ctx, t, mc,
hp.NewCreateCollectionParams(hp.TextEmbedding),
fieldsOption,
schemaOption,
hp.TWithConsistencyLevel(entity.ClStrong),
)
// prepare long text data that would need truncation
// Generate distinctly different left and right parts that will exceed token limits when combined
leftPart := "artificial intelligence machine learning deep learning neural networks computer vision natural language processing data science algorithms " + strings.Repeat("technology innovation science research development analysis ", 100)
rightPart := "database systems vector search embeddings similarity matching retrieval information storage indexing " + strings.Repeat("query performance optimization scalability distributed computing ", 100)
longText := leftPart + " " + rightPart // This will exceed 512 tokens and need truncation
documents := []string{longText, leftPart, rightPart}
// insert data
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
if tc.shouldSucceed {
common.CheckErr(t, err, true)
require.Equal(t, int64(len(documents)), res.InsertCount)
// create index and load for embedding comparison
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(schema.CollectionName, "dense", index.NewAutoIndex(entity.COSINE)))
common.CheckErr(t, err, true)
_, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(schema.CollectionName))
common.CheckErr(t, err, true)
// Query embeddings from Milvus
resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
WithFilter("").
WithOutputFields("dense", "document").
WithConsistencyLevel(entity.ClStrong).
WithLimit(10))
common.CheckErr(t, err, true)
require.Equal(t, len(documents), resQuery.Len())
// Extract Milvus embeddings
denseColumn := resQuery.GetColumn("dense")
require.NotNil(t, denseColumn)
floatVecColumn, ok := denseColumn.(*column.ColumnFloatVector)
require.True(t, ok, "Dense column should be a float vector column")
// Truncation validation using similarity comparison approach
// This follows the Python test logic: compare similarity between combined text and parts
// to verify that truncation direction works correctly
require.Equal(t, 3, resQuery.Len(), "Should have 3 documents: longText, leftPart, rightPart")
// Get embeddings for: [0]=longText, [1]=leftPart, [2]=rightPart
embeddings := make([][]float32, 3)
for i := 0; i < 3; i++ {
embedding := floatVecColumn.Data()[i]
require.Equal(t, hp.GetTEIModelDim(), len(embedding), "Embedding should have correct dimension")
// Check that embedding is not all zeros (would indicate a failure)
var sum float32
for _, val := range embedding {
sum += val * val
}
require.Greater(t, sum, float32(0.01), "Embedding should not be all zeros for document %d", i)
embeddings[i] = embedding
}
// Calculate cosine similarities
// similarity_left: longText vs leftPart
// similarity_right: longText vs rightPart
similarityLeft := hp.CosineSimilarity(embeddings[0], embeddings[1])
similarityRight := hp.CosineSimilarity(embeddings[0], embeddings[2])
t.Logf("Similarity longText vs leftPart: %.6f", similarityLeft)
t.Logf("Similarity longText vs rightPart: %.6f", similarityRight)
// Validation based on truncation direction:
// - If truncation_direction = "Left", we keep the right part, so longText should be more similar to rightPart
// - If truncation_direction = "Right", we keep the left part, so longText should be more similar to leftPart
if tc.truncationDirection == "Left" {
require.Greater(t, similarityRight, similarityLeft,
"With Left truncation, longText should be more similar to rightPart (%.6f) than leftPart (%.6f)",
similarityRight, similarityLeft)
t.Logf("Left truncation verified: rightPart similarity (%.6f) > leftPart similarity (%.6f)",
similarityRight, similarityLeft)
} else { // "Right"
require.Greater(t, similarityLeft, similarityRight,
"With Right truncation, longText should be more similar to leftPart (%.6f) than rightPart (%.6f)",
similarityLeft, similarityRight)
t.Logf("Right truncation verified: leftPart similarity (%.6f) > rightPart similarity (%.6f)",
similarityLeft, similarityRight)
}
t.Logf("Successfully inserted %d documents with truncate=%v, direction=%s", len(documents), tc.truncate, tc.truncationDirection)
} else {
common.CheckErr(t, err, false, "Payload Too Large")
}
})
}
}
// TestVerifyEmbeddingConsistency verifies that Milvus text embedding function produces same results as direct TEI calls
func TestVerifyEmbeddingConsistency(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function (custom fields for autoID=false)
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(false).TWithMaxLen(65535)
prepare, schema := hp.CollPrepare.CreateCollection(
ctx, t, mc,
hp.NewCreateCollectionParams(hp.TextEmbedding),
fieldsOption,
schemaOption,
hp.TWithConsistencyLevel(entity.ClStrong),
)
// Test documents
testDocs := []string{
"This is a test document about artificial intelligence",
"Vector databases enable semantic search capabilities",
"Text embeddings transform language into numbers",
}
// Insert documents into Milvus (will use text embedding function)
ids := []int64{1, 2, 3}
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
WithInt64Column(common.DefaultInt64FieldName, ids).
WithVarcharColumn("document", testDocs))
common.CheckErr(t, err, true)
require.Equal(t, int64(len(testDocs)), res.InsertCount)
// Create index and load
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// Query vectors from Milvus
resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
WithFilter("").
WithOutputFields("dense", "document", common.DefaultInt64FieldName).
WithConsistencyLevel(entity.ClStrong).
WithLimit(10))
common.CheckErr(t, err, true)
require.Equal(t, len(testDocs), resQuery.Len())
// Get embeddings directly from TEI
teiEmbeddings, err := hp.CallTEIDirectly(hp.GetTEIEndpoint(), testDocs)
if err != nil {
t.Skipf("Skip consistency test - could not connect to TEI endpoint: %v", err)
return
}
require.Equal(t, len(testDocs), len(teiEmbeddings))
// Compare embeddings
denseColumn := resQuery.GetColumn("dense")
require.NotNil(t, denseColumn)
// Get ID column to match embeddings with documents
idColumn := resQuery.GetColumn(common.DefaultInt64FieldName)
require.NotNil(t, idColumn)
// Extract and compare embeddings - need to handle column type properly
floatVecColumn, ok := denseColumn.(*column.ColumnFloatVector)
require.True(t, ok, "Dense column should be a float vector column")
for i := 0; i < resQuery.Len(); i++ {
// Get ID to find corresponding TEI embedding
id, err := idColumn.GetAsInt64(i)
require.NoError(t, err)
teiIdx := id - 1 // IDs are 1-based, array is 0-based
// Get Milvus embedding from the float vector column
milvusEmbedding := floatVecColumn.Data()[i]
require.NotNil(t, milvusEmbedding)
require.Equal(t, hp.GetTEIModelDim(), len(milvusEmbedding), "Embedding dimension should match")
// Calculate cosine similarity
similarity := hp.CosineSimilarity(milvusEmbedding, teiEmbeddings[teiIdx])
t.Logf("Document %d (ID=%d) similarity between Milvus and TEI: %.6f", i, id, similarity)
// Embeddings should be nearly identical (similarity > 0.99)
require.Greater(t, similarity, float32(0.99),
"Milvus embedding should be nearly identical to TEI embedding for document ID %d", id)
}
t.Log("Embedding consistency verified: Milvus text embedding function produces same results as direct TEI calls")
}
// TestUpsertTextFieldUpdatesEmbedding tests that upserting text field updates embedding
func TestUpsertTextFieldUpdatesEmbedding(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function (custom fields for autoID=false for upsert)
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(false).TWithMaxLen(65535) // disable auto ID for upsert
prepare, schema := hp.CollPrepare.CreateCollection(
ctx, t, mc,
hp.NewCreateCollectionParams(hp.TextEmbedding),
fieldsOption,
schemaOption,
hp.TWithConsistencyLevel(entity.ClStrong),
)
// create index and load first
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// insert initial data with specific ID
oldText := "This is the original text content"
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
WithInt64Column(common.DefaultInt64FieldName, []int64{1}).
WithVarcharColumn("document", []string{oldText}))
common.CheckErr(t, err, true)
require.Equal(t, int64(1), res.InsertCount)
// query original embedding before upsert
resQueryBefore, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
WithFilter("int64 == 1").
WithOutputFields("document", "dense").
WithConsistencyLevel(entity.ClStrong))
common.CheckErr(t, err, true)
require.Equal(t, 1, resQueryBefore.Len())
// extract original embedding
originalDenseColumn := resQueryBefore.GetColumn("dense")
require.NotNil(t, originalDenseColumn)
originalFloatVecColumn, ok := originalDenseColumn.(*column.ColumnFloatVector)
require.True(t, ok, "Dense column should be a float vector column")
originalEmbedding := originalFloatVecColumn.Data()[0]
require.Equal(t, hp.GetTEIModelDim(), len(originalEmbedding), "Original embedding dimension should match")
// verify original text
originalDocColumn := resQueryBefore.GetColumn("document")
require.NotNil(t, originalDocColumn)
originalVarCharColumn, ok := originalDocColumn.(*column.ColumnVarChar)
require.True(t, ok, "Document column should be a varchar column")
require.Equal(t, oldText, originalVarCharColumn.Data()[0], "Original text should match")
// upsert with new text
newText := "This is completely different updated text content"
res2, err := mc.Upsert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
WithInt64Column(common.DefaultInt64FieldName, []int64{1}).
WithVarcharColumn("document", []string{newText}))
common.CheckErr(t, err, true)
require.Equal(t, int64(1), res2.UpsertCount)
// query updated embedding after upsert
resQueryAfter, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
WithFilter("int64 == 1").
WithOutputFields("document", "dense").
WithConsistencyLevel(entity.ClStrong))
common.CheckErr(t, err, true)
require.Equal(t, 1, resQueryAfter.Len())
// extract updated embedding
updatedDenseColumn := resQueryAfter.GetColumn("dense")
require.NotNil(t, updatedDenseColumn)
updatedFloatVecColumn, ok := updatedDenseColumn.(*column.ColumnFloatVector)
require.True(t, ok, "Dense column should be a float vector column")
updatedEmbedding := updatedFloatVecColumn.Data()[0]
require.Equal(t, hp.GetTEIModelDim(), len(updatedEmbedding), "Updated embedding dimension should match")
// verify updated text
updatedDocColumn := resQueryAfter.GetColumn("document")
require.NotNil(t, updatedDocColumn)
updatedVarCharColumn, ok := updatedDocColumn.(*column.ColumnVarChar)
require.True(t, ok, "Document column should be a varchar column")
require.Equal(t, newText, updatedVarCharColumn.Data()[0], "Updated text should match")
// verify embeddings are different (key assertion)
similarity := hp.CosineSimilarity(originalEmbedding, updatedEmbedding)
require.Less(t, similarity, float32(0.95),
"Embeddings should be significantly different after text update (similarity=%.6f)", similarity)
t.Logf("Upsert verification complete: Original and updated embeddings have cosine similarity %.6f (< 0.95)", similarity)
t.Logf(" Original text: %s", oldText)
t.Logf(" Updated text: %s", newText)
}
// TestDeleteAndSearch tests that deleted text cannot be searched
func TestDeleteAndSearch(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function (custom fields for autoID=false)
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(false).TWithMaxLen(65535)
prepare, schema := hp.CollPrepare.CreateCollection(
ctx, t, mc,
hp.NewCreateCollectionParams(hp.TextEmbedding),
fieldsOption,
schemaOption,
hp.TWithConsistencyLevel(entity.ClStrong),
)
// insert test data
documents := []string{
"This is test document 0",
"This is test document 1",
"This is test document 2",
}
ids := []int64{0, 1, 2}
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
WithInt64Column(common.DefaultInt64FieldName, ids).
WithVarcharColumn("document", documents))
common.CheckErr(t, err, true)
require.Equal(t, int64(3), res.InsertCount)
// create index and load
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// delete document with ID 1
res2, err := mc.Delete(ctx, milvusclient.NewDeleteOption(schema.CollectionName).WithExpr("int64 in [1]"))
common.CheckErr(t, err, true)
require.Equal(t, int64(1), res2.DeleteCount)
// search and verify document 1 is not in results
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 3, []entity.Vector{entity.Text("test document 1")}).
WithANNSField("dense").
WithOutputFields("document", common.DefaultInt64FieldName))
common.CheckErr(t, err, true)
// verify deleted document is not in results
require.Greater(t, len(searchRes), 0)
for _, hits := range searchRes {
for i := 0; i < hits.Len(); i++ {
id, err := hits.IDs.GetAsInt64(i)
require.NoError(t, err)
require.NotEqual(t, int64(1), id, "Deleted document should not appear in search results")
}
}
}
// TestSearchWithTextEmbedding tests search functionality with text embedding
func TestSearchWithTextEmbedding(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create -> insert -> index -> load
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// prepare test data
nb := 10
documents := make([]string, nb)
for i := 0; i < nb; i++ {
documents[i] = fmt.Sprintf("This is test document number %d about artificial intelligence and machine learning", i)
}
// insert data using only text field
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
common.CheckErr(t, err, true)
require.Equal(t, int64(nb), res.InsertCount)
// create index and load
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// search using text query
queryText := "artificial intelligence machine learning"
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 5, []entity.Vector{entity.Text(queryText)}).
WithANNSField("dense").
WithOutputFields("document"))
common.CheckErr(t, err, true)
require.Greater(t, len(searchRes), 0)
for _, hits := range searchRes {
require.Greater(t, hits.Len(), 0, "Should find relevant documents")
require.LessOrEqual(t, hits.Len(), 5, "Should respect limit")
// verify results contain the search terms (semantic similarity)
for i := 0; i < hits.Len(); i++ {
score := hits.Scores[i]
require.Greater(t, score, float32(0), "Score should be positive")
}
}
}
// TestSearchWithEmptyQuery tests search with empty query (should fail)
func TestSearchWithEmptyQuery(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
_, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// insert some test data
documents := []string{"test document"}
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
common.CheckErr(t, err, true)
require.Equal(t, int64(1), res.InsertCount)
// create index and load
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(schema.CollectionName, "dense", index.NewAutoIndex(entity.COSINE)))
common.CheckErr(t, err, true)
_, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(schema.CollectionName))
common.CheckErr(t, err, true)
// search with empty query should fail
_, err = mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 3, []entity.Vector{entity.Text("")}).
WithANNSField("dense"))
common.CheckErr(t, err, false, "TextEmbedding function does not support empty text")
}
// TestHybridSearchTextEmbeddingBM25 tests hybrid search combining TEI text embedding and BM25
func TestHybridSearchTextEmbeddingBM25(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with both TEI text embedding and BM25 functions
collectionName := common.GenRandomString("hybrid_search", 6)
// create fields manually to support both dense and sparse vectors
fields := []*entity.Field{
entity.NewField().WithName(common.DefaultInt64FieldName).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true).WithIsAutoID(true),
entity.NewField().WithName("document").WithDataType(entity.FieldTypeVarChar).WithMaxLength(65535).WithEnableAnalyzer(true).WithAnalyzerParams(map[string]any{"tokenizer": "standard"}),
entity.NewField().WithName("dense").WithDataType(entity.FieldTypeFloatVector).WithDim(int64(hp.GetTEIModelDim())),
entity.NewField().WithName("sparse").WithDataType(entity.FieldTypeSparseVector),
}
// create TEI text embedding function
teiFunction := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
// create BM25 function
bm25Function := hp.TNewBM25Function("document", "sparse")
// create schema with both functions
schema := entity.NewSchema().
WithName(collectionName).
WithDescription("Hybrid search collection with TEI and BM25").
WithFunction(teiFunction).
WithFunction(bm25Function)
for _, field := range fields {
schema.WithField(field)
}
// create collection
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(collectionName, schema))
common.CheckErr(t, err, true)
// insert test data with diverse content
documents := []string{
"Artificial intelligence and machine learning are transforming technology",
"Vector databases enable semantic search capabilities for AI applications",
"Text embeddings capture semantic meaning in numerical representations",
"BM25 is a traditional keyword-based search algorithm",
"Hybrid search combines semantic and keyword-based retrieval methods",
"Large language models use transformer architectures for text understanding",
"Information retrieval systems help users find relevant documents",
"Natural language processing enables computers to understand human language",
"Database systems store and retrieve structured information efficiently",
"Search engines use ranking algorithms to order results by relevance",
}
// insert data - both embeddings will be generated automatically
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(collectionName).WithVarcharColumn("document", documents))
common.CheckErr(t, err, true)
require.Equal(t, int64(len(documents)), res.InsertCount)
// create indexes
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(collectionName, "dense", index.NewAutoIndex(entity.COSINE)))
common.CheckErr(t, err, true)
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(collectionName, "sparse", index.NewSparseInvertedIndex(entity.BM25, 0.1)))
common.CheckErr(t, err, true)
// load collection
_, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(collectionName))
common.CheckErr(t, err, true)
// test 1: Dense vector search (TEI semantic search)
t.Run("DenseVectorSearch", func(t *testing.T) {
queryText := "machine learning artificial intelligence"
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 3, []entity.Vector{entity.Text(queryText)}).
WithANNSField("dense").
WithOutputFields("document"))
common.CheckErr(t, err, true)
require.Greater(t, len(searchRes), 0)
for _, hits := range searchRes {
require.Greater(t, hits.Len(), 0, "Should find semantically similar documents")
t.Logf("Dense search found %d results for query: %s", hits.Len(), queryText)
}
})
// test 2: Sparse vector search (BM25 keyword search)
t.Run("SparseVectorSearch", func(t *testing.T) {
queryText := "database systems"
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 3, []entity.Vector{entity.Text(queryText)}).
WithANNSField("sparse").
WithOutputFields("document"))
common.CheckErr(t, err, true)
require.Greater(t, len(searchRes), 0)
for _, hits := range searchRes {
require.Greater(t, hits.Len(), 0, "Should find keyword-matching documents")
t.Logf("Sparse search found %d results for query: %s", hits.Len(), queryText)
}
})
// test 3: Both search types work independently
t.Run("IndependentSearches", func(t *testing.T) {
queryText := "vector search"
// Dense search
denseRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 5, []entity.Vector{entity.Text(queryText)}).
WithANNSField("dense").
WithOutputFields("document"))
common.CheckErr(t, err, true)
// Sparse search
sparseRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 5, []entity.Vector{entity.Text(queryText)}).
WithANNSField("sparse").
WithOutputFields("document"))
common.CheckErr(t, err, true)
// Both should return results
require.Greater(t, len(denseRes), 0, "Dense search should return results")
require.Greater(t, len(sparseRes), 0, "Sparse search should return results")
for _, hits := range denseRes {
require.Greater(t, hits.Len(), 0, "Dense search should find documents")
}
for _, hits := range sparseRes {
require.Greater(t, hits.Len(), 0, "Sparse search should find documents")
}
t.Logf("Dense search found %d results, Sparse search found %d results",
len(denseRes), len(sparseRes))
})
}
// TestInsertEmptyDocument tests insertion with empty document
func TestInsertEmptyDocument(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
_, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// try to insert empty document
documents := []string{"", "normal document"}
_, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
// should fail with empty document
common.CheckErr(t, err, false, "TextEmbedding function does not support empty text")
}
// TestInsertLongDocument tests insertion with very long document
func TestInsertLongDocument(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function (no truncate)
params := map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
"truncate": "false",
}
function := hp.TNewTextEmbeddingFunction("document", "dense", params)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
_, schema := hp.CollPrepare.CreateCollection(
ctx, t, mc,
hp.NewCreateCollectionParams(hp.TextEmbedding),
fieldsOption,
schemaOption,
hp.TWithConsistencyLevel(entity.ClStrong),
)
// try to insert very long document that exceeds model limits
longDocument := hp.GenLongText(8192, "english") // Very long text
documents := []string{longDocument}
_, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
// should fail with long document when truncate is false
common.CheckErr(t, err, false, "Call service failed")
}
// TestInvalidEndpointHandling tests various invalid endpoint scenarios
func TestInvalidEndpointHandling(t *testing.T) {
testCases := []struct {
name string
endpoint string
errMsg string
}{
{"NonExistentHost", "http://nonexistent-host:8080", "nonexistent-host"},
{"InvalidPort", "http://localhost:99999", "99999"},
{"InvalidProtocol", "ftp://localhost:8080", "ftp"},
{"EmptyEndpoint", "", "endpoint"},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with invalid endpoint
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": tc.endpoint,
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
// collection creation should fail for invalid endpoints
collectionName := common.GenRandomString("test_invalid", 6)
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
collectionName,
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))),
))
common.CheckErr(t, err, false, tc.errMsg)
t.Logf("Expected error for %s: %v", tc.name, err)
})
}
}
// TestMissingRequiredParameters tests creation with missing required parameters
func TestMissingRequiredParameters(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
testCases := []struct {
name string
params map[string]any
errMsg string
}{
{"MissingProvider", map[string]any{"endpoint": hp.GetTEIEndpoint()}, "provider"},
{"MissingEndpoint", map[string]any{"provider": "TEI"}, "endpoint"},
{"WrongProvider", map[string]any{"provider": "InvalidProvider", "endpoint": hp.GetTEIEndpoint()}, "invalidprovider"},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
// create function with incomplete parameters
function := entity.NewFunction().
WithName("incomplete_func").
WithInputFields("document").
WithOutputFields("dense").
WithType(entity.FunctionTypeTextEmbedding)
for key, value := range tc.params {
function.WithParam(key, value)
}
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim())).TWithAutoID(true).TWithMaxLen(65535)
// collection creation should fail
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
common.GenRandomString("test_incomplete", 6),
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))),
))
common.CheckErr(t, err, false, tc.errMsg)
t.Logf("Expected error for %s: %v", tc.name, err)
})
}
}
// TestConcurrentOperations tests concurrent text embedding operations
func TestConcurrentOperations(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout*2) // longer timeout for concurrent ops
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), hp.TNewTextEmbeddingFieldsOption(), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// create index and load
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// concurrent inserts
t.Run("ConcurrentInserts", func(t *testing.T) {
numRoutines := 5
documentsPerRoutine := 5
results := make(chan error, numRoutines)
for i := 0; i < numRoutines; i++ {
go func(routineID int) {
documents := make([]string, documentsPerRoutine)
for j := 0; j < documentsPerRoutine; j++ {
documents[j] = fmt.Sprintf("Concurrent document from routine %d, doc %d", routineID, j)
}
_, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
results <- err
}(i)
}
// wait for all goroutines to complete
for i := 0; i < numRoutines; i++ {
err := <-results
require.NoError(t, err, "Concurrent insert should succeed")
}
t.Logf("Successfully completed %d concurrent inserts with %d documents each", numRoutines, documentsPerRoutine)
})
// concurrent searches
t.Run("ConcurrentSearches", func(t *testing.T) {
numRoutines := 3
results := make(chan error, numRoutines)
for i := 0; i < numRoutines; i++ {
go func(routineID int) {
queryText := fmt.Sprintf("document routine %d", routineID)
_, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 5, []entity.Vector{entity.Text(queryText)}).
WithANNSField("dense").
WithOutputFields("document"))
results <- err
}(i)
}
// wait for all searches to complete
for i := 0; i < numRoutines; i++ {
err := <-results
require.NoError(t, err, "Concurrent search should succeed")
}
t.Logf("Successfully completed %d concurrent searches", numRoutines)
})
}