milvus/tests/go_client/testcases/text_embedding_test.go
2025-12-01 14:33:10 +08:00

975 lines
40 KiB
Go

package testcases
import (
"fmt"
"strings"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/milvus-io/milvus/client/v2/column"
"github.com/milvus-io/milvus/client/v2/entity"
"github.com/milvus-io/milvus/client/v2/index"
"github.com/milvus-io/milvus/client/v2/milvusclient"
"github.com/milvus-io/milvus/tests/go_client/common"
hp "github.com/milvus-io/milvus/tests/go_client/testcases/helper"
)
// newTextEmbeddingFieldsOption creates fields option with text embedding settings
func newTextEmbeddingFieldsOption(autoId bool) hp.FieldOptions {
fieldOpts := hp.TNewFieldOptions().
WithFieldOption("document", hp.TNewFieldsOption().TWithMaxLen(common.MaxLength)).
WithFieldOption("dense", hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim()))).
WithFieldOption(common.DefaultInt64FieldName, hp.TNewFieldsOption().TWithAutoID(autoId))
return fieldOpts
}
// TestCreateCollectionWithTextEmbedding tests basic collection creation with text embedding function
func TestCreateCollectionWithTextEmbedding(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), newTextEmbeddingFieldsOption(true), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// verify collection creation
require.NotNil(t, prepare)
require.NotNil(t, schema)
// describe collection to verify function
descRes, err := mc.DescribeCollection(ctx, milvusclient.NewDescribeCollectionOption(schema.CollectionName))
common.CheckErr(t, err, true)
require.Len(t, descRes.Schema.Functions, 1)
require.Equal(t, "document_text_emb", descRes.Schema.Functions[0].Name)
require.Equal(t, entity.FunctionTypeTextEmbedding, descRes.Schema.Functions[0].Type)
require.Equal(t, []string{"document"}, descRes.Schema.Functions[0].InputFieldNames)
require.Equal(t, []string{"dense"}, descRes.Schema.Functions[0].OutputFieldNames)
}
// TestCreateCollectionWithTextEmbeddingTwice tests creating collection twice with same schema
func TestCreateCollectionWithTextEmbeddingTwice(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := newTextEmbeddingFieldsOption(true)
collectionName := common.GenRandomString("text_embedding", 6)
createParams := hp.NewCreateCollectionParams(hp.TextEmbedding)
// first creation
prepare1, schema1 := hp.CollPrepare.CreateCollection(
ctx, t, mc, createParams, fieldsOption,
schemaOption.TWithName(collectionName),
hp.TWithConsistencyLevel(entity.ClStrong),
)
require.NotNil(t, prepare1)
require.NotNil(t, schema1)
// second creation with same name should succeed (idempotent)
prepare2, schema2 := hp.CollPrepare.CreateCollection(
ctx, t, mc, createParams, fieldsOption,
schemaOption.TWithName(collectionName),
hp.TWithConsistencyLevel(entity.ClStrong),
)
require.NotNil(t, prepare2)
require.NotNil(t, schema2)
// verify function exists
descRes, err := mc.DescribeCollection(ctx, milvusclient.NewDescribeCollectionOption(collectionName))
common.CheckErr(t, err, true)
require.Len(t, descRes.Schema.Functions, 1)
}
// TestCreateCollectionUnsupportedEndpoint tests creation with unsupported endpoint
func TestCreateCollectionUnsupportedEndpoint(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with invalid endpoint
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": "http://unsupported_endpoint",
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
// this should fail during collection creation
fieldOpts := hp.TNewFieldOptions().
WithFieldOption("document", hp.TNewFieldsOption().TWithMaxLen(common.MaxLength)).
WithFieldOption("dense", hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim()))).
WithFieldOption(common.DefaultInt64FieldName, hp.TNewFieldsOption().TWithAutoID(true))
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
common.GenRandomString("text_embedding", 6),
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldOpts))),
))
// expect error due to unsupported endpoint
common.CheckErr(t, err, false, "unsupported_endpoint")
}
// TestCreateCollectionUnmatchedDim tests creation with mismatched dimension
func TestCreateCollectionUnmatchedDim(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with wrong dimension (512 instead of expected 768 from TEI model)
wrongDim := int64(512)
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := hp.TNewFieldOptions().
WithFieldOption("document", hp.TNewFieldsOption().TWithMaxLen(common.MaxLength)).
WithFieldOption("dense", hp.TNewFieldsOption().TWithDim(wrongDim)).
WithFieldOption(common.DefaultInt64FieldName, hp.TNewFieldsOption().TWithAutoID(true))
collectionName := common.GenRandomString("text_embedding", 6)
// collection creation should fail with dimension mismatch error
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
collectionName,
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))),
))
// Expect error with specific dimension mismatch message
expectedError := fmt.Sprintf("required embedding dim is [%d], but the embedding obtained from the model is [%d]", wrongDim, hp.GetTEIModelDim())
common.CheckErr(t, err, false, expectedError)
}
// TestInsertWithTextEmbedding tests basic data insertion with text embedding
func TestInsertWithTextEmbedding(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), newTextEmbeddingFieldsOption(true), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// prepare test data - only provide text, embedding will be auto-generated
nb := 10
documents := make([]string, nb)
for i := 0; i < nb; i++ {
documents[i] = fmt.Sprintf("This is test document number %d with some content for embedding", i)
}
// insert data using only text field
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
common.CheckErr(t, err, true)
require.Equal(t, int64(nb), res.InsertCount)
// create index and load
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// query to verify vectors were generated
resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).WithFilter("").WithOutputFields("dense").WithLimit(10))
common.CheckErr(t, err, true)
require.Greater(t, len(resQuery.Fields), 0)
// verify vector dimension - check first result
if resQuery.Len() > 0 {
// Query results structure is different - need to check the actual field structure
denseColumn := resQuery.GetColumn("dense")
require.NotNil(t, denseColumn)
// Field should contain vectors for all results
}
}
// TestInsertWithTruncateParams tests insertion with different truncate parameters
func TestInsertWithTruncateParams(t *testing.T) {
testCases := []struct {
name string
truncate bool
truncationDirection string
shouldSucceed bool
}{
{"truncate_true_right", true, "Right", true},
{"truncate_true_left", true, "Left", true},
{"truncate_false", false, "", false}, // should fail with long text
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create TEI function with truncate parameters
params := map[string]any{}
if tc.truncate {
params["truncate"] = "true"
params["truncation_direction"] = tc.truncationDirection
} else {
params["truncate"] = "false"
}
params["provider"] = "TEI"
params["endpoint"] = hp.GetTEIEndpoint()
function := hp.TNewTextEmbeddingFunction("document", "dense", params)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := newTextEmbeddingFieldsOption(true)
_, schema := hp.CollPrepare.CreateCollection(
ctx, t, mc,
hp.NewCreateCollectionParams(hp.TextEmbedding),
fieldsOption,
schemaOption,
hp.TWithConsistencyLevel(entity.ClStrong),
)
// prepare long text data that would need truncation
// Generate distinctly different left and right parts that will exceed token limits when combined
leftPart := "artificial intelligence machine learning deep learning neural networks computer vision natural language processing data science algorithms " + strings.Repeat("technology innovation science research development analysis ", 100)
rightPart := "database systems vector search embeddings similarity matching retrieval information storage indexing " + strings.Repeat("query performance optimization scalability distributed computing ", 100)
longText := leftPart + " " + rightPart // This will exceed 512 tokens and need truncation
documents := []string{longText, leftPart, rightPart}
// insert data
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
if tc.shouldSucceed {
common.CheckErr(t, err, true)
require.Equal(t, int64(len(documents)), res.InsertCount)
// create index and load for embedding comparison
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(schema.CollectionName, "dense", index.NewAutoIndex(entity.COSINE)))
common.CheckErr(t, err, true)
_, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(schema.CollectionName))
common.CheckErr(t, err, true)
// Query embeddings from Milvus
resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
WithFilter("").
WithOutputFields("dense", "document").
WithConsistencyLevel(entity.ClStrong).
WithLimit(10))
common.CheckErr(t, err, true)
require.Equal(t, len(documents), resQuery.Len())
// Extract Milvus embeddings
denseColumn := resQuery.GetColumn("dense")
require.NotNil(t, denseColumn)
floatVecColumn, ok := denseColumn.(*column.ColumnFloatVector)
require.True(t, ok, "Dense column should be a float vector column")
// Truncation validation using similarity comparison approach
// This follows the Python test logic: compare similarity between combined text and parts
// to verify that truncation direction works correctly
require.Equal(t, 3, resQuery.Len(), "Should have 3 documents: longText, leftPart, rightPart")
// Get embeddings for: [0]=longText, [1]=leftPart, [2]=rightPart
embeddings := make([][]float32, 3)
for i := 0; i < 3; i++ {
embedding := floatVecColumn.Data()[i]
require.Equal(t, hp.GetTEIModelDim(), len(embedding), "Embedding should have correct dimension")
// Check that embedding is not all zeros (would indicate a failure)
var sum float32
for _, val := range embedding {
sum += val * val
}
require.Greater(t, sum, float32(0.01), "Embedding should not be all zeros for document %d", i)
embeddings[i] = embedding
}
// Calculate cosine similarities
// similarity_left: longText vs leftPart
// similarity_right: longText vs rightPart
similarityLeft := hp.CosineSimilarity(embeddings[0], embeddings[1])
similarityRight := hp.CosineSimilarity(embeddings[0], embeddings[2])
t.Logf("Similarity longText vs leftPart: %.6f", similarityLeft)
t.Logf("Similarity longText vs rightPart: %.6f", similarityRight)
// Validation based on truncation direction:
// - If truncation_direction = "Left", we keep the right part, so longText should be more similar to rightPart
// - If truncation_direction = "Right", we keep the left part, so longText should be more similar to leftPart
if tc.truncationDirection == "Left" {
require.Greater(t, similarityRight, similarityLeft,
"With Left truncation, longText should be more similar to rightPart (%.6f) than leftPart (%.6f)",
similarityRight, similarityLeft)
t.Logf("Left truncation verified: rightPart similarity (%.6f) > leftPart similarity (%.6f)",
similarityRight, similarityLeft)
} else { // "Right"
require.Greater(t, similarityLeft, similarityRight,
"With Right truncation, longText should be more similar to leftPart (%.6f) than rightPart (%.6f)",
similarityLeft, similarityRight)
t.Logf("Right truncation verified: leftPart similarity (%.6f) > rightPart similarity (%.6f)",
similarityLeft, similarityRight)
}
t.Logf("Successfully inserted %d documents with truncate=%v, direction=%s", len(documents), tc.truncate, tc.truncationDirection)
} else {
common.CheckErr(t, err, false, "Payload Too Large")
}
})
}
}
// TestVerifyEmbeddingConsistency verifies that Milvus text embedding function produces same results as direct TEI calls
func TestVerifyEmbeddingConsistency(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function (custom fields for autoID=false)
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := newTextEmbeddingFieldsOption(false)
prepare, schema := hp.CollPrepare.CreateCollection(
ctx, t, mc,
hp.NewCreateCollectionParams(hp.TextEmbedding),
fieldsOption,
schemaOption,
hp.TWithConsistencyLevel(entity.ClStrong),
)
// Test documents
testDocs := []string{
"This is a test document about artificial intelligence",
"Vector databases enable semantic search capabilities",
"Text embeddings transform language into numbers",
}
// Insert documents into Milvus (will use text embedding function)
ids := []int64{1, 2, 3}
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
WithInt64Column(common.DefaultInt64FieldName, ids).
WithVarcharColumn("document", testDocs))
common.CheckErr(t, err, true)
require.Equal(t, int64(len(testDocs)), res.InsertCount)
// Create index and load
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// Query vectors from Milvus
resQuery, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
WithFilter("").
WithOutputFields("dense", "document", common.DefaultInt64FieldName).
WithConsistencyLevel(entity.ClStrong).
WithLimit(10))
common.CheckErr(t, err, true)
require.Equal(t, len(testDocs), resQuery.Len())
// Get embeddings directly from TEI
teiEmbeddings, err := hp.CallTEIDirectly(hp.GetTEIEndpoint(), testDocs)
if err != nil {
t.Skipf("Skip consistency test - could not connect to TEI endpoint: %v", err)
return
}
require.Equal(t, len(testDocs), len(teiEmbeddings))
// Compare embeddings
denseColumn := resQuery.GetColumn("dense")
require.NotNil(t, denseColumn)
// Get ID column to match embeddings with documents
idColumn := resQuery.GetColumn(common.DefaultInt64FieldName)
require.NotNil(t, idColumn)
// Extract and compare embeddings - need to handle column type properly
floatVecColumn, ok := denseColumn.(*column.ColumnFloatVector)
require.True(t, ok, "Dense column should be a float vector column")
for i := 0; i < resQuery.Len(); i++ {
// Get ID to find corresponding TEI embedding
id, err := idColumn.GetAsInt64(i)
require.NoError(t, err)
teiIdx := id - 1 // IDs are 1-based, array is 0-based
// Get Milvus embedding from the float vector column
milvusEmbedding := floatVecColumn.Data()[i]
require.NotNil(t, milvusEmbedding)
require.Equal(t, hp.GetTEIModelDim(), len(milvusEmbedding), "Embedding dimension should match")
// Calculate cosine similarity
similarity := hp.CosineSimilarity(milvusEmbedding, teiEmbeddings[teiIdx])
t.Logf("Document %d (ID=%d) similarity between Milvus and TEI: %.6f", i, id, similarity)
// Embeddings should be nearly identical (similarity > 0.99)
require.Greater(t, similarity, float32(0.99),
"Milvus embedding should be nearly identical to TEI embedding for document ID %d", id)
}
t.Log("Embedding consistency verified: Milvus text embedding function produces same results as direct TEI calls")
}
// TestUpsertTextFieldUpdatesEmbedding tests that upserting text field updates embedding
func TestUpsertTextFieldUpdatesEmbedding(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function (custom fields for autoID=false for upsert)
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := newTextEmbeddingFieldsOption(false)
prepare, schema := hp.CollPrepare.CreateCollection(
ctx, t, mc,
hp.NewCreateCollectionParams(hp.TextEmbedding),
fieldsOption,
schemaOption,
hp.TWithConsistencyLevel(entity.ClStrong),
)
// create index and load first
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// insert initial data with specific ID
oldText := "This is the original text content"
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
WithInt64Column(common.DefaultInt64FieldName, []int64{1}).
WithVarcharColumn("document", []string{oldText}))
common.CheckErr(t, err, true)
require.Equal(t, int64(1), res.InsertCount)
// query original embedding before upsert
resQueryBefore, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
WithFilter("int64 == 1").
WithOutputFields("document", "dense").
WithConsistencyLevel(entity.ClStrong))
common.CheckErr(t, err, true)
require.Equal(t, 1, resQueryBefore.Len())
// extract original embedding
originalDenseColumn := resQueryBefore.GetColumn("dense")
require.NotNil(t, originalDenseColumn)
originalFloatVecColumn, ok := originalDenseColumn.(*column.ColumnFloatVector)
require.True(t, ok, "Dense column should be a float vector column")
originalEmbedding := originalFloatVecColumn.Data()[0]
require.Equal(t, hp.GetTEIModelDim(), len(originalEmbedding), "Original embedding dimension should match")
// verify original text
originalDocColumn := resQueryBefore.GetColumn("document")
require.NotNil(t, originalDocColumn)
originalVarCharColumn, ok := originalDocColumn.(*column.ColumnVarChar)
require.True(t, ok, "Document column should be a varchar column")
require.Equal(t, oldText, originalVarCharColumn.Data()[0], "Original text should match")
// upsert with new text
newText := "This is completely different updated text content"
res2, err := mc.Upsert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
WithInt64Column(common.DefaultInt64FieldName, []int64{1}).
WithVarcharColumn("document", []string{newText}))
common.CheckErr(t, err, true)
require.Equal(t, int64(1), res2.UpsertCount)
// query updated embedding after upsert
resQueryAfter, err := mc.Query(ctx, milvusclient.NewQueryOption(schema.CollectionName).
WithFilter("int64 == 1").
WithOutputFields("document", "dense").
WithConsistencyLevel(entity.ClStrong))
common.CheckErr(t, err, true)
require.Equal(t, 1, resQueryAfter.Len())
// extract updated embedding
updatedDenseColumn := resQueryAfter.GetColumn("dense")
require.NotNil(t, updatedDenseColumn)
updatedFloatVecColumn, ok := updatedDenseColumn.(*column.ColumnFloatVector)
require.True(t, ok, "Dense column should be a float vector column")
updatedEmbedding := updatedFloatVecColumn.Data()[0]
require.Equal(t, hp.GetTEIModelDim(), len(updatedEmbedding), "Updated embedding dimension should match")
// verify updated text
updatedDocColumn := resQueryAfter.GetColumn("document")
require.NotNil(t, updatedDocColumn)
updatedVarCharColumn, ok := updatedDocColumn.(*column.ColumnVarChar)
require.True(t, ok, "Document column should be a varchar column")
require.Equal(t, newText, updatedVarCharColumn.Data()[0], "Updated text should match")
// verify embeddings are different (key assertion)
similarity := hp.CosineSimilarity(originalEmbedding, updatedEmbedding)
require.Less(t, similarity, float32(0.95),
"Embeddings should be significantly different after text update (similarity=%.6f)", similarity)
t.Logf("Upsert verification complete: Original and updated embeddings have cosine similarity %.6f (< 0.95)", similarity)
t.Logf(" Original text: %s", oldText)
t.Logf(" Updated text: %s", newText)
}
// TestDeleteAndSearch tests that deleted text cannot be searched
func TestDeleteAndSearch(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function (custom fields for autoID=false)
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := newTextEmbeddingFieldsOption(false)
prepare, schema := hp.CollPrepare.CreateCollection(
ctx, t, mc,
hp.NewCreateCollectionParams(hp.TextEmbedding),
fieldsOption,
schemaOption,
hp.TWithConsistencyLevel(entity.ClStrong),
)
// insert test data
documents := []string{
"This is test document 0",
"This is test document 1",
"This is test document 2",
}
ids := []int64{0, 1, 2}
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).
WithInt64Column(common.DefaultInt64FieldName, ids).
WithVarcharColumn("document", documents))
common.CheckErr(t, err, true)
require.Equal(t, int64(3), res.InsertCount)
// create index and load
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// delete document with ID 1
res2, err := mc.Delete(ctx, milvusclient.NewDeleteOption(schema.CollectionName).WithExpr("int64 in [1]"))
common.CheckErr(t, err, true)
require.Equal(t, int64(1), res2.DeleteCount)
// search and verify document 1 is not in results
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 3, []entity.Vector{entity.Text("test document 1")}).
WithANNSField("dense").
WithOutputFields("document", common.DefaultInt64FieldName))
common.CheckErr(t, err, true)
// verify deleted document is not in results
require.Greater(t, len(searchRes), 0)
for _, hits := range searchRes {
for i := 0; i < hits.Len(); i++ {
id, err := hits.IDs.GetAsInt64(i)
require.NoError(t, err)
require.NotEqual(t, int64(1), id, "Deleted document should not appear in search results")
}
}
}
// TestSearchWithTextEmbedding tests search functionality with text embedding
func TestSearchWithTextEmbedding(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create -> insert -> index -> load
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), newTextEmbeddingFieldsOption(true), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// prepare test data
nb := 10
documents := make([]string, nb)
for i := 0; i < nb; i++ {
documents[i] = fmt.Sprintf("This is test document number %d about artificial intelligence and machine learning", i)
}
// insert data using only text field
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
common.CheckErr(t, err, true)
require.Equal(t, int64(nb), res.InsertCount)
// create index and load
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// search using text query
queryText := "artificial intelligence machine learning"
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 5, []entity.Vector{entity.Text(queryText)}).
WithANNSField("dense").
WithOutputFields("document"))
common.CheckErr(t, err, true)
require.Greater(t, len(searchRes), 0)
for _, hits := range searchRes {
require.Greater(t, hits.Len(), 0, "Should find relevant documents")
require.LessOrEqual(t, hits.Len(), 5, "Should respect limit")
// verify results contain the search terms (semantic similarity)
for i := 0; i < hits.Len(); i++ {
score := hits.Scores[i]
require.Greater(t, score, float32(0), "Score should be positive")
}
}
}
// TestSearchWithEmptyQuery tests search with empty query (should fail)
func TestSearchWithEmptyQuery(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
_, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), newTextEmbeddingFieldsOption(true), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// insert some test data
documents := []string{"test document"}
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
common.CheckErr(t, err, true)
require.Equal(t, int64(1), res.InsertCount)
// create index and load
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(schema.CollectionName, "dense", index.NewAutoIndex(entity.COSINE)))
common.CheckErr(t, err, true)
_, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(schema.CollectionName))
common.CheckErr(t, err, true)
// search with empty query should fail
_, err = mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 3, []entity.Vector{entity.Text("")}).
WithANNSField("dense"))
common.CheckErr(t, err, false, "TextEmbedding function does not support empty text")
}
// TestHybridSearchTextEmbeddingBM25 tests hybrid search combining TEI text embedding and BM25
func TestHybridSearchTextEmbeddingBM25(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with both TEI text embedding and BM25 functions
collectionName := common.GenRandomString("hybrid_search", 6)
// create fields manually to support both dense and sparse vectors
fields := []*entity.Field{
entity.NewField().WithName(common.DefaultInt64FieldName).WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true).WithIsAutoID(true),
entity.NewField().WithName("document").WithDataType(entity.FieldTypeVarChar).WithMaxLength(65535).WithEnableAnalyzer(true).WithAnalyzerParams(map[string]any{"tokenizer": "standard"}),
entity.NewField().WithName("dense").WithDataType(entity.FieldTypeFloatVector).WithDim(int64(hp.GetTEIModelDim())),
entity.NewField().WithName("sparse").WithDataType(entity.FieldTypeSparseVector),
}
// create TEI text embedding function
teiFunction := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
})
// create BM25 function
bm25Function := hp.TNewBM25Function("document", "sparse")
// create schema with both functions
schema := entity.NewSchema().
WithName(collectionName).
WithDescription("Hybrid search collection with TEI and BM25").
WithFunction(teiFunction).
WithFunction(bm25Function)
for _, field := range fields {
schema.WithField(field)
}
// create collection
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(collectionName, schema).WithConsistencyLevel(entity.ClStrong))
common.CheckErr(t, err, true)
// insert test data with diverse content
documents := []string{
"Artificial intelligence and machine learning are transforming technology",
"Vector databases enable semantic search capabilities for AI applications",
"Text embeddings capture semantic meaning in numerical representations",
"BM25 is a traditional keyword-based search algorithm",
"Hybrid search combines semantic and keyword-based retrieval methods",
"Large language models use transformer architectures for text understanding",
"Information retrieval systems help users find relevant documents",
"Natural language processing enables computers to understand human language",
"Database systems store and retrieve structured information efficiently",
"Search engines use ranking algorithms to order results by relevance",
}
// insert data - both embeddings will be generated automatically
res, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(collectionName).WithVarcharColumn("document", documents))
common.CheckErr(t, err, true)
require.Equal(t, int64(len(documents)), res.InsertCount)
// create indexes
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(collectionName, "dense", index.NewAutoIndex(entity.COSINE)))
common.CheckErr(t, err, true)
_, err = mc.CreateIndex(ctx, milvusclient.NewCreateIndexOption(collectionName, "sparse", index.NewSparseInvertedIndex(entity.BM25, 0.1)))
common.CheckErr(t, err, true)
// load collection
_, err = mc.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(collectionName))
common.CheckErr(t, err, true)
// test 1: Dense vector search (TEI semantic search)
t.Run("DenseVectorSearch", func(t *testing.T) {
queryText := "machine learning artificial intelligence"
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 3, []entity.Vector{entity.Text(queryText)}).
WithANNSField("dense").
WithOutputFields("document"))
common.CheckErr(t, err, true)
require.Greater(t, len(searchRes), 0)
for _, hits := range searchRes {
require.Greater(t, hits.Len(), 0, "Should find semantically similar documents")
t.Logf("Dense search found %d results for query: %s", hits.Len(), queryText)
}
})
// test 2: Sparse vector search (BM25 keyword search)
t.Run("SparseVectorSearch", func(t *testing.T) {
queryText := "database systems"
searchRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 3, []entity.Vector{entity.Text(queryText)}).
WithANNSField("sparse").
WithOutputFields("document"))
common.CheckErr(t, err, true)
require.Greater(t, len(searchRes), 0)
for _, hits := range searchRes {
require.Greater(t, hits.Len(), 0, "Should find keyword-matching documents")
t.Logf("Sparse search found %d results for query: %s", hits.Len(), queryText)
}
})
// test 3: Both search types work independently
t.Run("IndependentSearches", func(t *testing.T) {
queryText := "vector search"
// Dense search
denseRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 5, []entity.Vector{entity.Text(queryText)}).
WithANNSField("dense").
WithOutputFields("document"))
common.CheckErr(t, err, true)
// Sparse search
sparseRes, err := mc.Search(ctx, milvusclient.NewSearchOption(collectionName, 5, []entity.Vector{entity.Text(queryText)}).
WithANNSField("sparse").
WithOutputFields("document"))
common.CheckErr(t, err, true)
// Both should return results
require.Greater(t, len(denseRes), 0, "Dense search should return results")
require.Greater(t, len(sparseRes), 0, "Sparse search should return results")
for _, hits := range denseRes {
require.Greater(t, hits.Len(), 0, "Dense search should find documents")
}
for _, hits := range sparseRes {
require.Greater(t, hits.Len(), 0, "Sparse search should find documents")
}
t.Logf("Dense search found %d results, Sparse search found %d results",
len(denseRes), len(sparseRes))
})
}
// TestInsertEmptyDocument tests insertion with empty document
func TestInsertEmptyDocument(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
_, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), newTextEmbeddingFieldsOption(true), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// try to insert empty document
documents := []string{"", "normal document"}
_, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
// should fail with empty document
common.CheckErr(t, err, false, "TextEmbedding function does not support empty text")
}
// TestInsertLongDocument tests insertion with very long document
func TestInsertLongDocument(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function (no truncate)
params := map[string]any{
"provider": "TEI",
"endpoint": hp.GetTEIEndpoint(),
"truncate": "false",
}
function := hp.TNewTextEmbeddingFunction("document", "dense", params)
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := newTextEmbeddingFieldsOption(true)
_, schema := hp.CollPrepare.CreateCollection(
ctx, t, mc,
hp.NewCreateCollectionParams(hp.TextEmbedding),
fieldsOption,
schemaOption,
hp.TWithConsistencyLevel(entity.ClStrong),
)
// try to insert very long document that exceeds model limits
longDocument := hp.GenLongText(8192, "english") // Very long text
documents := []string{longDocument}
_, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
// should fail with long document when truncate is false
common.CheckErr(t, err, false, "Call service failed")
}
// TestInvalidEndpointHandling tests various invalid endpoint scenarios
func TestInvalidEndpointHandling(t *testing.T) {
testCases := []struct {
name string
endpoint string
errMsg string
}{
{"NonExistentHost", "http://nonexistent-host:8080", "nonexistent-host"},
{"InvalidPort", "http://localhost:99999", "99999"},
{"InvalidProtocol", "ftp://localhost:8080", "ftp"},
{"EmptyEndpoint", "", "endpoint"},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with invalid endpoint
function := hp.TNewTextEmbeddingFunction("document", "dense", map[string]any{
"provider": "TEI",
"endpoint": tc.endpoint,
})
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldOpts := hp.TNewFieldOptions().
WithFieldOption("document", hp.TNewFieldsOption().TWithMaxLen(common.MaxLength)).
WithFieldOption("dense", hp.TNewFieldsOption().TWithDim(int64(hp.GetTEIModelDim()))).
WithFieldOption(common.DefaultInt64FieldName, hp.TNewFieldsOption().TWithAutoID(true))
// collection creation should fail for invalid endpoints
collectionName := common.GenRandomString("test_invalid", 6)
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
collectionName,
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldOpts))),
))
common.CheckErr(t, err, false, tc.errMsg)
t.Logf("Expected error for %s: %v", tc.name, err)
})
}
}
// TestMissingRequiredParameters tests creation with missing required parameters
func TestMissingRequiredParameters(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout)
mc := hp.CreateDefaultMilvusClient(ctx, t)
testCases := []struct {
name string
params map[string]any
errMsg string
}{
{"MissingProvider", map[string]any{"endpoint": hp.GetTEIEndpoint()}, "provider"},
{"MissingEndpoint", map[string]any{"provider": "TEI"}, "endpoint"},
{"WrongProvider", map[string]any{"provider": "InvalidProvider", "endpoint": hp.GetTEIEndpoint()}, "invalidprovider"},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
// create function with incomplete parameters
function := entity.NewFunction().
WithName("incomplete_func").
WithInputFields("document").
WithOutputFields("dense").
WithType(entity.FunctionTypeTextEmbedding)
for key, value := range tc.params {
function.WithParam(key, value)
}
schemaOption := hp.TNewSchemaOption().TWithFunction(function)
fieldsOption := newTextEmbeddingFieldsOption(true)
// collection creation should fail
err := mc.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(
common.GenRandomString("test_incomplete", 6),
hp.GenSchema(schemaOption.TWithFields(hp.FieldsFact.GenFieldsForCollection(hp.TextEmbedding, fieldsOption))),
))
common.CheckErr(t, err, false, tc.errMsg)
t.Logf("Expected error for %s: %v", tc.name, err)
})
}
}
// TestConcurrentOperations tests concurrent text embedding operations
func TestConcurrentOperations(t *testing.T) {
ctx := hp.CreateContext(t, time.Second*common.DefaultTimeout*2) // longer timeout for concurrent ops
mc := hp.CreateDefaultMilvusClient(ctx, t)
// create collection with TEI function
prepare, schema := hp.CollPrepare.CreateCollection(ctx, t, mc, hp.NewCreateCollectionParams(hp.TextEmbedding), newTextEmbeddingFieldsOption(true), hp.TNewTextEmbeddingSchemaOption(), hp.TWithConsistencyLevel(entity.ClStrong))
// create index and load
prepare.CreateIndex(ctx, t, mc, hp.TNewIndexParams(schema).TWithFieldIndex(map[string]index.Index{"dense": index.NewAutoIndex(entity.COSINE)}))
prepare.Load(ctx, t, mc, hp.NewLoadParams(schema.CollectionName))
// concurrent inserts
t.Run("ConcurrentInserts", func(t *testing.T) {
numRoutines := 5
documentsPerRoutine := 5
results := make(chan error, numRoutines)
for i := 0; i < numRoutines; i++ {
go func(routineID int) {
documents := make([]string, documentsPerRoutine)
for j := 0; j < documentsPerRoutine; j++ {
documents[j] = fmt.Sprintf("Concurrent document from routine %d, doc %d", routineID, j)
}
_, err := mc.Insert(ctx, milvusclient.NewColumnBasedInsertOption(schema.CollectionName).WithVarcharColumn("document", documents))
results <- err
}(i)
}
// wait for all goroutines to complete
for i := 0; i < numRoutines; i++ {
err := <-results
require.NoError(t, err, "Concurrent insert should succeed")
}
t.Logf("Successfully completed %d concurrent inserts with %d documents each", numRoutines, documentsPerRoutine)
})
// concurrent searches
t.Run("ConcurrentSearches", func(t *testing.T) {
numRoutines := 3
results := make(chan error, numRoutines)
for i := 0; i < numRoutines; i++ {
go func(routineID int) {
queryText := fmt.Sprintf("document routine %d", routineID)
_, err := mc.Search(ctx, milvusclient.NewSearchOption(schema.CollectionName, 5, []entity.Vector{entity.Text(queryText)}).
WithANNSField("dense").
WithOutputFields("document"))
results <- err
}(i)
}
// wait for all searches to complete
for i := 0; i < numRoutines; i++ {
err := <-results
require.NoError(t, err, "Concurrent search should succeed")
}
t.Logf("Successfully completed %d concurrent searches", numRoutines)
})
}