milvus/tests/restful_client_v2/testcases/test_text_embedding_search.py
zhuwenxing 8bda337f52
test: add text embedding function restful api test (#42977)
/kind improvement

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
2025-06-27 10:46:41 +08:00

1035 lines
40 KiB
Python

import pytest
import numpy as np
from faker import Faker
from base.testbase import TestBase
from utils.utils import gen_collection_name
from utils.util_log import test_log as logger
fake_en = Faker("en_US")
prefix = "text_embedding_search"
@pytest.mark.L0
class TestTextEmbeddingSearch(TestBase):
"""
******************************************************************
The following cases are used to test text embedding function search via RESTful API
******************************************************************
"""
def _create_basic_collection_payload(self, name, tei_endpoint, dim=768, with_bm25=False):
"""Helper method to create basic collection payload with TEI function"""
fields = [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
]
functions = [{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}]
if with_bm25:
fields[1]["elementTypeParams"].update({
"enable_analyzer": True,
"analyzer_params": {"tokenizer": "standard"},
"enable_match": True
})
fields.append({"fieldName": "sparse", "dataType": "SparseFloatVector"})
functions.append({
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document"],
"outputFieldNames": ["sparse"],
"params": {}
})
return {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": fields,
"functions": functions
}
}
def _create_and_verify_collection(self, name, tei_endpoint, dim=768, with_bm25=False):
"""Helper method to create collection and verify creation"""
payload = self._create_basic_collection_payload(name, tei_endpoint, dim, with_bm25)
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
# Verify collection was created
rsp = self.collection_client.collection_describe(name)
assert rsp['code'] == 0, f"Collection describe failed: {rsp}"
assert rsp['data']['collectionName'] == name, f"Collection name mismatch: expected {name}, got {rsp['data']['collectionName']}"
return payload
def _insert_and_verify_data(self, name, data):
"""Helper method to insert data and verify insertion"""
payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0, f"Insert failed: {rsp}"
assert rsp['data']['insertCount'] == len(data), f"Expected {len(data)} inserts, got {rsp['data']['insertCount']}"
return rsp
def _create_index_and_load(self, name, index_fields=None):
"""Helper method to create index and load collection"""
if index_fields is None:
index_fields = [{"fieldName": "dense", "indexName": "dense_index", "metricType": "COSINE"}]
index_payload = {
"collectionName": name,
"indexParams": [
{**field, "indexType": "AUTOINDEX", "params": {}}
for field in index_fields
]
}
rsp = self.index_client.index_create(index_payload)
assert rsp['code'] == 0, f"Index creation failed: {rsp}"
# Load collection
rsp = self.collection_client.collection_load(collection_name=name)
assert rsp['code'] == 0, f"Collection load failed: {rsp}"
def test_simple_tei_text_embedding_workflow(self, tei_endpoint):
"""
target: test simple TEI text embedding workflow
method: create collection, insert data, create index, load, and search
expected: all operations succeed
"""
name = gen_collection_name(prefix)
# Create collection with TEI text embedding function
self._create_and_verify_collection(name, tei_endpoint)
# Insert simple text data
data = [
{"id": 1, "document": "This is a test document"},
{"id": 2, "document": "Another test document"}
]
self._insert_and_verify_data(name, data)
# Create index and load collection
self._create_index_and_load(name)
# Search
search_payload = {
"collectionName": name,
"data": ["test document"],
"limit": 2,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0, f"Search failed: {rsp}"
assert len(rsp['data']) > 0, f"Search returned no results: {rsp['data']}"
def test_create_collection_with_tei_text_embedding_function(self, tei_endpoint):
"""
target: test create collection with TEI text embedding function via REST API (equivalent to ORM example)
method: create collection with TEI text embedding function using RESTful API
expected: create collection successfully
"""
name = gen_collection_name(prefix)
# Create collection with additional truncation parameters
payload = self._create_basic_collection_payload(name, tei_endpoint)
payload["schema"]["functions"][0]["params"].update({
"truncate": True,
"truncation_direction": "Right"
})
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
# Verify collection was created with function
rsp = self.collection_client.collection_describe(name)
assert rsp['code'] == 0, f"Collection describe failed: {rsp}"
assert rsp['data']['collectionName'] == name, f"Collection name mismatch: expected {name}, got {rsp['data']['collectionName']}"
@pytest.mark.parametrize("truncate", [True, False])
@pytest.mark.parametrize("truncation_direction", ["Left", "Right"])
def test_insert_with_tei_text_embedding_truncation(self, tei_endpoint, truncate, truncation_direction):
"""
target: test insert data with TEI text embedding function with truncation parameters
method: insert long text data with different truncation settings
expected: insert successfully and truncation works as expected
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with TEI text embedding function including truncation params
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint,
"truncate": truncate,
"truncation_direction": truncation_direction
}
}
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Prepare test data with long text similar to ORM test
left_text = " ".join([fake_en.word() for _ in range(512)])
right_text = " ".join([fake_en.word() for _ in range(512)])
data = [
{
"id": 0,
"document": left_text + " " + right_text
},
{
"id": 1,
"document": left_text
},
{
"id": 2,
"document": right_text
}
]
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
if not truncate:
logger.info(f"Truncate is False, insertion result: {rsp}")
return
assert rsp['code'] == 0, f"Insert failed: {rsp}"
assert rsp['data']['insertCount'] == len(data), f"Expected {len(data)} inserts, got {rsp['data']['insertCount']}"
# Create index and load for similarity comparison
index_payload = {
"collectionName": name,
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE",
"indexType": "AUTOINDEX",
"params": {}
}
]
}
rsp = self.index_client.index_create(index_payload)
assert rsp['code'] == 0, f"Index creation failed: {rsp}"
# Load collection
rsp = self.collection_client.collection_load(collection_name=name)
assert rsp['code'] == 0
# Query to get embeddings for similarity comparison
query_payload = {
"collectionName": name,
"filter": "id >= 0",
"outputFields": ["id", "dense"],
"limit": 10
}
rsp = self.vector_client.vector_query(query_payload)
assert rsp['code'] == 0, f"Query failed: {rsp}"
assert len(rsp['data']) == 3, f"Expected 3 results, got {len(rsp['data'])}"
# Compare similarity between embeddings to verify truncation direction
embeddings = {}
for result in rsp['data']:
embeddings[result['id']] = result['dense']
# Calculate cosine similarity
similarity_left = np.dot(embeddings[0], embeddings[1]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
)
similarity_right = np.dot(embeddings[0], embeddings[2]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[2])
)
logger.info(f"Similarity with left: {similarity_left}, with right: {similarity_right}")
if truncation_direction == "Left":
# When truncating from left, the combined text should be more similar to right text
assert similarity_left < similarity_right, (
f"Left truncation failed: left_sim={similarity_left:.4f}, right_sim={similarity_right:.4f}"
)
else: # Right truncation
# When truncating from right, the combined text should be more similar to left text
assert similarity_left > similarity_right, (
f"Right truncation failed: left_sim={similarity_left:.4f}, right_sim={similarity_right:.4f}"
)
def test_insert_with_tei_text_embedding_function(self, tei_endpoint):
"""
target: test insert data with TEI text embedding function via REST API
method: insert text data, embeddings should be automatically generated by TEI
expected: insert successfully and embeddings are generated
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with TEI text embedding function
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert text data without embedding vectors (they should be auto-generated by TEI)
nb = 10
data = []
for i in range(nb):
data.append({
"id": i,
"document": fake_en.text()
})
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0, f"Insert failed: {rsp}"
assert rsp['data']['insertCount'] == nb, f"Expected {nb} inserts, got {rsp['data']['insertCount']}"
def test_search_with_tei_text_embedding_function(self, tei_endpoint):
"""
target: test search with TEI text embedding function via REST API
method: 1. create collection with TEI text embedding function
2. insert text data
3. search with text query (should auto-generate embedding via TEI)
expected: search successfully with relevant results
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with TEI text embedding function
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE"
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert text data
nb = 100
documents = [
"Machine learning is a subset of artificial intelligence",
"Deep learning uses neural networks with multiple layers",
"Natural language processing helps computers understand text",
"Computer vision enables machines to interpret visual information",
"Reinforcement learning trains agents through rewards and penalties"
]
data = []
for i in range(nb):
data.append({
"id": i,
"document": documents[i % len(documents)] + f" Document {i}"
})
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# Search with text query (TEI will auto-generate embedding)
search_payload = {
"collectionName": name,
"data": ["artificial intelligence and machine learning"],
"limit": 10,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0, f"Search failed: {rsp}"
assert len(rsp['data']) > 0, f"Search returned no results"
# Verify search results contain relevant documents
found_relevant = any(
"machine learning" in result.get('document', '').lower() or
"artificial intelligence" in result.get('document', '').lower()
for result in rsp['data']
)
assert found_relevant, f"Search should return relevant documents, got: {[r.get('document', '') for r in rsp['data']]}"
def test_tei_and_bm25_collection_creation(self, tei_endpoint):
"""
target: test create collection with both TEI and BM25 functions using correct format
method: create collection with TEI text embedding and BM25 functions based on working example
expected: collection creation succeeds
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with both TEI and BM25 functions using correct format
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{
"fieldName": "document",
"dataType": "VarChar",
"elementTypeParams": {
"max_length": "1000",
"enable_analyzer": True,
"analyzer_params": {"tokenizer": "standard"},
"enable_match": True
}
},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}},
{"fieldName": "sparse", "dataType": "SparseFloatVector"}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
},
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document"],
"outputFieldNames": ["sparse"],
"params": {}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE"
},
{
"fieldName": "sparse",
"indexName": "sparse_index",
"metricType": "BM25",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert test data
data = []
for i in range(10):
data.append({
"id": i,
"document": fake_en.text().lower()
})
payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0, f"Insert failed: {rsp}"
assert rsp['data']['insertCount'] == 10, f"Expected 10 inserts, got {rsp['data']['insertCount']}"
# Test search with BM25 (sparse vector)
search_payload = {
"collectionName": name,
"data": [fake_en.text().lower()],
"annsField": "sparse",
"limit": 5,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0, f"BM25 search failed: {rsp}"
assert len(rsp['data']) > 0, f"BM25 search returned no results"
# test search with dense vector
search_payload = {
"collectionName": name,
"data": [fake_en.text().lower()],
"annsField": "dense",
"limit": 5,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0, f"Dense search failed: {rsp}"
assert len(rsp['data']) > 0, f"Dense search returned no results"
def test_hybrid_search_with_text_embedding_and_bm25(self, tei_endpoint):
"""
target: test hybrid search combining text embedding and BM25 via REST API
method: 1. create collection with both text embedding and BM25 functions
2. insert text data
3. perform hybrid search
expected: hybrid search returns combined results
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with both text embedding and BM25 functions
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{
"fieldName": "document",
"dataType": "VarChar",
"elementTypeParams": {
"max_length": "65535",
"enable_analyzer": True,
"analyzer_params": {"tokenizer": "standard"},
"enable_match": True
}
},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}},
{"fieldName": "sparse", "dataType": "SparseFloatVector"}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
},
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document"],
"outputFieldNames": ["sparse"],
"params": {}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE"
},
{
"fieldName": "sparse",
"indexName": "sparse_index",
"metricType": "BM25",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert diverse text data
documents = [
"Python is a popular programming language for data science",
"JavaScript is widely used for web development",
"Machine learning algorithms can predict future trends",
"Database systems store and manage large amounts of data",
"Cloud computing provides scalable infrastructure solutions",
"Artificial intelligence transforms various industries",
"Software engineering practices improve code quality",
"Data visualization helps understand complex datasets",
"Cybersecurity protects digital assets from threats",
"Mobile applications provide convenient user experiences"
]
data = []
for i in range(50):
data.append({
"id": i,
"document": documents[i % len(documents)] + f" Extended content {i}"
})
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# Perform hybrid search using advanced search
hybrid_search_payload = {
"collectionName": name,
"search": [
{
"data": ["programming language data science"],
"annsField": "dense",
"limit": 20
},
{
"data": ["programming language data science"],
"annsField": "sparse",
"limit": 20
}
],
"rerank": {
"strategy": "weighted",
"params": {"weights": [0.7, 0.3]}
},
"limit": 10,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_advanced_search(hybrid_search_payload)
assert rsp['code'] == 0, f"Hybrid search failed: {rsp}"
assert len(rsp['data']) > 0, f"Hybrid search returned no results"
# Verify hybrid search results are relevant
found_relevant = any(
any(term in result.get('document', '').lower() for term in ['python', 'programming', 'data'])
for result in rsp['data']
)
assert found_relevant, f"Hybrid search should return relevant documents, got: {[r.get('document', '') for r in rsp['data']]}"
@pytest.mark.L1
class TestTextEmbeddingSearchAdvanced(TestBase):
"""
******************************************************************
Advanced test cases for text embedding function search via RESTful API
******************************************************************
"""
def test_search_with_filter_and_text_embedding(self, tei_endpoint):
"""
target: test search with both text embedding and scalar filters
method: 1. create collection with text embedding function and metadata fields
2. insert text data with metadata
3. search with text query and scalar filters
expected: search returns filtered and relevant results
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with text embedding function and metadata fields
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "category", "dataType": "VarChar", "elementTypeParams": {"max_length": "100"}},
{"fieldName": "year", "dataType": "Int64"},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE"
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert text data with metadata
categories = ["technology", "science", "business", "education"]
years = [2020, 2021, 2022, 2023, 2024]
data = []
for i in range(100):
data.append({
"id": i,
"document": fake_en.text(),
"category": categories[i % len(categories)],
"year": years[i % len(years)]
})
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# Search with text query and filters
search_payload = {
"collectionName": name,
"data": ["technology innovation"],
"filter": "category == 'technology' and year >= 2022",
"limit": 10,
"outputFields": ["id", "document", "category", "year"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0
# Verify all results match the filter criteria
for result in rsp['data']:
assert result['category'] == 'technology', f"Category mismatch: expected 'technology', got '{result['category']}'"
assert result['year'] >= 2022, f"Year filter failed: expected >= 2022, got {result['year']}"
def test_upsert_with_text_embedding_function(self, tei_endpoint):
"""
target: test upsert operation with text embedding function
method: 1. insert initial text data
2. upsert with modified text content
3. verify embeddings are updated
expected: upsert successfully updates both text and embeddings
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with text embedding function
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE"
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert initial data
original_text = "The original document about machine learning"
data = [{"id": 1, "document": original_text}]
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# Query original embedding
query_payload = {
"collectionName": name,
"filter": "id == 1",
"outputFields": ["id", "document", "dense"],
"limit": 10
}
rsp = self.vector_client.vector_query(query_payload)
assert rsp['code'] == 0, f"Original query failed: {rsp}"
assert len(rsp['data']) > 0, f"Original query returned no results"
original_embedding = rsp['data'][0]['dense']
# Upsert with modified text
updated_text = "The updated document about deep learning and neural networks"
upsert_data = [{"id": 1, "document": updated_text}]
payload = {
"collectionName": name,
"data": upsert_data
}
rsp = self.vector_client.vector_upsert(payload)
assert rsp['code'] == 0, f"Upsert failed: {rsp}"
# Query updated embedding
rsp = self.vector_client.vector_query(query_payload)
assert rsp['code'] == 0, f"Updated query failed: {rsp}"
assert len(rsp['data']) > 0, f"Updated query returned no results"
updated_embedding = rsp['data'][0]['dense']
# Verify text was updated
assert rsp['data'][0]['document'] == updated_text, f"Text not updated: expected '{updated_text}', got '{rsp['data'][0]['document']}'"
# Verify embedding was updated (embeddings should be different)
similarity = np.dot(original_embedding, updated_embedding) / (
np.linalg.norm(original_embedding) * np.linalg.norm(updated_embedding)
)
assert similarity < 0.99, f"Embedding should be significantly different after text update, similarity: {similarity:.4f}"
@pytest.mark.L2
class TestTextEmbeddingSearchNegative(TestBase):
"""
******************************************************************
Negative test cases for text embedding function search via RESTful API
******************************************************************
"""
def test_create_collection_with_invalid_text_embedding_params(self):
"""
target: test create collection with invalid text embedding function parameters
method: create collection with invalid embedding provider/model
expected: collection creation should fail with appropriate error
"""
name = gen_collection_name(prefix)
dim = 1024
# Create collection with invalid text embedding function
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "text_embedding_fn",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "invalid_provider",
"model_name": "invalid_model",
"api_key": "invalid_key"
}
}
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] != 0, f"Expected creation to fail with invalid provider, but got: {rsp}"
def test_search_with_empty_query_text(self, tei_endpoint):
"""
target: test search with empty text query
method: 1. create collection with text embedding function
2. insert data
3. search with empty string
expected: search should handle empty query appropriately
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with text embedding function
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert sample data
data = [{"id": i, "document": fake_en.text()} for i in range(10)]
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# Search with empty query
search_payload = {
"collectionName": name,
"data": [""],
"limit": 5,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] != 0, f"Expected search to fail with empty query, but got: {rsp}"
def test_dimension_mismatch_with_text_embedding(self, tei_endpoint):
"""
target: test dimension mismatch between text embedding function and vector field
method: create collection with mismatched dimensions
expected: collection creation should fail
"""
name = gen_collection_name(prefix)
wrong_dim = 512 # TEI produces 768-dim vectors
# Create collection with mismatched dimensions
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(wrong_dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint # This produces 768-dim vectors
}
}
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] != 0, f"Expected creation to fail with dimension mismatch, but got: {rsp}"