milvus/tests/restful_client_v2/testcases/test_embedding_rerank_function.py
zhuwenxing f1e75085e8
test: add restful api case for rerank function (#42987)
/kind improvement

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
2025-07-08 19:36:46 +08:00

2109 lines
84 KiB
Python

import pytest
import numpy as np
from faker import Faker
from base.testbase import TestBase
from utils.utils import gen_collection_name
from utils.util_log import test_log as logger
fake_en = Faker("en_US")
prefix = "text_embedding_search"
@pytest.mark.L0
class TestTextEmbeddingSearch(TestBase):
"""
******************************************************************
The following cases are used to test text embedding function search via RESTful API
******************************************************************
"""
def _create_basic_collection_payload(self, name, tei_endpoint, dim=768, with_bm25=False):
"""Helper method to create basic collection payload with TEI function"""
fields = [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
]
functions = [{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}]
if with_bm25:
fields[1]["elementTypeParams"].update({
"enable_analyzer": True,
"analyzer_params": {"tokenizer": "standard"},
"enable_match": True
})
fields.append({"fieldName": "sparse", "dataType": "SparseFloatVector"})
functions.append({
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document"],
"outputFieldNames": ["sparse"],
"params": {}
})
return {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": fields,
"functions": functions
}
}
def _create_and_verify_collection(self, name, tei_endpoint, dim=768, with_bm25=False):
"""Helper method to create collection and verify creation"""
payload = self._create_basic_collection_payload(name, tei_endpoint, dim, with_bm25)
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
# Verify collection was created
rsp = self.collection_client.collection_describe(name)
assert rsp['code'] == 0, f"Collection describe failed: {rsp}"
assert rsp['data']['collectionName'] == name, f"Collection name mismatch: expected {name}, got {rsp['data']['collectionName']}"
return payload
def _insert_and_verify_data(self, name, data):
"""Helper method to insert data and verify insertion"""
payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0, f"Insert failed: {rsp}"
assert rsp['data']['insertCount'] == len(data), f"Expected {len(data)} inserts, got {rsp['data']['insertCount']}"
return rsp
def _create_index_and_load(self, name, index_fields=None):
"""Helper method to create index and load collection"""
if index_fields is None:
index_fields = [{"fieldName": "dense", "indexName": "dense_index", "metricType": "COSINE"}]
index_payload = {
"collectionName": name,
"indexParams": [
{**field, "indexType": "AUTOINDEX", "params": {}}
for field in index_fields
]
}
rsp = self.index_client.index_create(index_payload)
assert rsp['code'] == 0, f"Index creation failed: {rsp}"
# Load collection
rsp = self.collection_client.collection_load(collection_name=name)
assert rsp['code'] == 0, f"Collection load failed: {rsp}"
def test_simple_tei_text_embedding_workflow(self, tei_endpoint):
"""
target: test simple TEI text embedding workflow
method: create collection, insert data, create index, load, and search
expected: all operations succeed
"""
name = gen_collection_name(prefix)
# Create collection with TEI text embedding function
self._create_and_verify_collection(name, tei_endpoint)
# Insert simple text data
data = [
{"id": 1, "document": "This is a test document"},
{"id": 2, "document": "Another test document"}
]
self._insert_and_verify_data(name, data)
# Create index and load collection
self._create_index_and_load(name)
# Search
search_payload = {
"collectionName": name,
"data": ["test document"],
"limit": 2,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0, f"Search failed: {rsp}"
assert len(rsp['data']) > 0, f"Search returned no results: {rsp['data']}"
def test_create_collection_with_tei_text_embedding_function(self, tei_endpoint):
"""
target: test create collection with TEI text embedding function via REST API (equivalent to ORM example)
method: create collection with TEI text embedding function using RESTful API
expected: create collection successfully
"""
name = gen_collection_name(prefix)
# Create collection with additional truncation parameters
payload = self._create_basic_collection_payload(name, tei_endpoint)
payload["schema"]["functions"][0]["params"].update({
"truncate": True,
"truncation_direction": "Right"
})
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
# Verify collection was created with function
rsp = self.collection_client.collection_describe(name)
assert rsp['code'] == 0, f"Collection describe failed: {rsp}"
assert rsp['data']['collectionName'] == name, f"Collection name mismatch: expected {name}, got {rsp['data']['collectionName']}"
@pytest.mark.parametrize("truncate", [True, False])
@pytest.mark.parametrize("truncation_direction", ["Left", "Right"])
def test_insert_with_tei_text_embedding_truncation(self, tei_endpoint, truncate, truncation_direction):
"""
target: test insert data with TEI text embedding function with truncation parameters
method: insert long text data with different truncation settings
expected: insert successfully and truncation works as expected
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with TEI text embedding function including truncation params
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint,
"truncate": truncate,
"truncation_direction": truncation_direction
}
}
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Prepare test data with long text similar to ORM test
left_text = " ".join([fake_en.word() for _ in range(512)])
right_text = " ".join([fake_en.word() for _ in range(512)])
data = [
{
"id": 0,
"document": left_text + " " + right_text
},
{
"id": 1,
"document": left_text
},
{
"id": 2,
"document": right_text
}
]
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
if not truncate:
logger.info(f"Truncate is False, insertion result: {rsp}")
return
assert rsp['code'] == 0, f"Insert failed: {rsp}"
assert rsp['data']['insertCount'] == len(data), f"Expected {len(data)} inserts, got {rsp['data']['insertCount']}"
# Create index and load for similarity comparison
index_payload = {
"collectionName": name,
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE",
"indexType": "AUTOINDEX",
"params": {}
}
]
}
rsp = self.index_client.index_create(index_payload)
assert rsp['code'] == 0, f"Index creation failed: {rsp}"
# Load collection
rsp = self.collection_client.collection_load(collection_name=name)
assert rsp['code'] == 0
# Query to get embeddings for similarity comparison
query_payload = {
"collectionName": name,
"filter": "id >= 0",
"outputFields": ["id", "dense"],
"limit": 10
}
rsp = self.vector_client.vector_query(query_payload)
assert rsp['code'] == 0, f"Query failed: {rsp}"
assert len(rsp['data']) == 3, f"Expected 3 results, got {len(rsp['data'])}"
# Compare similarity between embeddings to verify truncation direction
embeddings = {}
for result in rsp['data']:
embeddings[result['id']] = result['dense']
# Calculate cosine similarity
similarity_left = np.dot(embeddings[0], embeddings[1]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
)
similarity_right = np.dot(embeddings[0], embeddings[2]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[2])
)
logger.info(f"Similarity with left: {similarity_left}, with right: {similarity_right}")
if truncation_direction == "Left":
# When truncating from left, the combined text should be more similar to right text
assert similarity_left < similarity_right, (
f"Left truncation failed: left_sim={similarity_left:.4f}, right_sim={similarity_right:.4f}"
)
else: # Right truncation
# When truncating from right, the combined text should be more similar to left text
assert similarity_left > similarity_right, (
f"Right truncation failed: left_sim={similarity_left:.4f}, right_sim={similarity_right:.4f}"
)
def test_insert_with_tei_text_embedding_function(self, tei_endpoint):
"""
target: test insert data with TEI text embedding function via REST API
method: insert text data, embeddings should be automatically generated by TEI
expected: insert successfully and embeddings are generated
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with TEI text embedding function
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert text data without embedding vectors (they should be auto-generated by TEI)
nb = 10
data = []
for i in range(nb):
data.append({
"id": i,
"document": fake_en.text()
})
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0, f"Insert failed: {rsp}"
assert rsp['data']['insertCount'] == nb, f"Expected {nb} inserts, got {rsp['data']['insertCount']}"
def test_search_with_tei_text_embedding_function(self, tei_endpoint):
"""
target: test search with TEI text embedding function via REST API
method: 1. create collection with TEI text embedding function
2. insert text data
3. search with text query (should auto-generate embedding via TEI)
expected: search successfully with relevant results
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with TEI text embedding function
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE"
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert text data
nb = 100
documents = [
"Machine learning is a subset of artificial intelligence",
"Deep learning uses neural networks with multiple layers",
"Natural language processing helps computers understand text",
"Computer vision enables machines to interpret visual information",
"Reinforcement learning trains agents through rewards and penalties"
]
data = []
for i in range(nb):
data.append({
"id": i,
"document": documents[i % len(documents)] + f" Document {i}"
})
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# Search with text query (TEI will auto-generate embedding)
search_payload = {
"collectionName": name,
"data": ["artificial intelligence and machine learning"],
"limit": 10,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0, f"Search failed: {rsp}"
assert len(rsp['data']) > 0, f"Search returned no results"
# Verify search results contain relevant documents
found_relevant = any(
"machine learning" in result.get('document', '').lower() or
"artificial intelligence" in result.get('document', '').lower()
for result in rsp['data']
)
assert found_relevant, f"Search should return relevant documents, got: {[r.get('document', '') for r in rsp['data']]}"
def test_tei_and_bm25_collection_creation(self, tei_endpoint):
"""
target: test create collection with both TEI and BM25 functions using correct format
method: create collection with TEI text embedding and BM25 functions based on working example
expected: collection creation succeeds
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with both TEI and BM25 functions using correct format
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{
"fieldName": "document",
"dataType": "VarChar",
"elementTypeParams": {
"max_length": "1000",
"enable_analyzer": True,
"analyzer_params": {"tokenizer": "standard"},
"enable_match": True
}
},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}},
{"fieldName": "sparse", "dataType": "SparseFloatVector"}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
},
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document"],
"outputFieldNames": ["sparse"],
"params": {}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE"
},
{
"fieldName": "sparse",
"indexName": "sparse_index",
"metricType": "BM25",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert test data
data = []
for i in range(10):
data.append({
"id": i,
"document": fake_en.text().lower()
})
payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0, f"Insert failed: {rsp}"
assert rsp['data']['insertCount'] == 10, f"Expected 10 inserts, got {rsp['data']['insertCount']}"
# Test search with BM25 (sparse vector)
search_payload = {
"collectionName": name,
"data": [fake_en.text().lower()],
"annsField": "sparse",
"limit": 5,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0, f"BM25 search failed: {rsp}"
assert len(rsp['data']) > 0, f"BM25 search returned no results"
# test search with dense vector
search_payload = {
"collectionName": name,
"data": [fake_en.text().lower()],
"annsField": "dense",
"limit": 5,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0, f"Dense search failed: {rsp}"
assert len(rsp['data']) > 0, f"Dense search returned no results"
def test_hybrid_search_with_text_embedding_and_bm25(self, tei_endpoint):
"""
target: test hybrid search combining text embedding and BM25 via REST API
method: 1. create collection with both text embedding and BM25 functions
2. insert text data
3. perform hybrid search
expected: hybrid search returns combined results
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with both text embedding and BM25 functions
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{
"fieldName": "document",
"dataType": "VarChar",
"elementTypeParams": {
"max_length": "65535",
"enable_analyzer": True,
"analyzer_params": {"tokenizer": "standard"},
"enable_match": True
}
},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}},
{"fieldName": "sparse", "dataType": "SparseFloatVector"}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
},
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document"],
"outputFieldNames": ["sparse"],
"params": {}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE"
},
{
"fieldName": "sparse",
"indexName": "sparse_index",
"metricType": "BM25",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert diverse text data
documents = [
"Python is a popular programming language for data science",
"JavaScript is widely used for web development",
"Machine learning algorithms can predict future trends",
"Database systems store and manage large amounts of data",
"Cloud computing provides scalable infrastructure solutions",
"Artificial intelligence transforms various industries",
"Software engineering practices improve code quality",
"Data visualization helps understand complex datasets",
"Cybersecurity protects digital assets from threats",
"Mobile applications provide convenient user experiences"
]
data = []
for i in range(50):
data.append({
"id": i,
"document": documents[i % len(documents)] + f" Extended content {i}"
})
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# Perform hybrid search using advanced search
hybrid_search_payload = {
"collectionName": name,
"search": [
{
"data": ["programming language data science"],
"annsField": "dense",
"limit": 20
},
{
"data": ["programming language data science"],
"annsField": "sparse",
"limit": 20
}
],
"rerank": {
"strategy": "weighted",
"params": {"weights": [0.7, 0.3]}
},
"limit": 10,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_advanced_search(hybrid_search_payload)
assert rsp['code'] == 0, f"Hybrid search failed: {rsp}"
assert len(rsp['data']) > 0, f"Hybrid search returned no results"
# Verify hybrid search results are relevant
found_relevant = any(
any(term in result.get('document', '').lower() for term in ['python', 'programming', 'data'])
for result in rsp['data']
)
assert found_relevant, f"Hybrid search should return relevant documents, got: {[r.get('document', '') for r in rsp['data']]}"
@pytest.mark.L1
class TestTextEmbeddingSearchAdvanced(TestBase):
"""
******************************************************************
Advanced test cases for text embedding function search via RESTful API
******************************************************************
"""
def test_search_with_filter_and_text_embedding(self, tei_endpoint):
"""
target: test search with both text embedding and scalar filters
method: 1. create collection with text embedding function and metadata fields
2. insert text data with metadata
3. search with text query and scalar filters
expected: search returns filtered and relevant results
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with text embedding function and metadata fields
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "category", "dataType": "VarChar", "elementTypeParams": {"max_length": "100"}},
{"fieldName": "year", "dataType": "Int64"},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE"
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert text data with metadata
categories = ["technology", "science", "business", "education"]
years = [2020, 2021, 2022, 2023, 2024]
data = []
for i in range(100):
data.append({
"id": i,
"document": fake_en.text(),
"category": categories[i % len(categories)],
"year": years[i % len(years)]
})
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# Search with text query and filters
search_payload = {
"collectionName": name,
"data": ["technology innovation"],
"filter": "category == 'technology' and year >= 2022",
"limit": 10,
"outputFields": ["id", "document", "category", "year"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0
# Verify all results match the filter criteria
for result in rsp['data']:
assert result['category'] == 'technology', f"Category mismatch: expected 'technology', got '{result['category']}'"
assert result['year'] >= 2022, f"Year filter failed: expected >= 2022, got {result['year']}"
def test_upsert_with_text_embedding_function(self, tei_endpoint):
"""
target: test upsert operation with text embedding function
method: 1. insert initial text data
2. upsert with modified text content
3. verify embeddings are updated
expected: upsert successfully updates both text and embeddings
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with text embedding function
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE"
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert initial data
original_text = "The original document about machine learning"
data = [{"id": 1, "document": original_text}]
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# Query original embedding
query_payload = {
"collectionName": name,
"filter": "id == 1",
"outputFields": ["id", "document", "dense"],
"limit": 10
}
rsp = self.vector_client.vector_query(query_payload)
assert rsp['code'] == 0, f"Original query failed: {rsp}"
assert len(rsp['data']) > 0, f"Original query returned no results"
original_embedding = rsp['data'][0]['dense']
# Upsert with modified text
updated_text = "The updated document about deep learning and neural networks"
upsert_data = [{"id": 1, "document": updated_text}]
payload = {
"collectionName": name,
"data": upsert_data
}
rsp = self.vector_client.vector_upsert(payload)
assert rsp['code'] == 0, f"Upsert failed: {rsp}"
# Query updated embedding
rsp = self.vector_client.vector_query(query_payload)
assert rsp['code'] == 0, f"Updated query failed: {rsp}"
assert len(rsp['data']) > 0, f"Updated query returned no results"
updated_embedding = rsp['data'][0]['dense']
# Verify text was updated
assert rsp['data'][0]['document'] == updated_text, f"Text not updated: expected '{updated_text}', got '{rsp['data'][0]['document']}'"
# Verify embedding was updated (embeddings should be different)
similarity = np.dot(original_embedding, updated_embedding) / (
np.linalg.norm(original_embedding) * np.linalg.norm(updated_embedding)
)
assert similarity < 0.99, f"Embedding should be significantly different after text update, similarity: {similarity:.4f}"
@pytest.mark.L2
class TestTextEmbeddingSearchNegative(TestBase):
"""
******************************************************************
Negative test cases for text embedding function search via RESTful API
******************************************************************
"""
def test_create_collection_with_invalid_text_embedding_params(self):
"""
target: test create collection with invalid text embedding function parameters
method: create collection with invalid embedding provider/model
expected: collection creation should fail with appropriate error
"""
name = gen_collection_name(prefix)
dim = 1024
# Create collection with invalid text embedding function
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "text_embedding_fn",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "invalid_provider",
"model_name": "invalid_model",
"api_key": "invalid_key"
}
}
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] != 0, f"Expected creation to fail with invalid provider, but got: {rsp}"
def test_search_with_empty_query_text(self, tei_endpoint):
"""
target: test search with empty text query
method: 1. create collection with text embedding function
2. insert data
3. search with empty string
expected: search should handle empty query appropriately
"""
name = gen_collection_name(prefix)
dim = 768
# Create collection with text embedding function
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
# Insert sample data
data = [{"id": i, "document": fake_en.text()} for i in range(10)]
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
# Search with empty query
search_payload = {
"collectionName": name,
"data": [""],
"limit": 5,
"outputFields": ["id", "document"]
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] != 0, f"Expected search to fail with empty query, but got: {rsp}"
def test_dimension_mismatch_with_text_embedding(self, tei_endpoint):
"""
target: test dimension mismatch between text embedding function and vector field
method: create collection with mismatched dimensions
expected: collection creation should fail
"""
name = gen_collection_name(prefix)
wrong_dim = 512 # TEI produces 768-dim vectors
# Create collection with mismatched dimensions
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(wrong_dim)}}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint # This produces 768-dim vectors
}
}
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] != 0, f"Expected creation to fail with dimension mismatch, but got: {rsp}"
class TestModelRerankFunction(TestBase):
"""
******************************************************************
Test cases for model rerank function via RESTful API
******************************************************************
"""
def _create_collection_with_all_vector_types(self, name, tei_endpoint):
"""Helper method to create collection with dense, sparse, and bm25 fields"""
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection with all vector types",
"fields": [
{"fieldName": "doc_id", "dataType": "Int64", "isPrimary": True},
{
"fieldName": "document",
"dataType": "VarChar",
"elementTypeParams": {
"max_length": "65535",
"enable_analyzer": True,
"analyzer_params": {"tokenizer": "standard"},
"enable_match": True
}
},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": "768"}},
{"fieldName": "sparse", "dataType": "SparseFloatVector"},
{"fieldName": "bm25", "dataType": "SparseFloatVector"}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
},
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document"],
"outputFieldNames": ["bm25"],
"params": {}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE"
},
{
"fieldName": "sparse",
"indexName": "sparse_index",
"metricType": "IP",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}
},
{
"fieldName": "bm25",
"indexName": "bm25_index",
"metricType": "BM25",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
# Insert sample data
import random
data = []
for i in range(50):
data.append({
"doc_id": i,
"document": fake_en.text(),
"sparse": {random.randint(1, 10000): random.random() for _ in range(100)}
})
payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0, f"Insert failed: {rsp}"
return name
@pytest.mark.parametrize("enable_truncate", [False, True])
def test_single_vector_search_with_model_rerank(self, tei_endpoint, enable_truncate,
tei_reranker_endpoint):
"""
target: test single vector search with model rerank using RESTful API
method: test dense/sparse/bm25 search with model reranker separately
expected: search should succeed with model reranker
"""
import random
name = gen_collection_name(prefix)
self._create_collection_with_all_vector_types(name, tei_endpoint)
# Prepare search parameters for reranker
nq = 2
query_texts = [fake_en.text() for _ in range(nq)]
if enable_truncate:
# Make query texts larger for truncation test
query_texts = [" ".join([fake_en.word() for _ in range(1024)]) for _ in range(nq)]
# Prepare reranker parameters (functionScore format)
reranker_params = {
"functions": [{
"name": "tei_reranker",
"description": "",
"type": "Rerank",
"inputFieldNames": ["document"],
"params": {
"reranker": "model",
"provider": "tei",
"queries": query_texts,
"endpoint": tei_reranker_endpoint,
"truncate": enable_truncate,
"truncation_direction": "Right"
}
}]
}
# Test different search types
for search_type in ["dense", "sparse", "bm25"]:
logger.info(f"Executing {search_type} search with model reranker")
if search_type == "dense":
# Dense vector search
search_payload = {
"collectionName": name,
"data": [[random.random() for _ in range(768)] for _ in range(nq)],
"annsField": "dense",
"limit": 10,
"outputFields": ["doc_id", "document"],
"functionScore": reranker_params
}
elif search_type == "sparse":
# Sparse vector search
search_payload = {
"collectionName": name,
"data": [{random.randint(1, 10000): random.random() for _ in range(100)} for _ in range(nq)],
"annsField": "sparse",
"limit": 10,
"outputFields": ["doc_id", "document"],
"functionScore": reranker_params
}
elif search_type == "bm25":
# BM25 search
search_payload = {
"collectionName": name,
"data": query_texts,
"annsField": "bm25",
"limit": 10,
"outputFields": ["doc_id", "document"],
"searchParams": {"metric_type": "BM25"},
"functionScore": reranker_params
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0, f"{search_type} search with model reranker failed: {rsp}"
assert len(rsp['data']) > 0, f"{search_type} search returned no results"
logger.info(f"{search_type} search with TEI reranker succeeded")
def test_hybrid_vector_search_with_model_rerank(self, tei_endpoint,
tei_reranker_endpoint):
"""
target: test hybrid vector search with model rerank using RESTful API
method: test dense+sparse/dense+bm25/sparse+bm25 search with model reranker
expected: hybrid search should succeed with model reranker
"""
import random
name = gen_collection_name(prefix)
self._create_collection_with_all_vector_types(name, tei_endpoint)
# Prepare search parameters for reranker
nq = 2
query_texts = [fake_en.text() for _ in range(nq)]
# Prepare reranker parameters (functionScore format)
reranker_params = {
"functions": [{
"name": "tei_reranker",
"description": "",
"type": "Rerank",
"inputFieldNames": ["document"],
"params": {
"reranker": "model",
"provider": "tei",
"queries": query_texts,
"endpoint": tei_reranker_endpoint
}
}]
}
# Test different hybrid search combinations
for search_type in ["dense+sparse", "dense+bm25", "sparse+bm25"]:
logger.info(f"Executing {search_type} hybrid search with model reranker")
if search_type == "dense+sparse":
hybrid_search_payload = {
"collectionName": name,
"search": [
{
"data": [[random.random() for _ in range(768)] for _ in range(nq)],
"annsField": "dense",
"limit": 5
},
{
"data": [{random.randint(1, 10000): random.random() for _ in range(100)} for _ in range(nq)],
"annsField": "sparse",
"limit": 5
}
],
"functionScore": reranker_params,
"limit": 10,
"outputFields": ["doc_id", "document"]
}
elif search_type == "dense+bm25":
hybrid_search_payload = {
"collectionName": name,
"search": [
{
"data": [[random.random() for _ in range(768)] for _ in range(nq)],
"annsField": "dense",
"limit": 5
},
{
"data": query_texts,
"annsField": "bm25",
"limit": 5,
"params": {"metric_type": "BM25"}
}
],
"functionScore": reranker_params,
"limit": 10,
"outputFields": ["doc_id", "document"]
}
elif search_type == "sparse+bm25":
hybrid_search_payload = {
"collectionName": name,
"search": [
{
"data": [{random.randint(1, 10000): random.random() for _ in range(100)} for _ in range(nq)],
"annsField": "sparse",
"limit": 5
},
{
"data": query_texts,
"annsField": "bm25",
"limit": 5,
"params": {"metric_type": "BM25"}
}
],
"functionScore": reranker_params,
"limit": 10,
"outputFields": ["doc_id", "document"]
}
rsp = self.vector_client.vector_advanced_search(hybrid_search_payload)
assert rsp['code'] == 0, f"{search_type} hybrid search with model reranker failed: {rsp}"
assert len(rsp['data']) > 0, f"{search_type} hybrid search returned no results"
logger.info(f"{search_type} hybrid search with TEI reranker succeeded")
@pytest.mark.L1
class TestDecayRerank(TestBase):
"""
******************************************************************
Test cases for Decay rerank function via RESTful API
******************************************************************
"""
def _create_collection_with_timestamp_field(self, name, tei_endpoint):
"""Helper method to create collection with timestamp field for decay rerank"""
import time
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection for decay rerank",
"fields": [
{"fieldName": "doc_id", "dataType": "Int64", "isPrimary": True},
{
"fieldName": "document",
"dataType": "VarChar",
"elementTypeParams": {
"max_length": "65535",
"enable_analyzer": True,
"analyzer_params": {"tokenizer": "standard"},
"enable_match": True
}
},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": "768"}},
{"fieldName": "sparse", "dataType": "SparseFloatVector"},
{"fieldName": "timestamp", "dataType": "Int64"}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
},
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document"],
"outputFieldNames": ["sparse"],
"params": {}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE",
"indexType": "AUTOINDEX",
"params": {}
},
{
"fieldName": "sparse",
"indexName": "sparse_index",
"metricType": "BM25",
"indexType": "SPARSE_INVERTED_INDEX",
"params": {"bm25_k1": 1.2, "bm25_b": 0.75}
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
# Insert sample data with different timestamps
current_time = int(time.time())
data = []
news_documents = [
"Artificial intelligence helps medical breakthroughs",
"Analysis of artificial intelligence trends in 2023",
"Artificial intelligence ethical disputes continue to ferment",
"The latest progress in deep learning technology",
"Machine learning algorithms improve healthcare diagnosis",
"Neural networks advance computer vision capabilities",
"Natural language processing enables better chatbots",
"Robotics automation transforms manufacturing industry",
"Quantum computing promises revolutionary breakthroughs",
"Blockchain technology secures digital transactions"
]
# Create data with timestamps ranging from 30 days ago to current time
for i in range(len(news_documents)):
timestamp_offset = (len(news_documents) - i - 1) * 24 * 60 * 60 * 3 # 3 days apart
data.append({
"doc_id": i,
"document": news_documents[i],
"timestamp": current_time - timestamp_offset
})
payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0, f"Insert failed: {rsp}"
return name, current_time
@pytest.mark.parametrize("decay_function", ["gauss", "exp", "linear"])
def test_single_vector_search_with_decay_rerank(self, tei_endpoint, decay_function):
"""
target: test single vector search with decay rerank using different decay functions
method: test dense/sparse search with gauss/exp/linear decay reranker
expected: search should succeed with decay reranker and time-based ranking
"""
import random
name = gen_collection_name(prefix)
collection_name, current_time = self._create_collection_with_timestamp_field(name, tei_endpoint)
# Prepare decay reranker parameters
decay_params = {
"functions": [{
"name": f"{decay_function}_decay_ranker",
"description": "",
"type": "Rerank",
"inputFieldNames": ["timestamp"],
"params": {
"reranker": "decay",
"function": decay_function,
"origin": current_time, # Current time as origin
"offset": 0,
"decay": 0.5,
"scale": 7 * 24 * 60 * 60 # 7 days in seconds
}
}]
}
# Test different search types
for search_type in ["dense", "sparse"]:
logger.info(f"Executing {search_type} search with {decay_function} decay reranker")
if search_type == "dense":
# Dense vector search
search_payload = {
"collectionName": collection_name,
"data": ["artificial intelligence technology progress"],
"annsField": "dense",
"limit": 10,
"outputFields": ["doc_id", "document", "timestamp"],
"functionScore": decay_params
}
elif search_type == "sparse":
# Sparse vector search
search_payload = {
"collectionName": collection_name,
"data": ["artificial intelligence technology progress"],
"annsField": "sparse",
"limit": 10,
"outputFields": ["doc_id", "document", "timestamp"],
"functionScore": decay_params
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0, f"{search_type} search with {decay_function} decay reranker failed: {rsp}"
assert len(rsp['data']) > 0, f"{search_type} search returned no results"
# Verify time-based ranking: more recent documents should have higher scores
if len(rsp['data']) > 1:
# Check that results are ordered by final score (which includes decay)
scores = [result.get('distance', 0) for result in rsp['data']]
logger.info(f"{decay_function} decay results scores: {scores}")
logger.info(f"{search_type} search with {decay_function} decay reranker succeeded")
@pytest.mark.parametrize("decay_function", ["gauss", "exp", "linear"])
def test_hybrid_search_with_decay_rerank(self, tei_endpoint, decay_function):
"""
target: test hybrid search with decay rerank using different decay functions
method: test dense+sparse hybrid search with decay reranker
expected: hybrid search should succeed with decay reranker
"""
import random
name = gen_collection_name(prefix)
collection_name, current_time = self._create_collection_with_timestamp_field(name, tei_endpoint)
# Prepare decay reranker parameters
decay_params = {
"functions": [{
"name": f"{decay_function}_decay_ranker",
"description": "",
"type": "Rerank",
"inputFieldNames": ["timestamp"],
"params": {
"reranker": "decay",
"function": decay_function,
"origin": current_time,
"offset": 2 * 24 * 60 * 60, # 2 days offset
"decay": 0.3, # More aggressive decay
"scale": 5 * 24 * 60 * 60 # 5 days scale
}
}]
}
logger.info(f"Executing hybrid search with {decay_function} decay reranker")
# Hybrid search with decay rerank
hybrid_search_payload = {
"collectionName": collection_name,
"search": [
{
"data": ["artificial intelligence machine learning"],
"annsField": "dense",
"limit": 5
},
{
"data": ["artificial intelligence machine learning"],
"annsField": "sparse",
"limit": 5
}
],
"functionScore": decay_params,
"limit": 10,
"outputFields": ["doc_id", "document", "timestamp"]
}
rsp = self.vector_client.vector_advanced_search(hybrid_search_payload)
assert rsp['code'] == 0, f"Hybrid search with {decay_function} decay reranker failed: {rsp}"
assert len(rsp['data']) > 0, f"Hybrid search returned no results"
# Log results for manual verification
logger.info(f"Hybrid search with {decay_function} decay reranker results:")
for i, result in enumerate(rsp['data'][:3]): # Show top 3 results
logger.info(f" {i+1}. Doc: {result.get('document', '')[:50]}... Timestamp: {result.get('timestamp', 0)}")
logger.info(f"Hybrid search with {decay_function} decay reranker succeeded")
def test_decay_rerank_with_different_parameters(self, tei_endpoint):
"""
target: test decay rerank with different parameter combinations
method: test different origin, offset, decay, scale parameters
expected: search should succeed with different parameter configurations
"""
name = gen_collection_name(prefix)
collection_name, current_time = self._create_collection_with_timestamp_field(name, tei_endpoint)
# Test different parameter combinations
test_configs = [
{
"name": "no_offset_high_decay",
"params": {
"origin": current_time,
"offset": 0,
"decay": 0.8, # High decay rate
"scale": 3 * 24 * 60 * 60
}
},
{
"name": "with_offset_low_decay",
"params": {
"origin": current_time,
"offset": 5 * 24 * 60 * 60, # 5 days offset
"decay": 0.2, # Low decay rate
"scale": 10 * 24 * 60 * 60
}
},
{
"name": "past_origin",
"params": {
"origin": current_time - 15 * 24 * 60 * 60, # 15 days ago
"offset": 0,
"decay": 0.5,
"scale": 7 * 24 * 60 * 60
}
}
]
for config in test_configs:
logger.info(f"Testing decay rerank with config: {config['name']}")
decay_params = {
"functions": [{
"name": f"decay_ranker_{config['name']}",
"description": "",
"type": "Rerank",
"inputFieldNames": ["timestamp"],
"params": {
"reranker": "decay",
"function": "gauss",
**config["params"]
}
}]
}
search_payload = {
"collectionName": collection_name,
"data": ["technology progress artificial intelligence"],
"annsField": "dense",
"limit": 10,
"outputFields": ["doc_id", "document", "timestamp"],
"functionScore": decay_params
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] == 0, f"Search with {config['name']} failed: {rsp}"
assert len(rsp['data']) > 0, f"Search with {config['name']} returned no results"
logger.info(f"Decay rerank with {config['name']} succeeded, returned {len(rsp['data'])} results")
@pytest.mark.L2
class TestDecayRerankNegative(TestBase):
"""
******************************************************************
Negative test cases for Decay rerank function via RESTful API
******************************************************************
"""
def _create_collection_with_timestamp_field(self, name, tei_endpoint):
"""Helper method to create collection with timestamp field for decay rerank"""
import time
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection for decay rerank",
"fields": [
{"fieldName": "doc_id", "dataType": "Int64", "isPrimary": True},
{
"fieldName": "document",
"dataType": "VarChar",
"elementTypeParams": {
"max_length": "65535",
"enable_analyzer": True,
"analyzer_params": {"tokenizer": "standard"},
"enable_match": True
}
},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": "768"}},
{"fieldName": "sparse", "dataType": "SparseFloatVector"},
{"fieldName": "timestamp", "dataType": "Int64"}
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
},
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document"],
"outputFieldNames": ["sparse"],
"params": {}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE",
"indexType": "AUTOINDEX",
"params": {}
},
{
"fieldName": "sparse",
"indexName": "sparse_index",
"metricType": "BM25",
"indexType": "SPARSE_INVERTED_INDEX",
"params": {"bm25_k1": 1.2, "bm25_b": 0.75}
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
# Insert sample data with different timestamps
current_time = int(time.time())
data = []
news_documents = [
"Artificial intelligence helps medical breakthroughs",
"Analysis of artificial intelligence trends in 2023",
"Artificial intelligence ethical disputes continue to ferment",
"The latest progress in deep learning technology",
"Machine learning algorithms improve healthcare diagnosis"
]
# Create data with timestamps ranging from 15 days ago to current time
for i in range(len(news_documents)):
timestamp_offset = (len(news_documents) - i - 1) * 24 * 60 * 60 * 3 # 3 days apart
data.append({
"doc_id": i,
"document": news_documents[i],
"timestamp": current_time - timestamp_offset
})
payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0, f"Insert failed: {rsp}"
return name, current_time
def test_decay_rerank_with_invalid_function_type(self, tei_endpoint):
"""
target: test decay rerank with invalid function type
method: create decay rerank with invalid function type
expected: search should fail with appropriate error
"""
name = gen_collection_name(prefix)
collection_name, current_time = self._create_collection_with_timestamp_field(name, tei_endpoint)
# Test with invalid function type
decay_params = {
"functions": [{
"name": "invalid_decay_ranker",
"description": "",
"type": "Rerank",
"inputFieldNames": ["timestamp"],
"params": {
"reranker": "decay",
"function": "invalid_function", # Invalid function type
"origin": current_time,
"offset": 0,
"decay": 0.5,
"scale": 7 * 24 * 60 * 60
}
}]
}
search_payload = {
"collectionName": collection_name,
"data": ["artificial intelligence"],
"annsField": "dense",
"limit": 10,
"outputFields": ["doc_id", "document", "timestamp"],
"functionScore": decay_params
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] != 0, f"Expected search to fail with invalid function type, but got: {rsp}"
def test_decay_rerank_with_invalid_field_type(self, tei_endpoint):
"""
target: test decay rerank with non-numeric field
method: create decay rerank with non-numeric field
expected: search should fail appropriately
"""
name = gen_collection_name(prefix)
# Create collection with string field instead of numeric timestamp
payload = {
"collectionName": name,
"schema": {
"autoId": False,
"enableDynamicField": True,
"description": "test collection for decay rerank negative test",
"fields": [
{"fieldName": "doc_id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": "768"}},
{"fieldName": "category", "dataType": "VarChar", "elementTypeParams": {"max_length": "100"}} # String field
],
"functions": [
{
"name": "tei",
"type": "TextEmbedding",
"inputFieldNames": ["document"],
"outputFieldNames": ["dense"],
"params": {
"provider": "TEI",
"endpoint": tei_endpoint
}
}
]
}
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
# Insert data
data = [{"doc_id": i, "document": fake_en.text(), "category": "tech"} for i in range(5)]
payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0, f"Insert failed: {rsp}"
# Test decay rerank with string field
decay_params = {
"functions": [{
"name": "invalid_field_decay_ranker",
"description": "",
"type": "Rerank",
"inputFieldNames": ["category"], # String field, should fail
"params": {
"reranker": "decay",
"function": "gauss",
"origin": 100,
"offset": 0,
"decay": 0.5,
"scale": 10
}
}]
}
search_payload = {
"collectionName": name,
"data": ["technology"],
"annsField": "dense",
"limit": 10,
"outputFields": ["doc_id", "document", "category"],
"functionScore": decay_params
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] != 0, f"Expected search to fail with non-numeric field, but got: {rsp}"
def test_decay_rerank_with_invalid_parameters(self, tei_endpoint):
"""
target: test decay rerank with invalid parameter values
method: test with invalid decay, scale, offset values
expected: search should fail with appropriate error
"""
name = gen_collection_name(prefix)
collection_name, current_time = self._create_collection_with_timestamp_field(name, tei_endpoint)
# Test invalid parameter combinations
invalid_configs = [
{
"name": "negative_scale",
"params": {
"origin": current_time,
"scale": -100, # Invalid: scale must be > 0
"decay": 0.5
}
},
{
"name": "invalid_decay_range",
"params": {
"origin": current_time,
"scale": 100,
"decay": 1.5 # Invalid: decay must be between 0 and 1
}
},
{
"name": "negative_offset",
"params": {
"origin": current_time,
"scale": 100,
"decay": 0.5,
"offset": -10 # Invalid: offset must be >= 0
}
}
]
for config in invalid_configs:
logger.info(f"Testing invalid config: {config['name']}")
decay_params = {
"functions": [{
"name": f"invalid_decay_ranker_{config['name']}",
"description": "",
"type": "Rerank",
"inputFieldNames": ["timestamp"],
"params": {
"reranker": "decay",
"function": "gauss",
**config["params"]
}
}]
}
search_payload = {
"collectionName": collection_name,
"data": ["artificial intelligence"],
"annsField": "dense",
"limit": 10,
"outputFields": ["doc_id", "document", "timestamp"],
"functionScore": decay_params
}
rsp = self.vector_client.vector_search(search_payload)
assert rsp['code'] != 0, f"Expected search to fail with {config['name']}, but got: {rsp}"
logger.info(f"Invalid config {config['name']} correctly failed")
@pytest.mark.L1
class TestRRFWeightedRerank(TestBase):
"""
******************************************************************
Test cases for RRF and Weighted rerank function via RESTful API
******************************************************************
"""
def _create_collection_with_bm25_function(self, name):
"""Helper method to create collection with dense, sparse, and bm25 fields"""
payload = {
"collectionName": name,
"schema": {
"autoId": True,
"enableDynamicField": False,
"description": "test collection for rrf/weighted rerank",
"fields": [
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
{"fieldName": "doc_id", "dataType": "VarChar", "elementTypeParams": {"max_length": "100"}},
{
"fieldName": "document",
"dataType": "VarChar",
"elementTypeParams": {
"max_length": "10000",
"enable_analyzer": True,
"analyzer_params": {"tokenizer": "standard"},
"enable_match": True
}
},
{"fieldName": "sparse", "dataType": "SparseFloatVector"},
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": "768"}},
{"fieldName": "bm25", "dataType": "SparseFloatVector"}
],
"functions": [
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document"],
"outputFieldNames": ["bm25"],
"params": {}
}
]
},
"indexParams": [
{
"fieldName": "dense",
"indexName": "dense_index",
"metricType": "COSINE",
"indexType": "FLAT",
"params": {}
},
{
"fieldName": "sparse",
"indexName": "sparse_index",
"metricType": "IP",
"indexType": "SPARSE_INVERTED_INDEX",
"params": {}
},
{
"fieldName": "bm25",
"indexName": "bm25_index",
"metricType": "BM25",
"indexType": "SPARSE_INVERTED_INDEX",
"params": {"bm25_k1": 1.2, "bm25_b": 0.75}
}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
# Insert sample data
import random
data = []
data_size = 100 # Reduced size for faster testing
for i in range(data_size):
data.append({
"doc_id": str(i),
"document": fake_en.text(),
"sparse": {random.randint(1, 10000): random.random() for _ in range(100)},
"dense": [random.random() for _ in range(768)]
})
payload = {"collectionName": name, "data": data}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0, f"Insert failed: {rsp}"
return name
@pytest.mark.parametrize("ranker_model", ["rrf", "weighted"])
def test_hybrid_vector_search_with_rrf_weighted_rerank(self, ranker_model):
"""
target: test hybrid vector search with RRF/Weighted rerank using RESTful API
method: test dense+sparse/dense+bm25/sparse+bm25 search with RRF/Weighted reranker
expected: hybrid search should succeed with RRF/Weighted reranker
"""
import random
name = gen_collection_name(prefix)
self._create_collection_with_bm25_function(name)
# Prepare search parameters for reranker
nq = 2 # Reduced for faster testing
query_texts = [fake_en.text() for _ in range(nq)]
# Prepare reranker parameters (functionScore format)
if ranker_model == "rrf":
reranker_params = {
"functions": [{
"name": "rrf_ranker",
"description": "",
"type": "Rerank",
"inputFieldNames": [],
"params": {
"reranker": "rrf",
"k": 100
}
}]
}
else: # weighted
reranker_params = {
"functions": [{
"name": "weighted_ranker",
"description": "",
"type": "Rerank",
"inputFieldNames": [],
"params": {
"reranker": "weighted",
"weights": [0.1, 0.9],
"norm_score": True
}
}]
}
# Test different hybrid search combinations
for search_type in ["dense+sparse", "dense+bm25", "sparse+bm25"]:
logger.info(f"Executing {search_type} hybrid search with {ranker_model} reranker")
if search_type == "dense+sparse":
hybrid_search_payload = {
"collectionName": name,
"search": [
{
"data": [[random.random() for _ in range(768)] for _ in range(nq)],
"annsField": "dense",
"limit": 5
},
{
"data": [{random.randint(1, 10000): random.random() for _ in range(100)} for _ in range(nq)],
"annsField": "sparse",
"limit": 5
}
],
"functionScore": reranker_params,
"limit": 10,
"outputFields": ["doc_id", "document"]
}
elif search_type == "dense+bm25":
hybrid_search_payload = {
"collectionName": name,
"search": [
{
"data": [[random.random() for _ in range(768)] for _ in range(nq)],
"annsField": "dense",
"limit": 5
},
{
"data": query_texts,
"annsField": "bm25",
"limit": 5
}
],
"functionScore": reranker_params,
"limit": 10,
"outputFields": ["doc_id", "document"]
}
elif search_type == "sparse+bm25":
hybrid_search_payload = {
"collectionName": name,
"search": [
{
"data": [{random.randint(1, 10000): random.random() for _ in range(100)} for _ in range(nq)],
"annsField": "sparse",
"limit": 5
},
{
"data": query_texts,
"annsField": "bm25",
"limit": 5
}
],
"functionScore": reranker_params,
"limit": 10,
"outputFields": ["doc_id", "document"],
"searchParams": {"metric_type": "BM25"}
}
rsp = self.vector_client.vector_advanced_search(hybrid_search_payload)
assert rsp['code'] == 0, f"{search_type} hybrid search with {ranker_model} reranker failed: {rsp}"
assert len(rsp['data']) > 0, f"{search_type} hybrid search returned no results"
logger.info(f"{search_type} hybrid search with {ranker_model} reranker succeeded")