mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
2109 lines
84 KiB
Python
2109 lines
84 KiB
Python
import pytest
|
|
import numpy as np
|
|
from faker import Faker
|
|
from base.testbase import TestBase
|
|
from utils.utils import gen_collection_name
|
|
from utils.util_log import test_log as logger
|
|
|
|
fake_en = Faker("en_US")
|
|
|
|
prefix = "text_embedding_search"
|
|
|
|
|
|
@pytest.mark.L0
|
|
class TestTextEmbeddingSearch(TestBase):
|
|
"""
|
|
******************************************************************
|
|
The following cases are used to test text embedding function search via RESTful API
|
|
******************************************************************
|
|
"""
|
|
|
|
def _create_basic_collection_payload(self, name, tei_endpoint, dim=768, with_bm25=False):
|
|
"""Helper method to create basic collection payload with TEI function"""
|
|
fields = [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
|
|
]
|
|
|
|
functions = [{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
}]
|
|
|
|
if with_bm25:
|
|
fields[1]["elementTypeParams"].update({
|
|
"enable_analyzer": True,
|
|
"analyzer_params": {"tokenizer": "standard"},
|
|
"enable_match": True
|
|
})
|
|
fields.append({"fieldName": "sparse", "dataType": "SparseFloatVector"})
|
|
functions.append({
|
|
"name": "bm25_fn",
|
|
"type": "BM25",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["sparse"],
|
|
"params": {}
|
|
})
|
|
|
|
return {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection",
|
|
"fields": fields,
|
|
"functions": functions
|
|
}
|
|
}
|
|
|
|
def _create_and_verify_collection(self, name, tei_endpoint, dim=768, with_bm25=False):
|
|
"""Helper method to create collection and verify creation"""
|
|
payload = self._create_basic_collection_payload(name, tei_endpoint, dim, with_bm25)
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
|
|
|
|
# Verify collection was created
|
|
rsp = self.collection_client.collection_describe(name)
|
|
assert rsp['code'] == 0, f"Collection describe failed: {rsp}"
|
|
assert rsp['data']['collectionName'] == name, f"Collection name mismatch: expected {name}, got {rsp['data']['collectionName']}"
|
|
return payload
|
|
|
|
def _insert_and_verify_data(self, name, data):
|
|
"""Helper method to insert data and verify insertion"""
|
|
payload = {"collectionName": name, "data": data}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0, f"Insert failed: {rsp}"
|
|
assert rsp['data']['insertCount'] == len(data), f"Expected {len(data)} inserts, got {rsp['data']['insertCount']}"
|
|
return rsp
|
|
|
|
def _create_index_and_load(self, name, index_fields=None):
|
|
"""Helper method to create index and load collection"""
|
|
if index_fields is None:
|
|
index_fields = [{"fieldName": "dense", "indexName": "dense_index", "metricType": "COSINE"}]
|
|
|
|
index_payload = {
|
|
"collectionName": name,
|
|
"indexParams": [
|
|
{**field, "indexType": "AUTOINDEX", "params": {}}
|
|
for field in index_fields
|
|
]
|
|
}
|
|
rsp = self.index_client.index_create(index_payload)
|
|
assert rsp['code'] == 0, f"Index creation failed: {rsp}"
|
|
|
|
# Load collection
|
|
rsp = self.collection_client.collection_load(collection_name=name)
|
|
assert rsp['code'] == 0, f"Collection load failed: {rsp}"
|
|
|
|
def test_simple_tei_text_embedding_workflow(self, tei_endpoint):
|
|
"""
|
|
target: test simple TEI text embedding workflow
|
|
method: create collection, insert data, create index, load, and search
|
|
expected: all operations succeed
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
|
|
# Create collection with TEI text embedding function
|
|
self._create_and_verify_collection(name, tei_endpoint)
|
|
|
|
# Insert simple text data
|
|
data = [
|
|
{"id": 1, "document": "This is a test document"},
|
|
{"id": 2, "document": "Another test document"}
|
|
]
|
|
self._insert_and_verify_data(name, data)
|
|
|
|
# Create index and load collection
|
|
self._create_index_and_load(name)
|
|
|
|
# Search
|
|
search_payload = {
|
|
"collectionName": name,
|
|
"data": ["test document"],
|
|
"limit": 2,
|
|
"outputFields": ["id", "document"]
|
|
}
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] == 0, f"Search failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"Search returned no results: {rsp['data']}"
|
|
|
|
def test_create_collection_with_tei_text_embedding_function(self, tei_endpoint):
|
|
"""
|
|
target: test create collection with TEI text embedding function via REST API (equivalent to ORM example)
|
|
method: create collection with TEI text embedding function using RESTful API
|
|
expected: create collection successfully
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
|
|
# Create collection with additional truncation parameters
|
|
payload = self._create_basic_collection_payload(name, tei_endpoint)
|
|
payload["schema"]["functions"][0]["params"].update({
|
|
"truncate": True,
|
|
"truncation_direction": "Right"
|
|
})
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
|
|
|
|
# Verify collection was created with function
|
|
rsp = self.collection_client.collection_describe(name)
|
|
assert rsp['code'] == 0, f"Collection describe failed: {rsp}"
|
|
assert rsp['data']['collectionName'] == name, f"Collection name mismatch: expected {name}, got {rsp['data']['collectionName']}"
|
|
|
|
|
|
@pytest.mark.parametrize("truncate", [True, False])
|
|
@pytest.mark.parametrize("truncation_direction", ["Left", "Right"])
|
|
def test_insert_with_tei_text_embedding_truncation(self, tei_endpoint, truncate, truncation_direction):
|
|
"""
|
|
target: test insert data with TEI text embedding function with truncation parameters
|
|
method: insert long text data with different truncation settings
|
|
expected: insert successfully and truncation works as expected
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
dim = 768
|
|
|
|
# Create collection with TEI text embedding function including truncation params
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection",
|
|
"fields": [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint,
|
|
"truncate": truncate,
|
|
"truncation_direction": truncation_direction
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Prepare test data with long text similar to ORM test
|
|
left_text = " ".join([fake_en.word() for _ in range(512)])
|
|
right_text = " ".join([fake_en.word() for _ in range(512)])
|
|
data = [
|
|
{
|
|
"id": 0,
|
|
"document": left_text + " " + right_text
|
|
},
|
|
{
|
|
"id": 1,
|
|
"document": left_text
|
|
},
|
|
{
|
|
"id": 2,
|
|
"document": right_text
|
|
}
|
|
]
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data
|
|
}
|
|
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
|
|
if not truncate:
|
|
logger.info(f"Truncate is False, insertion result: {rsp}")
|
|
return
|
|
|
|
assert rsp['code'] == 0, f"Insert failed: {rsp}"
|
|
assert rsp['data']['insertCount'] == len(data), f"Expected {len(data)} inserts, got {rsp['data']['insertCount']}"
|
|
|
|
# Create index and load for similarity comparison
|
|
index_payload = {
|
|
"collectionName": name,
|
|
"indexParams": [
|
|
{
|
|
"fieldName": "dense",
|
|
"indexName": "dense_index",
|
|
"metricType": "COSINE",
|
|
"indexType": "AUTOINDEX",
|
|
"params": {}
|
|
}
|
|
]
|
|
}
|
|
rsp = self.index_client.index_create(index_payload)
|
|
assert rsp['code'] == 0, f"Index creation failed: {rsp}"
|
|
|
|
# Load collection
|
|
rsp = self.collection_client.collection_load(collection_name=name)
|
|
assert rsp['code'] == 0
|
|
|
|
# Query to get embeddings for similarity comparison
|
|
query_payload = {
|
|
"collectionName": name,
|
|
"filter": "id >= 0",
|
|
"outputFields": ["id", "dense"],
|
|
"limit": 10
|
|
}
|
|
|
|
rsp = self.vector_client.vector_query(query_payload)
|
|
assert rsp['code'] == 0, f"Query failed: {rsp}"
|
|
assert len(rsp['data']) == 3, f"Expected 3 results, got {len(rsp['data'])}"
|
|
|
|
# Compare similarity between embeddings to verify truncation direction
|
|
embeddings = {}
|
|
for result in rsp['data']:
|
|
embeddings[result['id']] = result['dense']
|
|
|
|
# Calculate cosine similarity
|
|
similarity_left = np.dot(embeddings[0], embeddings[1]) / (
|
|
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
|
|
)
|
|
similarity_right = np.dot(embeddings[0], embeddings[2]) / (
|
|
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[2])
|
|
)
|
|
|
|
logger.info(f"Similarity with left: {similarity_left}, with right: {similarity_right}")
|
|
|
|
if truncation_direction == "Left":
|
|
# When truncating from left, the combined text should be more similar to right text
|
|
assert similarity_left < similarity_right, (
|
|
f"Left truncation failed: left_sim={similarity_left:.4f}, right_sim={similarity_right:.4f}"
|
|
)
|
|
else: # Right truncation
|
|
# When truncating from right, the combined text should be more similar to left text
|
|
assert similarity_left > similarity_right, (
|
|
f"Right truncation failed: left_sim={similarity_left:.4f}, right_sim={similarity_right:.4f}"
|
|
)
|
|
|
|
|
|
def test_insert_with_tei_text_embedding_function(self, tei_endpoint):
|
|
"""
|
|
target: test insert data with TEI text embedding function via REST API
|
|
method: insert text data, embeddings should be automatically generated by TEI
|
|
expected: insert successfully and embeddings are generated
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
dim = 768
|
|
|
|
# Create collection with TEI text embedding function
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection",
|
|
"fields": [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Insert text data without embedding vectors (they should be auto-generated by TEI)
|
|
nb = 10
|
|
data = []
|
|
for i in range(nb):
|
|
data.append({
|
|
"id": i,
|
|
"document": fake_en.text()
|
|
})
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data
|
|
}
|
|
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0, f"Insert failed: {rsp}"
|
|
assert rsp['data']['insertCount'] == nb, f"Expected {nb} inserts, got {rsp['data']['insertCount']}"
|
|
|
|
def test_search_with_tei_text_embedding_function(self, tei_endpoint):
|
|
"""
|
|
target: test search with TEI text embedding function via REST API
|
|
method: 1. create collection with TEI text embedding function
|
|
2. insert text data
|
|
3. search with text query (should auto-generate embedding via TEI)
|
|
expected: search successfully with relevant results
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
dim = 768
|
|
|
|
# Create collection with TEI text embedding function
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection",
|
|
"fields": [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
}
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{
|
|
"fieldName": "dense",
|
|
"indexName": "dense_index",
|
|
"metricType": "COSINE"
|
|
}
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Insert text data
|
|
nb = 100
|
|
documents = [
|
|
"Machine learning is a subset of artificial intelligence",
|
|
"Deep learning uses neural networks with multiple layers",
|
|
"Natural language processing helps computers understand text",
|
|
"Computer vision enables machines to interpret visual information",
|
|
"Reinforcement learning trains agents through rewards and penalties"
|
|
]
|
|
|
|
data = []
|
|
for i in range(nb):
|
|
data.append({
|
|
"id": i,
|
|
"document": documents[i % len(documents)] + f" Document {i}"
|
|
})
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data
|
|
}
|
|
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Search with text query (TEI will auto-generate embedding)
|
|
search_payload = {
|
|
"collectionName": name,
|
|
"data": ["artificial intelligence and machine learning"],
|
|
"limit": 10,
|
|
"outputFields": ["id", "document"]
|
|
}
|
|
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] == 0, f"Search failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"Search returned no results"
|
|
|
|
# Verify search results contain relevant documents
|
|
found_relevant = any(
|
|
"machine learning" in result.get('document', '').lower() or
|
|
"artificial intelligence" in result.get('document', '').lower()
|
|
for result in rsp['data']
|
|
)
|
|
assert found_relevant, f"Search should return relevant documents, got: {[r.get('document', '') for r in rsp['data']]}"
|
|
|
|
|
|
def test_tei_and_bm25_collection_creation(self, tei_endpoint):
|
|
"""
|
|
target: test create collection with both TEI and BM25 functions using correct format
|
|
method: create collection with TEI text embedding and BM25 functions based on working example
|
|
expected: collection creation succeeds
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
dim = 768
|
|
|
|
# Create collection with both TEI and BM25 functions using correct format
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection",
|
|
"fields": [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{
|
|
"fieldName": "document",
|
|
"dataType": "VarChar",
|
|
"elementTypeParams": {
|
|
"max_length": "1000",
|
|
"enable_analyzer": True,
|
|
"analyzer_params": {"tokenizer": "standard"},
|
|
"enable_match": True
|
|
}
|
|
},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}},
|
|
{"fieldName": "sparse", "dataType": "SparseFloatVector"}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
},
|
|
{
|
|
"name": "bm25_fn",
|
|
"type": "BM25",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["sparse"],
|
|
"params": {}
|
|
}
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{
|
|
"fieldName": "dense",
|
|
"indexName": "dense_index",
|
|
"metricType": "COSINE"
|
|
},
|
|
{
|
|
"fieldName": "sparse",
|
|
"indexName": "sparse_index",
|
|
"metricType": "BM25",
|
|
"params": {"index_type": "SPARSE_INVERTED_INDEX"}
|
|
}
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Insert test data
|
|
data = []
|
|
for i in range(10):
|
|
data.append({
|
|
"id": i,
|
|
"document": fake_en.text().lower()
|
|
})
|
|
|
|
payload = {"collectionName": name, "data": data}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0, f"Insert failed: {rsp}"
|
|
assert rsp['data']['insertCount'] == 10, f"Expected 10 inserts, got {rsp['data']['insertCount']}"
|
|
|
|
# Test search with BM25 (sparse vector)
|
|
search_payload = {
|
|
"collectionName": name,
|
|
"data": [fake_en.text().lower()],
|
|
"annsField": "sparse",
|
|
"limit": 5,
|
|
"outputFields": ["id", "document"]
|
|
}
|
|
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] == 0, f"BM25 search failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"BM25 search returned no results"
|
|
|
|
# test search with dense vector
|
|
search_payload = {
|
|
"collectionName": name,
|
|
"data": [fake_en.text().lower()],
|
|
"annsField": "dense",
|
|
"limit": 5,
|
|
"outputFields": ["id", "document"]
|
|
}
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] == 0, f"Dense search failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"Dense search returned no results"
|
|
|
|
|
|
def test_hybrid_search_with_text_embedding_and_bm25(self, tei_endpoint):
|
|
"""
|
|
target: test hybrid search combining text embedding and BM25 via REST API
|
|
method: 1. create collection with both text embedding and BM25 functions
|
|
2. insert text data
|
|
3. perform hybrid search
|
|
expected: hybrid search returns combined results
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
dim = 768
|
|
|
|
# Create collection with both text embedding and BM25 functions
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection",
|
|
"fields": [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{
|
|
"fieldName": "document",
|
|
"dataType": "VarChar",
|
|
"elementTypeParams": {
|
|
"max_length": "65535",
|
|
"enable_analyzer": True,
|
|
"analyzer_params": {"tokenizer": "standard"},
|
|
"enable_match": True
|
|
}
|
|
},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}},
|
|
{"fieldName": "sparse", "dataType": "SparseFloatVector"}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
},
|
|
{
|
|
"name": "bm25_fn",
|
|
"type": "BM25",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["sparse"],
|
|
"params": {}
|
|
}
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{
|
|
"fieldName": "dense",
|
|
"indexName": "dense_index",
|
|
"metricType": "COSINE"
|
|
},
|
|
{
|
|
"fieldName": "sparse",
|
|
"indexName": "sparse_index",
|
|
"metricType": "BM25",
|
|
"params": {"index_type": "SPARSE_INVERTED_INDEX"}
|
|
}
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Insert diverse text data
|
|
documents = [
|
|
"Python is a popular programming language for data science",
|
|
"JavaScript is widely used for web development",
|
|
"Machine learning algorithms can predict future trends",
|
|
"Database systems store and manage large amounts of data",
|
|
"Cloud computing provides scalable infrastructure solutions",
|
|
"Artificial intelligence transforms various industries",
|
|
"Software engineering practices improve code quality",
|
|
"Data visualization helps understand complex datasets",
|
|
"Cybersecurity protects digital assets from threats",
|
|
"Mobile applications provide convenient user experiences"
|
|
]
|
|
|
|
data = []
|
|
for i in range(50):
|
|
data.append({
|
|
"id": i,
|
|
"document": documents[i % len(documents)] + f" Extended content {i}"
|
|
})
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data
|
|
}
|
|
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Perform hybrid search using advanced search
|
|
hybrid_search_payload = {
|
|
"collectionName": name,
|
|
"search": [
|
|
{
|
|
"data": ["programming language data science"],
|
|
"annsField": "dense",
|
|
"limit": 20
|
|
},
|
|
{
|
|
"data": ["programming language data science"],
|
|
"annsField": "sparse",
|
|
"limit": 20
|
|
}
|
|
],
|
|
"rerank": {
|
|
"strategy": "weighted",
|
|
"params": {"weights": [0.7, 0.3]}
|
|
},
|
|
"limit": 10,
|
|
"outputFields": ["id", "document"]
|
|
}
|
|
|
|
rsp = self.vector_client.vector_advanced_search(hybrid_search_payload)
|
|
assert rsp['code'] == 0, f"Hybrid search failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"Hybrid search returned no results"
|
|
|
|
# Verify hybrid search results are relevant
|
|
found_relevant = any(
|
|
any(term in result.get('document', '').lower() for term in ['python', 'programming', 'data'])
|
|
for result in rsp['data']
|
|
)
|
|
assert found_relevant, f"Hybrid search should return relevant documents, got: {[r.get('document', '') for r in rsp['data']]}"
|
|
|
|
|
|
@pytest.mark.L1
|
|
class TestTextEmbeddingSearchAdvanced(TestBase):
|
|
"""
|
|
******************************************************************
|
|
Advanced test cases for text embedding function search via RESTful API
|
|
******************************************************************
|
|
"""
|
|
|
|
def test_search_with_filter_and_text_embedding(self, tei_endpoint):
|
|
"""
|
|
target: test search with both text embedding and scalar filters
|
|
method: 1. create collection with text embedding function and metadata fields
|
|
2. insert text data with metadata
|
|
3. search with text query and scalar filters
|
|
expected: search returns filtered and relevant results
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
dim = 768
|
|
|
|
# Create collection with text embedding function and metadata fields
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection",
|
|
"fields": [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
|
|
{"fieldName": "category", "dataType": "VarChar", "elementTypeParams": {"max_length": "100"}},
|
|
{"fieldName": "year", "dataType": "Int64"},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
}
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{
|
|
"fieldName": "dense",
|
|
"indexName": "dense_index",
|
|
"metricType": "COSINE"
|
|
}
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Insert text data with metadata
|
|
categories = ["technology", "science", "business", "education"]
|
|
years = [2020, 2021, 2022, 2023, 2024]
|
|
|
|
data = []
|
|
for i in range(100):
|
|
data.append({
|
|
"id": i,
|
|
"document": fake_en.text(),
|
|
"category": categories[i % len(categories)],
|
|
"year": years[i % len(years)]
|
|
})
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data
|
|
}
|
|
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Search with text query and filters
|
|
search_payload = {
|
|
"collectionName": name,
|
|
"data": ["technology innovation"],
|
|
"filter": "category == 'technology' and year >= 2022",
|
|
"limit": 10,
|
|
"outputFields": ["id", "document", "category", "year"]
|
|
}
|
|
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Verify all results match the filter criteria
|
|
for result in rsp['data']:
|
|
assert result['category'] == 'technology', f"Category mismatch: expected 'technology', got '{result['category']}'"
|
|
assert result['year'] >= 2022, f"Year filter failed: expected >= 2022, got {result['year']}"
|
|
|
|
|
|
def test_upsert_with_text_embedding_function(self, tei_endpoint):
|
|
"""
|
|
target: test upsert operation with text embedding function
|
|
method: 1. insert initial text data
|
|
2. upsert with modified text content
|
|
3. verify embeddings are updated
|
|
expected: upsert successfully updates both text and embeddings
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
dim = 768
|
|
|
|
# Create collection with text embedding function
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection",
|
|
"fields": [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
}
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{
|
|
"fieldName": "dense",
|
|
"indexName": "dense_index",
|
|
"metricType": "COSINE"
|
|
}
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Insert initial data
|
|
original_text = "The original document about machine learning"
|
|
data = [{"id": 1, "document": original_text}]
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data
|
|
}
|
|
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Query original embedding
|
|
query_payload = {
|
|
"collectionName": name,
|
|
"filter": "id == 1",
|
|
"outputFields": ["id", "document", "dense"],
|
|
"limit": 10
|
|
}
|
|
|
|
rsp = self.vector_client.vector_query(query_payload)
|
|
assert rsp['code'] == 0, f"Original query failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"Original query returned no results"
|
|
original_embedding = rsp['data'][0]['dense']
|
|
|
|
# Upsert with modified text
|
|
updated_text = "The updated document about deep learning and neural networks"
|
|
upsert_data = [{"id": 1, "document": updated_text}]
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": upsert_data
|
|
}
|
|
|
|
rsp = self.vector_client.vector_upsert(payload)
|
|
assert rsp['code'] == 0, f"Upsert failed: {rsp}"
|
|
|
|
# Query updated embedding
|
|
rsp = self.vector_client.vector_query(query_payload)
|
|
assert rsp['code'] == 0, f"Updated query failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"Updated query returned no results"
|
|
updated_embedding = rsp['data'][0]['dense']
|
|
|
|
# Verify text was updated
|
|
assert rsp['data'][0]['document'] == updated_text, f"Text not updated: expected '{updated_text}', got '{rsp['data'][0]['document']}'"
|
|
|
|
# Verify embedding was updated (embeddings should be different)
|
|
similarity = np.dot(original_embedding, updated_embedding) / (
|
|
np.linalg.norm(original_embedding) * np.linalg.norm(updated_embedding)
|
|
)
|
|
assert similarity < 0.99, f"Embedding should be significantly different after text update, similarity: {similarity:.4f}"
|
|
|
|
|
|
@pytest.mark.L2
|
|
class TestTextEmbeddingSearchNegative(TestBase):
|
|
"""
|
|
******************************************************************
|
|
Negative test cases for text embedding function search via RESTful API
|
|
******************************************************************
|
|
"""
|
|
|
|
def test_create_collection_with_invalid_text_embedding_params(self):
|
|
"""
|
|
target: test create collection with invalid text embedding function parameters
|
|
method: create collection with invalid embedding provider/model
|
|
expected: collection creation should fail with appropriate error
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
dim = 1024
|
|
|
|
# Create collection with invalid text embedding function
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"fields": [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "text_embedding_fn",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "invalid_provider",
|
|
"model_name": "invalid_model",
|
|
"api_key": "invalid_key"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] != 0, f"Expected creation to fail with invalid provider, but got: {rsp}"
|
|
|
|
def test_search_with_empty_query_text(self, tei_endpoint):
|
|
"""
|
|
target: test search with empty text query
|
|
method: 1. create collection with text embedding function
|
|
2. insert data
|
|
3. search with empty string
|
|
expected: search should handle empty query appropriately
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
dim = 768
|
|
|
|
# Create collection with text embedding function
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection",
|
|
"fields": [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(dim)}}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Insert sample data
|
|
data = [{"id": i, "document": fake_en.text()} for i in range(10)]
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"data": data
|
|
}
|
|
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0
|
|
|
|
# Search with empty query
|
|
search_payload = {
|
|
"collectionName": name,
|
|
"data": [""],
|
|
"limit": 5,
|
|
"outputFields": ["id", "document"]
|
|
}
|
|
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] != 0, f"Expected search to fail with empty query, but got: {rsp}"
|
|
|
|
|
|
def test_dimension_mismatch_with_text_embedding(self, tei_endpoint):
|
|
"""
|
|
target: test dimension mismatch between text embedding function and vector field
|
|
method: create collection with mismatched dimensions
|
|
expected: collection creation should fail
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
wrong_dim = 512 # TEI produces 768-dim vectors
|
|
|
|
# Create collection with mismatched dimensions
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection",
|
|
"fields": [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": str(wrong_dim)}}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint # This produces 768-dim vectors
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] != 0, f"Expected creation to fail with dimension mismatch, but got: {rsp}"
|
|
|
|
class TestModelRerankFunction(TestBase):
|
|
"""
|
|
******************************************************************
|
|
Test cases for model rerank function via RESTful API
|
|
******************************************************************
|
|
"""
|
|
|
|
def _create_collection_with_all_vector_types(self, name, tei_endpoint):
|
|
"""Helper method to create collection with dense, sparse, and bm25 fields"""
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection with all vector types",
|
|
"fields": [
|
|
{"fieldName": "doc_id", "dataType": "Int64", "isPrimary": True},
|
|
{
|
|
"fieldName": "document",
|
|
"dataType": "VarChar",
|
|
"elementTypeParams": {
|
|
"max_length": "65535",
|
|
"enable_analyzer": True,
|
|
"analyzer_params": {"tokenizer": "standard"},
|
|
"enable_match": True
|
|
}
|
|
},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": "768"}},
|
|
{"fieldName": "sparse", "dataType": "SparseFloatVector"},
|
|
{"fieldName": "bm25", "dataType": "SparseFloatVector"}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
},
|
|
{
|
|
"name": "bm25_fn",
|
|
"type": "BM25",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["bm25"],
|
|
"params": {}
|
|
}
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{
|
|
"fieldName": "dense",
|
|
"indexName": "dense_index",
|
|
"metricType": "COSINE"
|
|
},
|
|
{
|
|
"fieldName": "sparse",
|
|
"indexName": "sparse_index",
|
|
"metricType": "IP",
|
|
"params": {"index_type": "SPARSE_INVERTED_INDEX"}
|
|
},
|
|
{
|
|
"fieldName": "bm25",
|
|
"indexName": "bm25_index",
|
|
"metricType": "BM25",
|
|
"params": {"index_type": "SPARSE_INVERTED_INDEX"}
|
|
}
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
|
|
|
|
# Insert sample data
|
|
import random
|
|
data = []
|
|
for i in range(50):
|
|
data.append({
|
|
"doc_id": i,
|
|
"document": fake_en.text(),
|
|
"sparse": {random.randint(1, 10000): random.random() for _ in range(100)}
|
|
})
|
|
|
|
payload = {"collectionName": name, "data": data}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0, f"Insert failed: {rsp}"
|
|
|
|
return name
|
|
|
|
@pytest.mark.parametrize("enable_truncate", [False, True])
|
|
def test_single_vector_search_with_model_rerank(self, tei_endpoint, enable_truncate,
|
|
tei_reranker_endpoint):
|
|
"""
|
|
target: test single vector search with model rerank using RESTful API
|
|
method: test dense/sparse/bm25 search with model reranker separately
|
|
expected: search should succeed with model reranker
|
|
"""
|
|
import random
|
|
|
|
name = gen_collection_name(prefix)
|
|
self._create_collection_with_all_vector_types(name, tei_endpoint)
|
|
|
|
# Prepare search parameters for reranker
|
|
nq = 2
|
|
query_texts = [fake_en.text() for _ in range(nq)]
|
|
if enable_truncate:
|
|
# Make query texts larger for truncation test
|
|
query_texts = [" ".join([fake_en.word() for _ in range(1024)]) for _ in range(nq)]
|
|
|
|
# Prepare reranker parameters (functionScore format)
|
|
reranker_params = {
|
|
"functions": [{
|
|
"name": "tei_reranker",
|
|
"description": "",
|
|
"type": "Rerank",
|
|
"inputFieldNames": ["document"],
|
|
"params": {
|
|
"reranker": "model",
|
|
"provider": "tei",
|
|
"queries": query_texts,
|
|
"endpoint": tei_reranker_endpoint,
|
|
"truncate": enable_truncate,
|
|
"truncation_direction": "Right"
|
|
}
|
|
}]
|
|
}
|
|
|
|
# Test different search types
|
|
for search_type in ["dense", "sparse", "bm25"]:
|
|
logger.info(f"Executing {search_type} search with model reranker")
|
|
|
|
if search_type == "dense":
|
|
# Dense vector search
|
|
search_payload = {
|
|
"collectionName": name,
|
|
"data": [[random.random() for _ in range(768)] for _ in range(nq)],
|
|
"annsField": "dense",
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document"],
|
|
"functionScore": reranker_params
|
|
}
|
|
|
|
elif search_type == "sparse":
|
|
# Sparse vector search
|
|
search_payload = {
|
|
"collectionName": name,
|
|
"data": [{random.randint(1, 10000): random.random() for _ in range(100)} for _ in range(nq)],
|
|
"annsField": "sparse",
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document"],
|
|
"functionScore": reranker_params
|
|
}
|
|
|
|
elif search_type == "bm25":
|
|
# BM25 search
|
|
search_payload = {
|
|
"collectionName": name,
|
|
"data": query_texts,
|
|
"annsField": "bm25",
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document"],
|
|
"searchParams": {"metric_type": "BM25"},
|
|
"functionScore": reranker_params
|
|
}
|
|
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] == 0, f"{search_type} search with model reranker failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"{search_type} search returned no results"
|
|
logger.info(f"{search_type} search with TEI reranker succeeded")
|
|
|
|
def test_hybrid_vector_search_with_model_rerank(self, tei_endpoint,
|
|
tei_reranker_endpoint):
|
|
"""
|
|
target: test hybrid vector search with model rerank using RESTful API
|
|
method: test dense+sparse/dense+bm25/sparse+bm25 search with model reranker
|
|
expected: hybrid search should succeed with model reranker
|
|
"""
|
|
import random
|
|
|
|
name = gen_collection_name(prefix)
|
|
self._create_collection_with_all_vector_types(name, tei_endpoint)
|
|
|
|
# Prepare search parameters for reranker
|
|
nq = 2
|
|
query_texts = [fake_en.text() for _ in range(nq)]
|
|
|
|
# Prepare reranker parameters (functionScore format)
|
|
reranker_params = {
|
|
"functions": [{
|
|
"name": "tei_reranker",
|
|
"description": "",
|
|
"type": "Rerank",
|
|
"inputFieldNames": ["document"],
|
|
"params": {
|
|
"reranker": "model",
|
|
"provider": "tei",
|
|
"queries": query_texts,
|
|
"endpoint": tei_reranker_endpoint
|
|
}
|
|
}]
|
|
}
|
|
|
|
# Test different hybrid search combinations
|
|
for search_type in ["dense+sparse", "dense+bm25", "sparse+bm25"]:
|
|
logger.info(f"Executing {search_type} hybrid search with model reranker")
|
|
|
|
if search_type == "dense+sparse":
|
|
hybrid_search_payload = {
|
|
"collectionName": name,
|
|
"search": [
|
|
{
|
|
"data": [[random.random() for _ in range(768)] for _ in range(nq)],
|
|
"annsField": "dense",
|
|
"limit": 5
|
|
},
|
|
{
|
|
"data": [{random.randint(1, 10000): random.random() for _ in range(100)} for _ in range(nq)],
|
|
"annsField": "sparse",
|
|
"limit": 5
|
|
}
|
|
],
|
|
"functionScore": reranker_params,
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document"]
|
|
}
|
|
|
|
elif search_type == "dense+bm25":
|
|
hybrid_search_payload = {
|
|
"collectionName": name,
|
|
"search": [
|
|
{
|
|
"data": [[random.random() for _ in range(768)] for _ in range(nq)],
|
|
"annsField": "dense",
|
|
"limit": 5
|
|
},
|
|
{
|
|
"data": query_texts,
|
|
"annsField": "bm25",
|
|
"limit": 5,
|
|
"params": {"metric_type": "BM25"}
|
|
}
|
|
],
|
|
"functionScore": reranker_params,
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document"]
|
|
}
|
|
|
|
elif search_type == "sparse+bm25":
|
|
hybrid_search_payload = {
|
|
"collectionName": name,
|
|
"search": [
|
|
{
|
|
"data": [{random.randint(1, 10000): random.random() for _ in range(100)} for _ in range(nq)],
|
|
"annsField": "sparse",
|
|
"limit": 5
|
|
},
|
|
{
|
|
"data": query_texts,
|
|
"annsField": "bm25",
|
|
"limit": 5,
|
|
"params": {"metric_type": "BM25"}
|
|
}
|
|
],
|
|
"functionScore": reranker_params,
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document"]
|
|
}
|
|
|
|
rsp = self.vector_client.vector_advanced_search(hybrid_search_payload)
|
|
assert rsp['code'] == 0, f"{search_type} hybrid search with model reranker failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"{search_type} hybrid search returned no results"
|
|
logger.info(f"{search_type} hybrid search with TEI reranker succeeded")
|
|
|
|
|
|
@pytest.mark.L1
|
|
class TestDecayRerank(TestBase):
|
|
"""
|
|
******************************************************************
|
|
Test cases for Decay rerank function via RESTful API
|
|
******************************************************************
|
|
"""
|
|
|
|
def _create_collection_with_timestamp_field(self, name, tei_endpoint):
|
|
"""Helper method to create collection with timestamp field for decay rerank"""
|
|
import time
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection for decay rerank",
|
|
"fields": [
|
|
{"fieldName": "doc_id", "dataType": "Int64", "isPrimary": True},
|
|
{
|
|
"fieldName": "document",
|
|
"dataType": "VarChar",
|
|
"elementTypeParams": {
|
|
"max_length": "65535",
|
|
"enable_analyzer": True,
|
|
"analyzer_params": {"tokenizer": "standard"},
|
|
"enable_match": True
|
|
}
|
|
},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": "768"}},
|
|
{"fieldName": "sparse", "dataType": "SparseFloatVector"},
|
|
{"fieldName": "timestamp", "dataType": "Int64"}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
},
|
|
{
|
|
"name": "bm25_fn",
|
|
"type": "BM25",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["sparse"],
|
|
"params": {}
|
|
}
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{
|
|
"fieldName": "dense",
|
|
"indexName": "dense_index",
|
|
"metricType": "COSINE",
|
|
"indexType": "AUTOINDEX",
|
|
"params": {}
|
|
},
|
|
{
|
|
"fieldName": "sparse",
|
|
"indexName": "sparse_index",
|
|
"metricType": "BM25",
|
|
"indexType": "SPARSE_INVERTED_INDEX",
|
|
"params": {"bm25_k1": 1.2, "bm25_b": 0.75}
|
|
}
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
|
|
|
|
# Insert sample data with different timestamps
|
|
current_time = int(time.time())
|
|
data = []
|
|
news_documents = [
|
|
"Artificial intelligence helps medical breakthroughs",
|
|
"Analysis of artificial intelligence trends in 2023",
|
|
"Artificial intelligence ethical disputes continue to ferment",
|
|
"The latest progress in deep learning technology",
|
|
"Machine learning algorithms improve healthcare diagnosis",
|
|
"Neural networks advance computer vision capabilities",
|
|
"Natural language processing enables better chatbots",
|
|
"Robotics automation transforms manufacturing industry",
|
|
"Quantum computing promises revolutionary breakthroughs",
|
|
"Blockchain technology secures digital transactions"
|
|
]
|
|
|
|
# Create data with timestamps ranging from 30 days ago to current time
|
|
for i in range(len(news_documents)):
|
|
timestamp_offset = (len(news_documents) - i - 1) * 24 * 60 * 60 * 3 # 3 days apart
|
|
data.append({
|
|
"doc_id": i,
|
|
"document": news_documents[i],
|
|
"timestamp": current_time - timestamp_offset
|
|
})
|
|
|
|
payload = {"collectionName": name, "data": data}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0, f"Insert failed: {rsp}"
|
|
|
|
return name, current_time
|
|
|
|
@pytest.mark.parametrize("decay_function", ["gauss", "exp", "linear"])
|
|
def test_single_vector_search_with_decay_rerank(self, tei_endpoint, decay_function):
|
|
"""
|
|
target: test single vector search with decay rerank using different decay functions
|
|
method: test dense/sparse search with gauss/exp/linear decay reranker
|
|
expected: search should succeed with decay reranker and time-based ranking
|
|
"""
|
|
import random
|
|
|
|
name = gen_collection_name(prefix)
|
|
collection_name, current_time = self._create_collection_with_timestamp_field(name, tei_endpoint)
|
|
|
|
# Prepare decay reranker parameters
|
|
decay_params = {
|
|
"functions": [{
|
|
"name": f"{decay_function}_decay_ranker",
|
|
"description": "",
|
|
"type": "Rerank",
|
|
"inputFieldNames": ["timestamp"],
|
|
"params": {
|
|
"reranker": "decay",
|
|
"function": decay_function,
|
|
"origin": current_time, # Current time as origin
|
|
"offset": 0,
|
|
"decay": 0.5,
|
|
"scale": 7 * 24 * 60 * 60 # 7 days in seconds
|
|
}
|
|
}]
|
|
}
|
|
|
|
# Test different search types
|
|
for search_type in ["dense", "sparse"]:
|
|
logger.info(f"Executing {search_type} search with {decay_function} decay reranker")
|
|
|
|
if search_type == "dense":
|
|
# Dense vector search
|
|
search_payload = {
|
|
"collectionName": collection_name,
|
|
"data": ["artificial intelligence technology progress"],
|
|
"annsField": "dense",
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document", "timestamp"],
|
|
"functionScore": decay_params
|
|
}
|
|
|
|
elif search_type == "sparse":
|
|
# Sparse vector search
|
|
search_payload = {
|
|
"collectionName": collection_name,
|
|
"data": ["artificial intelligence technology progress"],
|
|
"annsField": "sparse",
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document", "timestamp"],
|
|
"functionScore": decay_params
|
|
}
|
|
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] == 0, f"{search_type} search with {decay_function} decay reranker failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"{search_type} search returned no results"
|
|
|
|
# Verify time-based ranking: more recent documents should have higher scores
|
|
if len(rsp['data']) > 1:
|
|
# Check that results are ordered by final score (which includes decay)
|
|
scores = [result.get('distance', 0) for result in rsp['data']]
|
|
logger.info(f"{decay_function} decay results scores: {scores}")
|
|
|
|
logger.info(f"{search_type} search with {decay_function} decay reranker succeeded")
|
|
|
|
@pytest.mark.parametrize("decay_function", ["gauss", "exp", "linear"])
|
|
def test_hybrid_search_with_decay_rerank(self, tei_endpoint, decay_function):
|
|
"""
|
|
target: test hybrid search with decay rerank using different decay functions
|
|
method: test dense+sparse hybrid search with decay reranker
|
|
expected: hybrid search should succeed with decay reranker
|
|
"""
|
|
import random
|
|
|
|
name = gen_collection_name(prefix)
|
|
collection_name, current_time = self._create_collection_with_timestamp_field(name, tei_endpoint)
|
|
|
|
# Prepare decay reranker parameters
|
|
decay_params = {
|
|
"functions": [{
|
|
"name": f"{decay_function}_decay_ranker",
|
|
"description": "",
|
|
"type": "Rerank",
|
|
"inputFieldNames": ["timestamp"],
|
|
"params": {
|
|
"reranker": "decay",
|
|
"function": decay_function,
|
|
"origin": current_time,
|
|
"offset": 2 * 24 * 60 * 60, # 2 days offset
|
|
"decay": 0.3, # More aggressive decay
|
|
"scale": 5 * 24 * 60 * 60 # 5 days scale
|
|
}
|
|
}]
|
|
}
|
|
|
|
logger.info(f"Executing hybrid search with {decay_function} decay reranker")
|
|
|
|
# Hybrid search with decay rerank
|
|
hybrid_search_payload = {
|
|
"collectionName": collection_name,
|
|
"search": [
|
|
{
|
|
"data": ["artificial intelligence machine learning"],
|
|
"annsField": "dense",
|
|
"limit": 5
|
|
},
|
|
{
|
|
"data": ["artificial intelligence machine learning"],
|
|
"annsField": "sparse",
|
|
"limit": 5
|
|
}
|
|
],
|
|
"functionScore": decay_params,
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document", "timestamp"]
|
|
}
|
|
|
|
rsp = self.vector_client.vector_advanced_search(hybrid_search_payload)
|
|
assert rsp['code'] == 0, f"Hybrid search with {decay_function} decay reranker failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"Hybrid search returned no results"
|
|
|
|
# Log results for manual verification
|
|
logger.info(f"Hybrid search with {decay_function} decay reranker results:")
|
|
for i, result in enumerate(rsp['data'][:3]): # Show top 3 results
|
|
logger.info(f" {i+1}. Doc: {result.get('document', '')[:50]}... Timestamp: {result.get('timestamp', 0)}")
|
|
|
|
logger.info(f"Hybrid search with {decay_function} decay reranker succeeded")
|
|
|
|
def test_decay_rerank_with_different_parameters(self, tei_endpoint):
|
|
"""
|
|
target: test decay rerank with different parameter combinations
|
|
method: test different origin, offset, decay, scale parameters
|
|
expected: search should succeed with different parameter configurations
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
collection_name, current_time = self._create_collection_with_timestamp_field(name, tei_endpoint)
|
|
|
|
# Test different parameter combinations
|
|
test_configs = [
|
|
{
|
|
"name": "no_offset_high_decay",
|
|
"params": {
|
|
"origin": current_time,
|
|
"offset": 0,
|
|
"decay": 0.8, # High decay rate
|
|
"scale": 3 * 24 * 60 * 60
|
|
}
|
|
},
|
|
{
|
|
"name": "with_offset_low_decay",
|
|
"params": {
|
|
"origin": current_time,
|
|
"offset": 5 * 24 * 60 * 60, # 5 days offset
|
|
"decay": 0.2, # Low decay rate
|
|
"scale": 10 * 24 * 60 * 60
|
|
}
|
|
},
|
|
{
|
|
"name": "past_origin",
|
|
"params": {
|
|
"origin": current_time - 15 * 24 * 60 * 60, # 15 days ago
|
|
"offset": 0,
|
|
"decay": 0.5,
|
|
"scale": 7 * 24 * 60 * 60
|
|
}
|
|
}
|
|
]
|
|
|
|
for config in test_configs:
|
|
logger.info(f"Testing decay rerank with config: {config['name']}")
|
|
|
|
decay_params = {
|
|
"functions": [{
|
|
"name": f"decay_ranker_{config['name']}",
|
|
"description": "",
|
|
"type": "Rerank",
|
|
"inputFieldNames": ["timestamp"],
|
|
"params": {
|
|
"reranker": "decay",
|
|
"function": "gauss",
|
|
**config["params"]
|
|
}
|
|
}]
|
|
}
|
|
|
|
search_payload = {
|
|
"collectionName": collection_name,
|
|
"data": ["technology progress artificial intelligence"],
|
|
"annsField": "dense",
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document", "timestamp"],
|
|
"functionScore": decay_params
|
|
}
|
|
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] == 0, f"Search with {config['name']} failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"Search with {config['name']} returned no results"
|
|
|
|
logger.info(f"Decay rerank with {config['name']} succeeded, returned {len(rsp['data'])} results")
|
|
|
|
|
|
@pytest.mark.L2
|
|
class TestDecayRerankNegative(TestBase):
|
|
"""
|
|
******************************************************************
|
|
Negative test cases for Decay rerank function via RESTful API
|
|
******************************************************************
|
|
"""
|
|
|
|
def _create_collection_with_timestamp_field(self, name, tei_endpoint):
|
|
"""Helper method to create collection with timestamp field for decay rerank"""
|
|
import time
|
|
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection for decay rerank",
|
|
"fields": [
|
|
{"fieldName": "doc_id", "dataType": "Int64", "isPrimary": True},
|
|
{
|
|
"fieldName": "document",
|
|
"dataType": "VarChar",
|
|
"elementTypeParams": {
|
|
"max_length": "65535",
|
|
"enable_analyzer": True,
|
|
"analyzer_params": {"tokenizer": "standard"},
|
|
"enable_match": True
|
|
}
|
|
},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": "768"}},
|
|
{"fieldName": "sparse", "dataType": "SparseFloatVector"},
|
|
{"fieldName": "timestamp", "dataType": "Int64"}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
},
|
|
{
|
|
"name": "bm25_fn",
|
|
"type": "BM25",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["sparse"],
|
|
"params": {}
|
|
}
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{
|
|
"fieldName": "dense",
|
|
"indexName": "dense_index",
|
|
"metricType": "COSINE",
|
|
"indexType": "AUTOINDEX",
|
|
"params": {}
|
|
},
|
|
{
|
|
"fieldName": "sparse",
|
|
"indexName": "sparse_index",
|
|
"metricType": "BM25",
|
|
"indexType": "SPARSE_INVERTED_INDEX",
|
|
"params": {"bm25_k1": 1.2, "bm25_b": 0.75}
|
|
}
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
|
|
|
|
# Insert sample data with different timestamps
|
|
current_time = int(time.time())
|
|
data = []
|
|
news_documents = [
|
|
"Artificial intelligence helps medical breakthroughs",
|
|
"Analysis of artificial intelligence trends in 2023",
|
|
"Artificial intelligence ethical disputes continue to ferment",
|
|
"The latest progress in deep learning technology",
|
|
"Machine learning algorithms improve healthcare diagnosis"
|
|
]
|
|
|
|
# Create data with timestamps ranging from 15 days ago to current time
|
|
for i in range(len(news_documents)):
|
|
timestamp_offset = (len(news_documents) - i - 1) * 24 * 60 * 60 * 3 # 3 days apart
|
|
data.append({
|
|
"doc_id": i,
|
|
"document": news_documents[i],
|
|
"timestamp": current_time - timestamp_offset
|
|
})
|
|
|
|
payload = {"collectionName": name, "data": data}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0, f"Insert failed: {rsp}"
|
|
|
|
return name, current_time
|
|
|
|
def test_decay_rerank_with_invalid_function_type(self, tei_endpoint):
|
|
"""
|
|
target: test decay rerank with invalid function type
|
|
method: create decay rerank with invalid function type
|
|
expected: search should fail with appropriate error
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
collection_name, current_time = self._create_collection_with_timestamp_field(name, tei_endpoint)
|
|
|
|
# Test with invalid function type
|
|
decay_params = {
|
|
"functions": [{
|
|
"name": "invalid_decay_ranker",
|
|
"description": "",
|
|
"type": "Rerank",
|
|
"inputFieldNames": ["timestamp"],
|
|
"params": {
|
|
"reranker": "decay",
|
|
"function": "invalid_function", # Invalid function type
|
|
"origin": current_time,
|
|
"offset": 0,
|
|
"decay": 0.5,
|
|
"scale": 7 * 24 * 60 * 60
|
|
}
|
|
}]
|
|
}
|
|
|
|
search_payload = {
|
|
"collectionName": collection_name,
|
|
"data": ["artificial intelligence"],
|
|
"annsField": "dense",
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document", "timestamp"],
|
|
"functionScore": decay_params
|
|
}
|
|
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] != 0, f"Expected search to fail with invalid function type, but got: {rsp}"
|
|
|
|
def test_decay_rerank_with_invalid_field_type(self, tei_endpoint):
|
|
"""
|
|
target: test decay rerank with non-numeric field
|
|
method: create decay rerank with non-numeric field
|
|
expected: search should fail appropriately
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
|
|
# Create collection with string field instead of numeric timestamp
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": False,
|
|
"enableDynamicField": True,
|
|
"description": "test collection for decay rerank negative test",
|
|
"fields": [
|
|
{"fieldName": "doc_id", "dataType": "Int64", "isPrimary": True},
|
|
{"fieldName": "document", "dataType": "VarChar", "elementTypeParams": {"max_length": "65535"}},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": "768"}},
|
|
{"fieldName": "category", "dataType": "VarChar", "elementTypeParams": {"max_length": "100"}} # String field
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "tei",
|
|
"type": "TextEmbedding",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["dense"],
|
|
"params": {
|
|
"provider": "TEI",
|
|
"endpoint": tei_endpoint
|
|
}
|
|
}
|
|
]
|
|
}
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
|
|
|
|
# Insert data
|
|
data = [{"doc_id": i, "document": fake_en.text(), "category": "tech"} for i in range(5)]
|
|
payload = {"collectionName": name, "data": data}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0, f"Insert failed: {rsp}"
|
|
|
|
# Test decay rerank with string field
|
|
decay_params = {
|
|
"functions": [{
|
|
"name": "invalid_field_decay_ranker",
|
|
"description": "",
|
|
"type": "Rerank",
|
|
"inputFieldNames": ["category"], # String field, should fail
|
|
"params": {
|
|
"reranker": "decay",
|
|
"function": "gauss",
|
|
"origin": 100,
|
|
"offset": 0,
|
|
"decay": 0.5,
|
|
"scale": 10
|
|
}
|
|
}]
|
|
}
|
|
|
|
search_payload = {
|
|
"collectionName": name,
|
|
"data": ["technology"],
|
|
"annsField": "dense",
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document", "category"],
|
|
"functionScore": decay_params
|
|
}
|
|
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] != 0, f"Expected search to fail with non-numeric field, but got: {rsp}"
|
|
|
|
def test_decay_rerank_with_invalid_parameters(self, tei_endpoint):
|
|
"""
|
|
target: test decay rerank with invalid parameter values
|
|
method: test with invalid decay, scale, offset values
|
|
expected: search should fail with appropriate error
|
|
"""
|
|
name = gen_collection_name(prefix)
|
|
collection_name, current_time = self._create_collection_with_timestamp_field(name, tei_endpoint)
|
|
|
|
# Test invalid parameter combinations
|
|
invalid_configs = [
|
|
{
|
|
"name": "negative_scale",
|
|
"params": {
|
|
"origin": current_time,
|
|
"scale": -100, # Invalid: scale must be > 0
|
|
"decay": 0.5
|
|
}
|
|
},
|
|
{
|
|
"name": "invalid_decay_range",
|
|
"params": {
|
|
"origin": current_time,
|
|
"scale": 100,
|
|
"decay": 1.5 # Invalid: decay must be between 0 and 1
|
|
}
|
|
},
|
|
{
|
|
"name": "negative_offset",
|
|
"params": {
|
|
"origin": current_time,
|
|
"scale": 100,
|
|
"decay": 0.5,
|
|
"offset": -10 # Invalid: offset must be >= 0
|
|
}
|
|
}
|
|
]
|
|
|
|
for config in invalid_configs:
|
|
logger.info(f"Testing invalid config: {config['name']}")
|
|
|
|
decay_params = {
|
|
"functions": [{
|
|
"name": f"invalid_decay_ranker_{config['name']}",
|
|
"description": "",
|
|
"type": "Rerank",
|
|
"inputFieldNames": ["timestamp"],
|
|
"params": {
|
|
"reranker": "decay",
|
|
"function": "gauss",
|
|
**config["params"]
|
|
}
|
|
}]
|
|
}
|
|
|
|
search_payload = {
|
|
"collectionName": collection_name,
|
|
"data": ["artificial intelligence"],
|
|
"annsField": "dense",
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document", "timestamp"],
|
|
"functionScore": decay_params
|
|
}
|
|
|
|
rsp = self.vector_client.vector_search(search_payload)
|
|
assert rsp['code'] != 0, f"Expected search to fail with {config['name']}, but got: {rsp}"
|
|
logger.info(f"Invalid config {config['name']} correctly failed")
|
|
|
|
|
|
@pytest.mark.L1
|
|
class TestRRFWeightedRerank(TestBase):
|
|
"""
|
|
******************************************************************
|
|
Test cases for RRF and Weighted rerank function via RESTful API
|
|
******************************************************************
|
|
"""
|
|
|
|
def _create_collection_with_bm25_function(self, name):
|
|
"""Helper method to create collection with dense, sparse, and bm25 fields"""
|
|
payload = {
|
|
"collectionName": name,
|
|
"schema": {
|
|
"autoId": True,
|
|
"enableDynamicField": False,
|
|
"description": "test collection for rrf/weighted rerank",
|
|
"fields": [
|
|
{"fieldName": "id", "dataType": "Int64", "isPrimary": True},
|
|
{"fieldName": "doc_id", "dataType": "VarChar", "elementTypeParams": {"max_length": "100"}},
|
|
{
|
|
"fieldName": "document",
|
|
"dataType": "VarChar",
|
|
"elementTypeParams": {
|
|
"max_length": "10000",
|
|
"enable_analyzer": True,
|
|
"analyzer_params": {"tokenizer": "standard"},
|
|
"enable_match": True
|
|
}
|
|
},
|
|
{"fieldName": "sparse", "dataType": "SparseFloatVector"},
|
|
{"fieldName": "dense", "dataType": "FloatVector", "elementTypeParams": {"dim": "768"}},
|
|
{"fieldName": "bm25", "dataType": "SparseFloatVector"}
|
|
],
|
|
"functions": [
|
|
{
|
|
"name": "bm25_fn",
|
|
"type": "BM25",
|
|
"inputFieldNames": ["document"],
|
|
"outputFieldNames": ["bm25"],
|
|
"params": {}
|
|
}
|
|
]
|
|
},
|
|
"indexParams": [
|
|
{
|
|
"fieldName": "dense",
|
|
"indexName": "dense_index",
|
|
"metricType": "COSINE",
|
|
"indexType": "FLAT",
|
|
"params": {}
|
|
},
|
|
{
|
|
"fieldName": "sparse",
|
|
"indexName": "sparse_index",
|
|
"metricType": "IP",
|
|
"indexType": "SPARSE_INVERTED_INDEX",
|
|
"params": {}
|
|
},
|
|
{
|
|
"fieldName": "bm25",
|
|
"indexName": "bm25_index",
|
|
"metricType": "BM25",
|
|
"indexType": "SPARSE_INVERTED_INDEX",
|
|
"params": {"bm25_k1": 1.2, "bm25_b": 0.75}
|
|
}
|
|
]
|
|
}
|
|
|
|
rsp = self.collection_client.collection_create(payload)
|
|
assert rsp['code'] == 0, f"Collection creation failed: {rsp}"
|
|
|
|
# Insert sample data
|
|
import random
|
|
data = []
|
|
data_size = 100 # Reduced size for faster testing
|
|
for i in range(data_size):
|
|
data.append({
|
|
"doc_id": str(i),
|
|
"document": fake_en.text(),
|
|
"sparse": {random.randint(1, 10000): random.random() for _ in range(100)},
|
|
"dense": [random.random() for _ in range(768)]
|
|
})
|
|
|
|
payload = {"collectionName": name, "data": data}
|
|
rsp = self.vector_client.vector_insert(payload)
|
|
assert rsp['code'] == 0, f"Insert failed: {rsp}"
|
|
|
|
return name
|
|
|
|
@pytest.mark.parametrize("ranker_model", ["rrf", "weighted"])
|
|
def test_hybrid_vector_search_with_rrf_weighted_rerank(self, ranker_model):
|
|
"""
|
|
target: test hybrid vector search with RRF/Weighted rerank using RESTful API
|
|
method: test dense+sparse/dense+bm25/sparse+bm25 search with RRF/Weighted reranker
|
|
expected: hybrid search should succeed with RRF/Weighted reranker
|
|
"""
|
|
import random
|
|
|
|
name = gen_collection_name(prefix)
|
|
self._create_collection_with_bm25_function(name)
|
|
|
|
# Prepare search parameters for reranker
|
|
nq = 2 # Reduced for faster testing
|
|
query_texts = [fake_en.text() for _ in range(nq)]
|
|
|
|
# Prepare reranker parameters (functionScore format)
|
|
if ranker_model == "rrf":
|
|
reranker_params = {
|
|
"functions": [{
|
|
"name": "rrf_ranker",
|
|
"description": "",
|
|
"type": "Rerank",
|
|
"inputFieldNames": [],
|
|
"params": {
|
|
"reranker": "rrf",
|
|
"k": 100
|
|
}
|
|
}]
|
|
}
|
|
else: # weighted
|
|
reranker_params = {
|
|
"functions": [{
|
|
"name": "weighted_ranker",
|
|
"description": "",
|
|
"type": "Rerank",
|
|
"inputFieldNames": [],
|
|
"params": {
|
|
"reranker": "weighted",
|
|
"weights": [0.1, 0.9],
|
|
"norm_score": True
|
|
}
|
|
}]
|
|
}
|
|
|
|
# Test different hybrid search combinations
|
|
for search_type in ["dense+sparse", "dense+bm25", "sparse+bm25"]:
|
|
logger.info(f"Executing {search_type} hybrid search with {ranker_model} reranker")
|
|
|
|
if search_type == "dense+sparse":
|
|
hybrid_search_payload = {
|
|
"collectionName": name,
|
|
"search": [
|
|
{
|
|
"data": [[random.random() for _ in range(768)] for _ in range(nq)],
|
|
"annsField": "dense",
|
|
"limit": 5
|
|
},
|
|
{
|
|
"data": [{random.randint(1, 10000): random.random() for _ in range(100)} for _ in range(nq)],
|
|
"annsField": "sparse",
|
|
"limit": 5
|
|
}
|
|
],
|
|
"functionScore": reranker_params,
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document"]
|
|
}
|
|
|
|
elif search_type == "dense+bm25":
|
|
hybrid_search_payload = {
|
|
"collectionName": name,
|
|
"search": [
|
|
{
|
|
"data": [[random.random() for _ in range(768)] for _ in range(nq)],
|
|
"annsField": "dense",
|
|
"limit": 5
|
|
},
|
|
{
|
|
"data": query_texts,
|
|
"annsField": "bm25",
|
|
"limit": 5
|
|
}
|
|
],
|
|
"functionScore": reranker_params,
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document"]
|
|
}
|
|
|
|
elif search_type == "sparse+bm25":
|
|
hybrid_search_payload = {
|
|
"collectionName": name,
|
|
"search": [
|
|
{
|
|
"data": [{random.randint(1, 10000): random.random() for _ in range(100)} for _ in range(nq)],
|
|
"annsField": "sparse",
|
|
"limit": 5
|
|
},
|
|
{
|
|
"data": query_texts,
|
|
"annsField": "bm25",
|
|
"limit": 5
|
|
}
|
|
],
|
|
"functionScore": reranker_params,
|
|
"limit": 10,
|
|
"outputFields": ["doc_id", "document"],
|
|
"searchParams": {"metric_type": "BM25"}
|
|
}
|
|
|
|
rsp = self.vector_client.vector_advanced_search(hybrid_search_payload)
|
|
assert rsp['code'] == 0, f"{search_type} hybrid search with {ranker_model} reranker failed: {rsp}"
|
|
assert len(rsp['data']) > 0, f"{search_type} hybrid search returned no results"
|
|
logger.info(f"{search_type} hybrid search with {ranker_model} reranker succeeded") |