test: add restful cases for full text search and some minor fix (#37148)

/kind improvement

---------

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
zhuwenxing 2024-10-31 21:18:23 +08:00 committed by GitHub
parent d24970c090
commit 247f75180f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 528 additions and 53 deletions

View File

@ -334,6 +334,7 @@ class CollectionClient(Requests):
self.endpoint = endpoint
self.api_key = token
self.db_name = None
self.name_list = []
self.headers = self.update_headers()
@classmethod
@ -435,6 +436,10 @@ class CollectionClient(Requests):
def collection_create(self, payload, db_name="default"):
time.sleep(1) # wait for collection created and in case of rate limit
c_name = payload.get("collectionName", None)
db_name = payload.get("dbName", db_name)
self.name_list.append((db_name, c_name))
url = f'{self.endpoint}/v2/vectordb/collections/create'
if self.db_name is not None:
payload["dbName"] = self.db_name

View File

@ -50,6 +50,17 @@ class TestBase(Base):
rsp = self.collection_client.collection_drop(payload)
except Exception as e:
logger.error(e)
for item in self.collection_client.name_list:
db_name = item[0]
c_name = item[1]
payload = {
"collectionName": c_name,
"dbName": db_name
}
try:
self.collection_client.collection_drop(payload)
except Exception as e:
logger.error(e)
@pytest.fixture(scope="function", autouse=True)
def init_client(self, endpoint, token, minio_host, bucket_name, root_path):

View File

@ -103,7 +103,7 @@ class TestCreateCollection(TestBase):
"collectionName": name,
"dimension": dim,
"metricType": metric_type,
"params":{
"params": {
"enableDynamicField": enable_dynamic_field,
"shardsNum": request_shards_num,
"consistencyLevel": f"{consistency_level}",
@ -147,7 +147,7 @@ class TestCreateCollection(TestBase):
@pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
@pytest.mark.parametrize("consistency_level", ["Strong", "Bounded"])
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
@pytest.mark.parametrize("index_type", ["AUTOINDEX","IVF_SQ8", "HNSW"])
@pytest.mark.parametrize("index_type", ["AUTOINDEX", "IVF_SQ8", "HNSW"])
@pytest.mark.parametrize("dim", [128])
def test_create_collections_with_all_params(self,
dim,
@ -179,6 +179,7 @@ class TestCreateCollection(TestBase):
"FLAT": {},
"IVF_SQ8": {"nlist": 16384},
"HNSW": {"M": 16, "efConstruction": 500},
"BM25_SPARSE_INVERTED_INDEX": {"bm25_k1": 0.5, "bm25_b": 0.5},
"AUTOINDEX": {}
}
@ -197,15 +198,32 @@ class TestCreateCollection(TestBase):
{"fieldName": "book_id", "dataType": "Int64",
"isPrimary": primary_key_field == "book_id", "elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64",
"isPartitionKey": partition_key_field == "word_count", "isClusteringKey": clustering_key_field == "word_count", "elementTypeParams": {}},
"isPartitionKey": partition_key_field == "word_count",
"isClusteringKey": clustering_key_field == "word_count", "elementTypeParams": {}},
{"fieldName": "book_category", "dataType": "Int64",
"isPartitionKey": partition_key_field == "book_category",
"isClusteringKey": clustering_key_field == "book_category", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
"analyzer_params": {
"tokenizer": "default"
},
"enable_match": True}},
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
{"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
"elementTypeParams": {"max_capacity": "1024"}},
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"}
],
"functions": [
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document_content"],
"outputFieldNames": ["sparse_vector"],
"params": {}
}
]
},
"indexParams": [
@ -214,7 +232,14 @@ class TestCreateCollection(TestBase):
"metricType": f"{metric_type}",
"indexType": index_type,
"params": index_param_map[index_type]
}]
},
{"fieldName": "sparse_vector",
"indexName": "sparse_vector_index",
"metricType": "BM25",
"indexType": "SPARSE_INVERTED_INDEX",
"params": index_param_map["BM25_SPARSE_INVERTED_INDEX"]
}
]
}
logging.info(f"create collection {name} with payload: {payload}")
@ -244,6 +269,7 @@ class TestCreateCollection(TestBase):
assert rsp['data']['partitionsNum'] == num_partitions
assert rsp['data']['consistencyLevel'] == consistency_level
assert ttl_seconds_actual == ttl_seconds
assert len(rsp['data']["functions"]) == len(payload["schema"]["functions"])
#
# # check fields properties
fields = rsp['data']['fields']
@ -259,11 +285,16 @@ class TestCreateCollection(TestBase):
# check index
index_info = [index.to_dict() for index in c.indexes]
logger.info(f"index_info: {index_info}")
assert len(index_info) == 1
assert index_info[0]["index_param"]['metric_type'] == metric_type
assert index_info[0]["index_param"]['index_type'] == index_type
assert index_info[0]["index_param"].get("params", {}) == index_param_map[index_type]
assert len(index_info) == 2
for index in index_info:
index_param = index["index_param"]
if index_param["index_type"] == "SPARSE_INVERTED_INDEX":
assert index_param["metric_type"] == "BM25"
assert index_param.get("params", {}) == index_param_map["BM25_SPARSE_INVERTED_INDEX"]
else:
assert index_param["metric_type"] == metric_type
assert index_param["index_type"] == index_type
assert index_param.get("params", {}) == index_param_map[index_type]
@pytest.mark.parametrize("auto_id", [True, False])
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
@ -686,9 +717,6 @@ class TestCreateCollectionNegative(TestBase):
rsp = client.collection_create(payload)
assert rsp['code'] == 1801
@pytest.mark.parametrize("name",
[" ", "test_collection_" * 100, "test collection", "test/collection", "test\collection"])
def test_create_collections_with_invalid_collection_name(self, name):
@ -797,6 +825,7 @@ class TestGetCollectionStats(TestBase):
rsp = client.collection_stats(collection_name=name)
assert rsp['data']['rowCount'] == nb
@pytest.mark.L0
class TestLoadReleaseCollection(TestBase):
@ -845,6 +874,7 @@ class TestLoadReleaseCollection(TestBase):
rsp = client.collection_load_state(collection_name=name)
assert rsp['data']['loadState'] == "LoadStateNotLoad"
@pytest.mark.L0
class TestGetCollectionLoadState(TestBase):
@ -1126,6 +1156,7 @@ class TestRenameCollection(TestBase):
assert new_name in all_collections
assert name not in all_collections
@pytest.mark.L1
class TestCollectionWithAuth(TestBase):
def test_drop_collections_with_invalid_api_key(self):

View File

@ -1,28 +1,42 @@
import random
from sklearn import preprocessing
import numpy as np
import sys
import json
import time
from utils import constant
from utils.utils import gen_collection_name
from utils.utils import gen_collection_name, patch_faker_text, en_vocabularies_distribution, \
zh_vocabularies_distribution
from utils.util_log import test_log as logger
import pytest
from base.testbase import TestBase
from utils.utils import gen_vector
from pymilvus import (
FieldSchema, CollectionSchema, DataType,
Collection
)
from faker import Faker
Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
patch_faker_text(fake_en, en_vocabularies_distribution)
patch_faker_text(fake_zh, zh_vocabularies_distribution)
index_param_map = {
"FLAT": {},
"IVF_SQ8": {"nlist": 128},
"HNSW": {"M": 16, "efConstruction": 200},
"BM25_SPARSE_INVERTED_INDEX": {"bm25_k1": 0.5, "bm25_b": 0.5},
"AUTOINDEX": {}
}
@pytest.mark.L0
class TestCreateIndex(TestBase):
@pytest.mark.parametrize("metric_type", ["L2"])
@pytest.mark.parametrize("index_type", ["AUTOINDEX", "HNSW"])
@pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
@pytest.mark.parametrize("index_type", ["AUTOINDEX", "IVF_SQ8", "HNSW"])
@pytest.mark.parametrize("dim", [128])
def test_index_e2e(self, dim, metric_type, index_type):
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365")
def test_index_default(self, dim, metric_type, index_type):
"""
target: test create collection
method: create a collection with a simple schema
@ -43,38 +57,21 @@ class TestCreateIndex(TestBase):
}
logger.info(f"create collection {name} with payload: {payload}")
rsp = client.collection_create(payload)
# insert data
for i in range(1):
data = []
for j in range(3000):
tmp = {
"book_id": j,
"word_count": j,
"book_describe": f"book_{j}",
"book_intro": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
0].tolist(),
}
data.append(tmp)
payload = {
"collectionName": name,
"data": data
}
rsp = self.vector_client.vector_insert(payload)
c = Collection(name)
c.flush()
# list index, expect empty
rsp = self.index_client.index_list(name)
# create index
payload = {
"collectionName": name,
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector",
"metricType": f"{metric_type}"}]
"indexParams": [
{"fieldName": "book_intro", "indexName": "book_intro_vector",
"metricType": f"{metric_type}",
"indexType": f"{index_type}",
"params": index_param_map[index_type]
}
]
}
if index_type == "HNSW":
payload["indexParams"][0]["params"] = {"index_type": "HNSW", "M": "16", "efConstruction": "200"}
if index_type == "AUTOINDEX":
payload["indexParams"][0]["params"] = {"index_type": "AUTOINDEX"}
rsp = self.index_client.index_create(payload)
assert rsp['code'] == 0
time.sleep(10)
@ -90,8 +87,19 @@ class TestCreateIndex(TestBase):
assert expected_index[i]['fieldName'] == actual_index[i]['fieldName']
assert expected_index[i]['indexName'] == actual_index[i]['indexName']
assert expected_index[i]['metricType'] == actual_index[i]['metricType']
assert expected_index[i]["params"]['index_type'] == actual_index[i]['indexType']
assert expected_index[i]["indexType"] == actual_index[i]['indexType']
# check index by pymilvus
index_info = [index.to_dict() for index in c.indexes]
logger.info(f"index_info: {index_info}")
for index in index_info:
index_param = index["index_param"]
if index_param["index_type"] == "SPARSE_INVERTED_INDEX":
assert index_param["metric_type"] == "BM25"
assert index_param.get("params", {}) == index_param_map["BM25_SPARSE_INVERTED_INDEX"]
else:
assert index_param["metric_type"] == metric_type
assert index_param["index_type"] == index_type
assert index_param.get("params", {}) == index_param_map[index_type]
# drop index
for i in range(len(actual_index)):
payload = {
@ -241,6 +249,119 @@ class TestCreateIndex(TestBase):
assert expected_index[i]['indexName'] == actual_index[i]['indexName']
assert expected_index[i]['params']['index_type'] == actual_index[i]['indexType']
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True])
@pytest.mark.parametrize("is_partition_key", [True])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("tokenizer", ['default', 'jieba'])
@pytest.mark.parametrize("index_type", ['SPARSE_INVERTED_INDEX', 'SPARSE_WAND'])
@pytest.mark.parametrize("bm25_k1", [1.2, 1.5])
@pytest.mark.parametrize("bm25_b", [0.7, 0.5])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365")
def test_create_index_for_full_text_search(self, nb, dim, insert_round, auto_id, is_partition_key,
enable_dynamic_schema, tokenizer, index_type, bm25_k1, bm25_b):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
"analyzer_params": {
"tokenizer": tokenizer,
},
"enable_match": True}},
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
],
"functions": [
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document_content"],
"outputFieldNames": ["sparse_vector"],
"params": {}
}
]
},
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
if tokenizer == 'default':
fake = fake_en
elif tokenizer == 'jieba':
fake = fake_zh
else:
raise Exception("Invalid tokenizer")
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
idx = i * nb + j
if auto_id:
tmp = {
"user_id": idx % 100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
else:
tmp = {
"book_id": idx,
"user_id": idx % 100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
assert rsp['code'] == 0
# create index
payload = {
"collectionName": name,
"indexParams": [
{"fieldName": "sparse_vector", "indexName": "sparse_vector",
"metricType": "BM25",
"indexType": index_type,
"params": {"bm25_k1": bm25_k1, "bm25_b": bm25_b}
}
]
}
rsp = self.index_client.index_create(payload)
c = Collection(name)
index_info = [index.to_dict() for index in c.indexes]
logger.info(f"index_info: {index_info}")
for info in index_info:
assert info['index_param']['metric_type'] == 'BM25'
assert info['index_param']["params"]['bm25_k1'] == bm25_k1
assert info['index_param']["params"]['bm25_b'] == bm25_b
assert info['index_param']['index_type'] == index_type
@pytest.mark.L1
class TestCreateIndexNegative(TestBase):

View File

@ -101,7 +101,6 @@ class TestCreateImportJob(TestBase):
assert False, "import job timeout"
c = Collection(name)
c.load(_refresh=True)
time.sleep(10)
res = c.query(
expr="",
output_fields=["count(*)"],
@ -192,7 +191,6 @@ class TestCreateImportJob(TestBase):
assert False, "import job timeout"
c = Collection(name)
c.load(_refresh=True)
time.sleep(10)
res = c.query(
expr="",
output_fields=["count(*)"],
@ -285,7 +283,6 @@ class TestCreateImportJob(TestBase):
assert False, "import job timeout"
c = Collection(name)
c.load(_refresh=True)
time.sleep(10)
res = c.query(
expr="",
output_fields=["count(*)"],
@ -376,6 +373,7 @@ class TestCreateImportJob(TestBase):
time.sleep(10)
# assert data count
c = Collection(name)
c.load(_refresh=True)
assert c.num_entities == 2000
# assert import data can be queried
payload = {
@ -456,6 +454,7 @@ class TestCreateImportJob(TestBase):
time.sleep(10)
# assert data count
c = Collection(name)
c.load(_refresh=True)
assert c.num_entities == 2000
# assert import data can be queried
payload = {
@ -541,6 +540,7 @@ class TestCreateImportJob(TestBase):
time.sleep(10)
# assert data count
c = Collection(name)
c.load(_refresh=True)
assert c.num_entities == 2000
# assert import data can be queried
payload = {
@ -665,6 +665,7 @@ class TestCreateImportJob(TestBase):
time.sleep(10)
# assert data count
c = Collection(name)
c.load(_refresh=True)
assert c.num_entities == 6000
# assert import data can be queried
payload = {
@ -915,6 +916,7 @@ class TestImportJobAdvance(TestBase):
rsp = self.import_job_client.list_import_jobs(payload)
# assert data count
c = Collection(name)
c.load(_refresh=True)
assert c.num_entities == file_nums * batch_size
# assert import data can be queried
payload = {
@ -1007,6 +1009,7 @@ class TestCreateImportJobAdvance(TestBase):
rsp = self.import_job_client.list_import_jobs(payload)
# assert data count
c = Collection(name)
c.load(_refresh=True)
assert c.num_entities == file_nums * batch_size * task_num
# assert import data can be queried
payload = {
@ -1096,6 +1099,7 @@ class TestCreateImportJobAdvance(TestBase):
rsp = self.import_job_client.list_import_jobs(payload)
# assert data count
c = Collection(name)
c.load(_refresh=True)
assert c.num_entities == file_nums * batch_size * task_num
# assert import data can be queried
payload = {

View File

@ -6,7 +6,7 @@ import sys
import json
import time
from utils import constant
from utils.utils import gen_collection_name, get_sorted_distance
from utils.utils import gen_collection_name, get_sorted_distance, patch_faker_text, en_vocabularies_distribution, zh_vocabularies_distribution
from utils.util_log import test_log as logger
import pytest
from base.testbase import TestBase
@ -20,6 +20,9 @@ Faker.seed(19530)
fake_en = Faker("en_US")
fake_zh = Faker("zh_CN")
patch_faker_text(fake_en, en_vocabularies_distribution)
patch_faker_text(fake_zh, zh_vocabularies_distribution)
@pytest.mark.L0
class TestInsertVector(TestBase):
@ -1193,14 +1196,108 @@ class TestSearchVector(TestBase):
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True, False])
@pytest.mark.parametrize("is_partition_key", [True, False])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("groupingField", ['user_id', None])
@pytest.mark.parametrize("tokenizer", ['default'])
def test_search_vector_for_en_full_text_search(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
"analyzer_params": {
"tokenizer": tokenizer,
},
"enable_match": True}},
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
],
"functions": [
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document_content"],
"outputFieldNames": ["sparse_vector"],
"params": {}
}
]
},
"indexParams": [
{"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
if tokenizer == 'default':
fake = fake_en
elif tokenizer == 'jieba':
fake = fake_zh
else:
raise Exception("Invalid tokenizer")
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
idx = i * nb + j
if auto_id:
tmp = {
"user_id": idx%100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
else:
tmp = {
"book_id": idx,
"user_id": idx%100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
assert rsp['code'] == 0
# search data
payload = {
"collectionName": name,
"data": [gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="coo")],
"data": [fake.text().lower() for _ in range(1)],
"filter": "word_count > 100",
"outputFields": ["*"],
"searchParams": {
"metricType": "IP",
"params": {
"drop_ratio_search": "0.2",
}
@ -1211,6 +1308,125 @@ class TestSearchVector(TestBase):
payload["groupingField"] = groupingField
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
assert len(rsp['data']) > 0
@pytest.mark.parametrize("insert_round", [1])
@pytest.mark.parametrize("auto_id", [True, False])
@pytest.mark.parametrize("is_partition_key", [True, False])
@pytest.mark.parametrize("enable_dynamic_schema", [True])
@pytest.mark.parametrize("nb", [3000])
@pytest.mark.parametrize("dim", [128])
@pytest.mark.parametrize("groupingField", ['user_id', None])
@pytest.mark.parametrize("tokenizer", ['jieba'])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751")
def test_search_vector_for_zh_full_text_search(self, nb, dim, insert_round, auto_id,
is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
"""
Insert a vector with a simple payload
"""
# create a collection
name = gen_collection_name()
payload = {
"collectionName": name,
"schema": {
"autoId": auto_id,
"enableDynamicField": enable_dynamic_schema,
"fields": [
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
"elementTypeParams": {}},
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
{"fieldName": "document_content", "dataType": "VarChar",
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
"analyzer_params": {
"tokenizer": tokenizer,
},
"enable_match": True}},
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
],
"functions": [
{
"name": "bm25_fn",
"type": "BM25",
"inputFieldNames": ["document_content"],
"outputFieldNames": ["sparse_vector"],
"params": {}
}
]
},
"indexParams": [
{"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
"params": {"index_type": "SPARSE_INVERTED_INDEX"}}
]
}
rsp = self.collection_client.collection_create(payload)
assert rsp['code'] == 0
rsp = self.collection_client.collection_describe(name)
logger.info(f"rsp: {rsp}")
assert rsp['code'] == 0
if tokenizer == 'default':
fake = fake_en
elif tokenizer == 'jieba':
fake = fake_zh
else:
raise Exception("Invalid tokenizer")
# insert data
for i in range(insert_round):
data = []
for j in range(nb):
idx = i * nb + j
if auto_id:
tmp = {
"user_id": idx%100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
else:
tmp = {
"book_id": idx,
"user_id": idx%100,
"word_count": j,
"book_describe": f"book_{idx}",
"document_content": fake.text().lower(),
}
if enable_dynamic_schema:
tmp.update({f"dynamic_field_{i}": i})
data.append(tmp)
payload = {
"collectionName": name,
"data": data,
}
rsp = self.vector_client.vector_insert(payload)
assert rsp['code'] == 0
assert rsp['data']['insertCount'] == nb
assert rsp['code'] == 0
# search data
payload = {
"collectionName": name,
"data": [fake.text().lower() for _ in range(2)],
"filter": "word_count > 100",
"outputFields": ["*"],
"searchParams": {
"params": {
"drop_ratio_search": "0.2",
}
},
"limit": 500,
}
if groupingField:
payload["groupingField"] = groupingField
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 0
assert len(rsp['data']) > 0
@pytest.mark.parametrize("insert_round", [2])
@pytest.mark.parametrize("auto_id", [True])
@ -1790,6 +2006,29 @@ class TestSearchVectorNegative(TestBase):
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] == 1802
@pytest.mark.parametrize("invalid_metric_type", ["L2", "IP", "UNSUPPORTED"])
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37138")
def test_search_vector_with_invalid_metric_type(self, invalid_metric_type):
"""
Search a vector with a simple payload
"""
name = gen_collection_name()
self.name = name
self.init_collection(name, metric_type="COSINE")
# search data
dim = 128
payload = {
"collectionName": name,
"data": [preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()],
"searchParams": {
"metricType": invalid_metric_type
}
}
rsp = self.vector_client.vector_search(payload)
assert rsp['code'] != 0
@pytest.mark.parametrize("limit", [0, 16385])
def test_search_vector_with_invalid_limit(self, limit):
"""

View File

@ -14,10 +14,74 @@ from sklearn.metrics import pairwise_distances
from collections import Counter
import bm25s
import jieba
fake = Faker()
fake.seed_instance(19530)
rng = np.random.default_rng()
en_vocabularies_distribution = {
"hello": 0.01,
"milvus": 0.01,
"vector": 0.01,
"database": 0.01
}
zh_vocabularies_distribution = {
"你好": 0.01,
"向量": 0.01,
"数据": 0.01,
"": 0.01
}
def patch_faker_text(fake_instance, vocabularies_distribution):
"""
Monkey patch the text() method of a Faker instance to include custom vocabulary.
Each word in vocabularies_distribution has an independent chance to be inserted.
Args:
fake_instance: Faker instance to patch
vocabularies_distribution: Dictionary where:
- key: word to insert
- value: probability (0-1) of inserting this word into each sentence
Example:
vocabularies_distribution = {
"hello": 0.1, # 10% chance to insert "hello" in each sentence
"milvus": 0.1, # 10% chance to insert "milvus" in each sentence
}
"""
original_text = fake_instance.text
def new_text(*args, **kwargs):
sentences = []
# Split original text into sentences
original_sentences = original_text(*args,**kwargs).split('.')
original_sentences = [s.strip() for s in original_sentences if s.strip()]
for base_sentence in original_sentences:
words = base_sentence.split()
# Independently decide whether to insert each word
for word, probability in vocabularies_distribution.items():
if random.random() < probability:
# Choose random position to insert the word
insert_pos = random.randint(0, len(words))
words.insert(insert_pos, word)
# Reconstruct the sentence
base_sentence = ' '.join(words)
# Ensure proper capitalization
base_sentence = base_sentence[0].upper() + base_sentence[1:]
sentences.append(base_sentence)
return '. '.join(sentences) + '.'
# Replace the original text method with our custom one
fake_instance.text = new_text
def analyze_documents(texts, language="en"):
stopwords = "en"
if language in ["en", "english"]: