mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
test: add restful cases for full text search and some minor fix (#37148)
/kind improvement --------- Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
parent
d24970c090
commit
247f75180f
@ -334,6 +334,7 @@ class CollectionClient(Requests):
|
||||
self.endpoint = endpoint
|
||||
self.api_key = token
|
||||
self.db_name = None
|
||||
self.name_list = []
|
||||
self.headers = self.update_headers()
|
||||
|
||||
@classmethod
|
||||
@ -435,6 +436,10 @@ class CollectionClient(Requests):
|
||||
|
||||
def collection_create(self, payload, db_name="default"):
|
||||
time.sleep(1) # wait for collection created and in case of rate limit
|
||||
c_name = payload.get("collectionName", None)
|
||||
db_name = payload.get("dbName", db_name)
|
||||
self.name_list.append((db_name, c_name))
|
||||
|
||||
url = f'{self.endpoint}/v2/vectordb/collections/create'
|
||||
if self.db_name is not None:
|
||||
payload["dbName"] = self.db_name
|
||||
|
||||
@ -50,6 +50,17 @@ class TestBase(Base):
|
||||
rsp = self.collection_client.collection_drop(payload)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
for item in self.collection_client.name_list:
|
||||
db_name = item[0]
|
||||
c_name = item[1]
|
||||
payload = {
|
||||
"collectionName": c_name,
|
||||
"dbName": db_name
|
||||
}
|
||||
try:
|
||||
self.collection_client.collection_drop(payload)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
|
||||
@pytest.fixture(scope="function", autouse=True)
|
||||
def init_client(self, endpoint, token, minio_host, bucket_name, root_path):
|
||||
|
||||
@ -103,7 +103,7 @@ class TestCreateCollection(TestBase):
|
||||
"collectionName": name,
|
||||
"dimension": dim,
|
||||
"metricType": metric_type,
|
||||
"params":{
|
||||
"params": {
|
||||
"enableDynamicField": enable_dynamic_field,
|
||||
"shardsNum": request_shards_num,
|
||||
"consistencyLevel": f"{consistency_level}",
|
||||
@ -147,7 +147,7 @@ class TestCreateCollection(TestBase):
|
||||
@pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
|
||||
@pytest.mark.parametrize("consistency_level", ["Strong", "Bounded"])
|
||||
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
|
||||
@pytest.mark.parametrize("index_type", ["AUTOINDEX","IVF_SQ8", "HNSW"])
|
||||
@pytest.mark.parametrize("index_type", ["AUTOINDEX", "IVF_SQ8", "HNSW"])
|
||||
@pytest.mark.parametrize("dim", [128])
|
||||
def test_create_collections_with_all_params(self,
|
||||
dim,
|
||||
@ -179,6 +179,7 @@ class TestCreateCollection(TestBase):
|
||||
"FLAT": {},
|
||||
"IVF_SQ8": {"nlist": 16384},
|
||||
"HNSW": {"M": 16, "efConstruction": 500},
|
||||
"BM25_SPARSE_INVERTED_INDEX": {"bm25_k1": 0.5, "bm25_b": 0.5},
|
||||
"AUTOINDEX": {}
|
||||
}
|
||||
|
||||
@ -197,15 +198,32 @@ class TestCreateCollection(TestBase):
|
||||
{"fieldName": "book_id", "dataType": "Int64",
|
||||
"isPrimary": primary_key_field == "book_id", "elementTypeParams": {}},
|
||||
{"fieldName": "word_count", "dataType": "Int64",
|
||||
"isPartitionKey": partition_key_field == "word_count", "isClusteringKey": clustering_key_field == "word_count", "elementTypeParams": {}},
|
||||
"isPartitionKey": partition_key_field == "word_count",
|
||||
"isClusteringKey": clustering_key_field == "word_count", "elementTypeParams": {}},
|
||||
{"fieldName": "book_category", "dataType": "Int64",
|
||||
"isPartitionKey": partition_key_field == "book_category",
|
||||
"isClusteringKey": clustering_key_field == "book_category", "elementTypeParams": {}},
|
||||
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
||||
{"fieldName": "document_content", "dataType": "VarChar",
|
||||
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
|
||||
"analyzer_params": {
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"enable_match": True}},
|
||||
{"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
|
||||
{"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
|
||||
"elementTypeParams": {"max_capacity": "1024"}},
|
||||
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
|
||||
{"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
|
||||
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"}
|
||||
],
|
||||
"functions": [
|
||||
{
|
||||
"name": "bm25_fn",
|
||||
"type": "BM25",
|
||||
"inputFieldNames": ["document_content"],
|
||||
"outputFieldNames": ["sparse_vector"],
|
||||
"params": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
"indexParams": [
|
||||
@ -214,7 +232,14 @@ class TestCreateCollection(TestBase):
|
||||
"metricType": f"{metric_type}",
|
||||
"indexType": index_type,
|
||||
"params": index_param_map[index_type]
|
||||
}]
|
||||
},
|
||||
{"fieldName": "sparse_vector",
|
||||
"indexName": "sparse_vector_index",
|
||||
"metricType": "BM25",
|
||||
"indexType": "SPARSE_INVERTED_INDEX",
|
||||
"params": index_param_map["BM25_SPARSE_INVERTED_INDEX"]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
logging.info(f"create collection {name} with payload: {payload}")
|
||||
@ -244,6 +269,7 @@ class TestCreateCollection(TestBase):
|
||||
assert rsp['data']['partitionsNum'] == num_partitions
|
||||
assert rsp['data']['consistencyLevel'] == consistency_level
|
||||
assert ttl_seconds_actual == ttl_seconds
|
||||
assert len(rsp['data']["functions"]) == len(payload["schema"]["functions"])
|
||||
#
|
||||
# # check fields properties
|
||||
fields = rsp['data']['fields']
|
||||
@ -259,11 +285,16 @@ class TestCreateCollection(TestBase):
|
||||
# check index
|
||||
index_info = [index.to_dict() for index in c.indexes]
|
||||
logger.info(f"index_info: {index_info}")
|
||||
assert len(index_info) == 1
|
||||
assert index_info[0]["index_param"]['metric_type'] == metric_type
|
||||
assert index_info[0]["index_param"]['index_type'] == index_type
|
||||
assert index_info[0]["index_param"].get("params", {}) == index_param_map[index_type]
|
||||
|
||||
assert len(index_info) == 2
|
||||
for index in index_info:
|
||||
index_param = index["index_param"]
|
||||
if index_param["index_type"] == "SPARSE_INVERTED_INDEX":
|
||||
assert index_param["metric_type"] == "BM25"
|
||||
assert index_param.get("params", {}) == index_param_map["BM25_SPARSE_INVERTED_INDEX"]
|
||||
else:
|
||||
assert index_param["metric_type"] == metric_type
|
||||
assert index_param["index_type"] == index_type
|
||||
assert index_param.get("params", {}) == index_param_map[index_type]
|
||||
|
||||
@pytest.mark.parametrize("auto_id", [True, False])
|
||||
@pytest.mark.parametrize("enable_dynamic_field", [True, False])
|
||||
@ -686,9 +717,6 @@ class TestCreateCollectionNegative(TestBase):
|
||||
rsp = client.collection_create(payload)
|
||||
assert rsp['code'] == 1801
|
||||
|
||||
|
||||
|
||||
|
||||
@pytest.mark.parametrize("name",
|
||||
[" ", "test_collection_" * 100, "test collection", "test/collection", "test\collection"])
|
||||
def test_create_collections_with_invalid_collection_name(self, name):
|
||||
@ -797,6 +825,7 @@ class TestGetCollectionStats(TestBase):
|
||||
rsp = client.collection_stats(collection_name=name)
|
||||
assert rsp['data']['rowCount'] == nb
|
||||
|
||||
|
||||
@pytest.mark.L0
|
||||
class TestLoadReleaseCollection(TestBase):
|
||||
|
||||
@ -845,6 +874,7 @@ class TestLoadReleaseCollection(TestBase):
|
||||
rsp = client.collection_load_state(collection_name=name)
|
||||
assert rsp['data']['loadState'] == "LoadStateNotLoad"
|
||||
|
||||
|
||||
@pytest.mark.L0
|
||||
class TestGetCollectionLoadState(TestBase):
|
||||
|
||||
@ -1126,6 +1156,7 @@ class TestRenameCollection(TestBase):
|
||||
assert new_name in all_collections
|
||||
assert name not in all_collections
|
||||
|
||||
|
||||
@pytest.mark.L1
|
||||
class TestCollectionWithAuth(TestBase):
|
||||
def test_drop_collections_with_invalid_api_key(self):
|
||||
|
||||
@ -1,28 +1,42 @@
|
||||
import random
|
||||
from sklearn import preprocessing
|
||||
import numpy as np
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
from utils import constant
|
||||
from utils.utils import gen_collection_name
|
||||
from utils.utils import gen_collection_name, patch_faker_text, en_vocabularies_distribution, \
|
||||
zh_vocabularies_distribution
|
||||
from utils.util_log import test_log as logger
|
||||
import pytest
|
||||
from base.testbase import TestBase
|
||||
from utils.utils import gen_vector
|
||||
from pymilvus import (
|
||||
FieldSchema, CollectionSchema, DataType,
|
||||
Collection
|
||||
)
|
||||
from faker import Faker
|
||||
|
||||
Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
patch_faker_text(fake_en, en_vocabularies_distribution)
|
||||
patch_faker_text(fake_zh, zh_vocabularies_distribution)
|
||||
|
||||
index_param_map = {
|
||||
"FLAT": {},
|
||||
"IVF_SQ8": {"nlist": 128},
|
||||
"HNSW": {"M": 16, "efConstruction": 200},
|
||||
"BM25_SPARSE_INVERTED_INDEX": {"bm25_k1": 0.5, "bm25_b": 0.5},
|
||||
"AUTOINDEX": {}
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.L0
|
||||
class TestCreateIndex(TestBase):
|
||||
|
||||
@pytest.mark.parametrize("metric_type", ["L2"])
|
||||
@pytest.mark.parametrize("index_type", ["AUTOINDEX", "HNSW"])
|
||||
@pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
|
||||
@pytest.mark.parametrize("index_type", ["AUTOINDEX", "IVF_SQ8", "HNSW"])
|
||||
@pytest.mark.parametrize("dim", [128])
|
||||
def test_index_e2e(self, dim, metric_type, index_type):
|
||||
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365")
|
||||
def test_index_default(self, dim, metric_type, index_type):
|
||||
"""
|
||||
target: test create collection
|
||||
method: create a collection with a simple schema
|
||||
@ -43,38 +57,21 @@ class TestCreateIndex(TestBase):
|
||||
}
|
||||
logger.info(f"create collection {name} with payload: {payload}")
|
||||
rsp = client.collection_create(payload)
|
||||
# insert data
|
||||
for i in range(1):
|
||||
data = []
|
||||
for j in range(3000):
|
||||
tmp = {
|
||||
"book_id": j,
|
||||
"word_count": j,
|
||||
"book_describe": f"book_{j}",
|
||||
"book_intro": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
|
||||
0].tolist(),
|
||||
}
|
||||
data.append(tmp)
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"data": data
|
||||
}
|
||||
rsp = self.vector_client.vector_insert(payload)
|
||||
c = Collection(name)
|
||||
c.flush()
|
||||
# list index, expect empty
|
||||
rsp = self.index_client.index_list(name)
|
||||
|
||||
# create index
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector",
|
||||
"metricType": f"{metric_type}"}]
|
||||
"indexParams": [
|
||||
{"fieldName": "book_intro", "indexName": "book_intro_vector",
|
||||
"metricType": f"{metric_type}",
|
||||
"indexType": f"{index_type}",
|
||||
"params": index_param_map[index_type]
|
||||
}
|
||||
]
|
||||
}
|
||||
if index_type == "HNSW":
|
||||
payload["indexParams"][0]["params"] = {"index_type": "HNSW", "M": "16", "efConstruction": "200"}
|
||||
if index_type == "AUTOINDEX":
|
||||
payload["indexParams"][0]["params"] = {"index_type": "AUTOINDEX"}
|
||||
rsp = self.index_client.index_create(payload)
|
||||
assert rsp['code'] == 0
|
||||
time.sleep(10)
|
||||
@ -90,8 +87,19 @@ class TestCreateIndex(TestBase):
|
||||
assert expected_index[i]['fieldName'] == actual_index[i]['fieldName']
|
||||
assert expected_index[i]['indexName'] == actual_index[i]['indexName']
|
||||
assert expected_index[i]['metricType'] == actual_index[i]['metricType']
|
||||
assert expected_index[i]["params"]['index_type'] == actual_index[i]['indexType']
|
||||
|
||||
assert expected_index[i]["indexType"] == actual_index[i]['indexType']
|
||||
# check index by pymilvus
|
||||
index_info = [index.to_dict() for index in c.indexes]
|
||||
logger.info(f"index_info: {index_info}")
|
||||
for index in index_info:
|
||||
index_param = index["index_param"]
|
||||
if index_param["index_type"] == "SPARSE_INVERTED_INDEX":
|
||||
assert index_param["metric_type"] == "BM25"
|
||||
assert index_param.get("params", {}) == index_param_map["BM25_SPARSE_INVERTED_INDEX"]
|
||||
else:
|
||||
assert index_param["metric_type"] == metric_type
|
||||
assert index_param["index_type"] == index_type
|
||||
assert index_param.get("params", {}) == index_param_map[index_type]
|
||||
# drop index
|
||||
for i in range(len(actual_index)):
|
||||
payload = {
|
||||
@ -241,6 +249,119 @@ class TestCreateIndex(TestBase):
|
||||
assert expected_index[i]['indexName'] == actual_index[i]['indexName']
|
||||
assert expected_index[i]['params']['index_type'] == actual_index[i]['indexType']
|
||||
|
||||
@pytest.mark.parametrize("insert_round", [1])
|
||||
@pytest.mark.parametrize("auto_id", [True])
|
||||
@pytest.mark.parametrize("is_partition_key", [True])
|
||||
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
||||
@pytest.mark.parametrize("nb", [3000])
|
||||
@pytest.mark.parametrize("dim", [128])
|
||||
@pytest.mark.parametrize("tokenizer", ['default', 'jieba'])
|
||||
@pytest.mark.parametrize("index_type", ['SPARSE_INVERTED_INDEX', 'SPARSE_WAND'])
|
||||
@pytest.mark.parametrize("bm25_k1", [1.2, 1.5])
|
||||
@pytest.mark.parametrize("bm25_b", [0.7, 0.5])
|
||||
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365")
|
||||
def test_create_index_for_full_text_search(self, nb, dim, insert_round, auto_id, is_partition_key,
|
||||
enable_dynamic_schema, tokenizer, index_type, bm25_k1, bm25_b):
|
||||
"""
|
||||
Insert a vector with a simple payload
|
||||
"""
|
||||
# create a collection
|
||||
name = gen_collection_name()
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"schema": {
|
||||
"autoId": auto_id,
|
||||
"enableDynamicField": enable_dynamic_schema,
|
||||
"fields": [
|
||||
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
||||
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
||||
"elementTypeParams": {}},
|
||||
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
||||
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
||||
{"fieldName": "document_content", "dataType": "VarChar",
|
||||
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
|
||||
"analyzer_params": {
|
||||
"tokenizer": tokenizer,
|
||||
},
|
||||
"enable_match": True}},
|
||||
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
|
||||
],
|
||||
"functions": [
|
||||
{
|
||||
"name": "bm25_fn",
|
||||
"type": "BM25",
|
||||
"inputFieldNames": ["document_content"],
|
||||
"outputFieldNames": ["sparse_vector"],
|
||||
"params": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
}
|
||||
rsp = self.collection_client.collection_create(payload)
|
||||
assert rsp['code'] == 0
|
||||
rsp = self.collection_client.collection_describe(name)
|
||||
logger.info(f"rsp: {rsp}")
|
||||
assert rsp['code'] == 0
|
||||
if tokenizer == 'default':
|
||||
fake = fake_en
|
||||
elif tokenizer == 'jieba':
|
||||
fake = fake_zh
|
||||
else:
|
||||
raise Exception("Invalid tokenizer")
|
||||
|
||||
# insert data
|
||||
for i in range(insert_round):
|
||||
data = []
|
||||
for j in range(nb):
|
||||
idx = i * nb + j
|
||||
if auto_id:
|
||||
tmp = {
|
||||
"user_id": idx % 100,
|
||||
"word_count": j,
|
||||
"book_describe": f"book_{idx}",
|
||||
"document_content": fake.text().lower(),
|
||||
}
|
||||
else:
|
||||
tmp = {
|
||||
"book_id": idx,
|
||||
"user_id": idx % 100,
|
||||
"word_count": j,
|
||||
"book_describe": f"book_{idx}",
|
||||
"document_content": fake.text().lower(),
|
||||
}
|
||||
if enable_dynamic_schema:
|
||||
tmp.update({f"dynamic_field_{i}": i})
|
||||
data.append(tmp)
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"data": data,
|
||||
}
|
||||
rsp = self.vector_client.vector_insert(payload)
|
||||
assert rsp['code'] == 0
|
||||
assert rsp['data']['insertCount'] == nb
|
||||
assert rsp['code'] == 0
|
||||
|
||||
# create index
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"indexParams": [
|
||||
{"fieldName": "sparse_vector", "indexName": "sparse_vector",
|
||||
"metricType": "BM25",
|
||||
"indexType": index_type,
|
||||
"params": {"bm25_k1": bm25_k1, "bm25_b": bm25_b}
|
||||
}
|
||||
]
|
||||
}
|
||||
rsp = self.index_client.index_create(payload)
|
||||
c = Collection(name)
|
||||
index_info = [index.to_dict() for index in c.indexes]
|
||||
logger.info(f"index_info: {index_info}")
|
||||
for info in index_info:
|
||||
assert info['index_param']['metric_type'] == 'BM25'
|
||||
assert info['index_param']["params"]['bm25_k1'] == bm25_k1
|
||||
assert info['index_param']["params"]['bm25_b'] == bm25_b
|
||||
assert info['index_param']['index_type'] == index_type
|
||||
|
||||
|
||||
@pytest.mark.L1
|
||||
class TestCreateIndexNegative(TestBase):
|
||||
|
||||
@ -101,7 +101,6 @@ class TestCreateImportJob(TestBase):
|
||||
assert False, "import job timeout"
|
||||
c = Collection(name)
|
||||
c.load(_refresh=True)
|
||||
time.sleep(10)
|
||||
res = c.query(
|
||||
expr="",
|
||||
output_fields=["count(*)"],
|
||||
@ -192,7 +191,6 @@ class TestCreateImportJob(TestBase):
|
||||
assert False, "import job timeout"
|
||||
c = Collection(name)
|
||||
c.load(_refresh=True)
|
||||
time.sleep(10)
|
||||
res = c.query(
|
||||
expr="",
|
||||
output_fields=["count(*)"],
|
||||
@ -285,7 +283,6 @@ class TestCreateImportJob(TestBase):
|
||||
assert False, "import job timeout"
|
||||
c = Collection(name)
|
||||
c.load(_refresh=True)
|
||||
time.sleep(10)
|
||||
res = c.query(
|
||||
expr="",
|
||||
output_fields=["count(*)"],
|
||||
@ -376,6 +373,7 @@ class TestCreateImportJob(TestBase):
|
||||
time.sleep(10)
|
||||
# assert data count
|
||||
c = Collection(name)
|
||||
c.load(_refresh=True)
|
||||
assert c.num_entities == 2000
|
||||
# assert import data can be queried
|
||||
payload = {
|
||||
@ -456,6 +454,7 @@ class TestCreateImportJob(TestBase):
|
||||
time.sleep(10)
|
||||
# assert data count
|
||||
c = Collection(name)
|
||||
c.load(_refresh=True)
|
||||
assert c.num_entities == 2000
|
||||
# assert import data can be queried
|
||||
payload = {
|
||||
@ -541,6 +540,7 @@ class TestCreateImportJob(TestBase):
|
||||
time.sleep(10)
|
||||
# assert data count
|
||||
c = Collection(name)
|
||||
c.load(_refresh=True)
|
||||
assert c.num_entities == 2000
|
||||
# assert import data can be queried
|
||||
payload = {
|
||||
@ -665,6 +665,7 @@ class TestCreateImportJob(TestBase):
|
||||
time.sleep(10)
|
||||
# assert data count
|
||||
c = Collection(name)
|
||||
c.load(_refresh=True)
|
||||
assert c.num_entities == 6000
|
||||
# assert import data can be queried
|
||||
payload = {
|
||||
@ -915,6 +916,7 @@ class TestImportJobAdvance(TestBase):
|
||||
rsp = self.import_job_client.list_import_jobs(payload)
|
||||
# assert data count
|
||||
c = Collection(name)
|
||||
c.load(_refresh=True)
|
||||
assert c.num_entities == file_nums * batch_size
|
||||
# assert import data can be queried
|
||||
payload = {
|
||||
@ -1007,6 +1009,7 @@ class TestCreateImportJobAdvance(TestBase):
|
||||
rsp = self.import_job_client.list_import_jobs(payload)
|
||||
# assert data count
|
||||
c = Collection(name)
|
||||
c.load(_refresh=True)
|
||||
assert c.num_entities == file_nums * batch_size * task_num
|
||||
# assert import data can be queried
|
||||
payload = {
|
||||
@ -1096,6 +1099,7 @@ class TestCreateImportJobAdvance(TestBase):
|
||||
rsp = self.import_job_client.list_import_jobs(payload)
|
||||
# assert data count
|
||||
c = Collection(name)
|
||||
c.load(_refresh=True)
|
||||
assert c.num_entities == file_nums * batch_size * task_num
|
||||
# assert import data can be queried
|
||||
payload = {
|
||||
|
||||
@ -6,7 +6,7 @@ import sys
|
||||
import json
|
||||
import time
|
||||
from utils import constant
|
||||
from utils.utils import gen_collection_name, get_sorted_distance
|
||||
from utils.utils import gen_collection_name, get_sorted_distance, patch_faker_text, en_vocabularies_distribution, zh_vocabularies_distribution
|
||||
from utils.util_log import test_log as logger
|
||||
import pytest
|
||||
from base.testbase import TestBase
|
||||
@ -20,6 +20,9 @@ Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
|
||||
patch_faker_text(fake_en, en_vocabularies_distribution)
|
||||
patch_faker_text(fake_zh, zh_vocabularies_distribution)
|
||||
|
||||
|
||||
@pytest.mark.L0
|
||||
class TestInsertVector(TestBase):
|
||||
@ -1193,14 +1196,108 @@ class TestSearchVector(TestBase):
|
||||
rsp = self.vector_client.vector_search(payload)
|
||||
assert rsp['code'] == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("insert_round", [1])
|
||||
@pytest.mark.parametrize("auto_id", [True, False])
|
||||
@pytest.mark.parametrize("is_partition_key", [True, False])
|
||||
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
||||
@pytest.mark.parametrize("nb", [3000])
|
||||
@pytest.mark.parametrize("dim", [128])
|
||||
@pytest.mark.parametrize("groupingField", ['user_id', None])
|
||||
@pytest.mark.parametrize("tokenizer", ['default'])
|
||||
def test_search_vector_for_en_full_text_search(self, nb, dim, insert_round, auto_id,
|
||||
is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
|
||||
"""
|
||||
Insert a vector with a simple payload
|
||||
"""
|
||||
# create a collection
|
||||
name = gen_collection_name()
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"schema": {
|
||||
"autoId": auto_id,
|
||||
"enableDynamicField": enable_dynamic_schema,
|
||||
"fields": [
|
||||
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
||||
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
||||
"elementTypeParams": {}},
|
||||
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
||||
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
||||
{"fieldName": "document_content", "dataType": "VarChar",
|
||||
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
|
||||
"analyzer_params": {
|
||||
"tokenizer": tokenizer,
|
||||
},
|
||||
"enable_match": True}},
|
||||
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
|
||||
],
|
||||
"functions": [
|
||||
{
|
||||
"name": "bm25_fn",
|
||||
"type": "BM25",
|
||||
"inputFieldNames": ["document_content"],
|
||||
"outputFieldNames": ["sparse_vector"],
|
||||
"params": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"indexParams": [
|
||||
{"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
|
||||
"params": {"index_type": "SPARSE_INVERTED_INDEX"}}
|
||||
]
|
||||
}
|
||||
rsp = self.collection_client.collection_create(payload)
|
||||
assert rsp['code'] == 0
|
||||
rsp = self.collection_client.collection_describe(name)
|
||||
logger.info(f"rsp: {rsp}")
|
||||
assert rsp['code'] == 0
|
||||
if tokenizer == 'default':
|
||||
fake = fake_en
|
||||
elif tokenizer == 'jieba':
|
||||
fake = fake_zh
|
||||
else:
|
||||
raise Exception("Invalid tokenizer")
|
||||
|
||||
# insert data
|
||||
for i in range(insert_round):
|
||||
data = []
|
||||
for j in range(nb):
|
||||
idx = i * nb + j
|
||||
if auto_id:
|
||||
tmp = {
|
||||
"user_id": idx%100,
|
||||
"word_count": j,
|
||||
"book_describe": f"book_{idx}",
|
||||
"document_content": fake.text().lower(),
|
||||
}
|
||||
else:
|
||||
tmp = {
|
||||
"book_id": idx,
|
||||
"user_id": idx%100,
|
||||
"word_count": j,
|
||||
"book_describe": f"book_{idx}",
|
||||
"document_content": fake.text().lower(),
|
||||
}
|
||||
if enable_dynamic_schema:
|
||||
tmp.update({f"dynamic_field_{i}": i})
|
||||
data.append(tmp)
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"data": data,
|
||||
}
|
||||
rsp = self.vector_client.vector_insert(payload)
|
||||
assert rsp['code'] == 0
|
||||
assert rsp['data']['insertCount'] == nb
|
||||
assert rsp['code'] == 0
|
||||
|
||||
# search data
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"data": [gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="coo")],
|
||||
"data": [fake.text().lower() for _ in range(1)],
|
||||
"filter": "word_count > 100",
|
||||
"outputFields": ["*"],
|
||||
"searchParams": {
|
||||
"metricType": "IP",
|
||||
"params": {
|
||||
"drop_ratio_search": "0.2",
|
||||
}
|
||||
@ -1211,6 +1308,125 @@ class TestSearchVector(TestBase):
|
||||
payload["groupingField"] = groupingField
|
||||
rsp = self.vector_client.vector_search(payload)
|
||||
assert rsp['code'] == 0
|
||||
assert len(rsp['data']) > 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("insert_round", [1])
|
||||
@pytest.mark.parametrize("auto_id", [True, False])
|
||||
@pytest.mark.parametrize("is_partition_key", [True, False])
|
||||
@pytest.mark.parametrize("enable_dynamic_schema", [True])
|
||||
@pytest.mark.parametrize("nb", [3000])
|
||||
@pytest.mark.parametrize("dim", [128])
|
||||
@pytest.mark.parametrize("groupingField", ['user_id', None])
|
||||
@pytest.mark.parametrize("tokenizer", ['jieba'])
|
||||
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751")
|
||||
def test_search_vector_for_zh_full_text_search(self, nb, dim, insert_round, auto_id,
|
||||
is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
|
||||
"""
|
||||
Insert a vector with a simple payload
|
||||
"""
|
||||
# create a collection
|
||||
name = gen_collection_name()
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"schema": {
|
||||
"autoId": auto_id,
|
||||
"enableDynamicField": enable_dynamic_schema,
|
||||
"fields": [
|
||||
{"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
|
||||
{"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
|
||||
"elementTypeParams": {}},
|
||||
{"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
|
||||
{"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
|
||||
{"fieldName": "document_content", "dataType": "VarChar",
|
||||
"elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
|
||||
"analyzer_params": {
|
||||
"tokenizer": tokenizer,
|
||||
},
|
||||
"enable_match": True}},
|
||||
{"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
|
||||
],
|
||||
"functions": [
|
||||
{
|
||||
"name": "bm25_fn",
|
||||
"type": "BM25",
|
||||
"inputFieldNames": ["document_content"],
|
||||
"outputFieldNames": ["sparse_vector"],
|
||||
"params": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"indexParams": [
|
||||
{"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
|
||||
"params": {"index_type": "SPARSE_INVERTED_INDEX"}}
|
||||
]
|
||||
}
|
||||
rsp = self.collection_client.collection_create(payload)
|
||||
assert rsp['code'] == 0
|
||||
rsp = self.collection_client.collection_describe(name)
|
||||
logger.info(f"rsp: {rsp}")
|
||||
assert rsp['code'] == 0
|
||||
if tokenizer == 'default':
|
||||
fake = fake_en
|
||||
elif tokenizer == 'jieba':
|
||||
fake = fake_zh
|
||||
else:
|
||||
raise Exception("Invalid tokenizer")
|
||||
|
||||
# insert data
|
||||
for i in range(insert_round):
|
||||
data = []
|
||||
for j in range(nb):
|
||||
idx = i * nb + j
|
||||
if auto_id:
|
||||
tmp = {
|
||||
"user_id": idx%100,
|
||||
"word_count": j,
|
||||
"book_describe": f"book_{idx}",
|
||||
"document_content": fake.text().lower(),
|
||||
}
|
||||
else:
|
||||
tmp = {
|
||||
"book_id": idx,
|
||||
"user_id": idx%100,
|
||||
"word_count": j,
|
||||
"book_describe": f"book_{idx}",
|
||||
"document_content": fake.text().lower(),
|
||||
}
|
||||
if enable_dynamic_schema:
|
||||
tmp.update({f"dynamic_field_{i}": i})
|
||||
data.append(tmp)
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"data": data,
|
||||
}
|
||||
rsp = self.vector_client.vector_insert(payload)
|
||||
assert rsp['code'] == 0
|
||||
assert rsp['data']['insertCount'] == nb
|
||||
assert rsp['code'] == 0
|
||||
|
||||
# search data
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"data": [fake.text().lower() for _ in range(2)],
|
||||
"filter": "word_count > 100",
|
||||
"outputFields": ["*"],
|
||||
"searchParams": {
|
||||
"params": {
|
||||
"drop_ratio_search": "0.2",
|
||||
}
|
||||
},
|
||||
"limit": 500,
|
||||
}
|
||||
if groupingField:
|
||||
payload["groupingField"] = groupingField
|
||||
rsp = self.vector_client.vector_search(payload)
|
||||
assert rsp['code'] == 0
|
||||
assert len(rsp['data']) > 0
|
||||
|
||||
|
||||
|
||||
|
||||
@pytest.mark.parametrize("insert_round", [2])
|
||||
@pytest.mark.parametrize("auto_id", [True])
|
||||
@ -1790,6 +2006,29 @@ class TestSearchVectorNegative(TestBase):
|
||||
rsp = self.vector_client.vector_search(payload)
|
||||
assert rsp['code'] == 1802
|
||||
|
||||
@pytest.mark.parametrize("invalid_metric_type", ["L2", "IP", "UNSUPPORTED"])
|
||||
@pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37138")
|
||||
def test_search_vector_with_invalid_metric_type(self, invalid_metric_type):
|
||||
"""
|
||||
Search a vector with a simple payload
|
||||
"""
|
||||
name = gen_collection_name()
|
||||
self.name = name
|
||||
self.init_collection(name, metric_type="COSINE")
|
||||
|
||||
# search data
|
||||
dim = 128
|
||||
payload = {
|
||||
"collectionName": name,
|
||||
"data": [preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()],
|
||||
"searchParams": {
|
||||
"metricType": invalid_metric_type
|
||||
}
|
||||
}
|
||||
rsp = self.vector_client.vector_search(payload)
|
||||
assert rsp['code'] != 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("limit", [0, 16385])
|
||||
def test_search_vector_with_invalid_limit(self, limit):
|
||||
"""
|
||||
|
||||
@ -14,10 +14,74 @@ from sklearn.metrics import pairwise_distances
|
||||
from collections import Counter
|
||||
import bm25s
|
||||
import jieba
|
||||
|
||||
|
||||
fake = Faker()
|
||||
fake.seed_instance(19530)
|
||||
rng = np.random.default_rng()
|
||||
|
||||
|
||||
en_vocabularies_distribution = {
|
||||
"hello": 0.01,
|
||||
"milvus": 0.01,
|
||||
"vector": 0.01,
|
||||
"database": 0.01
|
||||
}
|
||||
|
||||
zh_vocabularies_distribution = {
|
||||
"你好": 0.01,
|
||||
"向量": 0.01,
|
||||
"数据": 0.01,
|
||||
"库": 0.01
|
||||
}
|
||||
|
||||
|
||||
def patch_faker_text(fake_instance, vocabularies_distribution):
|
||||
"""
|
||||
Monkey patch the text() method of a Faker instance to include custom vocabulary.
|
||||
Each word in vocabularies_distribution has an independent chance to be inserted.
|
||||
Args:
|
||||
fake_instance: Faker instance to patch
|
||||
vocabularies_distribution: Dictionary where:
|
||||
- key: word to insert
|
||||
- value: probability (0-1) of inserting this word into each sentence
|
||||
Example:
|
||||
vocabularies_distribution = {
|
||||
"hello": 0.1, # 10% chance to insert "hello" in each sentence
|
||||
"milvus": 0.1, # 10% chance to insert "milvus" in each sentence
|
||||
}
|
||||
"""
|
||||
original_text = fake_instance.text
|
||||
|
||||
def new_text(*args, **kwargs):
|
||||
sentences = []
|
||||
# Split original text into sentences
|
||||
original_sentences = original_text(*args,**kwargs).split('.')
|
||||
original_sentences = [s.strip() for s in original_sentences if s.strip()]
|
||||
|
||||
for base_sentence in original_sentences:
|
||||
words = base_sentence.split()
|
||||
|
||||
# Independently decide whether to insert each word
|
||||
for word, probability in vocabularies_distribution.items():
|
||||
if random.random() < probability:
|
||||
# Choose random position to insert the word
|
||||
insert_pos = random.randint(0, len(words))
|
||||
words.insert(insert_pos, word)
|
||||
|
||||
# Reconstruct the sentence
|
||||
base_sentence = ' '.join(words)
|
||||
|
||||
# Ensure proper capitalization
|
||||
base_sentence = base_sentence[0].upper() + base_sentence[1:]
|
||||
sentences.append(base_sentence)
|
||||
|
||||
return '. '.join(sentences) + '.'
|
||||
|
||||
# Replace the original text method with our custom one
|
||||
fake_instance.text = new_text
|
||||
|
||||
|
||||
def analyze_documents(texts, language="en"):
|
||||
stopwords = "en"
|
||||
if language in ["en", "english"]:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user