From 247f75180f8f9f35404b01969feffded39ee2b6a Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Thu, 31 Oct 2024 21:18:23 +0800 Subject: [PATCH] test: add restful cases for full text search and some minor fix (#37148) /kind improvement --------- Signed-off-by: zhuwenxing --- tests/restful_client_v2/api/milvus.py | 5 + tests/restful_client_v2/base/testbase.py | 11 + .../testcases/test_collection_operations.py | 57 +++- .../testcases/test_index_operation.py | 189 +++++++++++--- .../testcases/test_jobs_operation.py | 10 +- .../testcases/test_vector_operations.py | 245 +++++++++++++++++- tests/restful_client_v2/utils/utils.py | 64 +++++ 7 files changed, 528 insertions(+), 53 deletions(-) diff --git a/tests/restful_client_v2/api/milvus.py b/tests/restful_client_v2/api/milvus.py index 9c1dabbdbb..ae94cd959c 100644 --- a/tests/restful_client_v2/api/milvus.py +++ b/tests/restful_client_v2/api/milvus.py @@ -334,6 +334,7 @@ class CollectionClient(Requests): self.endpoint = endpoint self.api_key = token self.db_name = None + self.name_list = [] self.headers = self.update_headers() @classmethod @@ -435,6 +436,10 @@ class CollectionClient(Requests): def collection_create(self, payload, db_name="default"): time.sleep(1) # wait for collection created and in case of rate limit + c_name = payload.get("collectionName", None) + db_name = payload.get("dbName", db_name) + self.name_list.append((db_name, c_name)) + url = f'{self.endpoint}/v2/vectordb/collections/create' if self.db_name is not None: payload["dbName"] = self.db_name diff --git a/tests/restful_client_v2/base/testbase.py b/tests/restful_client_v2/base/testbase.py index 3c08fea27e..d4556aefeb 100644 --- a/tests/restful_client_v2/base/testbase.py +++ b/tests/restful_client_v2/base/testbase.py @@ -50,6 +50,17 @@ class TestBase(Base): rsp = self.collection_client.collection_drop(payload) except Exception as e: logger.error(e) + for item in self.collection_client.name_list: + db_name = item[0] + c_name = item[1] + payload = { + "collectionName": c_name, + "dbName": db_name + } + try: + self.collection_client.collection_drop(payload) + except Exception as e: + logger.error(e) @pytest.fixture(scope="function", autouse=True) def init_client(self, endpoint, token, minio_host, bucket_name, root_path): diff --git a/tests/restful_client_v2/testcases/test_collection_operations.py b/tests/restful_client_v2/testcases/test_collection_operations.py index b08da39925..5f6be5807a 100644 --- a/tests/restful_client_v2/testcases/test_collection_operations.py +++ b/tests/restful_client_v2/testcases/test_collection_operations.py @@ -103,7 +103,7 @@ class TestCreateCollection(TestBase): "collectionName": name, "dimension": dim, "metricType": metric_type, - "params":{ + "params": { "enableDynamicField": enable_dynamic_field, "shardsNum": request_shards_num, "consistencyLevel": f"{consistency_level}", @@ -147,7 +147,7 @@ class TestCreateCollection(TestBase): @pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"]) @pytest.mark.parametrize("consistency_level", ["Strong", "Bounded"]) @pytest.mark.parametrize("enable_dynamic_field", [True, False]) - @pytest.mark.parametrize("index_type", ["AUTOINDEX","IVF_SQ8", "HNSW"]) + @pytest.mark.parametrize("index_type", ["AUTOINDEX", "IVF_SQ8", "HNSW"]) @pytest.mark.parametrize("dim", [128]) def test_create_collections_with_all_params(self, dim, @@ -179,6 +179,7 @@ class TestCreateCollection(TestBase): "FLAT": {}, "IVF_SQ8": {"nlist": 16384}, "HNSW": {"M": 16, "efConstruction": 500}, + "BM25_SPARSE_INVERTED_INDEX": {"bm25_k1": 0.5, "bm25_b": 0.5}, "AUTOINDEX": {} } @@ -197,15 +198,32 @@ class TestCreateCollection(TestBase): {"fieldName": "book_id", "dataType": "Int64", "isPrimary": primary_key_field == "book_id", "elementTypeParams": {}}, {"fieldName": "word_count", "dataType": "Int64", - "isPartitionKey": partition_key_field == "word_count", "isClusteringKey": clustering_key_field == "word_count", "elementTypeParams": {}}, + "isPartitionKey": partition_key_field == "word_count", + "isClusteringKey": clustering_key_field == "word_count", "elementTypeParams": {}}, {"fieldName": "book_category", "dataType": "Int64", "isPartitionKey": partition_key_field == "book_category", "isClusteringKey": clustering_key_field == "book_category", "elementTypeParams": {}}, {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, + {"fieldName": "document_content", "dataType": "VarChar", + "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True, + "analyzer_params": { + "tokenizer": "default" + }, + "enable_match": True}}, {"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}}, {"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64", "elementTypeParams": {"max_capacity": "1024"}}, - {"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}} + {"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}, + {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"} + ], + "functions": [ + { + "name": "bm25_fn", + "type": "BM25", + "inputFieldNames": ["document_content"], + "outputFieldNames": ["sparse_vector"], + "params": {} + } ] }, "indexParams": [ @@ -214,7 +232,14 @@ class TestCreateCollection(TestBase): "metricType": f"{metric_type}", "indexType": index_type, "params": index_param_map[index_type] - }] + }, + {"fieldName": "sparse_vector", + "indexName": "sparse_vector_index", + "metricType": "BM25", + "indexType": "SPARSE_INVERTED_INDEX", + "params": index_param_map["BM25_SPARSE_INVERTED_INDEX"] + } + ] } logging.info(f"create collection {name} with payload: {payload}") @@ -244,6 +269,7 @@ class TestCreateCollection(TestBase): assert rsp['data']['partitionsNum'] == num_partitions assert rsp['data']['consistencyLevel'] == consistency_level assert ttl_seconds_actual == ttl_seconds + assert len(rsp['data']["functions"]) == len(payload["schema"]["functions"]) # # # check fields properties fields = rsp['data']['fields'] @@ -259,11 +285,16 @@ class TestCreateCollection(TestBase): # check index index_info = [index.to_dict() for index in c.indexes] logger.info(f"index_info: {index_info}") - assert len(index_info) == 1 - assert index_info[0]["index_param"]['metric_type'] == metric_type - assert index_info[0]["index_param"]['index_type'] == index_type - assert index_info[0]["index_param"].get("params", {}) == index_param_map[index_type] - + assert len(index_info) == 2 + for index in index_info: + index_param = index["index_param"] + if index_param["index_type"] == "SPARSE_INVERTED_INDEX": + assert index_param["metric_type"] == "BM25" + assert index_param.get("params", {}) == index_param_map["BM25_SPARSE_INVERTED_INDEX"] + else: + assert index_param["metric_type"] == metric_type + assert index_param["index_type"] == index_type + assert index_param.get("params", {}) == index_param_map[index_type] @pytest.mark.parametrize("auto_id", [True, False]) @pytest.mark.parametrize("enable_dynamic_field", [True, False]) @@ -686,9 +717,6 @@ class TestCreateCollectionNegative(TestBase): rsp = client.collection_create(payload) assert rsp['code'] == 1801 - - - @pytest.mark.parametrize("name", [" ", "test_collection_" * 100, "test collection", "test/collection", "test\collection"]) def test_create_collections_with_invalid_collection_name(self, name): @@ -797,6 +825,7 @@ class TestGetCollectionStats(TestBase): rsp = client.collection_stats(collection_name=name) assert rsp['data']['rowCount'] == nb + @pytest.mark.L0 class TestLoadReleaseCollection(TestBase): @@ -845,6 +874,7 @@ class TestLoadReleaseCollection(TestBase): rsp = client.collection_load_state(collection_name=name) assert rsp['data']['loadState'] == "LoadStateNotLoad" + @pytest.mark.L0 class TestGetCollectionLoadState(TestBase): @@ -1126,6 +1156,7 @@ class TestRenameCollection(TestBase): assert new_name in all_collections assert name not in all_collections + @pytest.mark.L1 class TestCollectionWithAuth(TestBase): def test_drop_collections_with_invalid_api_key(self): diff --git a/tests/restful_client_v2/testcases/test_index_operation.py b/tests/restful_client_v2/testcases/test_index_operation.py index 534684c9bf..c399e62e3c 100644 --- a/tests/restful_client_v2/testcases/test_index_operation.py +++ b/tests/restful_client_v2/testcases/test_index_operation.py @@ -1,28 +1,42 @@ import random from sklearn import preprocessing import numpy as np -import sys -import json import time -from utils import constant -from utils.utils import gen_collection_name +from utils.utils import gen_collection_name, patch_faker_text, en_vocabularies_distribution, \ + zh_vocabularies_distribution from utils.util_log import test_log as logger import pytest from base.testbase import TestBase from utils.utils import gen_vector from pymilvus import ( - FieldSchema, CollectionSchema, DataType, Collection ) +from faker import Faker + +Faker.seed(19530) +fake_en = Faker("en_US") +fake_zh = Faker("zh_CN") + +patch_faker_text(fake_en, en_vocabularies_distribution) +patch_faker_text(fake_zh, zh_vocabularies_distribution) + +index_param_map = { + "FLAT": {}, + "IVF_SQ8": {"nlist": 128}, + "HNSW": {"M": 16, "efConstruction": 200}, + "BM25_SPARSE_INVERTED_INDEX": {"bm25_k1": 0.5, "bm25_b": 0.5}, + "AUTOINDEX": {} +} @pytest.mark.L0 class TestCreateIndex(TestBase): - @pytest.mark.parametrize("metric_type", ["L2"]) - @pytest.mark.parametrize("index_type", ["AUTOINDEX", "HNSW"]) + @pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"]) + @pytest.mark.parametrize("index_type", ["AUTOINDEX", "IVF_SQ8", "HNSW"]) @pytest.mark.parametrize("dim", [128]) - def test_index_e2e(self, dim, metric_type, index_type): + @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365") + def test_index_default(self, dim, metric_type, index_type): """ target: test create collection method: create a collection with a simple schema @@ -43,38 +57,21 @@ class TestCreateIndex(TestBase): } logger.info(f"create collection {name} with payload: {payload}") rsp = client.collection_create(payload) - # insert data - for i in range(1): - data = [] - for j in range(3000): - tmp = { - "book_id": j, - "word_count": j, - "book_describe": f"book_{j}", - "book_intro": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[ - 0].tolist(), - } - data.append(tmp) - payload = { - "collectionName": name, - "data": data - } - rsp = self.vector_client.vector_insert(payload) c = Collection(name) c.flush() # list index, expect empty rsp = self.index_client.index_list(name) - # create index payload = { "collectionName": name, - "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector", - "metricType": f"{metric_type}"}] + "indexParams": [ + {"fieldName": "book_intro", "indexName": "book_intro_vector", + "metricType": f"{metric_type}", + "indexType": f"{index_type}", + "params": index_param_map[index_type] + } + ] } - if index_type == "HNSW": - payload["indexParams"][0]["params"] = {"index_type": "HNSW", "M": "16", "efConstruction": "200"} - if index_type == "AUTOINDEX": - payload["indexParams"][0]["params"] = {"index_type": "AUTOINDEX"} rsp = self.index_client.index_create(payload) assert rsp['code'] == 0 time.sleep(10) @@ -90,8 +87,19 @@ class TestCreateIndex(TestBase): assert expected_index[i]['fieldName'] == actual_index[i]['fieldName'] assert expected_index[i]['indexName'] == actual_index[i]['indexName'] assert expected_index[i]['metricType'] == actual_index[i]['metricType'] - assert expected_index[i]["params"]['index_type'] == actual_index[i]['indexType'] - + assert expected_index[i]["indexType"] == actual_index[i]['indexType'] + # check index by pymilvus + index_info = [index.to_dict() for index in c.indexes] + logger.info(f"index_info: {index_info}") + for index in index_info: + index_param = index["index_param"] + if index_param["index_type"] == "SPARSE_INVERTED_INDEX": + assert index_param["metric_type"] == "BM25" + assert index_param.get("params", {}) == index_param_map["BM25_SPARSE_INVERTED_INDEX"] + else: + assert index_param["metric_type"] == metric_type + assert index_param["index_type"] == index_type + assert index_param.get("params", {}) == index_param_map[index_type] # drop index for i in range(len(actual_index)): payload = { @@ -241,6 +249,119 @@ class TestCreateIndex(TestBase): assert expected_index[i]['indexName'] == actual_index[i]['indexName'] assert expected_index[i]['params']['index_type'] == actual_index[i]['indexType'] + @pytest.mark.parametrize("insert_round", [1]) + @pytest.mark.parametrize("auto_id", [True]) + @pytest.mark.parametrize("is_partition_key", [True]) + @pytest.mark.parametrize("enable_dynamic_schema", [True]) + @pytest.mark.parametrize("nb", [3000]) + @pytest.mark.parametrize("dim", [128]) + @pytest.mark.parametrize("tokenizer", ['default', 'jieba']) + @pytest.mark.parametrize("index_type", ['SPARSE_INVERTED_INDEX', 'SPARSE_WAND']) + @pytest.mark.parametrize("bm25_k1", [1.2, 1.5]) + @pytest.mark.parametrize("bm25_b", [0.7, 0.5]) + @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365") + def test_create_index_for_full_text_search(self, nb, dim, insert_round, auto_id, is_partition_key, + enable_dynamic_schema, tokenizer, index_type, bm25_k1, bm25_b): + """ + Insert a vector with a simple payload + """ + # create a collection + name = gen_collection_name() + payload = { + "collectionName": name, + "schema": { + "autoId": auto_id, + "enableDynamicField": enable_dynamic_schema, + "fields": [ + {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, + {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, + "elementTypeParams": {}}, + {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, + {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, + {"fieldName": "document_content", "dataType": "VarChar", + "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True, + "analyzer_params": { + "tokenizer": tokenizer, + }, + "enable_match": True}}, + {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"}, + ], + "functions": [ + { + "name": "bm25_fn", + "type": "BM25", + "inputFieldNames": ["document_content"], + "outputFieldNames": ["sparse_vector"], + "params": {} + } + ] + }, + } + rsp = self.collection_client.collection_create(payload) + assert rsp['code'] == 0 + rsp = self.collection_client.collection_describe(name) + logger.info(f"rsp: {rsp}") + assert rsp['code'] == 0 + if tokenizer == 'default': + fake = fake_en + elif tokenizer == 'jieba': + fake = fake_zh + else: + raise Exception("Invalid tokenizer") + + # insert data + for i in range(insert_round): + data = [] + for j in range(nb): + idx = i * nb + j + if auto_id: + tmp = { + "user_id": idx % 100, + "word_count": j, + "book_describe": f"book_{idx}", + "document_content": fake.text().lower(), + } + else: + tmp = { + "book_id": idx, + "user_id": idx % 100, + "word_count": j, + "book_describe": f"book_{idx}", + "document_content": fake.text().lower(), + } + if enable_dynamic_schema: + tmp.update({f"dynamic_field_{i}": i}) + data.append(tmp) + payload = { + "collectionName": name, + "data": data, + } + rsp = self.vector_client.vector_insert(payload) + assert rsp['code'] == 0 + assert rsp['data']['insertCount'] == nb + assert rsp['code'] == 0 + + # create index + payload = { + "collectionName": name, + "indexParams": [ + {"fieldName": "sparse_vector", "indexName": "sparse_vector", + "metricType": "BM25", + "indexType": index_type, + "params": {"bm25_k1": bm25_k1, "bm25_b": bm25_b} + } + ] + } + rsp = self.index_client.index_create(payload) + c = Collection(name) + index_info = [index.to_dict() for index in c.indexes] + logger.info(f"index_info: {index_info}") + for info in index_info: + assert info['index_param']['metric_type'] == 'BM25' + assert info['index_param']["params"]['bm25_k1'] == bm25_k1 + assert info['index_param']["params"]['bm25_b'] == bm25_b + assert info['index_param']['index_type'] == index_type + @pytest.mark.L1 class TestCreateIndexNegative(TestBase): diff --git a/tests/restful_client_v2/testcases/test_jobs_operation.py b/tests/restful_client_v2/testcases/test_jobs_operation.py index 46f058cb3f..9cbe9b9688 100644 --- a/tests/restful_client_v2/testcases/test_jobs_operation.py +++ b/tests/restful_client_v2/testcases/test_jobs_operation.py @@ -101,7 +101,6 @@ class TestCreateImportJob(TestBase): assert False, "import job timeout" c = Collection(name) c.load(_refresh=True) - time.sleep(10) res = c.query( expr="", output_fields=["count(*)"], @@ -192,7 +191,6 @@ class TestCreateImportJob(TestBase): assert False, "import job timeout" c = Collection(name) c.load(_refresh=True) - time.sleep(10) res = c.query( expr="", output_fields=["count(*)"], @@ -285,7 +283,6 @@ class TestCreateImportJob(TestBase): assert False, "import job timeout" c = Collection(name) c.load(_refresh=True) - time.sleep(10) res = c.query( expr="", output_fields=["count(*)"], @@ -376,6 +373,7 @@ class TestCreateImportJob(TestBase): time.sleep(10) # assert data count c = Collection(name) + c.load(_refresh=True) assert c.num_entities == 2000 # assert import data can be queried payload = { @@ -456,6 +454,7 @@ class TestCreateImportJob(TestBase): time.sleep(10) # assert data count c = Collection(name) + c.load(_refresh=True) assert c.num_entities == 2000 # assert import data can be queried payload = { @@ -541,6 +540,7 @@ class TestCreateImportJob(TestBase): time.sleep(10) # assert data count c = Collection(name) + c.load(_refresh=True) assert c.num_entities == 2000 # assert import data can be queried payload = { @@ -665,6 +665,7 @@ class TestCreateImportJob(TestBase): time.sleep(10) # assert data count c = Collection(name) + c.load(_refresh=True) assert c.num_entities == 6000 # assert import data can be queried payload = { @@ -915,6 +916,7 @@ class TestImportJobAdvance(TestBase): rsp = self.import_job_client.list_import_jobs(payload) # assert data count c = Collection(name) + c.load(_refresh=True) assert c.num_entities == file_nums * batch_size # assert import data can be queried payload = { @@ -1007,6 +1009,7 @@ class TestCreateImportJobAdvance(TestBase): rsp = self.import_job_client.list_import_jobs(payload) # assert data count c = Collection(name) + c.load(_refresh=True) assert c.num_entities == file_nums * batch_size * task_num # assert import data can be queried payload = { @@ -1096,6 +1099,7 @@ class TestCreateImportJobAdvance(TestBase): rsp = self.import_job_client.list_import_jobs(payload) # assert data count c = Collection(name) + c.load(_refresh=True) assert c.num_entities == file_nums * batch_size * task_num # assert import data can be queried payload = { diff --git a/tests/restful_client_v2/testcases/test_vector_operations.py b/tests/restful_client_v2/testcases/test_vector_operations.py index 991c12fe49..ecab26be90 100644 --- a/tests/restful_client_v2/testcases/test_vector_operations.py +++ b/tests/restful_client_v2/testcases/test_vector_operations.py @@ -6,7 +6,7 @@ import sys import json import time from utils import constant -from utils.utils import gen_collection_name, get_sorted_distance +from utils.utils import gen_collection_name, get_sorted_distance, patch_faker_text, en_vocabularies_distribution, zh_vocabularies_distribution from utils.util_log import test_log as logger import pytest from base.testbase import TestBase @@ -20,6 +20,9 @@ Faker.seed(19530) fake_en = Faker("en_US") fake_zh = Faker("zh_CN") +patch_faker_text(fake_en, en_vocabularies_distribution) +patch_faker_text(fake_zh, zh_vocabularies_distribution) + @pytest.mark.L0 class TestInsertVector(TestBase): @@ -1193,14 +1196,108 @@ class TestSearchVector(TestBase): rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 + + @pytest.mark.parametrize("insert_round", [1]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("is_partition_key", [True, False]) + @pytest.mark.parametrize("enable_dynamic_schema", [True]) + @pytest.mark.parametrize("nb", [3000]) + @pytest.mark.parametrize("dim", [128]) + @pytest.mark.parametrize("groupingField", ['user_id', None]) + @pytest.mark.parametrize("tokenizer", ['default']) + def test_search_vector_for_en_full_text_search(self, nb, dim, insert_round, auto_id, + is_partition_key, enable_dynamic_schema, groupingField, tokenizer): + """ + Insert a vector with a simple payload + """ + # create a collection + name = gen_collection_name() + payload = { + "collectionName": name, + "schema": { + "autoId": auto_id, + "enableDynamicField": enable_dynamic_schema, + "fields": [ + {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, + {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, + "elementTypeParams": {}}, + {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, + {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, + {"fieldName": "document_content", "dataType": "VarChar", + "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True, + "analyzer_params": { + "tokenizer": tokenizer, + }, + "enable_match": True}}, + {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"}, + ], + "functions": [ + { + "name": "bm25_fn", + "type": "BM25", + "inputFieldNames": ["document_content"], + "outputFieldNames": ["sparse_vector"], + "params": {} + } + ] + }, + + "indexParams": [ + {"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25", + "params": {"index_type": "SPARSE_INVERTED_INDEX"}} + ] + } + rsp = self.collection_client.collection_create(payload) + assert rsp['code'] == 0 + rsp = self.collection_client.collection_describe(name) + logger.info(f"rsp: {rsp}") + assert rsp['code'] == 0 + if tokenizer == 'default': + fake = fake_en + elif tokenizer == 'jieba': + fake = fake_zh + else: + raise Exception("Invalid tokenizer") + + # insert data + for i in range(insert_round): + data = [] + for j in range(nb): + idx = i * nb + j + if auto_id: + tmp = { + "user_id": idx%100, + "word_count": j, + "book_describe": f"book_{idx}", + "document_content": fake.text().lower(), + } + else: + tmp = { + "book_id": idx, + "user_id": idx%100, + "word_count": j, + "book_describe": f"book_{idx}", + "document_content": fake.text().lower(), + } + if enable_dynamic_schema: + tmp.update({f"dynamic_field_{i}": i}) + data.append(tmp) + payload = { + "collectionName": name, + "data": data, + } + rsp = self.vector_client.vector_insert(payload) + assert rsp['code'] == 0 + assert rsp['data']['insertCount'] == nb + assert rsp['code'] == 0 + # search data payload = { "collectionName": name, - "data": [gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="coo")], + "data": [fake.text().lower() for _ in range(1)], "filter": "word_count > 100", "outputFields": ["*"], "searchParams": { - "metricType": "IP", "params": { "drop_ratio_search": "0.2", } @@ -1211,6 +1308,125 @@ class TestSearchVector(TestBase): payload["groupingField"] = groupingField rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 0 + assert len(rsp['data']) > 0 + + + @pytest.mark.parametrize("insert_round", [1]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("is_partition_key", [True, False]) + @pytest.mark.parametrize("enable_dynamic_schema", [True]) + @pytest.mark.parametrize("nb", [3000]) + @pytest.mark.parametrize("dim", [128]) + @pytest.mark.parametrize("groupingField", ['user_id', None]) + @pytest.mark.parametrize("tokenizer", ['jieba']) + @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751") + def test_search_vector_for_zh_full_text_search(self, nb, dim, insert_round, auto_id, + is_partition_key, enable_dynamic_schema, groupingField, tokenizer): + """ + Insert a vector with a simple payload + """ + # create a collection + name = gen_collection_name() + payload = { + "collectionName": name, + "schema": { + "autoId": auto_id, + "enableDynamicField": enable_dynamic_schema, + "fields": [ + {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, + {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, + "elementTypeParams": {}}, + {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, + {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, + {"fieldName": "document_content", "dataType": "VarChar", + "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True, + "analyzer_params": { + "tokenizer": tokenizer, + }, + "enable_match": True}}, + {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"}, + ], + "functions": [ + { + "name": "bm25_fn", + "type": "BM25", + "inputFieldNames": ["document_content"], + "outputFieldNames": ["sparse_vector"], + "params": {} + } + ] + }, + + "indexParams": [ + {"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25", + "params": {"index_type": "SPARSE_INVERTED_INDEX"}} + ] + } + rsp = self.collection_client.collection_create(payload) + assert rsp['code'] == 0 + rsp = self.collection_client.collection_describe(name) + logger.info(f"rsp: {rsp}") + assert rsp['code'] == 0 + if tokenizer == 'default': + fake = fake_en + elif tokenizer == 'jieba': + fake = fake_zh + else: + raise Exception("Invalid tokenizer") + + # insert data + for i in range(insert_round): + data = [] + for j in range(nb): + idx = i * nb + j + if auto_id: + tmp = { + "user_id": idx%100, + "word_count": j, + "book_describe": f"book_{idx}", + "document_content": fake.text().lower(), + } + else: + tmp = { + "book_id": idx, + "user_id": idx%100, + "word_count": j, + "book_describe": f"book_{idx}", + "document_content": fake.text().lower(), + } + if enable_dynamic_schema: + tmp.update({f"dynamic_field_{i}": i}) + data.append(tmp) + payload = { + "collectionName": name, + "data": data, + } + rsp = self.vector_client.vector_insert(payload) + assert rsp['code'] == 0 + assert rsp['data']['insertCount'] == nb + assert rsp['code'] == 0 + + # search data + payload = { + "collectionName": name, + "data": [fake.text().lower() for _ in range(2)], + "filter": "word_count > 100", + "outputFields": ["*"], + "searchParams": { + "params": { + "drop_ratio_search": "0.2", + } + }, + "limit": 500, + } + if groupingField: + payload["groupingField"] = groupingField + rsp = self.vector_client.vector_search(payload) + assert rsp['code'] == 0 + assert len(rsp['data']) > 0 + + + @pytest.mark.parametrize("insert_round", [2]) @pytest.mark.parametrize("auto_id", [True]) @@ -1790,6 +2006,29 @@ class TestSearchVectorNegative(TestBase): rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 1802 + @pytest.mark.parametrize("invalid_metric_type", ["L2", "IP", "UNSUPPORTED"]) + @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37138") + def test_search_vector_with_invalid_metric_type(self, invalid_metric_type): + """ + Search a vector with a simple payload + """ + name = gen_collection_name() + self.name = name + self.init_collection(name, metric_type="COSINE") + + # search data + dim = 128 + payload = { + "collectionName": name, + "data": [preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()], + "searchParams": { + "metricType": invalid_metric_type + } + } + rsp = self.vector_client.vector_search(payload) + assert rsp['code'] != 0 + + @pytest.mark.parametrize("limit", [0, 16385]) def test_search_vector_with_invalid_limit(self, limit): """ diff --git a/tests/restful_client_v2/utils/utils.py b/tests/restful_client_v2/utils/utils.py index 0cd6a6e10e..d7fee27f1d 100644 --- a/tests/restful_client_v2/utils/utils.py +++ b/tests/restful_client_v2/utils/utils.py @@ -14,10 +14,74 @@ from sklearn.metrics import pairwise_distances from collections import Counter import bm25s import jieba + + fake = Faker() +fake.seed_instance(19530) rng = np.random.default_rng() +en_vocabularies_distribution = { + "hello": 0.01, + "milvus": 0.01, + "vector": 0.01, + "database": 0.01 +} + +zh_vocabularies_distribution = { + "你好": 0.01, + "向量": 0.01, + "数据": 0.01, + "库": 0.01 +} + + +def patch_faker_text(fake_instance, vocabularies_distribution): + """ + Monkey patch the text() method of a Faker instance to include custom vocabulary. + Each word in vocabularies_distribution has an independent chance to be inserted. + Args: + fake_instance: Faker instance to patch + vocabularies_distribution: Dictionary where: + - key: word to insert + - value: probability (0-1) of inserting this word into each sentence + Example: + vocabularies_distribution = { + "hello": 0.1, # 10% chance to insert "hello" in each sentence + "milvus": 0.1, # 10% chance to insert "milvus" in each sentence + } + """ + original_text = fake_instance.text + + def new_text(*args, **kwargs): + sentences = [] + # Split original text into sentences + original_sentences = original_text(*args,**kwargs).split('.') + original_sentences = [s.strip() for s in original_sentences if s.strip()] + + for base_sentence in original_sentences: + words = base_sentence.split() + + # Independently decide whether to insert each word + for word, probability in vocabularies_distribution.items(): + if random.random() < probability: + # Choose random position to insert the word + insert_pos = random.randint(0, len(words)) + words.insert(insert_pos, word) + + # Reconstruct the sentence + base_sentence = ' '.join(words) + + # Ensure proper capitalization + base_sentence = base_sentence[0].upper() + base_sentence[1:] + sentences.append(base_sentence) + + return '. '.join(sentences) + '.' + + # Replace the original text method with our custom one + fake_instance.text = new_text + + def analyze_documents(texts, language="en"): stopwords = "en" if language in ["en", "english"]: