From fd971a434f246b073e212b4c99b1507add8be801 Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Mon, 15 Apr 2024 19:39:31 +0800 Subject: [PATCH] test: add case for sparse vector and verify group by (#32217) add case for sparse vector and verify group by --------- Signed-off-by: zhuwenxing --- .../testcases/test_vector_operations.py | 94 ++++++++++++++++++- tests/restful_client_v2/utils/utils.py | 4 +- 2 files changed, 95 insertions(+), 3 deletions(-) diff --git a/tests/restful_client_v2/testcases/test_vector_operations.py b/tests/restful_client_v2/testcases/test_vector_operations.py index c6b35c9c32..869fee020f 100644 --- a/tests/restful_client_v2/testcases/test_vector_operations.py +++ b/tests/restful_client_v2/testcases/test_vector_operations.py @@ -586,7 +586,7 @@ class TestSearchVector(TestBase): for i in range(nb): if auto_id: tmp = { - "user_id": i%100, + "user_id": i%10, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), @@ -597,7 +597,7 @@ class TestSearchVector(TestBase): else: tmp = { "book_id": i, - "user_id": i%100, + "user_id": i%10, "word_count": i, "book_describe": f"book_{i}", "float_vector": gen_vector(datatype="FloatVector", dim=dim), @@ -634,6 +634,9 @@ class TestSearchVector(TestBase): } rsp = self.vector_client.vector_search(payload) assert rsp['code'] == 200 + # assert no dup user_id + user_ids = [r["user_id"]for r in rsp['data']] + assert len(user_ids) == len(set(user_ids)) @pytest.mark.parametrize("insert_round", [1]) @pytest.mark.parametrize("auto_id", [True]) @@ -721,6 +724,93 @@ class TestSearchVector(TestBase): assert len(rsp['data']) == 100 + @pytest.mark.parametrize("insert_round", [1]) + @pytest.mark.parametrize("auto_id", [True]) + @pytest.mark.parametrize("is_partition_key", [True]) + @pytest.mark.parametrize("enable_dynamic_schema", [True]) + @pytest.mark.parametrize("nb", [3000]) + @pytest.mark.parametrize("dim", [128]) + @pytest.mark.xfail(reason="issue https://github.com/milvus-io/milvus/issues/32214") + def test_search_vector_with_sparse_float_vector_datatype(self, nb, dim, insert_round, auto_id, + is_partition_key, enable_dynamic_schema): + """ + Insert a vector with a simple payload + """ + # create a collection + name = gen_collection_name() + payload = { + "collectionName": name, + "schema": { + "autoId": auto_id, + "enableDynamicField": enable_dynamic_schema, + "fields": [ + {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}}, + {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key, + "elementTypeParams": {}}, + {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}}, + {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}}, + {"fieldName": "sparse_float_vector", "dataType": "SparseFloatVector"}, + ] + }, + "indexParams": [ + {"fieldName": "sparse_float_vector", "indexName": "sparse_float_vector", "metricType": "IP", + "indexConfig": {"index_type": "SPARSE_INVERTED_INDEX", "drop_ratio_build": "0.2"}} + ] + } + rsp = self.collection_client.collection_create(payload) + assert rsp['code'] == 200 + rsp = self.collection_client.collection_describe(name) + logger.info(f"rsp: {rsp}") + assert rsp['code'] == 200 + # insert data + for i in range(insert_round): + data = [] + for i in range(nb): + if auto_id: + tmp = { + "user_id": i%100, + "word_count": i, + "book_describe": f"book_{i}", + "sparse_float_vector": gen_vector(datatype="SparseFloatVector", dim=dim), + } + else: + tmp = { + "book_id": i, + "user_id": i%100, + "word_count": i, + "book_describe": f"book_{i}", + "sparse_float_vector": gen_vector(datatype="SparseFloatVector", dim=dim), + } + if enable_dynamic_schema: + tmp.update({f"dynamic_field_{i}": i}) + data.append(tmp) + payload = { + "collectionName": name, + "data": data, + } + rsp = self.vector_client.vector_insert(payload) + assert rsp['code'] == 200 + assert rsp['data']['insertCount'] == nb + # search data + payload = { + "collectionName": name, + "data": [gen_vector(datatype="SparseFloatVector", dim=dim)], + "filter": "word_count > 100", + "groupingField": "user_id", + "outputFields": ["*"], + "searchParams": { + "metricType": "IP", + "params": { + "drop_ratio_search": "0.2", + } + }, + "limit": 100, + } + rsp = self.vector_client.vector_search(payload) + assert rsp['code'] == 200 + assert len(rsp['data']) == 100 + + @pytest.mark.parametrize("insert_round", [2]) @pytest.mark.parametrize("auto_id", [True]) diff --git a/tests/restful_client_v2/utils/utils.py b/tests/restful_client_v2/utils/utils.py index efa8fe9e0a..466bf1d453 100644 --- a/tests/restful_client_v2/utils/utils.py +++ b/tests/restful_client_v2/utils/utils.py @@ -12,7 +12,7 @@ from loguru import logger import datetime fake = Faker() - +rng = np.random.default_rng() def random_string(length=8): letters = string.ascii_letters @@ -201,6 +201,8 @@ def gen_vector(datatype="float_vector", dim=128, binary_data=False): value = None if datatype == "FloatVector": return preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist() + if datatype == "SparseFloatVector": + return {d: rng.random() for d in random.sample(range(dim), random.randint(20, 30))} if datatype == "BinaryVector": value = gen_binary_vectors(1, dim)[1][0] if datatype == "Float16Vector":