From 13a52016ac1acb6a8575be2c32e5f66fea2ddabc Mon Sep 17 00:00:00 2001 From: yanliang567 <82361606+yanliang567@users.noreply.github.com> Date: Tue, 2 Dec 2025 18:11:10 +0800 Subject: [PATCH] test: Update hybrid search tests with milvus client (#46003) related issue: https://github.com/milvus-io/milvus/issues/45326 Signed-off-by: yanliang567 --- tests/python_client/check/func_check.py | 51 +- .../test_milvus_client_hybrid_search_v2.py | 1474 +++++++---------- .../test_milvus_client_search_v2_new.py | 4 +- tests/python_client/requirements.txt | 4 +- .../async_milvus_client/test_e2e_async.py | 5 +- .../async_milvus_client/test_index_async.py | 8 +- 6 files changed, 646 insertions(+), 900 deletions(-) diff --git a/tests/python_client/check/func_check.py b/tests/python_client/check/func_check.py index 4bf3b40ad1..45c0108110 100644 --- a/tests/python_client/check/func_check.py +++ b/tests/python_client/check/func_check.py @@ -436,14 +436,13 @@ class ResponseChecker: original_entities = pandas.DataFrame(original_entities) pc.output_field_value_check(search_res, original_entities, pk_name=pk_name) if len(search_res) != check_items["nq"]: - log.error("search_results_check: Numbers of query searched (%d) " + log.error("search_results_check: Numbers of query searched(nq) (%d) " "is not equal with expected (%d)" % (len(search_res), check_items["nq"])) assert len(search_res) == check_items["nq"] else: log.info("search_results_check: Numbers of query searched is correct") # log.debug(search_res) - nq_i = 0 for hits in search_res: ids = [] distances = [] @@ -461,25 +460,24 @@ class ResponseChecker: % (len(hits), check_items["limit"])) assert len(hits) == check_items["limit"] assert len(ids) == check_items["limit"] - else: - if check_items.get("ids", None) is not None: - ids_match = pc.list_contain_check(ids, list(check_items["ids"])) - if not ids_match: - log.error("search_results_check: ids searched not match") - assert ids_match - elif check_items.get("metric", None) is not None: - # verify the distances are already sorted - if check_items.get("metric").upper() in ["IP", "COSINE", "BM25"]: - assert pc.compare_lists_with_epsilon_ignore_dict_order(distances, sorted(distances, reverse=True)) - else: - assert pc.compare_lists_with_epsilon_ignore_dict_order(distances, sorted(distances, reverse=False)) - if check_items.get("vector_nq") is None or check_items.get("original_vectors") is None: - log.debug("skip distance check for knowhere does not return the precise distances") - else: - pass + if check_items.get("ids", None) is not None: + ids_match = pc.list_contain_check(ids, list(check_items["ids"])) + if not ids_match: + log.error("search_results_check: ids searched not match") + assert ids_match + if check_items.get("metric", None) is not None: + # verify the distances are already sorted + num_to_check = min(100, len(distances)) # check 100 items if more than that + if check_items.get("metric").upper() in ["IP", "COSINE", "BM25"]: + assert distances[:num_to_check] == sorted(distances[:num_to_check], reverse=True) else: - pass # just check nq and topk, not specific ids need check - nq_i += 1 + assert distances[:num_to_check] == sorted(distances[:num_to_check], reverse=False) + if check_items.get("vector_nq") is None or check_items.get("original_vectors") is None: + log.debug("skip distance check for knowhere does not return the precise distances") + else: + pass + else: + pass # just check nq and topk, not specific ids need check log.info("search_results_check: limit (topK) and " "ids searched for %d queries are correct" % len(search_res)) @@ -586,12 +584,13 @@ class ResponseChecker: for single_query_result in query_res: single_query_result[vector_field] = np.frombuffer(single_query_result[vector_field][0], dtype=np.int8).tolist() if isinstance(query_res, list): - result = pc.compare_lists_with_epsilon_ignore_dict_order(a=query_res, b=exp_res) - if result is False: - # Only for debug, compare the result with deepdiff - pc.compare_lists_with_epsilon_ignore_dict_order_deepdiff(a=query_res, b=exp_res) - assert result - return result + debug_mode = check_items.get("debug_mode", False) + if debug_mode is True: + assert pc.compare_lists_with_epsilon_ignore_dict_order_deepdiff(a=query_res, b=exp_res) + else: + assert pc.compare_lists_with_epsilon_ignore_dict_order(a=query_res, b=exp_res), \ + f"there exists different values between query_results and expected_results, " \ + f"use debug_mode in check_items to print the difference entity by entity(but it is slow)" else: log.error(f"Query result {query_res} is not list") return False diff --git a/tests/python_client/milvus_client_v2/test_milvus_client_hybrid_search_v2.py b/tests/python_client/milvus_client_v2/test_milvus_client_hybrid_search_v2.py index 949a8b500c..8fec126dfa 100644 --- a/tests/python_client/milvus_client_v2/test_milvus_client_hybrid_search_v2.py +++ b/tests/python_client/milvus_client_v2/test_milvus_client_hybrid_search_v2.py @@ -29,57 +29,13 @@ cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution) cf.patch_faker_text(fake_zh, cf.zh_vocabularies_distribution) pd.set_option("expand_frame_repr", False) - -prefix = "search_collection" -search_num = 10 -max_dim = ct.max_dim -min_dim = ct.min_dim -epsilon = ct.epsilon -hybrid_search_epsilon = 0.01 -gracefulTime = ct.gracefulTime -default_nb = ct.default_nb -default_nb_medium = ct.default_nb_medium -default_nq = ct.default_nq -default_dim = ct.default_dim -default_limit = ct.default_limit -max_limit = ct.max_limit -default_search_exp = "int64 >= 0" -default_search_string_exp = "varchar >= \"0\"" -default_search_mix_exp = "int64 >= 0 && varchar >= \"0\"" -default_invaild_string_exp = "varchar >= 0" -default_json_search_exp = "json_field[\"number\"] >= 0" -perfix_expr = 'varchar like "0%"' -default_search_field = ct.default_float_vec_field_name -default_search_params = ct.default_search_params -default_int64_field_name = ct.default_int64_field_name -default_float_field_name = ct.default_float_field_name -default_bool_field_name = ct.default_bool_field_name -default_string_field_name = ct.default_string_field_name -default_json_field_name = ct.default_json_field_name -default_index_params = ct.default_index -vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)] -uid = "test_search" -nq = 1 epsilon = 0.001 -binary_field_name = default_binary_vec_field_name -search_param = {"nprobe": 1} -entity = gen_entities(1, is_normal=True) -entities = gen_entities(default_nb, is_normal=True) -raw_vectors, binary_entities = gen_binary_entities(default_nb) -index_name1 = cf.gen_unique_str("float") -index_name2 = cf.gen_unique_str("varhar") -half_nb = ct.default_nb // 2 -max_hybrid_search_req_num = ct.max_hybrid_search_req_num +hybrid_search_epsilon = 0.01 # test parameters for test client v2 base class default_primary_key_field_name = "id" default_vector_field_name = "vector" -partition_names = ["partition_1", "partition_2"] -float_vector_field_name1 = "float_vector1" -float_vector_field_name2 = "float_vector2" -sparse_vector_field_name1 = "text_sparse_emb1" -sparse_vector_field_name2 = "text_sparse_emb2" -max_nq = 16384 +default_limit = 100 @pytest.mark.xdist_group("TestMilvusClientHybridSearch") @@ -89,14 +45,37 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): def setup_class(self): super().setup_class(self) self.collection_name = "TestMilvusClientHybridSearch" + cf.gen_unique_str("_") - self.partition_names = partition_names - self.float_vector_field_name1 = float_vector_field_name1 - self.float_vector_field_name2 = float_vector_field_name2 - self.sparse_vector_field_name1 = sparse_vector_field_name1 - self.sparse_vector_field_name2 = sparse_vector_field_name2 + self.partition_names = ["partition_1", "partition_2"] + self.primary_key_field_name = "id" + self.float_vector_field_name1 = "float_vector1" + self.float_vector_field_name2 = "float_vector2" + self.sparse_vector_field_name1 = "sparse_vector1" + self.sparse_vector_field_name2 = "sparse_vector2" + self.dynamic_field_name1 = "dynamic_1" + self.dynamic_field_name2 = "dynamic_2" + self.text_field_name1 = "text1" + self.text_field_name2 = "text2" + self.json_field_name = "json" + self.string_field_name = "string" + self.int64_field_name = "int64" + self.all_fields = [ + self.primary_key_field_name, + self.float_vector_field_name1, + self.float_vector_field_name2, + self.sparse_vector_field_name1, + self.sparse_vector_field_name2, + self.dynamic_field_name1, + self.dynamic_field_name2, + self.text_field_name1, + self.text_field_name2, + self.json_field_name, + self.string_field_name, + self.int64_field_name + ] + self.float_vector_dim = 128 self.primary_keys = [] - self.enable_dynamic_field = False + self.enable_dynamic_field = True self.datas = [] @pytest.fixture(scope="class", autouse=True) @@ -111,37 +90,37 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): } # Create collection - collection_schema = self.create_schema(client)[0] - collection_schema.add_field(default_primary_key_field_name, DataType.INT64, is_primary=True, auto_id=False) + collection_schema = self.create_schema(client, enable_dynamic_field=self.enable_dynamic_field)[0] + collection_schema.add_field(self.primary_key_field_name, DataType.INT64, is_primary=True, auto_id=False) collection_schema.add_field(self.float_vector_field_name1, DataType.FLOAT_VECTOR, dim=self.float_vector_dim) collection_schema.add_field(self.float_vector_field_name2, DataType.FLOAT_VECTOR, dim=self.float_vector_dim) collection_schema.add_field(self.sparse_vector_field_name1, DataType.SPARSE_FLOAT_VECTOR) collection_schema.add_field(self.sparse_vector_field_name2, DataType.SPARSE_FLOAT_VECTOR) - collection_schema.add_field('text1', DataType.VARCHAR, max_length=6553, + collection_schema.add_field(self.text_field_name1, DataType.VARCHAR, max_length=6553, enable_analyzer=True, analyzer_params=analyzer_params) - collection_schema.add_field('text2', DataType.VARCHAR, max_length=6553, + collection_schema.add_field(self.text_field_name2, DataType.VARCHAR, max_length=6553, enable_analyzer=True, analyzer_params=analyzer_params) - collection_schema.add_field(default_int64_field_name, DataType.INT64) - collection_schema.add_field('json', DataType.JSON) - collection_schema.add_field(default_string_field_name, DataType.VARCHAR, max_length=256) + collection_schema.add_field(self.int64_field_name, DataType.INT64) + collection_schema.add_field(self.json_field_name, DataType.JSON) + collection_schema.add_field(self.string_field_name, DataType.VARCHAR, max_length=256) bm25_function1 = Function( name=self.sparse_vector_field_name1, function_type=FunctionType.BM25, - input_field_names=["text1"], + input_field_names=[self.text_field_name1], output_field_names=self.sparse_vector_field_name1, params={}, ) bm25_function2 = Function( name=self.sparse_vector_field_name2, function_type=FunctionType.BM25, - input_field_names=["text2"], + input_field_names=[self.text_field_name2], output_field_names=self.sparse_vector_field_name2, params={}, ) collection_schema.add_function(bm25_function1) collection_schema.add_function(bm25_function2) self.create_collection(client, self.collection_name, schema=collection_schema, - enable_dynamic_field=self.enable_dynamic_field, force_teardown=False) + force_teardown=False) for partition_name in self.partition_names: self.create_partition(client, self.collection_name, partition_name=partition_name) @@ -166,14 +145,16 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): for i in range(default_nb): pk = i + j * default_nb row = { - default_primary_key_field_name: pk, + self.primary_key_field_name: pk, self.float_vector_field_name1: list(float_vectors[pk]), self.float_vector_field_name2: list(float_vectors2[pk]), - "text1": texts1[pk], - "text2": texts2[pk], - "json": {"float": pk * 1.0, "str": str(pk)}, - default_string_field_name: str(pk), - default_int64_field_name: pk + self.text_field_name1: texts1[pk], + self.text_field_name2: texts2[pk], + self.json_field_name: {"float": pk * 1.0, "str": str(pk)}, + self.string_field_name: str(pk), + self.int64_field_name: pk, + self.dynamic_field_name1: f"dynamic_value_{pk}", + self.dynamic_field_name2: pk * 1.0, } self.datas.append(row) @@ -263,15 +244,15 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): self.hybrid_search(client, self.collection_name, reqs=req_list, ranker=WeightedRanker(*[0.6, 0.4]), limit=default_limit, - output_fields=[default_primary_key_field_name, default_string_field_name], + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq, "ids": self.primary_keys, "limit": default_limit, - "pk_name": default_primary_key_field_name, + "pk_name": self.primary_key_field_name, "original_entities": self.datas, - "output_fields": [default_primary_key_field_name, - default_string_field_name]}) + "output_fields": [self.primary_key_field_name, + self.string_field_name]}) @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("req_limit_ratio", [1, 2]) @@ -304,32 +285,32 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): hybrid_search_0 = self.hybrid_search(client, self.collection_name, reqs=req_list, ranker=WeightedRanker(*[0.6, 0.4]), limit=default_limit, - filter=f"{default_int64_field_name} > 1000", - output_fields=[default_primary_key_field_name, default_string_field_name], + filter=f"{self.int64_field_name} > 1000", + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq, "ids": self.primary_keys, "limit": default_limit, "enable_milvus_client_api": True, - "pk_name": default_primary_key_field_name, - "original_entities": self.datas, - "output_fields": [default_primary_key_field_name, - default_string_field_name]})[0] + "pk_name": self.primary_key_field_name, + "original_entities": self.datas, + "output_fields": [self.primary_key_field_name, + self.string_field_name]})[0] hybrid_search_1 = self.hybrid_search(client, self.collection_name, reqs=req_list, ranker=WeightedRanker(*[0.6, 0.4]), limit=default_limit, - filter=f"{default_int64_field_name} > 1000", - output_fields=[default_primary_key_field_name, default_string_field_name], + filter=f"{self.int64_field_name} > 1000", + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq, "ids": self.primary_keys, "limit": default_limit, "enable_milvus_client_api": True, - "pk_name": default_primary_key_field_name, + "pk_name": self.primary_key_field_name, "original_entities": self.datas, - "output_fields": [default_primary_key_field_name, - default_string_field_name]})[0] + "output_fields": [self.primary_key_field_name, + self.string_field_name]})[0] # verify the hybrid search results are consistent for i in range(nq): assert hybrid_search_0[i].ids == hybrid_search_1[i].ids @@ -359,7 +340,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): "anns_field": field_name, "param": {}, "limit": default_limit, - "expr": f"{filter_min_value} < {default_primary_key_field_name} <= {filter_max_value}" + "expr": f"{filter_min_value} < {self.primary_key_field_name} <= {filter_max_value}" }) req_list.append(req) @@ -368,15 +349,15 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): ranker=ranker, limit=default_limit, # fitler=f"{default_primary_key_field_name} <= {filter_max_value}", - output_fields=[default_primary_key_field_name, default_string_field_name], + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq, "ids": self.primary_keys, "limit": default_limit, - "pk_name": default_primary_key_field_name, - "original_entities": self.datas, - "output_fields": [default_primary_key_field_name, - default_string_field_name]})[0] + "pk_name": self.primary_key_field_name, + "original_entities": self.datas, + "output_fields": [self.primary_key_field_name, + self.string_field_name]})[0] # verify the hybrid search results meet the filter for i in range(nq): @@ -385,20 +366,20 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): # hybrid search again with filter filter_max_value2 = np.mean(res[0].ids) - filter = f"{default_primary_key_field_name} <= {filter_max_value2}" + filter = f"{self.primary_key_field_name} <= {filter_max_value2}" res2 = self.hybrid_search(client, self.collection_name, reqs=req_list, ranker=ranker, limit=default_limit, fitler=filter, - output_fields=[default_primary_key_field_name, default_string_field_name], + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq, "ids": self.primary_keys, "limit": default_limit, - "pk_name": default_primary_key_field_name, - "original_entities": self.datas, - "output_fields": [default_primary_key_field_name, - default_string_field_name]})[0] + "pk_name": self.primary_key_field_name, + "original_entities": self.datas, + "output_fields": [self.primary_key_field_name, + self.string_field_name]})[0] # verify filter in hybrid search is not effective assert max(res2[i].ids) > filter_max_value2 @@ -426,7 +407,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): "anns_field": self.float_vector_field_name1, # on the same anns field "param": {}, "limit": default_limit, - "expr": f"{default_int64_field_name} > 100" + "expr": f"{self.int64_field_name} > 100" }) req_list.append(req) @@ -441,18 +422,138 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): check_items = {"nq": nq, "ids": self.primary_keys, "limit": default_limit, - "pk_name": default_primary_key_field_name, + "pk_name": self.primary_key_field_name, "original_entities": self.datas, - "output_fields": [default_primary_key_field_name, default_string_field_name]} + "output_fields": [self.primary_key_field_name, self.string_field_name]} self.hybrid_search(client, self.collection_name, reqs=req_list, ranker=ranker, limit=default_limit, - output_fields=[default_primary_key_field_name, default_string_field_name], + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=check_task, check_items=check_items) @pytest.mark.tags(CaseLabel.L1) - @pytest.mark.parametrize("req_limit", [None, 1, ct.default_limit * 2, max_limit]) + def test_hybrid_search_over_max_limit(self): + """ + target: test hybrid search with over maximum limit + method: create connection, collection, insert and search + expected: hybrid search successfully with limit(topK) + """ + over_max_limit = ct.max_limit + 1 + # 1. initialize client + client = self._client() + # 2. extract vector field name + vector_name_list = [self.float_vector_field_name1, self.float_vector_field_name2] + search_data = cf.gen_vectors(1, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) + # 3. prepare search params + weights = [0.3, 0.7] + ranker = WeightedRanker(*weights) + req_list1 = [] + for i in range(len(vector_name_list)): + _search_param = { + "data": search_data, + "anns_field": vector_name_list[i], + "param": {}, + "limit": 50, + } + req = AnnSearchRequest(**_search_param) + req_list1.append(req) + + # hybrid search with over max limit + error = {"err_code": 65535, "err_msg": f"invalid max query result window, (offset+limit) " + f"should be in range [1, 16384], but got {over_max_limit}"} + self.hybrid_search(client, self.collection_name, reqs=req_list1, + ranker=ranker, limit=over_max_limit, + check_task=CheckTasks.err_res, check_items=error) + + # hybrid search with over max limit in sub requests + req_list = [] + for i in range(len(vector_name_list)): + search_data = cf.gen_vectors(1, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) + search_param = { + "data": search_data, + "anns_field": vector_name_list[i], + "param": {}, + "limit": over_max_limit, + } + req = AnnSearchRequest(**search_param) + req_list.append(req) + error = {"err_code": 65535, "err_msg": f"topk [{over_max_limit}] is invalid, " + f"it should be in range [1, 16384], but got {over_max_limit}"} + self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=ranker, limit=default_limit, + check_task=CheckTasks.err_res, check_items=error) + + # TODO: hybrid search with over max limit+offset in sub requests after #45939 fixed + # req_list = [] + # offset = 10 + # for i in range(len(vector_name_list)): + # search_data = cf.gen_vectors(1, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) + # search_param = { + # "data": search_data, + # "anns_field": vector_name_list[i], + # "param": {"offset": offset}, + # "limit": over_max_limit - offset, + # } + # req = AnnSearchRequest(**search_param) + # req_list.append(req) + # error = {"err_code": 65535, "err_msg": f"topk [{over_max_limit}] is invalid, " + # f"it should be in range [1, 16384], but got {over_max_limit}"} + # self.hybrid_search(client, self.collection_name, reqs=req_list, + # ranker=ranker, limit=default_limit, + # check_task=CheckTasks.err_res, check_items=error) + + @pytest.mark.tags(CaseLabel.L1) + def test_hybrid_search_with_less_than_min_limit(self): + """ + target: test hybrid search with less than minimum limit + method: create connection, collection, insert and search + expected: hybrid search successfully with limit(topK) + """ + # 1. initialize collection with data + client = self._client() + # 2. extract vector field name + vector_name_list = [self.float_vector_field_name1, self.float_vector_field_name2] + # 3. hybrid search with less than minimum limit + req_list = [] + ranker = WeightedRanker(*[0.5, 0.5]) + limit = ct.min_limit - 1 + for i in range(len(vector_name_list)): + search_data = cf.gen_vectors(1, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) + req = AnnSearchRequest(**{ + "data": search_data, + "anns_field": vector_name_list[i], + "param": {}, + "limit": ct.default_limit, + }) + req_list.append(req) + error = {"err_code": 1, + "err_msg": f"`limit` value {limit} is illegal"} + self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=ranker, limit=limit, + check_task=CheckTasks.err_res, check_items=error) + + # 4. hybrid search with less than minimum limit in sub request + req_list = [] + ranker = WeightedRanker(*[0.5, 0.5]) + limit = ct.min_limit - 1 + for i in range(len(vector_name_list)): + search_data = cf.gen_vectors(1, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) + req = AnnSearchRequest(**{ + "data": search_data, + "anns_field": vector_name_list[i], + "param": {}, + "limit": limit, + }) + req_list.append(req) + error = {"err_code": 1, + "err_msg": f"topk [{limit}] is invalid, it should be in range [1, 16384], but got {limit}"} + self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=ranker, limit=ct.default_limit, + check_task=CheckTasks.err_res, check_items=error) + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("req_limit", [None, 1, ct.default_limit * 2, ct.max_limit]) def test_hybrid_search_diff_limits_in_search_req(self, req_limit): """ Test case: Hybrid search where individual search requests omit 'limit' @@ -463,7 +564,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): - Hybrid search completes successfully and returns results up to the specified topK limit. """ client = self._client() - + nq = 2 search_data = cf.gen_vectors(nq, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) # generate hybrid search request list req_list = [] @@ -481,18 +582,18 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): self.hybrid_search(client, self.collection_name, reqs=req_list, ranker=ranker, limit=ct.default_limit, - fitler=f"{default_int64_field_name} <= 18000", - output_fields=[default_primary_key_field_name, default_string_field_name], + fitler=f"{self.int64_field_name} <= 18000", + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq, "ids": self.primary_keys, "limit": expected_limit, - "pk_name": default_primary_key_field_name, + "pk_name": self.primary_key_field_name, "original_entities": self.datas, - "output_fields": [default_primary_key_field_name, - default_string_field_name]}) + "output_fields": [self.primary_key_field_name, + self.string_field_name]}) - @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.tags(CaseLabel.L0) def test_hybrid_search_as_search(self): """ target: test hybrid search to search as the original search interface @@ -521,30 +622,30 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): hybrid_res = self.hybrid_search(client, self.collection_name, reqs=req_list, ranker=WeightedRanker(1), limit=default_limit, - output_fields=[default_primary_key_field_name, default_string_field_name], + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq, "ids": self.primary_keys, "limit": default_limit, - "pk_name": default_primary_key_field_name, + "pk_name": self.primary_key_field_name, "enable_milvus_client_api": True, - "original_entities": self.datas, - "output_fields": [default_primary_key_field_name, - default_string_field_name]})[0] + "original_entities": self.datas, + "output_fields": [self.primary_key_field_name, + self.string_field_name]})[0] search_res = self.search(client, self.collection_name, data=search_data, anns_field=field_name, search_params={}, limit=default_limit, - output_fields=[default_primary_key_field_name, default_string_field_name], + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq, "ids": self.primary_keys, "limit": default_limit, "enable_milvus_client_api": True, - "pk_name": default_primary_key_field_name, + "pk_name": self.primary_key_field_name, "original_entities": self.datas, - "output_fields": [default_primary_key_field_name, - default_string_field_name]})[0] + "output_fields": [self.primary_key_field_name, + self.string_field_name]})[0] for i in range(nq): assert hybrid_res[i].ids == search_res[i].ids @@ -557,7 +658,8 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): - Connects to the collection and prepares two different vector search requests. - Performs a standard search on each vector field to build reference scoring. - Merges the results using the RRFRanker (default parameters) for hybrid search. - - Compares the hybrid search scores with the expected RRFRanker baseline (ignoring IDs, since matches can have equal scores and non-deterministic IDs). + - Compares the hybrid search scores with the expected RRFRanker baseline (ignoring IDs, + since matches can have equal scores and non-deterministic IDs). - Verifies repeated hybrid searches with the same parameters produce consistent results. The test passes if hybrid search completes successfully and the scores match the manually computed baseline. @@ -566,6 +668,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): vector_name_list = [self.float_vector_field_name1, self.float_vector_field_name2] # 3. prepare search params for each vector field req_list = [] + nq = 1 # only works for nq=1, as the limitation of get_hybrid_search_base_results_rrf() search_res_dict_array = [] for field_name in vector_name_list: search_data = cf.gen_vectors(nq, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) @@ -575,7 +678,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): "anns_field": field_name, "param": {}, "limit": default_limit, - "expr": f"{default_int64_field_name} > 0"} + "expr": f"{self.int64_field_name} > 0"} req = AnnSearchRequest(**search_param) req_list.append(req) # search for get the baseline of hybrid_search @@ -583,12 +686,12 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): anns_field=field_name, search_params={}, limit=default_limit, - filter=f"{default_int64_field_name} > 0", + filter=f"{self.int64_field_name} > 0", check_task=CheckTasks.check_search_results, check_items={"nq": 1, "ids": self.primary_keys, "limit": default_limit, - "pk_name": default_primary_key_field_name})[0] + "pk_name": self.primary_key_field_name})[0] ids = search_res[0].ids for j in range(len(ids)): search_res_dict[ids[j]] = 1 / (j + 60 + 1) @@ -603,7 +706,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): check_items={"nq": 1, "ids": self.primary_keys, "limit": default_limit, - "pk_name": default_primary_key_field_name})[0] + "pk_name": self.primary_key_field_name})[0] # 6. compare results through the re-calculated distances for i in range(len(score_answer[:default_limit])): assert score_answer[i] - hybrid_search_0[0].distances[i] < hybrid_search_epsilon @@ -615,7 +718,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): check_items={"nq": 1, "ids": self.primary_keys, "limit": default_limit, - "pk_name": default_primary_key_field_name})[0] + "pk_name": self.primary_key_field_name})[0] assert hybrid_search_0[0].ids == hybrid_search_1[0].ids assert hybrid_search_0[0].distances == hybrid_search_1[0].distances @@ -648,14 +751,14 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): anns_field=vector_filed_names[i], search_params={}, limit=limit, - output_fields=[default_primary_key_field_name, default_string_field_name], + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq, "ids": self.primary_keys, "limit": limit, - "pk_name": default_primary_key_field_name, - "output_fields": [default_primary_key_field_name, - default_string_field_name]})[0] + "pk_name": self.primary_key_field_name, + "output_fields": [self.primary_key_field_name, + self.string_field_name]})[0] for j in range(nq): id_list_nq[j].extend(search_res[j].ids) @@ -675,82 +778,346 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): hybrid_search_res = self.hybrid_search(client, self.collection_name, reqs=req_list, ranker=ranker, limit=larger_limit, - output_fields=[default_primary_key_field_name, - default_string_field_name], + output_fields=[self.primary_key_field_name, + self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq})[0] # verify the hybrid search results are consistent for i in range(nq): assert len(hybrid_search_res[i].ids) == len(list(set(id_list_nq[i]))) - # @pytest.mark.tags(CaseLabel.L2) - # def test_hybrid_search_with_range_search(self): - # """ - # target: test hybrid search with range search - # method: - # expected: raise exception (not support yet) - # """ - # client = self._client() - # nq = 2 - # limit = 200 - # - # field_names = [self.sparse_vector_field_name1, self.sparse_vector_field_name2] - # search_data = cf.gen_varchar_data(length=10, nb=nq, text_mode=True) - # - # # 0. search - # mid_distances = [] - # for i in range(len(field_names)): - # field_name = field_names[i] - # res_search = self.search(client, self.collection_name, data=search_data, - # anns_field=field_name, - # limit=limit)[0] - # field_mid_distances = [] - # for j in range(nq): - # field_mid_distances.append(res_search[j].distances[limit // 2 - 1]) - # mid_distances.append(np.mean(field_mid_distances)) - # - # # 1. hybrid search without range search - # req_list = [] - # for field_name in field_names: - # req = AnnSearchRequest(**{ - # "data": search_data, - # "anns_field": field_name, - # "param": {}, - # "limit": limit, - # }) - # req_list.append(req) - # res1 = self.hybrid_search(client, self.collection_name, reqs=req_list, - # ranker=WeightedRanker(0.5, 0.5), - # limit=limit, - # output_fields=[default_primary_key_field_name, default_string_field_name], - # check_task=CheckTasks.check_search_results, - # check_items={"nq": nq, "ids": self.primary_keys, "limit": limit, - # "pk_name": default_primary_key_field_name, - # "output_fields": [default_primary_key_field_name, - # default_string_field_name]})[0] - # - # # 2. hybrid search with range search one nq by one nq - # for i in range(nq): - # req_list2 = [] - # for j in range(len(field_names)): - # field_name = field_names[j] - # req = AnnSearchRequest(**{ - # "data": [search_data[i]], - # "anns_field": field_name, - # "param": {"params": {"radius": float(mid_distances[i]), "range_filter": 9999}}, - # "limit": limit//2, - # }) - # req_list2.append(req) - # res2 = self.hybrid_search(client, self.collection_name, reqs=req_list2, - # ranker=WeightedRanker(0.5, 0.5), - # limit=limit//2, - # output_fields=[default_primary_key_field_name, default_string_field_name], - # check_task=CheckTasks.check_search_results, - # check_items={"nq": 1, "ids": self.primary_keys, "limit": limit//2, - # "pk_name": default_primary_key_field_name, - # "output_fields": [default_primary_key_field_name, - # default_string_field_name]})[0] - # assert len(set(res2[0].ids).intersection(set(res1[0].ids[:limit//2]))) / len(res2[0].ids) >= 0.7, f"failed in nq={i}" + @pytest.mark.tags(CaseLabel.L2) + def test_hybrid_search_with_range_search(self): + """ + target: test hybrid search with range search + method: + expected: raise exception (not support yet) + """ + client = self._client() + limit = 200 + + field_names = [self.sparse_vector_field_name1, self.sparse_vector_field_name2] + nq = len(field_names) # nq should equal to number of filed names, as it would search nq by nq next + search_data = cf.gen_varchar_data(length=10, nb=nq, text_mode=True) + + # 0. search + mid_distances = [] + for i in range(len(field_names)): + field_name = field_names[i] + res_search = self.search(client, self.collection_name, data=search_data, + anns_field=field_name, + limit=limit)[0] + field_mid_distances = [] + for j in range(nq): + field_mid_distances.append(res_search[j].distances[limit // 2 - 1]) + mid_distances.append(np.mean(field_mid_distances)) + + # 1. hybrid search without range search + req_list = [] + for field_name in field_names: + req = AnnSearchRequest(**{ + "data": search_data, + "anns_field": field_name, + "param": {}, + "limit": limit, + }) + req_list.append(req) + res1 = self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=WeightedRanker(0.5, 0.5), + limit=limit, + output_fields=[self.primary_key_field_name, self.string_field_name], + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "ids": self.primary_keys, "limit": limit, + "pk_name": self.primary_key_field_name, + "output_fields": [self.primary_key_field_name, + self.string_field_name]})[0] + + # 2. hybrid search with range search one nq by one nq + for i in range(nq): + req_list2 = [] + for j in range(len(field_names)): + field_name = field_names[j] + req = AnnSearchRequest(**{ + "data": [search_data[i]], + "anns_field": field_name, + "param": {"params": {"radius": float(mid_distances[j]), "range_filter": 9999}}, + "limit": limit // 2, + }) + req_list2.append(req) + res2 = self.hybrid_search(client, self.collection_name, reqs=req_list2, + ranker=WeightedRanker(0.5, 0.5), + limit=limit // 2, + output_fields=[self.primary_key_field_name, self.string_field_name], + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, "ids": self.primary_keys, # "limit": limit // 2, + "pk_name": self.primary_key_field_name, + "output_fields": [self.primary_key_field_name, + self.string_field_name]})[0] + hit_rate = len(set(res2[0].ids).intersection(set(res1[i].ids[:limit // 2]))) / len(res2[0].ids) + # log.debug(f"hybrid search with range nq={i} hit hybrid search without rage, hit rate: {hit_rate}") + assert hit_rate >= 0.7, f"failed in nq={i}" + + @pytest.mark.tags(CaseLabel.L2) + def test_hybrid_search_with_diff_output_fields(self): + """ + target: test hybrid search with different output fields + method: create connection, collection, insert and search + expected: hybrid search successfully with different output fields + """ + client = self._client() + nq = 2 + limit = 100 + search_data = cf.gen_vectors(nq, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) + vector_filed_names = [self.float_vector_field_name1, self.float_vector_field_name2] + req_list = [] + for i in range(len(vector_filed_names)): + req = AnnSearchRequest(**{ + "data": search_data, + "anns_field": vector_filed_names[i], + "param": {}, + "limit": limit, + }) + req_list.append(req) + + # output * fields + output_fields = ["*"] + # sparse fields cannot be output, so specify the expected output fields + expected_output_fields = [field_name for field_name in self.all_fields + if field_name not in [self.sparse_vector_field_name1, self.sparse_vector_field_name2]] + res1 = self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=WeightedRanker(0.5, 0.5), + limit=limit, output_fields=output_fields, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "ids": self.primary_keys, "limit": limit, + "pk_name": self.primary_key_field_name, + "output_fields": expected_output_fields})[0] + output_fields = self.all_fields + # verify the error message when output sparse vector field + err_msg = {"err_code": 999, + "err_msg": "not allowed to retrieve raw data of field sparse_vector1"} + self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=WeightedRanker(0.5, 0.5), + limit=limit, output_fields=output_fields, + check_task=CheckTasks.err_res, + check_items=err_msg) + # output all listed fields + output_fields = expected_output_fields + res2 = self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=WeightedRanker(0.5, 0.5), + limit=limit, output_fields=output_fields, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "ids": self.primary_keys, "limit": limit, + "pk_name": self.primary_key_field_name, + "output_fields": expected_output_fields})[0] + # output some fields + output_fields = [self.primary_key_field_name, self.string_field_name, self.float_vector_field_name1, + self.float_vector_field_name2] + res3 = self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=WeightedRanker(0.5, 0.5), + limit=limit, output_fields=output_fields, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "ids": self.primary_keys, "limit": limit, + "pk_name": self.primary_key_field_name, + "output_fields": output_fields})[0] + # output with dynamic field + output_fields = [self.primary_key_field_name, self.string_field_name, self.dynamic_field_name1, + self.dynamic_field_name2] + res4 = self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=WeightedRanker(0.5, 0.5), + limit=limit, output_fields=output_fields, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "ids": self.primary_keys, "limit": limit, + "pk_name": self.primary_key_field_name, + "output_fields": output_fields})[0] + + @pytest.mark.tags(CaseLabel.L1) + def test_hybrid_search_result_always_descending_order(self): + """ + target: test hybrid search result always descending order in distance + method: create connection, collection, insert and search + expected: hybrid search successfully with result always descending order in distance + """ + client = self._client() + nq = 2 + limit = 100 + # test with float vector field + vector_filed_names = [self.float_vector_field_name1, self.float_vector_field_name2] + search_data = cf.gen_vectors(nq, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) + req_list = [] + for i in range(len(vector_filed_names)): + req = AnnSearchRequest(**{ + "data": search_data, + "anns_field": vector_filed_names[i], + "param": {}, + "limit": limit, + }) + req_list.append(req) + descend_metric = "IP" # here only impacts the distance verification in descending order or not + self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=WeightedRanker(0.5, 0.5), + limit=limit, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "ids": self.primary_keys, + "limit": limit, + "pk_name": self.primary_key_field_name, + "metric": descend_metric}) + + # test with sparse vector field + vector_filed_names = [self.sparse_vector_field_name1, self.sparse_vector_field_name2] + search_data = cf.gen_varchar_data(length=10, nb=nq, text_mode=True) + req_list = [] + for i in range(len(vector_filed_names)): + req = AnnSearchRequest(**{ + "data": search_data, + "anns_field": vector_filed_names[i], + "param": {}, + "limit": limit, + }) + req_list.append(req) + self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=WeightedRanker(0.5, 0.5), + limit=limit, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "ids": self.primary_keys, + "limit": limit, + "pk_name": self.primary_key_field_name, + "metric": descend_metric}) + + @pytest.mark.tags(CaseLabel.L2) + # @pytest.mark.parametrize("k", [1, 60, 1000]) + # @pytest.mark.parametrize("offset", [0, 5]) + @pytest.mark.skip(reason="milvus issue #32650") + def test_hybrid_search_RRFRanker_different_k(self): + """ + target: test hybrid search normal case + method: create connection, collection, insert and search. + Note: here the result check is through comparing the score, the ids could not be compared + because the high probability of the same score, then the id is not fixed in the range of + the same score + expected: hybrid search successfully with limit(topK) + """ + client = self._client() + k = 1 + offset = 0 + # 2. extract vector field name + nq = 1 # TODO: the verification function only works for nq = 1 for now + vector_name_list = [self.float_vector_field_name1, self.float_vector_field_name2] + req_list = [] + search_res_dict_array = [] + for i in range(len(vector_name_list)): + search_data = cf.gen_vectors(nq, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) + search_res_dict = {} + search_param = { + "data": search_data, + "anns_field": vector_name_list[i], + "param": {}, + "limit": default_limit, + } + req = AnnSearchRequest(**search_param) + req_list.append(req) + # search for get the baseline of hybrid_search + search_res = self.search(client, self.collection_name, data=search_data, + anns_field=vector_name_list[i], + limit=default_limit, offset=offset, + output_fields=[self.primary_key_field_name, self.string_field_name], + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, + "ids": self.primary_keys, + "limit": default_limit, + "pk_name": self.primary_key_field_name, + "output_fields": [self.primary_key_field_name, + self.string_field_name]})[0] + ids = search_res[0].ids + for j in range(len(ids)): + search_res_dict[ids[j]] = 1 / (j + k + 1) + search_res_dict_array.append(search_res_dict) + # 4. calculate hybrid search baseline for RRFRanker + ids_answer, score_answer = cf.get_hybrid_search_base_results_rrf(search_res_dict_array) + # 5. hybrid search + hybrid_res = self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=RRFRanker(k), + limit=default_limit, + offset=offset, + output_fields=[self.primary_key_field_name, self.string_field_name], + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, + "ids": self.primary_keys, + "limit": default_limit, + "pk_name": self.primary_key_field_name, + "output_fields": [self.primary_key_field_name, + self.string_field_name]})[0] + # 6. compare results through the re-calculated distances + for i in range(len(score_answer[:default_limit])): + assert score_answer[i] - hybrid_res[0].distances[i] < hybrid_search_epsilon * 2, f"failed in topk={i}" + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("limit", [1, 100, 16384]) + def test_hybrid_search_different_limit_round_decimal(self, limit): + """ + target: test hybrid search with different valid limit and round decimal + method: create connection, collection, insert and search + expected: hybrid search successfully with limit(topK) + """ + # 1. initialize client + client = self._client() + # 2. extract vector field name + vector_name_list = [self.sparse_vector_field_name1, self.sparse_vector_field_name2] + # 3. prepare search params + req_list = [] + weights = [0.3, 0.7] + search_res_dict_array = [] + if limit > default_nb: + limit = default_limit + metrics = [] + for i in range(len(vector_name_list)): + search_data = cf.gen_varchar_data(length=10, nb=1, text_mode=True) + search_res_dict = {} + search_param = { + "data": search_data, + "anns_field": vector_name_list[i], + "param": {}, + "limit": limit, + } + req = AnnSearchRequest(**search_param) + req_list.append(req) + metrics.append("BM25") + # search to get the baseline of hybrid_search + search_res = self.search(client, self.collection_name, data=search_data, + anns_field=vector_name_list[i], + limit=limit, + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, + "ids": self.primary_keys, + "limit": limit, + "pk_name": self.primary_key_field_name})[0] + ids = search_res[0].ids + distance_array = search_res[0].distances + for j in range(len(ids)): + search_res_dict[ids[j]] = distance_array[j] + search_res_dict_array.append(search_res_dict) + # 4. calculate hybrid search baseline + ids_answer, score_answer = cf.get_hybrid_search_base_results(search_res_dict_array, weights, metrics, 5) + # 5. hybrid search + hybrid_res = self.hybrid_search(client, self.collection_name, reqs=req_list, + ranker=WeightedRanker(*weights), + limit=limit, + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, + "ids": self.primary_keys, + "limit": limit, + "pk_name": self.primary_key_field_name})[0] + # 6. compare results through the re-calculated distances + for i in range(len(score_answer[:limit])): + delta = math.fabs(score_answer[i] - hybrid_res[0].distances[i]) + if delta >= hybrid_search_epsilon: + # print id and distance for debug + # answer and hybrid search result + for i1 in range(len(score_answer)): + log.info("answer id: %d, distance: %f" % (ids_answer[i1], score_answer[i1])) + for i2 in range(len(hybrid_res[0].ids)): + log.info( + "hybrid search res id: %d, distance: %f" % (hybrid_res[0].ids[i2], hybrid_res[0].distances[i2])) + assert delta < hybrid_search_epsilon @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("offset", [1, 5]) @@ -779,8 +1146,8 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): hybrid_res_inside = self.hybrid_search(client, self.collection_name, reqs=req_list, ranker=rerank, limit=ct.default_limit, - output_fields=[default_primary_key_field_name, - default_string_field_name], + output_fields=[self.primary_key_field_name, + self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq})[0] req_list = [] @@ -796,21 +1163,23 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): ranker=rerank, limit=ct.default_limit, offset=offset, - output_fields=[default_primary_key_field_name, - default_string_field_name], + output_fields=[self.primary_key_field_name, + self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq})[0] hybrid_res_no_offset = self.hybrid_search(client, self.collection_name, reqs=req_list, ranker=rerank, limit=ct.default_limit, - output_fields=[default_primary_key_field_name, - default_string_field_name], + output_fields=[self.primary_key_field_name, + self.string_field_name], check_task=CheckTasks.check_search_results, check_items={"nq": nq})[0] for i in range(nq): assert hybrid_res_inside[i].ids[offset:] == \ hybrid_res_outside[i].ids[:-offset] == \ hybrid_res_no_offset[i].ids[offset:] + # TODO: verify the offset working, uncomment the assertion below after #45939 fixed + # assert hybrid_res_inside[i].ids != hybrid_res_no_offset[i] @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("ranker", [WeightedRanker(*[0.5, 0.5]), RRFRanker()]) @@ -830,7 +1199,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): reqs=[], ranker=ranker, limit=default_limit, - output_fields=[default_primary_key_field_name, default_string_field_name], + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.err_res, check_items=err_msg) @@ -846,6 +1215,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): """ client = self._client() req_list = [] + nq = 1 search_data = cf.gen_vectors(nq, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) for field_name in [self.float_vector_field_name1, self.float_vector_field_name2]: req = AnnSearchRequest(**{ @@ -867,7 +1237,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): reqs=req_list, ranker=ranker, limit=default_limit, - output_fields=[default_primary_key_field_name, default_string_field_name], + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.err_res, check_items=err_msg) @@ -884,6 +1254,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): client = self._client() req_list = [] + nq = 2 search_data = cf.gen_vectors(nq, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) for field_name in [self.float_vector_field_name1, self.float_vector_field_name2]: req = AnnSearchRequest(**{ @@ -902,12 +1273,12 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): reqs=req_list, ranker=ranker, limit=default_limit, - output_fields=[default_primary_key_field_name, default_string_field_name], + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=CheckTasks.err_res, check_items=err_msg) @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("nq", [max_nq, max_nq + 1]) + @pytest.mark.parametrize("nq", [ct.max_nq, ct.max_nq + 1]) def test_hybrid_search_max_nq(self, nq): """ Test case: Hybrid search with valid and boundary nq values @@ -931,7 +1302,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): }) req_list.append(req) - if nq == max_nq + 1: + if nq == ct.max_nq + 1: check_task = CheckTasks.err_res check_items = {"err_code": 65535, "err_msg": "nq (number of search vector per search request) should be in range [1, 16384]"} @@ -940,12 +1311,12 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): check_items = {"nq": nq, "ids": self.primary_keys, "limit": default_limit, - "pk_name": default_primary_key_field_name, - "output_fields": [default_primary_key_field_name, default_string_field_name]} + "pk_name": self.primary_key_field_name, + "output_fields": [self.primary_key_field_name, self.string_field_name]} self.hybrid_search(client, self.collection_name, reqs=req_list, ranker=WeightedRanker(*[0.6, 0.4]), limit=default_limit, - output_fields=[default_primary_key_field_name, default_string_field_name], + output_fields=[self.primary_key_field_name, self.string_field_name], check_task=check_task, check_items=check_items) @@ -953,26 +1324,6 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base): class TestCollectionHybridSearchValid(TestcaseBase): """ Test case of search interface """ - @pytest.fixture(scope="function", params=[1, 10]) - def nq(self, request): - yield request.param - - @pytest.fixture(scope="function", params=[default_nb_medium]) - def nb(self, request): - yield request.param - - @pytest.fixture(scope="function", params=[32, 128]) - def dim(self, request): - yield request.param - - @pytest.fixture(scope="function", params=[False, True]) - def auto_id(self, request): - yield request.param - - @pytest.fixture(scope="function", params=[False, True]) - def _async(self, request): - yield request.param - @pytest.fixture(scope="function", params=["JACCARD", "HAMMING"]) def metrics(self, request): yield request.param @@ -1004,7 +1355,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): """ @pytest.mark.tags(CaseLabel.L1) - @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name, ct.default_string_field_name]) + @pytest.mark.parametrize("primary_field", [ct.default_string_field_name]) def test_hybrid_search_normal(self, is_flush, primary_field, vector_data_type): """ target: test hybrid search normal case @@ -1015,7 +1366,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): nq = 2 offset = 5 # create db - db_name = cf.gen_unique_str(prefix) + db_name = cf.gen_unique_str("db") self.database_wrap.create_database(db_name) # using db and create collection self.database_wrap.using_database(db_name) @@ -1025,7 +1376,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): enable_dynamic_field = True multiple_dim_array = [dim, dim] collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, dim=dim, is_flush=is_flush, + self.init_collection_general("", True, dim=dim, is_flush=is_flush, primary_field=primary_field, enable_dynamic_field=enable_dynamic_field, multiple_dim_array=multiple_dim_array, vector_data_type=vector_data_type, @@ -1063,7 +1414,6 @@ class TestCollectionHybridSearchValid(TestcaseBase): # 5. search to get the baseline of hybrid_search search_res = collection_w.search([vectors_search], vector_name_list[i], single_search_param, default_limit, - default_search_exp, check_task=CheckTasks.check_search_results, check_items={"nq": 1, "ids": insert_ids, @@ -1099,7 +1449,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): self.database_wrap.drop_database(db_name) @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name, ct.default_string_field_name]) + @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name]) def test_hybrid_search_different_metric_type(self, primary_field, is_flush, metric_type): """ target: test hybrid search for fields with different metric type @@ -1110,7 +1460,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): dim = 128 nq = 3 collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, dim=dim, is_flush=is_flush, is_index=False, + self.init_collection_general("", True, dim=dim, is_flush=is_flush, is_index=False, primary_field=primary_field, enable_dynamic_field=False, multiple_dim_array=[dim, dim])[0:5] # 2. extract vector field name @@ -1126,7 +1476,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): search_param = { "data": [[random.random() for _ in range(dim)] for _ in range(nq)], "anns_field": vector_name, - "param": {"metric_type": metric_type, "offset": 0}, + "param": {}, "limit": default_limit, "expr": "int64 > 0"} req = AnnSearchRequest(**search_param) @@ -1140,7 +1490,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): "pk_name": ct.default_int64_field_name}) @pytest.mark.tags(CaseLabel.L1) - @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name, ct.default_string_field_name]) + @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name]) def test_hybrid_search_different_metric_type_each_field(self, primary_field, is_flush, metric_type): """ target: test hybrid search for fields with different metric type @@ -1151,7 +1501,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): dim = 91 nq = 4 collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, dim=dim, is_flush=is_flush, is_index=False, + self.init_collection_general("", True, dim=dim, is_flush=is_flush, is_index=False, primary_field=primary_field, enable_dynamic_field=False, multiple_dim_array=[dim, dim])[0:5] # 2. extract vector field name @@ -1169,7 +1519,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): search_param = { "data": [[random.random() for _ in range(dim)] for _ in range(nq)], "anns_field": vector_name_list[0], - "param": {"metric_type": "L2", "offset": 0}, + "param": {"metric_type": "L2"}, "limit": default_limit, "expr": "int64 > 0"} req = AnnSearchRequest(**search_param) @@ -1177,7 +1527,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): search_param = { "data": [[random.random() for _ in range(dim)] for _ in range(nq)], "anns_field": vector_name_list[1], - "param": {"metric_type": "IP", "offset": 0}, + "param": {"metric_type": "IP"}, "limit": default_limit, "expr": "int64 > 0"} req = AnnSearchRequest(**search_param) @@ -1185,557 +1535,30 @@ class TestCollectionHybridSearchValid(TestcaseBase): search_param = { "data": [[random.random() for _ in range(dim)] for _ in range(nq)], "anns_field": vector_name_list[2], - "param": {"metric_type": "COSINE", "offset": 0}, + "param": {"metric_type": "COSINE"}, "limit": default_limit, "expr": "int64 > 0"} req = AnnSearchRequest(**search_param) req_list.append(req) # 4. hybrid search - hybrid_search = collection_w.hybrid_search(req_list, WeightedRanker(0.1, 0.9, 1), default_limit, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name})[0] + collection_w.hybrid_search(req_list, WeightedRanker(0.1, 0.9, 1), default_limit, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, + "ids": insert_ids, + "limit": default_limit, + "pk_name": ct.default_int64_field_name}) @pytest.mark.tags(CaseLabel.L1) - @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name, ct.default_string_field_name]) - def test_hybrid_search_WeightedRanker_different_parameters(self, primary_field, is_flush, metric_type): + def test_hybrid_search_WeightedRanker_different_parameters(self): """ target: test hybrid search for fields with different offset method: create connection, collection, insert and search expected: hybrid search successfully with limit(topK) """ - # 1. initialize collection with data - dim = 63 - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, auto_id=True, dim=dim, is_flush=is_flush, is_index=False, - primary_field=primary_field, - enable_dynamic_field=False, multiple_dim_array=[dim, dim])[0:5] - # 2. extract vector field name - vector_name_list = cf.extract_vector_field_name_list(collection_w) - vector_name_list.append(ct.default_float_vec_field_name) - flat_index = {"index_type": "FLAT", "params": {}, "metric_type": metric_type} - for vector_name in vector_name_list: - collection_w.create_index(vector_name, flat_index) - collection_w.load() - # 3. prepare search params - req_list = [] - for i in range(len(vector_name_list)): - search_param = { - "data": [[random.random() for _ in range(dim)] for _ in range(1)], - "anns_field": vector_name_list[i], - "param": {"metric_type": metric_type, "offset": i}, - "limit": default_limit, - "expr": "int64 > 0"} - req = AnnSearchRequest(**search_param) - req_list.append(req) - # 4. hybrid search - collection_w.hybrid_search(req_list, WeightedRanker(0.2, 0.03, 0.9), default_limit, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name}) - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name, ct.default_string_field_name]) - def test_hybrid_search_with_range_search(self, primary_field): - """ - target: test hybrid search with range search - method: create connection, collection, insert and search - expected: raise exception (not support yet) - """ - # 1. initialize collection with data - multiple_dim_array = [default_dim, default_dim] - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, dim=default_dim, is_index=False, - primary_field=primary_field, - multiple_dim_array=multiple_dim_array)[0:5] - # 2. extract vector field name - vector_name_list = cf.extract_vector_field_name_list(collection_w) - flat_index = {"index_type": "FLAT", "params": {}, "metric_type": "COSINE"} - for vector_name in vector_name_list: - collection_w.create_index(vector_name, flat_index) - collection_w.create_index(ct.default_float_vec_field_name, flat_index) - collection_w.load() - reqs_max_num = 2 - # 3. prepare search params - req_list = [] - for i in range(reqs_max_num): - search_param = { - "data": [[random.random() for _ in range(default_dim)] for _ in range(1)], - "anns_field": default_search_field, - "param": {"metric_type": "COSINE", "params": {"radius": 0, "range_filter": 1000}}, - "limit": default_limit, - "expr": "int64 > 0"} - req = AnnSearchRequest(**search_param) - req_list.append(req) - weights = [random.random() for _ in range(len(req_list))] - log.info(weights) - # 4. hybrid search - collection_w.hybrid_search(req_list, WeightedRanker(*weights), default_limit, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name}) - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("k", [1, 60, 1000, 16383]) - @pytest.mark.parametrize("offset", [0, 1, 5]) - @pytest.mark.skip("https://github.com/milvus-io/milvus/issues/32650") - def test_hybrid_search_RRFRanker_different_k(self, is_flush, k, offset): - """ - target: test hybrid search normal case - method: create connection, collection, insert and search. - Note: here the result check is through comparing the score, the ids could not be compared - because the high probability of the same score, then the id is not fixed in the range of - the same score - expected: hybrid search successfully with limit(topK) - """ - # 1. initialize collection with data - dim = 200 - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, auto_id=False, dim=dim, is_flush=is_flush, - enable_dynamic_field=False, multiple_dim_array=[dim, dim])[0:5] - # 2. extract vector field name - vector_name_list = cf.extract_vector_field_name_list(collection_w) - vector_name_list.append(ct.default_float_vec_field_name) - # 3. prepare search params for each vector field - req_list = [] - search_res_dict_array = [] - for i in range(len(vector_name_list)): - vectors = [[random.random() for _ in range(dim)] for _ in range(1)] - search_res_dict = {} - search_param = { - "data": vectors, - "anns_field": vector_name_list[i], - "param": {"metric_type": "COSINE"}, - "limit": default_limit, - "expr": "int64 > 0"} - req = AnnSearchRequest(**search_param) - req_list.append(req) - # search for get the baseline of hybrid_search - search_res = collection_w.search(vectors[:1], vector_name_list[i], - default_search_params, default_limit, - default_search_exp, offset=0, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name})[0] - ids = search_res[0].ids - for j in range(len(ids)): - search_res_dict[ids[j]] = 1 / (j + k + 1) - search_res_dict_array.append(search_res_dict) - # 4. calculate hybrid search baseline for RRFRanker - ids_answer, score_answer = cf.get_hybrid_search_base_results_rrf(search_res_dict_array) - # 5. hybrid search - hybrid_res = collection_w.hybrid_search(req_list, RRFRanker(k), default_limit, - offset=offset, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name})[0] - # 6. compare results through the re-calculated distances - for i in range(len(score_answer[:default_limit])): - assert score_answer[i] - hybrid_res[0].distances[i] < hybrid_search_epsilon - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("limit", [1, 100, 16384]) - @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name, ct.default_string_field_name]) - def test_hybrid_search_different_limit_round_decimal(self, primary_field, limit): - """ - target: test hybrid search with different valid limit and round decimal - method: create connection, collection, insert and search - expected: hybrid search successfully with limit(topK) - """ - # 1. initialize collection with data - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, primary_field=primary_field, - multiple_dim_array=[default_dim, default_dim])[0:5] - # 2. extract vector field name - vector_name_list = cf.extract_vector_field_name_list(collection_w) - vector_name_list.append(ct.default_float_vec_field_name) - # 3. prepare search params - req_list = [] - weights = [0.2, 0.3, 0.5] - search_res_dict_array = [] - if limit > default_nb: - limit = default_limit - metrics = [] - for i in range(len(vector_name_list)): - vectors = [[random.random() for _ in range(default_dim)] for _ in range(1)] - search_res_dict = {} - search_param = { - "data": vectors, - "anns_field": vector_name_list[i], - "param": {"metric_type": "COSINE", "offset": 0}, - "limit": limit, - "expr": "int64 > 0"} - req = AnnSearchRequest(**search_param) - req_list.append(req) - metrics.append("COSINE") - # search to get the base line of hybrid_search - search_res = collection_w.search(vectors[:1], vector_name_list[i], - default_search_params, limit, - default_search_exp, round_decimal=5, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "ids": insert_ids, - "limit": limit, - "pk_name": ct.default_int64_field_name})[0] - ids = search_res[0].ids - distance_array = search_res[0].distances - for j in range(len(ids)): - search_res_dict[ids[j]] = distance_array[j] - search_res_dict_array.append(search_res_dict) - # 4. calculate hybrid search base line - ids_answer, score_answer = cf.get_hybrid_search_base_results(search_res_dict_array, weights, metrics, 5) - # 5. hybrid search - hybrid_res = collection_w.hybrid_search(req_list, WeightedRanker(*weights), limit, - round_decimal=5, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "ids": insert_ids, - "limit": limit, - "pk_name": ct.default_int64_field_name})[0] - # 6. compare results through the re-calculated distances - for i in range(len(score_answer[:limit])): - delta = math.fabs(score_answer[i] - hybrid_res[0].distances[i]) - if delta >= hybrid_search_epsilon: - # print id and distance for debug - # answer and hybrid search result - for i1 in range(len(score_answer)): - log.info("answer id: %d, distance: %f" % (ids_answer[i1], score_answer[i1])) - for i2 in range(len(hybrid_res[0].ids)): - log.info( - "hybrid search res id: %d, distance: %f" % (hybrid_res[0].ids[i2], hybrid_res[0].distances[i2])) - assert delta < hybrid_search_epsilon - - @pytest.mark.tags(CaseLabel.L1) - def test_hybrid_search_limit_out_of_range_max(self): - """ - target: test hybrid search with over maximum limit - method: create connection, collection, insert and search - expected: hybrid search successfully with limit(topK) - """ - # 1. initialize collection with data - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, multiple_dim_array=[default_dim, default_dim])[0:5] - # 2. extract vector field name - vector_name_list = cf.extract_vector_field_name_list(collection_w) - vector_name_list.append(ct.default_float_vec_field_name) - # 3. prepare search params - req_list = [] - weights = [0.2, 0.3, 0.5] - for i in range(len(vector_name_list)): - vectors = [[random.random() for _ in range(default_dim)] for _ in range(1)] - search_param = { - "data": vectors, - "anns_field": vector_name_list[i], - "param": {"metric_type": "COSINE", "offset": 0}, - "limit": default_limit, - "expr": "int64 > 0"} - req = AnnSearchRequest(**search_param) - req_list.append(req) - # 4. hybrid search with over maximum limit - limit = 16385 - error = {ct.err_code: 65535, ct.err_msg: "invalid max query result window, (offset+limit) " - "should be in range [1, 16384], but got %d" % limit} - collection_w.hybrid_search(req_list, WeightedRanker(*weights), limit, - check_task=CheckTasks.err_res, check_items=error) - - @pytest.mark.tags(CaseLabel.L1) - def test_hybrid_search_limit_out_of_range_min(self): - """ - target: test hybrid search with over minimum limit - method: create connection, collection, insert and search - expected: hybrid search successfully with limit(topK) - """ - # 1. initialize collection with data - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, multiple_dim_array=[default_dim, default_dim])[0:5] - # 2. extract vector field name - vector_name_list = cf.extract_vector_field_name_list(collection_w) - vector_name_list.append(ct.default_float_vec_field_name) - # 3. prepare search params - req_list = [] - weights = [0.2, 0.3, 0.5] - for i in range(len(vector_name_list)): - vectors = [[random.random() for _ in range(default_dim)] for _ in range(1)] - search_param = { - "data": vectors, - "anns_field": vector_name_list[i], - "param": {"metric_type": "COSINE", "offset": 0}, - "limit": default_limit, - "expr": "int64 > 0"} - req = AnnSearchRequest(**search_param) - req_list.append(req) - # 4. hybrid search with over maximum limit - limit = 0 - error = {ct.err_code: 1, ct.err_msg: "`limit` value 0 is illegal"} - collection_w.hybrid_search(req_list, WeightedRanker(*weights), limit, - check_task=CheckTasks.err_res, check_items=error) - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name, ct.default_string_field_name]) - def test_hybrid_search_with_output_fields(self, nq, dim, auto_id, is_flush, enable_dynamic_field, - primary_field, vector_data_type): - """ - target: test hybrid search normal case - method: create connection, collection, insert and search - expected: hybrid search successfully with limit(topK) - """ - # 1. initialize collection with data - nq = 10 - multiple_dim_array = [dim, dim] - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_flush=is_flush, - primary_field=primary_field, - enable_dynamic_field=enable_dynamic_field, - multiple_dim_array=multiple_dim_array, - vector_data_type=vector_data_type)[0:5] - # 2. extract vector field name - vector_name_list = cf.extract_vector_field_name_list(collection_w) - vector_name_list.append(ct.default_float_vec_field_name) - # 3. prepare search params - req_list = [] - weights = [0.2, 0.3, 0.5] - metrics = [] - search_res_dict_array = [] - search_res_dict_array_nq = [] - vectors = cf.gen_vectors(nq, dim, vector_data_type) - - # get hybrid search req list - for i in range(len(vector_name_list)): - search_param = { - "data": vectors, - "anns_field": vector_name_list[i], - "param": {"metric_type": "COSINE"}, - "limit": default_limit, - "expr": "int64 > 0"} - req = AnnSearchRequest(**search_param) - req_list.append(req) - metrics.append("COSINE") - - # get the result of search with the same params of the following hybrid search - single_search_param = {"metric_type": "COSINE", "params": {"nprobe": 10}} - for k in range(nq): - for i in range(len(vector_name_list)): - search_res_dict = {} - search_res_dict_array = [] - vectors_search = vectors[k] - # 5. search to get the base line of hybrid_search - search_res = collection_w.search([vectors_search], vector_name_list[i], - single_search_param, default_limit, - default_search_exp, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name})[0] - ids = search_res[0].ids - distance_array = search_res[0].distances - for j in range(len(ids)): - search_res_dict[ids[j]] = distance_array[j] - search_res_dict_array.append(search_res_dict) - search_res_dict_array_nq.append(search_res_dict_array) - - # 6. calculate hybrid search base line - score_answer_nq = [] - for k in range(nq): - ids_answer, score_answer = cf.get_hybrid_search_base_results(search_res_dict_array_nq[k], weights, metrics) - score_answer_nq.append(score_answer) - # 7. hybrid search - output_fields = [default_int64_field_name] - hybrid_res = collection_w.hybrid_search(req_list, WeightedRanker(*weights), default_limit, - output_fields=output_fields, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name})[0] - # 8. compare results through the re-calculated distances - for k in range(len(score_answer_nq)): - for i in range(len(score_answer_nq[k][:default_limit])): - assert score_answer_nq[k][i] - hybrid_res[k].distances[i] < hybrid_search_epsilon - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name, ct.default_string_field_name]) - def test_hybrid_search_with_output_fields_all_fields(self, nq, dim, auto_id, is_flush, enable_dynamic_field, - primary_field, vector_data_type): - """ - target: test hybrid search normal case - method: create connection, collection, insert and search - expected: hybrid search successfully with limit(topK) - """ - # 1. initialize collection with data - nq = 10 - multiple_dim_array = [dim, dim] - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_flush=is_flush, - primary_field=primary_field, - enable_dynamic_field=enable_dynamic_field, - multiple_dim_array=multiple_dim_array, - vector_data_type=vector_data_type)[0:5] - # 2. extract vector field name - vector_name_list = cf.extract_vector_field_name_list(collection_w) - vector_name_list.append(ct.default_float_vec_field_name) - # 3. prepare search params - req_list = [] - weights = [0.2, 0.3, 0.5] - metrics = [] - search_res_dict_array = [] - search_res_dict_array_nq = [] - vectors = cf.gen_vectors(nq, dim, vector_data_type) - - # get hybrid search req list - for i in range(len(vector_name_list)): - search_param = { - "data": vectors, - "anns_field": vector_name_list[i], - "param": {"metric_type": "COSINE"}, - "limit": default_limit, - "expr": "int64 > 0"} - req = AnnSearchRequest(**search_param) - req_list.append(req) - metrics.append("COSINE") - - # get the result of search with the same params of the following hybrid search - single_search_param = {"metric_type": "COSINE", "params": {"nprobe": 10}} - for k in range(nq): - for i in range(len(vector_name_list)): - search_res_dict = {} - search_res_dict_array = [] - vectors_search = vectors[k] - # 5. search to get the base line of hybrid_search - search_res = collection_w.search([vectors_search], vector_name_list[i], - single_search_param, default_limit, - default_search_exp, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name})[0] - ids = search_res[0].ids - distance_array = search_res[0].distances - for j in range(len(ids)): - search_res_dict[ids[j]] = distance_array[j] - search_res_dict_array.append(search_res_dict) - search_res_dict_array_nq.append(search_res_dict_array) - - # 6. calculate hybrid search base line - score_answer_nq = [] - for k in range(nq): - ids_answer, score_answer = cf.get_hybrid_search_base_results(search_res_dict_array_nq[k], weights, metrics) - score_answer_nq.append(score_answer) - # 7. hybrid search - output_fields = [default_int64_field_name, default_float_field_name, default_string_field_name, - default_json_field_name] - output_fields = output_fields + vector_name_list - hybrid_res = collection_w.hybrid_search(req_list, WeightedRanker(*weights), default_limit, - output_fields=output_fields, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name})[0] - # 8. compare results through the re-calculated distances - for k in range(len(score_answer_nq)): - for i in range(len(score_answer_nq[k][:default_limit])): - assert score_answer_nq[k][i] - hybrid_res[k].distances[i] < hybrid_search_epsilon - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("output_fields", - [[default_search_field], [default_search_field, default_int64_field_name]]) - @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name, ct.default_string_field_name]) - def test_hybrid_search_with_output_fields_sync_async(self, nq, primary_field, output_fields, _async): - """ - target: test hybrid search normal case - method: create connection, collection, insert and search - expected: hybrid search successfully with limit(topK) - """ - # 1. initialize collection with data - multiple_dim_array = [default_dim, default_dim] - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, dim=default_dim, - primary_field=primary_field, - multiple_dim_array=multiple_dim_array)[0:5] - # 2. extract vector field name - vector_name_list = cf.extract_vector_field_name_list(collection_w) - vector_name_list.append(ct.default_float_vec_field_name) - # 3. prepare search params - req_list = [] - weights = [0.2, 0.3, 0.5] - metrics = [] - search_res_dict_array = [] - search_res_dict_array_nq = [] - vectors = cf.gen_vectors(nq, default_dim, vector_data_type=DataType.FLOAT_VECTOR) - - # get hybrid search req list - for i in range(len(vector_name_list)): - search_param = { - "data": vectors, - "anns_field": vector_name_list[i], - "param": {"metric_type": "COSINE"}, - "limit": default_limit, - "expr": "int64 > 0"} - req = AnnSearchRequest(**search_param) - req_list.append(req) - metrics.append("COSINE") - - # get the result of search with the same params of the following hybrid search - single_search_param = {"metric_type": "COSINE", "params": {"nprobe": 10}} - for k in range(nq): - for i in range(len(vector_name_list)): - search_res_dict = {} - search_res_dict_array = [] - vectors_search = vectors[k] - # 5. search to get the base line of hybrid_search - search_res = collection_w.search([vectors_search], vector_name_list[i], - single_search_param, default_limit, - default_search_exp, _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name, - "_async": _async})[0] - if _async: - search_res.done() - search_res = search_res.result() - ids = search_res[0].ids - distance_array = search_res[0].distances - for j in range(len(ids)): - search_res_dict[ids[j]] = distance_array[j] - search_res_dict_array.append(search_res_dict) - search_res_dict_array_nq.append(search_res_dict_array) - - # 6. calculate hybrid search base line - score_answer_nq = [] - for k in range(nq): - ids_answer, score_answer = cf.get_hybrid_search_base_results(search_res_dict_array_nq[k], weights, metrics) - score_answer_nq.append(score_answer) - # 7. hybrid search - hybrid_res = collection_w.hybrid_search(req_list, WeightedRanker(*weights), default_limit, - output_fields=output_fields, _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": default_limit, - "_async": _async, - "pk_name": ct.default_int64_field_name})[0] - if _async: - hybrid_res.done() - hybrid_res = hybrid_res.result() - # 8. compare results through the re-calculated distances - for k in range(len(score_answer_nq)): - for i in range(len(score_answer_nq[k][:default_limit])): - assert score_answer_nq[k][i] - hybrid_res[k].distances[i] < hybrid_search_epsilon + # TODO: to be implement + pass + @pytest.mark.skip(reason="skip for #45939") @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("rerank", [RRFRanker(), WeightedRanker(0.1, 0.9, 1)]) def test_hybrid_search_offset_both_inside_outside_params(self, rerank): @@ -1749,7 +1572,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): """ # 1. initialize collection with data collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, multiple_dim_array=[default_dim, default_dim])[0:5] + self.init_collection_general("", True, multiple_dim_array=[default_dim, default_dim])[0:5] # 2. extract vector field name vector_name_list = cf.extract_vector_field_name_list(collection_w) vector_name_list.append(ct.default_float_vec_field_name) @@ -1776,16 +1599,17 @@ class TestCollectionHybridSearchValid(TestcaseBase): @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("limit", [1, 100, 16384]) - @pytest.mark.parametrize("primary_field", [ct.default_int64_field_name, ct.default_string_field_name]) - def test_hybrid_search_is_partition_key(self, nq, primary_field, limit, vector_data_type): + @pytest.mark.parametrize("primary_field", [ct.default_string_field_name]) + def test_hybrid_search_is_partition_key(self, primary_field, limit, vector_data_type): """ target: test hybrid search with different valid limit and round decimal method: create connection, collection, insert and search expected: hybrid search successfully with limit(topK) """ + nq = 2 # 1. initialize collection with data collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, primary_field=primary_field, + self.init_collection_general("", True, primary_field=primary_field, multiple_dim_array=[default_dim, default_dim], vector_data_type=vector_data_type, is_partition_key=ct.default_float_field_name)[0:5] @@ -1822,7 +1646,6 @@ class TestCollectionHybridSearchValid(TestcaseBase): # 5. search to get the base line of hybrid_search search_res = collection_w.search([vectors_search], vector_name_list[i], single_search_param, default_limit, - default_search_exp, check_task=CheckTasks.check_search_results, check_items={"nq": 1, "ids": insert_ids, @@ -1835,7 +1658,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): search_res_dict_array.append(search_res_dict) search_res_dict_array_nq.append(search_res_dict_array) - # 6. calculate hybrid search base line + # 6. calculate hybrid search baseline score_answer_nq = [] for k in range(nq): ids_answer, score_answer = cf.get_hybrid_search_base_results(search_res_dict_array_nq[k], weights, metrics) @@ -1852,77 +1675,6 @@ class TestCollectionHybridSearchValid(TestcaseBase): for i in range(len(score_answer_nq[k][:default_limit])): assert score_answer_nq[k][i] - hybrid_res[k].distances[i] < hybrid_search_epsilon - @pytest.mark.tags(CaseLabel.L1) - def test_hybrid_search_result_L2_order(self, nq): - """ - target: test hybrid search result having correct order for L2 distance - method: create connection, collection, insert and search - expected: hybrid search successfully and result order is correct - """ - # 1. initialize collection with data - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, is_index=False, - multiple_dim_array=[default_dim, default_dim])[0:5] - - # 2. create index - vector_name_list = cf.extract_vector_field_name_list(collection_w) - vector_name_list.append(ct.default_float_vec_field_name) - for i in range(len(vector_name_list)): - default_index = {"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}, } - collection_w.create_index(vector_name_list[i], default_index) - collection_w.load() - - # 3. prepare search params - req_list = [] - weights = [0.2, 0.3, 0.5] - for i in range(len(vector_name_list)): - vectors = [[random.random() for _ in range(default_dim)] for _ in range(nq)] - search_param = { - "data": vectors, - "anns_field": vector_name_list[i], - "param": {"metric_type": "L2", "offset": 0}, - "limit": default_limit, - "expr": "int64 > 0"} - req = AnnSearchRequest(**search_param) - req_list.append(req) - # 4. hybrid search - res = collection_w.hybrid_search(req_list, WeightedRanker(*weights), 10)[0] - is_sorted_descend = lambda lst: all(lst[i] >= lst[i + 1] for i in range(len(lst) - 1)) - for i in range(nq): - assert is_sorted_descend(res[i].distances) - - @pytest.mark.tags(CaseLabel.L1) - def test_hybrid_search_result_order(self, nq): - """ - target: test hybrid search result having correct order for cosine distance - method: create connection, collection, insert and search - expected: hybrid search successfully and result order is correct - """ - # 1. initialize collection with data - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, multiple_dim_array=[default_dim, default_dim])[0:5] - # 2. extract vector field name - vector_name_list = cf.extract_vector_field_name_list(collection_w) - vector_name_list.append(ct.default_float_vec_field_name) - # 3. prepare search params - req_list = [] - weights = [0.2, 0.3, 0.5] - for i in range(len(vector_name_list)): - vectors = [[random.random() for _ in range(default_dim)] for _ in range(nq)] - search_param = { - "data": vectors, - "anns_field": vector_name_list[i], - "param": {"metric_type": "COSINE", "offset": 0}, - "limit": default_limit, - "expr": "int64 > 0"} - req = AnnSearchRequest(**search_param) - req_list.append(req) - # 4. hybrid search - res = collection_w.hybrid_search(req_list, WeightedRanker(*weights), 10)[0] - is_sorted_descend = lambda lst: all(lst[i] >= lst[i + 1] for i in range(len(lst) - 1)) - for i in range(nq): - assert is_sorted_descend(res[i].distances) - @pytest.mark.tags(CaseLabel.L2) def test_hybrid_search_sparse_normal(self): """ @@ -1933,7 +1685,7 @@ class TestCollectionHybridSearchValid(TestcaseBase): nb, auto_id, dim, enable_dynamic_field = 20000, False, 768, False # 1. init collection collection_w, insert_vectors, _, insert_ids = \ - self.init_collection_general(prefix, True, nb=nb, multiple_dim_array=[dim, dim * 2], + self.init_collection_general("", True, nb=nb, multiple_dim_array=[dim, dim * 2], with_json=False, vector_data_type=DataType.SPARSE_FLOAT_VECTOR)[0:4] # 2. extract vector field name vector_name_list = cf.extract_vector_field_name_list(collection_w) @@ -1954,16 +1706,16 @@ class TestCollectionHybridSearchValid(TestcaseBase): "expr": "int64 > 0"} req = AnnSearchRequest(**search_param) req_list.append(req) - # search for get the base line of hybrid_search + # search for get the baseline of hybrid_search search_res = collection_w.search(vector, vector_name_list[i], - default_search_params, default_limit, - default_search_exp, + param={}, + limit=default_limit )[0] ids = search_res[0].ids for j in range(len(ids)): search_res_dict[ids[j]] = 1 / (j + k + 1) search_res_dict_array.append(search_res_dict) - # 4. calculate hybrid search base line for RRFRanker + # 4. calculate hybrid search baseline for RRFRanker ids_answer, score_answer = cf.get_hybrid_search_base_results_rrf(search_res_dict_array) # 5. hybrid search hybrid_res = collection_w.hybrid_search(req_list, RRFRanker(k), default_limit, diff --git a/tests/python_client/milvus_client_v2/test_milvus_client_search_v2_new.py b/tests/python_client/milvus_client_v2/test_milvus_client_search_v2_new.py index 43d09bd555..dfcb48d9e0 100644 --- a/tests/python_client/milvus_client_v2/test_milvus_client_search_v2_new.py +++ b/tests/python_client/milvus_client_v2/test_milvus_client_search_v2_new.py @@ -344,8 +344,8 @@ class TestMilvusClientSearchBasicV2(TestMilvusClientV2Base): ) @pytest.mark.tags(CaseLabel.L2) - # @pytest.mark.parametrize("limit, nq", zip([1, 1000, ct.max_limit], [ct.max_nq, 10, 1])) - @pytest.mark.parametrize("limit, nq", zip([ct.max_limit], [1])) + @pytest.mark.parametrize("limit, nq", zip([1, 1000, ct.max_limit], [ct.max_nq, 10, 1])) + # @pytest.mark.parametrize("limit, nq", zip([ct.max_limit], [1])) def test_search_with_different_nq_limits(self, limit, nq): """ target: test search with different nq and limit values diff --git a/tests/python_client/requirements.txt b/tests/python_client/requirements.txt index d10f27fb0f..b2cf20fbfb 100644 --- a/tests/python_client/requirements.txt +++ b/tests/python_client/requirements.txt @@ -28,8 +28,8 @@ pytest-parallel pytest-random-order # pymilvus -pymilvus==2.7.0rc72 -pymilvus[bulk_writer]==2.7.0rc72 +pymilvus==2.7.0rc75 +pymilvus[bulk_writer]==2.7.0rc75 # for protobuf protobuf>=5.29.5 diff --git a/tests/python_client/testcases/async_milvus_client/test_e2e_async.py b/tests/python_client/testcases/async_milvus_client/test_e2e_async.py index 6c8815c9c5..e7c82a6983 100644 --- a/tests/python_client/testcases/async_milvus_client/test_e2e_async.py +++ b/tests/python_client/testcases/async_milvus_client/test_e2e_async.py @@ -254,10 +254,7 @@ class TestAsyncMilvusClient(TestMilvusClientV2Base): assert r[0]['insert_count'] == step # flush - # TODO: call async flush() as https://github.com/milvus-io/pymilvus/issues/3060 fixed - # await self.async_milvus_client_wrap.flush(c_name) - milvus_client = self._client() - self.flush(milvus_client, c_name) + await self.async_milvus_client_wrap.flush(c_name) stats, _ = await self.async_milvus_client_wrap.get_collection_stats(c_name) assert stats["row_count"] == async_default_nb diff --git a/tests/python_client/testcases/async_milvus_client/test_index_async.py b/tests/python_client/testcases/async_milvus_client/test_index_async.py index 1a482077cf..78ab4c4c08 100644 --- a/tests/python_client/testcases/async_milvus_client/test_index_async.py +++ b/tests/python_client/testcases/async_milvus_client/test_index_async.py @@ -60,8 +60,7 @@ class TestAsyncMilvusClientIndexInvalid(TestMilvusClientV2Base): index_params = async_client.prepare_index_params()[0] index_params.add_index(field_name="vector") # 3. create index - error = {ct.err_code: 1100, ct.err_msg: f"Invalid collection name: {name}. the first character of a collection " - f"name must be an underscore or letter: invalid parameter"} + error = {ct.err_code: 1100, ct.err_msg: f"collection not found[database=default][collection={name}]"} await async_client.create_index(name, index_params, check_task=CheckTasks.err_res, check_items=error) @@ -88,8 +87,7 @@ class TestAsyncMilvusClientIndexInvalid(TestMilvusClientV2Base): index_params = async_client.prepare_index_params()[0] index_params.add_index(field_name="vector") # 3. create index - error = {ct.err_code: 1100, ct.err_msg: f"Invalid collection name: {name}. the length of a collection name " - f"must be less than 255 characters: invalid parameter"} + error = {ct.err_code: 1100, ct.err_msg: f"collection not found[database=default][collection={name}]"} await async_client.create_index(name, index_params, check_task=CheckTasks.err_res, check_items=error) @@ -117,7 +115,7 @@ class TestAsyncMilvusClientIndexInvalid(TestMilvusClientV2Base): index_params.add_index(field_name="vector") # 3. create index error = {ct.err_code: 100, - ct.err_msg: f"can't find collection[database=default][collection={not_existed_collection_name}]"} + ct.err_msg: f"collection not found[database=default][collection={not_existed_collection_name}]"} await async_client.create_index(not_existed_collection_name, index_params, check_task=CheckTasks.err_res, check_items=error)