From abb3aeacdf5818b8323c4b22110d29827dd28ace Mon Sep 17 00:00:00 2001 From: yanliang567 <82361606+yanliang567@users.noreply.github.com> Date: Wed, 23 Jul 2025 22:04:54 +0800 Subject: [PATCH] test: Refactor diskann and hsnw index, and update gen data functions (#43452) related issue #40698 1. add diskann and hnsw index test 2. update gen_row_data and gen_column_data functions --------- Signed-off-by: yanliang567 --- tests/python_client/base/client_v2_base.py | 12 + tests/python_client/check/func_check.py | 25 +- tests/python_client/check/param_check.py | 2 +- tests/python_client/common/common_func.py | 503 +++++++----- .../test_milvus_client_collection.py | 1 + .../test_milvus_client_search.py | 98 ++- .../test_milvus_client_search_iterator.py | 3 +- .../test_milvus_client_search_diskann.py | 294 ------- .../test_milvus_client_search_v2.py | 715 +----------------- .../test_milvus_client_search_v2_new.py | 353 +++++++-- .../testcases/indexes/idx_diskann.py | 95 +++ .../testcases/indexes/idx_hnsw.py | 175 +++++ .../testcases/indexes/test_diskann.py | 229 ++++++ .../testcases/indexes/test_hnsw.py | 273 +++++++ tests/python_client/testcases/test_insert.py | 13 +- .../testcases/test_mix_scenes.py | 38 +- tests/python_client/testcases/test_utility.py | 9 +- 17 files changed, 1525 insertions(+), 1313 deletions(-) create mode 100644 tests/python_client/testcases/indexes/idx_diskann.py create mode 100644 tests/python_client/testcases/indexes/idx_hnsw.py create mode 100644 tests/python_client/testcases/indexes/test_diskann.py create mode 100644 tests/python_client/testcases/indexes/test_hnsw.py diff --git a/tests/python_client/base/client_v2_base.py b/tests/python_client/base/client_v2_base.py index d042e86bd3..bf49259b95 100644 --- a/tests/python_client/base/client_v2_base.py +++ b/tests/python_client/base/client_v2_base.py @@ -360,6 +360,18 @@ class TestMilvusClientV2Base(Base): collection_name=collection_name, **kwargs).run() return res, check_result + @trace() + def refresh_load(self, client, collection_name, timeout=None, check_task=None, check_items=None, **kwargs): + timeout = TIMEOUT if timeout is None else timeout + kwargs.update({"timeout": timeout}) + + func_name = sys._getframe().f_code.co_name + res, check = api_request([client.refresh_load, collection_name], **kwargs) + check_result = ResponseChecker(res, func_name, check_task, + check_items, check, + collection_name=collection_name, **kwargs).run() + return res, check_result + @trace() def release_collection(self, client, collection_name, timeout=None, check_task=None, check_items=None, **kwargs): timeout = TIMEOUT if timeout is None else timeout diff --git a/tests/python_client/check/func_check.py b/tests/python_client/check/func_check.py index 404ab8fed9..cb65b82d53 100644 --- a/tests/python_client/check/func_check.py +++ b/tests/python_client/check/func_check.py @@ -247,17 +247,12 @@ class ResponseChecker: raise Exception("No expect values found in the check task") if check_items.get("collection_name", None) is not None: assert res["collection_name"] == check_items.get("collection_name") - if check_items.get("auto_id", False): - assert res["auto_id"] == check_items.get("auto_id") - if check_items.get("num_shards", 1): - assert res["num_shards"] == check_items.get("num_shards", 1) - if check_items.get("consistency_level", 2): - assert res["consistency_level"] == check_items.get("consistency_level", 2) - if check_items.get("enable_dynamic_field", True): - assert res["enable_dynamic_field"] == check_items.get("enable_dynamic_field", True) - if check_items.get("num_partitions", 1): - assert res["num_partitions"] == check_items.get("num_partitions", 1) - if check_items.get("id_name", "id"): + assert res["auto_id"] == check_items.get("auto_id", False) + assert res["num_shards"] == check_items.get("num_shards", 1) + assert res["consistency_level"] == check_items.get("consistency_level", 0) + assert res["enable_dynamic_field"] == check_items.get("enable_dynamic_field", True) + assert res["num_partitions"] == check_items.get("num_partitions", 1) + if check_items.get("id_name", None): assert res["fields"][0]["name"] == check_items.get("id_name", "id") if check_items.get("vector_name", "vector"): vector_name_list = [] @@ -474,9 +469,9 @@ class ResponseChecker: elif check_items.get("metric", None) is not None: # verify the distances are already sorted if check_items.get("metric").upper() in ["IP", "COSINE", "BM25"]: - assert distances == sorted(distances, reverse=True) + assert pc.compare_lists_with_epsilon_ignore_dict_order(distances, sorted(distances, reverse=True)) else: - assert distances == sorted(distances, reverse=False) + assert pc.compare_lists_with_epsilon_ignore_dict_order(distances, sorted(distances, reverse=False)) if check_items.get("vector_nq") is None or check_items.get("original_vectors") is None: log.debug("skip distance check for knowhere does not return the precise distances") else: @@ -484,9 +479,9 @@ class ResponseChecker: else: pass # just check nq and topk, not specific ids need check nq_i += 1 + log.info("search_results_check: limit (topK) and " "ids searched for %d queries are correct" % len(search_res)) - return True @staticmethod @@ -600,7 +595,7 @@ class ResponseChecker: if isinstance(query_res, list): # assert pc.equal_entities_list(exp=exp_res, actual=query_res, primary_field=pk_name, with_vec=with_vec) # return True - assert pc.compare_lists_ignore_order(a=query_res, b=exp_res) + assert pc.compare_lists_with_epsilon_ignore_dict_order(a=query_res, b=exp_res) return True else: log.error(f"Query result {query_res} is not list") diff --git a/tests/python_client/check/param_check.py b/tests/python_client/check/param_check.py index 28cf436e67..90eb384194 100644 --- a/tests/python_client/check/param_check.py +++ b/tests/python_client/check/param_check.py @@ -69,7 +69,7 @@ def deep_approx_compare(x, y, epsilon=epsilon): return x == y -def compare_lists_ignore_order(a, b, epsilon=epsilon): +def compare_lists_with_epsilon_ignore_dict_order(a, b, epsilon=epsilon): """ Compares two lists of dictionaries for equality (order-insensitive) with floating-point tolerance. diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index 8bd27b5823..a7e1354d86 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -1654,20 +1654,6 @@ def gen_default_binary_dataframe_data(nb=ct.default_nb, dim=ct.default_dim, star return df, binary_raw_values -# -# def gen_default_list_data(nb=ct.default_nb, dim=ct.default_dim, start=0, with_json=True): -# int_values = [i for i in range(start, start + nb)] -# float_values = [np.float32(i) for i in range(start, start + nb)] -# string_values = [str(i) for i in range(start, start + nb)] -# json_values = [{"number": i, "string": str(i), "bool": bool(i), "list": [j for j in range(0, i)]} -# for i in range(start, start + nb)] -# float_vec_values = gen_vectors(nb, dim) -# if with_json is False: -# data = [int_values, float_values, string_values, float_vec_values] -# else: -# data = [int_values, float_values, string_values, json_values, float_vec_values] -# return data - def gen_default_list_sparse_data(nb=ct.default_nb, dim=ct.default_dim, start=0, with_json=False): int_values = [i for i in range(start, start + nb)] @@ -1728,56 +1714,122 @@ def prepare_bulk_insert_data(schema=None, return files -def get_column_data_by_schema(nb=ct.default_nb, schema=None, skip_vectors=False, start=None): +def gen_column_data_by_schema(nb=ct.default_nb, schema=None, skip_vectors=False, start=0): + return get_column_data_by_schema(nb=nb, schema=schema, skip_vectors=skip_vectors, start=start) + + +def get_column_data_by_schema(nb=ct.default_nb, schema=None, skip_vectors=False, start=0, random_pk=False): + """ + Generates column data based on the given schema. + + Args: + nb (int): Number of rows to generate. Defaults to ct.default_nb. + schema (Schema): Collection schema. If None, uses default schema. + skip_vectors (bool): Whether to skip vector fields. Defaults to False. + start (int): Starting value for primary key fields (default: 0) + random_pk (bool, optional): Whether to generate random primary key values (default: False) + + Returns: + list: List of column data arrays matching the schema fields (excluding auto_id fields). + """ if schema is None: schema = gen_default_collection_schema() fields = schema.fields - fields_not_auto_id = [] + fields_to_gen = [] for field in fields: - if not field.auto_id: - fields_not_auto_id.append(field) + if not field.auto_id and not field.is_function_output: + fields_to_gen.append(field) data = [] - for field in fields_not_auto_id: - if field.dtype == DataType.FLOAT_VECTOR and skip_vectors is True: + for field in fields_to_gen: + if field.dtype in ct.all_vector_types and skip_vectors is True: tmp = [] else: - tmp = gen_data_by_collection_field(field, nb=nb, start=start) + tmp = gen_data_by_collection_field(field, nb=nb, start=start, random_pk=random_pk) data.append(tmp) return data -def gen_row_data_by_schema(nb=ct.default_nb, schema=None, start=None): +def gen_row_data_by_schema(nb=ct.default_nb, schema=None, start=0, random_pk=False): + """ + Generates row data based on the given schema. + + Args: + nb (int): Number of rows to generate. Defaults to ct.default_nb. + schema (Schema): Collection schema or collection info. If None, uses default schema. + start (int): Starting value for primary key fields. Defaults to 0. + random_pk (bool, optional): Whether to generate random primary key values (default: False) + + Returns: + list[dict]: List of dictionaries where each dictionary represents a row, + with field names as keys and generated data as values. + + Notes: + - Skips auto_id fields and function output fields. + - For primary key fields, generates sequential values starting from 'start'. + - For non-primary fields, generates random data based on field type. + """ if schema is None: schema = gen_default_collection_schema() + # ignore auto id field and the fields in function output func_output_fields = [] - if hasattr(schema, "functions"): - functions = schema.functions + if isinstance(schema, dict): + # a dict of collection schema info is usually from client.describe_collection() + fields = schema.get('fields', []) + functions = schema.get('functions', []) for func in functions: - output_field_names = func.output_field_names + output_field_names = func.get('output_field_names', []) func_output_fields.extend(output_field_names) - func_output_fields = list(set(func_output_fields)) - fields = schema.fields - fields_needs_data = [] - for field in fields: - if field.auto_id: - continue - if field.name in func_output_fields: - continue - fields_needs_data.append(field) - data = [] - for i in range(nb): - tmp = {} - for field in fields_needs_data: - tmp[field.name] = gen_data_by_collection_field(field) - if start is not None and field.dtype == DataType.INT64: - tmp[field.name] = start - start += 1 - if field.nullable is True: - # 10% percent of data is null - if random.random() < 0.1: - tmp[field.name] = None - data.append(tmp) + func_output_fields = list(set(func_output_fields)) + + fields_needs_data = [] + for field in fields: + if field.get('auto_id', False): + continue + if field.get('name', None) in func_output_fields: + continue + fields_needs_data.append(field) + data = [] + for i in range(nb): + tmp = {} + for field in fields_needs_data: + tmp[field.get('name', None)] = gen_data_by_collection_field(field, random_pk=random_pk) + if field.get('is_primary', False) is True and field.get('type', None) == DataType.INT64: + tmp[field.get('name', None)] = start + start += 1 + if field.get('is_primary', False) is True and field.get('type', None) == DataType.VARCHAR: + tmp[field.get('name', None)] = str(start) + start += 1 + data.append(tmp) + else: + # a schema object is usually form orm schema object + fields = schema.fields + if hasattr(schema, "functions"): + functions = schema.functions + for func in functions: + output_field_names = func.output_field_names + func_output_fields.extend(output_field_names) + func_output_fields = list(set(func_output_fields)) + + fields_needs_data = [] + for field in fields: + if field.auto_id: + continue + if field.name in func_output_fields: + continue + fields_needs_data.append(field) + data = [] + for i in range(nb): + tmp = {} + for field in fields_needs_data: + tmp[field.name] = gen_data_by_collection_field(field, random_pk=random_pk) + if field.is_primary is True and field.dtype == DataType.INT64: + tmp[field.name] = start + start += 1 + if field.is_primary is True and field.dtype == DataType.VARCHAR: + tmp[field.name] = str(start) + start += 1 + data.append(tmp) return data @@ -1957,6 +2009,7 @@ def get_dense_anns_field_name_list(schema=None): anns_fields.append(item) return anns_fields + def gen_varchar_data(length: int, nb: int, text_mode=False): if text_mode: return [fake.text() for _ in range(nb)] @@ -1964,164 +2017,222 @@ def gen_varchar_data(length: int, nb: int, text_mode=False): return ["".join([chr(random.randint(97, 122)) for _ in range(length)]) for _ in range(nb)] -def gen_data_by_collection_field(field, nb=None, start=None): - # if nb is None, return one data, else return a list of data - nullable = field.nullable - if nullable is True: - if random.random() < 0.1: - return None - data_type = field.dtype - enable_analyzer = field.params.get("enable_analyzer", False) +def gen_data_by_collection_field(field, nb=None, start=0, random_pk=False): + """ + Generates test data for a given collection field based on its data type and properties. + + Args: + field (dict or Field): Field information, either as a dictionary (v2 client) or Field object (ORM client) + nb (int, optional): Bumber of data batch to generate. If None, returns a single value which usually used by row data generation + start (int, optional): Starting value for primary key fields (default: 0) + random_pk (bool, optional): Whether to generate random primary key values (default: False) + Returns: + Single value if nb is None, otherwise returns a list of generated values + + Notes: + - Handles various data types including primitive types, vectors, arrays and JSON + - For nullable fields, generates None values approximately 20% of the time + - Special handling for primary key fields (sequential values) + - For varchar field, use min(20, max_length) to gen data + - For vector fields, generates random vectors of specified dimension + - For array fields, generates arrays filled with random values of element type + """ + + if isinstance(field, dict): + # for v2 client, it accepts a dict of field info + nullable = field.get('nullable', False) + data_type = field.get('type', None) + enable_analyzer = field.get('params').get("enable_analyzer", False) + is_primary = field.get('is_primary', False) + else: + # for ORM client, it accepts a field object + nullable = field.nullable + data_type = field.dtype + enable_analyzer = field.params.get("enable_analyzer", False) + is_primary = field.is_primary + + # generate data according to the data type if data_type == DataType.BOOL: if nb is None: - return random.choice([True, False]) - return [random.choice([True, False]) for _ in range(nb)] - if data_type == DataType.INT8: + return random.choice([True, False]) if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [random.choice([True, False]) for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else random.choice([True, False]) for i in range(nb)] + elif data_type == DataType.INT8: if nb is None: - return random.randint(-128, 127) - return [random.randint(-128, 127) for _ in range(nb)] - if data_type == DataType.INT16: + return random.randint(-128, 127) if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [random.randint(-128, 127) for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else random.randint(-128, 127) for i in range(nb)] + elif data_type == DataType.INT16: if nb is None: - return random.randint(-32768, 32767) - return [random.randint(-32768, 32767) for _ in range(nb)] - if data_type == DataType.INT32: + return random.randint(-32768, 32767) if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [random.randint(-32768, 32767) for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else random.randint(-32768, 32767) for i in range(nb)] + elif data_type == DataType.INT32: if nb is None: - return random.randint(-2147483648, 2147483647) - return [random.randint(-2147483648, 2147483647) for _ in range(nb)] - if data_type == DataType.INT64: + return random.randint(-2147483648, 2147483647) if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [random.randint(-2147483648, 2147483647) for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else random.randint(-2147483648, 2147483647) for i in range(nb)] + elif data_type == DataType.INT64: if nb is None: - return random.randint(-9223372036854775808, 9223372036854775807) - if start is not None: - return [i for i in range(start, start+nb)] - return [random.randint(-9223372036854775808, 9223372036854775807) for _ in range(nb)] - if data_type == DataType.FLOAT: + return random.randint(-9223372036854775808, 9223372036854775807) if random.random() < 0.8 or nullable is False else None + if nullable is False: + if is_primary is True and random_pk is False: + return [i for i in range(start, start+nb)] + else: + return [random.randint(-9223372036854775808, 9223372036854775807) for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else random.randint(-9223372036854775808, 9223372036854775807) for i in range(nb)] + elif data_type == DataType.FLOAT: if nb is None: - return np.float32(random.random()) - return [np.float32(random.random()) for _ in range(nb)] - if data_type == DataType.DOUBLE: + return np.float32(random.random()) if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [np.float32(random.random()) for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else np.float32(random.random()) for i in range(nb)] + elif data_type == DataType.DOUBLE: if nb is None: - return np.float64(random.random()) - return [np.float64(random.random()) for _ in range(nb)] - if data_type == DataType.VARCHAR: - max_length = field.params['max_length'] + return np.float64(random.random()) if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [np.float64(random.random()) for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else np.float64(random.random()) for i in range(nb)] + elif data_type == DataType.VARCHAR: + if isinstance(field, dict): + max_length = field.get('params')['max_length'] + else: + max_length = field.params['max_length'] max_length = min(20, max_length-1) length = random.randint(0, max_length) if nb is None: - return gen_varchar_data(length=length, nb=1, text_mode=enable_analyzer)[0] - return gen_varchar_data(length=length, nb=nb, text_mode=enable_analyzer) - if data_type == DataType.JSON: + return gen_varchar_data(length=length, nb=1, text_mode=enable_analyzer)[0] if random.random() < 0.8 or nullable is False else None + if nullable is False: + if is_primary is True and random_pk is False: + return [str(i) for i in range(start, start+nb)] + else: + return gen_varchar_data(length=length, nb=nb, text_mode=enable_analyzer) + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else gen_varchar_data(length=length, nb=1, text_mode=enable_analyzer)[0] for i in range(nb)] + elif data_type == DataType.JSON: if nb is None: - return {"name": fake.name(), "address": fake.address(), "count": random.randint(0, 100)} - data = [{"name": str(i), "address": i, "count": random.randint(0, 100)} for i in range(nb)] - return data - if data_type == DataType.FLOAT_VECTOR: - dim = field.params['dim'] + return {"name": fake.name(), "address": fake.address(), "count": random.randint(0, 100)} if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [{"name": str(i), "address": i, "count": random.randint(0, 100)} for i in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else {"name": str(i), "address": i, "count": random.randint(0, 100)} for i in range(nb)] + elif data_type in ct.all_vector_types: + if isinstance(field, dict): + dim = ct.default_dim if data_type == DataType.SPARSE_FLOAT_VECTOR else field.get('params')['dim'] + else: + dim = ct.default_dim if data_type == DataType.SPARSE_FLOAT_VECTOR else field.params['dim'] if nb is None: - return [random.random() for i in range(dim)] - return [[random.random() for i in range(dim)] for _ in range(nb)] - if data_type == DataType.BFLOAT16_VECTOR: - dim = field.params['dim'] - if nb is None: - return RNG.uniform(size=dim).astype(bfloat16) - return [RNG.uniform(size=dim).astype(bfloat16) for _ in range(int(nb))] - # if nb is None: - # raw_vector = [random.random() for _ in range(dim)] - # bf16_vector = np.array(raw_vector, dtype=bfloat16).view(np.uint8).tolist() - # return bytes(bf16_vector) - # bf16_vectors = [] - # for i in range(nb): - # raw_vector = [random.random() for _ in range(dim)] - # bf16_vector = np.array(raw_vector, dtype=bfloat16).view(np.uint8).tolist() - # bf16_vectors.append(bytes(bf16_vector)) - # return bf16_vectors - if data_type == DataType.FLOAT16_VECTOR: - dim = field.params['dim'] - if nb is None: - return np.array([random.random() for _ in range(int(dim))], dtype=np.float16) - return [np.array([random.random() for _ in range(int(dim))], dtype=np.float16) for _ in range(int(nb))] - if data_type == DataType.INT8_VECTOR: - dim = field.params['dim'] - if nb is None: - raw_vector = [random.randint(-128, 127) for _ in range(dim)] - int8_vector = np.array(raw_vector, dtype=np.int8) - return int8_vector - raw_vectors = [[random.randint(-128, 127) for _ in range(dim)] for _ in range(nb)] - int8_vectors = [np.array(raw_vector, dtype=np.int8) for raw_vector in raw_vectors] - return int8_vectors - - if data_type == DataType.BINARY_VECTOR: - dim = field.params['dim'] - if nb is None: - raw_vector = [random.randint(0, 1) for _ in range(dim)] - binary_byte = bytes(np.packbits(raw_vector, axis=-1).tolist()) - return binary_byte - return [bytes(np.packbits([random.randint(0, 1) for _ in range(dim)], axis=-1).tolist()) for _ in range(nb)] - if data_type == DataType.SPARSE_FLOAT_VECTOR: - if nb is None: - return gen_sparse_vectors(nb=1)[0] - return gen_sparse_vectors(nb=nb) - if data_type == DataType.ARRAY: - max_capacity = field.params['max_capacity'] + return gen_vectors(1, dim, vector_data_type=data_type)[0] + if nullable is False: + return gen_vectors(nb, dim, vector_data_type=data_type) + else: + raise MilvusException(message=f"gen data failed, vector field does not support nullable") + elif data_type == DataType.ARRAY: + if isinstance(field, dict): + max_capacity = field.get('params')['max_capacity'] + else: + max_capacity = field.params['max_capacity'] element_type = field.element_type if element_type == DataType.INT8: if nb is None: - return [random.randint(-128, 127) for _ in range(max_capacity)] - return [[random.randint(-128, 127) for _ in range(max_capacity)] for _ in range(nb)] + return [random.randint(-128, 127) for _ in range(max_capacity)] if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [[random.randint(-128, 127) for _ in range(max_capacity)] for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else random.randint(-128, 127) for i in range(nb)] if element_type == DataType.INT16: if nb is None: - return [random.randint(-32768, 32767) for _ in range(max_capacity)] - return [[random.randint(-32768, 32767) for _ in range(max_capacity)] for _ in range(nb)] + return [random.randint(-32768, 32767) for _ in range(max_capacity)] if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [[random.randint(-32768, 32767) for _ in range(max_capacity)] for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else random.randint(-32768, 32767) for i in range(nb)] if element_type == DataType.INT32: if nb is None: - return [random.randint(-2147483648, 2147483647) for _ in range(max_capacity)] - return [[random.randint(-2147483648, 2147483647) for _ in range(max_capacity)] for _ in range(nb)] + return [random.randint(-2147483648, 2147483647) for _ in range(max_capacity)] if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [[random.randint(-2147483648, 2147483647) for _ in range(max_capacity)] for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else random.randint(-2147483648, 2147483647) for i in range(nb)] if element_type == DataType.INT64: if nb is None: - return [random.randint(-9223372036854775808, 9223372036854775807) for _ in range(max_capacity)] - return [[random.randint(-9223372036854775808, 9223372036854775807) for _ in range(max_capacity)] for _ in range(nb)] - + return [random.randint(-9223372036854775808, 9223372036854775807) for _ in range(max_capacity)] if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [[random.randint(-9223372036854775808, 9223372036854775807) for _ in range(max_capacity)] for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else random.randint(-9223372036854775808, 9223372036854775807) for i in range(nb)] if element_type == DataType.BOOL: if nb is None: - return [random.choice([True, False]) for _ in range(max_capacity)] - return [[random.choice([True, False]) for _ in range(max_capacity)] for _ in range(nb)] - + return [random.choice([True, False]) for _ in range(max_capacity)] if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [[random.choice([True, False]) for _ in range(max_capacity)] for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else random.choice([True, False]) for i in range(nb)] if element_type == DataType.FLOAT: if nb is None: - return [np.float32(random.random()) for _ in range(max_capacity)] - return [[np.float32(random.random()) for _ in range(max_capacity)] for _ in range(nb)] + return [np.float32(random.random()) for _ in range(max_capacity)] if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [[np.float32(random.random()) for _ in range(max_capacity)] for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else np.float32(random.random()) for i in range(nb)] if element_type == DataType.DOUBLE: if nb is None: - return [np.float64(random.random()) for _ in range(max_capacity)] - return [[np.float64(random.random()) for _ in range(max_capacity)] for _ in range(nb)] - + return [np.float64(random.random()) for _ in range(max_capacity)] if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [[np.float64(random.random()) for _ in range(max_capacity)] for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else np.float64(random.random()) for i in range(nb)] if element_type == DataType.VARCHAR: - max_length = field.params['max_length'] + if isinstance(field, dict): + max_length = field.get('params')['max_length'] + else: + max_length = field.params['max_length'] max_length = min(20, max_length - 1) length = random.randint(0, max_length) if nb is None: - return ["".join([chr(random.randint(97, 122)) for _ in range(length)]) for _ in range(max_capacity)] - return [["".join([chr(random.randint(97, 122)) for _ in range(length)]) for _ in range(max_capacity)] for _ in range(nb)] + return ["".join([chr(random.randint(97, 122)) for _ in range(length)]) for _ in range(max_capacity)] if random.random() < 0.8 or nullable is False else None + if nullable is False: + return [["".join([chr(random.randint(97, 122)) for _ in range(length)]) for _ in range(max_capacity)] for _ in range(nb)] + else: + # gen 20% none data for nullable field + return [None if i % 2 == 0 and random.random() < 0.4 else "".join([chr(random.randint(97, 122)) for _ in range(length)]) for i in range(nb)] + else: + raise MilvusException(message=f"gen data failed, data type {data_type} not implemented") return None -def gen_data_by_collection_schema(schema, nb, r=0): - """ - gen random data by collection schema, regardless of primary key or auto_id - vector type only support for DataType.FLOAT_VECTOR - """ - data = [] - start_uid = r * nb - fields = schema.fields - for field in fields: - data.append(gen_data_by_collection_field(field, nb, start_uid)) - return data - - def gen_varchar_values(nb: int, length: int = 0): return ["".join([chr(random.randint(97, 122)) for _ in range(length)]) for _ in range(nb)] -def gen_values(schema: CollectionSchema, nb, start_id=0, default_values: dict = {}): +def gen_values(schema: CollectionSchema, nb, start_id=0, default_values: dict = {}, random_pk=False): """ generate default value according to the collection fields, which can replace the value of the specified field @@ -2132,11 +2243,11 @@ def gen_values(schema: CollectionSchema, nb, start_id=0, default_values: dict = if default_value is not None: data.append(default_value) elif field.auto_id is False: - data.append(gen_data_by_collection_field(field, nb, start_id)) + data.append(gen_data_by_collection_field(field, nb, start_id, random_pk=random_pk)) return data -def gen_field_values(schema: CollectionSchema, nb, start_id=0, default_values: dict = {}) -> dict: +def gen_field_values(schema: CollectionSchema, nb, start_id=0, default_values: dict = {}, random_pk=False) -> dict: """ generate default value according to the collection fields, which can replace the value of the specified field @@ -2150,7 +2261,7 @@ def gen_field_values(schema: CollectionSchema, nb, start_id=0, default_values: d if default_value is not None: data[field.name] = default_value elif field.auto_id is False: - data[field.name] = gen_data_by_collection_field(field, nb, start_id * nb) + data[field.name] = gen_data_by_collection_field(field, nb, start_id * nb, random_pk=random_pk) return data @@ -3406,11 +3517,30 @@ def install_milvus_operator_specific_config(namespace, milvus_mode, release_name def get_wildcard_output_field_names(collection_w, output_fields): - all_fields = [field.name for field in collection_w.schema.fields] + """ + Processes output fields with wildcard ('*') expansion for collection queries. + + Args: + collection_w (Union[dict, CollectionWrapper]): Collection information, + either as a dict (v2 client) or ORM wrapper. + output_fields (List[str]): List of requested output fields, may contain '*' wildcard. + + Returns: + List[str]: Expanded list of output fields with wildcard replaced by all available field names. + """ + if not isinstance(collection_w, dict): + # in orm, it accepts a collection wrapper + field_names = [field.name for field in collection_w.schema.fields] + else: + # in client v2, it accepts a dict of collection info + fields = collection_w.get('fields', None) + field_names = [field.get('name') for field in fields] + output_fields = output_fields.copy() if "*" in output_fields: output_fields.remove("*") - output_fields.extend(all_fields) + output_fields.extend(field_names) + return output_fields @@ -3748,3 +3878,34 @@ def gen_collection_name_by_testcase_name(module_index=1): if calling from the testcase, module_index=1 """ return inspect.stack()[module_index][3] + gen_unique_str("_") + + +def parse_fmod(x: int, y: int) -> int: + """ + Computes the floating-point remainder of x/y with the same sign as x. + + This function mimics the behavior of the C fmod() function for integer inputs, + where the result has the same sign as the dividend (x). + + Args: + x (int): The dividend + y (int): The divisor + + Returns: + int: The remainder of x/y with the same sign as x + + Raises: + ValueError: If y is 0 (division by zero) + + Examples: + parse_fmod(5, 3) -> 2 + parse_fmod(-5, 3) -> -2 + parse_fmod(5, -3) -> 2 + parse_fmod(-5, -3) -> -2 + """ + if y == 0: + raise ValueError(f'[parse_fmod] Math domain error, `y` can not bt `0`') + + v = abs(x) % abs(y) + + return v if x >= 0 else -v \ No newline at end of file diff --git a/tests/python_client/milvus_client/test_milvus_client_collection.py b/tests/python_client/milvus_client/test_milvus_client_collection.py index cac44c40eb..75bf1b0ff3 100644 --- a/tests/python_client/milvus_client/test_milvus_client_collection.py +++ b/tests/python_client/milvus_client/test_milvus_client_collection.py @@ -288,6 +288,7 @@ class TestMilvusClientCollectionValid(TestMilvusClientV2Base): check_task=CheckTasks.check_describe_collection_property, check_items={"collection_name": collection_name, "dim": dim, + "auto_id": auto_id, "consistency_level": 0}) index = self.list_indexes(client, collection_name)[0] assert index == ['vector'] diff --git a/tests/python_client/milvus_client/test_milvus_client_search.py b/tests/python_client/milvus_client/test_milvus_client_search.py index 3a420354c4..d9e708eb83 100644 --- a/tests/python_client/milvus_client/test_milvus_client_search.py +++ b/tests/python_client/milvus_client/test_milvus_client_search.py @@ -1782,7 +1782,7 @@ class TestMilvusClientSearchValid(TestMilvusClientV2Base): check_task=CheckTasks.check_describe_collection_property, check_items={"collection_name": collection_name, "dim": default_dim, - "consistency_level": 0}) + "consistency_level": 2}) # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), @@ -1892,7 +1892,7 @@ class TestMilvusClientSearchValid(TestMilvusClientV2Base): check_task=CheckTasks.check_describe_collection_property, check_items={"collection_name": collection_name, "dim": default_dim, - "consistency_level": 0}) + "consistency_level": 2}) # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), @@ -1991,7 +1991,7 @@ class TestMilvusClientSearchValid(TestMilvusClientV2Base): check_task=CheckTasks.check_describe_collection_property, check_items={"collection_name": collection_name, "dim": default_dim, - "consistency_level": 0}) + "consistency_level": 2}) # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), @@ -2086,7 +2086,7 @@ class TestMilvusClientSearchValid(TestMilvusClientV2Base): check_task=CheckTasks.check_describe_collection_property, check_items={"collection_name": collection_name, "dim": default_dim, - "consistency_level": 0}) + "consistency_level": 2}) # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), @@ -2181,7 +2181,7 @@ class TestMilvusClientSearchValid(TestMilvusClientV2Base): check_task=CheckTasks.check_describe_collection_property, check_items={"collection_name": collection_name, "dim": default_dim, - "consistency_level": 0}) + "consistency_level": 2}) # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), @@ -2276,7 +2276,7 @@ class TestMilvusClientSearchValid(TestMilvusClientV2Base): check_task=CheckTasks.check_describe_collection_property, check_items={"collection_name": collection_name, "dim": default_dim, - "consistency_level": 0}) + "consistency_level": 2}) # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), @@ -2393,42 +2393,74 @@ class TestMilvusClientSearchValid(TestMilvusClientV2Base): expected: search/query successfully """ client = self._client() - collection_name = cf.gen_collection_name_by_testcase_name() + old_name = cf.gen_collection_name_by_testcase_name() # 1. create collection - self.create_collection(client, collection_name, default_dim, consistency_level="Bounded") + self.create_collection(client, old_name, default_dim, consistency_level="Strong") collections = self.list_collections(client)[0] - assert collection_name in collections - self.describe_collection(client, collection_name, - check_task=CheckTasks.check_describe_collection_property, - check_items={"collection_name": collection_name, - "dim": default_dim, - "consistency_level": 0}) - old_name = collection_name - new_name = collection_name + "new" + assert old_name in collections + c_info = self.describe_collection(client, old_name, + check_task=CheckTasks.check_describe_collection_property, + check_items={"collection_name": old_name, + "dim": default_dim, + "consistency_level": 0})[0] + + rows = cf.gen_row_data_by_schema(nb=default_nb, schema=c_info) + self.insert(client, old_name, rows) + self.flush(client, old_name) + self.wait_for_index_ready(client, collection_name=old_name, index_name='vector') + + vectors_to_search = cf.gen_vectors(ct.default_nq, default_dim) + insert_ids = [item.get('id') for item in rows] + old_search_res = self.search(client, old_name, vectors_to_search, + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, + "ids": insert_ids, + "pk_name": "id", + "limit": default_limit})[0] + old_query_res = self.query(client, old_name, filter=default_search_exp, + check_task=CheckTasks.check_query_results, + check_items={exp_res: rows, + "with_vec": True})[0] + + new_name = old_name + "new" self.rename_collection(client, old_name, new_name) - # 2. insert - rng = np.random.default_rng(seed=19530) - rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), - default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] + self.describe_collection(client, new_name, + check_task=CheckTasks.check_describe_collection_property, + check_items={"collection_name": new_name, + "dim": default_dim}) + + # search again after rename collection + new_search_res = self.search(client, new_name, vectors_to_search, + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, + "ids": insert_ids, + "pk_name": "id", + "limit": default_limit})[0] + new_query_res = self.query(client, new_name, filter=default_search_exp, + check_task=CheckTasks.check_query_results, + check_items={exp_res: rows, + "with_vec": True})[0] + assert old_search_res[0].ids == new_search_res[0].ids + assert old_query_res == new_query_res + + rows = cf.gen_row_data_by_schema(nb=200, schema=c_info, start=default_nb) + error = {ct.err_code: 0, ct.err_msg: f"collection not found"} + self.insert(client, old_name, rows, + check_task=CheckTasks.err_res, + check_items=error) self.insert(client, new_name, rows) - self.flush(client, new_name) - # assert self.num_entities(client, collection_name)[0] == default_nb - # 3. search - vectors_to_search = rng.random((1, default_dim)) - insert_ids = [i for i in range(default_nb)] + new_ids = [item.get('id') for item in rows] + insert_ids.extend(new_ids) self.search(client, new_name, vectors_to_search, check_task=CheckTasks.check_search_results, check_items={"enable_milvus_client_api": True, - "nq": len(vectors_to_search), + "nq": ct.default_nq, "ids": insert_ids, - "pk_name": default_primary_key_field_name, + "pk_name": "id", "limit": default_limit}) - # 4. query - self.query(client, new_name, filter=default_search_exp, - check_task=CheckTasks.check_query_results, - check_items={exp_res: rows, - "with_vec": True, - "pk_name": default_primary_key_field_name}) + self.release_collection(client, new_name) self.drop_collection(client, new_name) diff --git a/tests/python_client/milvus_client/test_milvus_client_search_iterator.py b/tests/python_client/milvus_client/test_milvus_client_search_iterator.py index d3e8d76003..2581033260 100644 --- a/tests/python_client/milvus_client/test_milvus_client_search_iterator.py +++ b/tests/python_client/milvus_client/test_milvus_client_search_iterator.py @@ -618,6 +618,7 @@ class TestMilvusClientSearchIteratorInValid(TestMilvusClientV2Base): self.describe_collection(client, collection_name, check_task=CheckTasks.check_describe_collection_property, check_items={"collection_name": collection_name, + "consistency_level": 2, "dim": default_dim}) # 2. insert rows = [{default_primary_key_field_name: i, default_vector_field_name: list(cf.gen_vectors(1, default_dim)[0]), @@ -688,7 +689,7 @@ class TestMilvusClientSearchIteratorValid(TestMilvusClientV2Base): check_task=CheckTasks.check_describe_collection_property, check_items={"collection_name": collection_name, "dim": default_dim, - "consistency_level": 0}) + "consistency_level": 2}) # 2. insert rows = [{default_primary_key_field_name: i, default_vector_field_name: list(cf.gen_vectors(1, default_dim)[0]), diff --git a/tests/python_client/milvus_client_v2/test_milvus_client_search_diskann.py b/tests/python_client/milvus_client_v2/test_milvus_client_search_diskann.py index 0495867465..ed2524279b 100644 --- a/tests/python_client/milvus_client_v2/test_milvus_client_search_diskann.py +++ b/tests/python_client/milvus_client_v2/test_milvus_client_search_diskann.py @@ -87,169 +87,10 @@ class TestSearchDiskann(TestcaseBase): ****************************************************************** """ - @pytest.fixture(scope="function", params=[32, 128]) - def dim(self, request): - yield request.param - - @pytest.fixture(scope="function", params=[False, True]) - def auto_id(self, request): - yield request.param - @pytest.fixture(scope="function", params=[False, True]) def _async(self, request): yield request.param - @pytest.fixture(scope="function", params=[True, False]) - def enable_dynamic_field(self, request): - yield request.param - - @pytest.mark.tags(CaseLabel.L2) - def test_search_with_diskann_index(self, _async): - """ - target: test delete after creating index - method: 1.create collection , insert data, primary_field is int field - 2.create diskann index , then load - 3.search - expected: search successfully - """ - # 1. initialize with data - dim = 100 - auto_id = False - enable_dynamic_field = True - nb = 2000 - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, auto_id=auto_id, - nb=nb, dim=dim, is_index=False, - enable_dynamic_field=enable_dynamic_field)[0:4] - - # 2. create index - default_index = {"index_type": "DISKANN", - "metric_type": "L2", "params": {}} - collection_w.create_index( - ct.default_float_vec_field_name, default_index) - collection_w.load() - - default_search_params = { - "metric_type": "L2", "params": {"search_list": 30}} - vectors = [[random.random() for _ in range(dim)] - for _ in range(default_nq)] - output_fields = [default_int64_field_name, - default_float_field_name, default_string_field_name] - collection_w.search(vectors[:default_nq], default_search_field, - default_search_params, default_limit, - default_search_exp, - output_fields=output_fields, - _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name, - "_async": _async} - ) - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("search_list", [20, 200]) - def test_search_with_limit_20(self, _async, search_list): - """ - target: test delete after creating index - method: 1.create collection , insert data, primary_field is int field - 2.create diskann index , then load - 3.search - expected: search successfully - """ - limit = 20 - # 1. initialize with data - enable_dynamic_field = True - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, is_index=False, - enable_dynamic_field=enable_dynamic_field)[0:4] - - # 2. create index - default_index = {"index_type": "DISKANN", "metric_type": "L2", "params": {}} - collection_w.create_index(ct.default_float_vec_field_name, default_index) - collection_w.load() - - search_params = {"metric_type": "L2", "params": {"search_list": search_list}} - output_fields = [default_int64_field_name, default_float_field_name, default_string_field_name] - collection_w.search(vectors[:default_nq], default_search_field, - search_params, limit, default_search_exp, - output_fields=output_fields, _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, - "ids": insert_ids, - "limit": limit, - "_async": _async, - "pk_name": ct.default_int64_field_name}) - - @pytest.mark.tags(CaseLabel.L2) - def test_search_invalid_params_with_diskann_B(self): - """ - target: test delete after creating index - method: 1.create collection , insert data, primary_field is int field - 2.create diskann index - 3.search with invalid params, [k, 200] when k <= 20 - expected: search report an error - """ - # 1. initialize with data - dim = 100 - limit = 20 - auto_id = True - collection_w, _, _, insert_ids = \ - self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_index=False)[0:4] - # 2. create index - default_index = {"index_type": "DISKANN", "metric_type": "L2", "params": {}} - collection_w.create_index(ct.default_float_vec_field_name, default_index) - collection_w.load() - default_search_params = {"metric_type": "L2", "params": {"search_list": limit-1}} - vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)] - output_fields = [default_int64_field_name, default_float_field_name, default_string_field_name] - collection_w.search(vectors[:default_nq], default_search_field, - default_search_params, limit, - default_search_exp, - output_fields=output_fields, - check_task=CheckTasks.err_res, - check_items={"err_code": 999, - "err_msg": f"should be larger than k({limit})"}) - - @pytest.mark.tags(CaseLabel.L2) - def test_search_with_diskann_with_string_pk(self): - """ - target: test delete after creating index - method: 1.create collection , insert data, primary_field is string field - 2.create diskann index - 3.search with invalid metric type - expected: search successfully - """ - # 1. initialize with data - dim = 128 - enable_dynamic_field = True - collection_w, _, _, insert_ids = \ - self.init_collection_general(prefix, True, auto_id=False, dim=dim, is_index=False, - primary_field=ct.default_string_field_name, - enable_dynamic_field=enable_dynamic_field)[0:4] - # 2. create index - default_index = {"index_type": "DISKANN", - "metric_type": "L2", "params": {}} - collection_w.create_index( - ct.default_float_vec_field_name, default_index) - collection_w.load() - search_list = 20 - default_search_params = {"metric_type": "L2", - "params": {"search_list": search_list}} - vectors = [[random.random() for _ in range(dim)] - for _ in range(default_nq)] - output_fields = [default_int64_field_name, - default_float_field_name, default_string_field_name] - collection_w.search(vectors[:default_nq], default_search_field, - default_search_params, default_limit, - default_search_exp, - output_fields=output_fields, - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, - "ids": insert_ids, - "limit": default_limit, - "pk_name": ct.default_int64_field_name} - ) - @pytest.mark.tags(CaseLabel.L2) def test_search_with_delete_data(self, _async): """ @@ -300,57 +141,6 @@ class TestSearchDiskann(TestcaseBase): "pk_name": ct.default_int64_field_name} ) - @pytest.mark.tags(CaseLabel.L2) - def test_search_with_diskann_and_more_index(self, _async): - """ - target: test delete after creating index - method: 1.create collection , insert data - 2.create more index ,then load - 3.delete half data, search - expected: assert index and deleted id not in search result - """ - # 1. initialize with data - dim = 64 - auto_id = False - enable_dynamic_field = True - collection_w, _, _, ids = \ - self.init_collection_general(prefix, True, auto_id=auto_id, dim=dim, is_index=False, - enable_dynamic_field=enable_dynamic_field, language="French")[0:4] - # 2. create index - default_index = {"index_type": "DISKANN", - "metric_type": "COSINE", "params": {}} - collection_w.create_index(ct.default_float_vec_field_name, default_index, index_name=index_name1) - if not enable_dynamic_field: - index_params_one = {} - collection_w.create_index("float", index_params_one, index_name="a") - index_param_two = {} - collection_w.create_index("varchar", index_param_two, index_name="b") - - collection_w.load() - tmp_expr = f'{ct.default_int64_field_name} in {[0]}' - - expr = f'{ct.default_int64_field_name} in {ids[:half_nb]}' - - # delete half of data - del_res = collection_w.delete(expr)[0] - assert del_res.delete_count == half_nb - - collection_w.delete(tmp_expr) - default_search_params = {"metric_type": "COSINE", "params": {"search_list": 30}} - vectors = [[random.random() for _ in range(dim)] for _ in range(default_nq)] - output_fields = [default_int64_field_name, default_float_field_name, default_string_field_name] - collection_w.search(vectors[:default_nq], default_search_field, - default_search_params, default_limit, - default_search_exp, - output_fields=output_fields, - _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, - "ids": ids, - "limit": default_limit, - "_async": _async, - "pk_name": ct.default_int64_field_name}) - @pytest.mark.tags(CaseLabel.L1) def test_search_with_scalar_field(self, _async): """ @@ -396,87 +186,3 @@ class TestSearchDiskann(TestcaseBase): "limit": limit, "_async": _async, "pk_name": ct.default_int64_field_name}) - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("limit", [10, 100, 1000]) - def test_search_diskann_search_list_equal_to_limit(self, limit, _async): - """ - target: test search diskann index when search_list equal to limit - method: 1.create collection , insert data, primary_field is int field - 2.create diskann index , then load - 3.search - expected: search successfully - """ - # 1. initialize with data - dim = 77 - auto_id = False - enable_dynamic_field = False - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, auto_id=auto_id, - dim=dim, is_index=False, - enable_dynamic_field=enable_dynamic_field)[0:4] - - # 2. create index - default_index = {"index_type": "DISKANN", - "metric_type": "L2", "params": {}} - collection_w.create_index( - ct.default_float_vec_field_name, default_index) - collection_w.load() - - search_params = {"metric_type": "L2", "params": {"search_list": limit}} - vectors = [[random.random() for _ in range(dim)] - for _ in range(default_nq)] - output_fields = [default_int64_field_name, - default_float_field_name, default_string_field_name] - collection_w.search(vectors[:default_nq], default_search_field, - search_params, limit, - default_search_exp, - output_fields=output_fields, - _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, - "ids": insert_ids, - "limit": limit, - "_async": _async, - "pk_name": ct.default_int64_field_name} - ) - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.skip(reason="issue #23672") - def test_search_diskann_search_list_up_to_min(self, _async): - """ - target: test search diskann index when search_list up to min - method: 1.create collection , insert data, primary_field is int field - 2.create diskann index , then load - 3.search - expected: search successfully - """ - # 1. initialize with data - dim = 100 - auto_id = True - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, auto_id=auto_id, - dim=dim, is_index=False)[0:4] - - # 2. create index - default_index = {"index_type": "DISKANN", - "metric_type": "L2", "params": {}} - collection_w.create_index( - ct.default_float_vec_field_name, default_index) - collection_w.load() - - search_params = {"metric_type": "L2", - "params": {"k": 200, "search_list": 201}} - search_vectors = [[random.random() for _ in range(dim)] - for _ in range(default_nq)] - output_fields = [default_int64_field_name, - default_float_field_name, default_string_field_name] - collection_w.search(search_vectors[:default_nq], default_search_field, - search_params, default_limit, - default_search_exp, - output_fields=output_fields, - _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, - "ids": insert_ids, - "limit": default_limit, - "_async": _async, - "pk_name": ct.default_int64_field_name}) diff --git a/tests/python_client/milvus_client_v2/test_milvus_client_search_v2.py b/tests/python_client/milvus_client_v2/test_milvus_client_search_v2.py index 5d54cd209f..3a6580b95d 100644 --- a/tests/python_client/milvus_client_v2/test_milvus_client_search_v2.py +++ b/tests/python_client/milvus_client_v2/test_milvus_client_search_v2.py @@ -141,43 +141,6 @@ class TestCollectionSearch(TestcaseBase): # The following are valid base cases ****************************************************************** """ - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("M", [4, 64]) - @pytest.mark.parametrize("efConstruction", [8, 512]) - @pytest.mark.parametrize("limit", [1, 10, 3000]) - def test_search_HNSW_index_with_min_ef(self, M, efConstruction, limit, _async): - """ - target: test search HNSW index with min ef - method: connect milvus, create collection , insert, create index, load and search - expected: search successfully - """ - dim = M * 4 - ef = limit - auto_id = True - enable_dynamic_field = True - self._connect() - collection_w, _, _, insert_ids, time_stamp = \ - self.init_collection_general(prefix, True, 5000, partition_num=1, - auto_id=auto_id, dim=dim, is_index=False, - enable_dynamic_field=enable_dynamic_field)[0:5] - HNSW_index_params = {"M": M, "efConstruction": efConstruction} - HNSW_index = {"index_type": "HNSW", - "params": HNSW_index_params, "metric_type": "L2"} - collection_w.create_index("float_vector", HNSW_index) - collection_w.load() - search_param = {"metric_type": "L2", "params": {"ef": ef}} - vectors = [[random.random() for _ in range(dim)] - for _ in range(default_nq)] - collection_w.search(vectors[:default_nq], default_search_field, - search_param, limit, - default_search_exp, _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, - "ids": insert_ids, - "limit": limit, - "_async": _async}) - @pytest.mark.tags(CaseLabel.L1) def test_search_with_expression(self, null_data_percent): """ @@ -871,497 +834,6 @@ class TestCollectionSearch(TestcaseBase): "limit": 1}) assert search_res[0].ids == [_id] - @pytest.mark.tags(CaseLabel.L2) - def test_search_with_output_fields_empty(self, nq, _async): - """ - target: test search with output fields - method: search with empty output_field - expected: search success - """ - # 1. initialize with data - nb = 1500 - dim = 32 - auto_id = True - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, nb, - auto_id=auto_id, - dim=dim)[0:4] - # 2. search - vectors = [[random.random() for _ in range(dim)] for _ in range(nq)] - collection_w.search(vectors[:nq], default_search_field, - default_search_params, default_limit, - default_search_exp, _async=_async, - output_fields=[], - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": default_limit, - "_async": _async, - "output_fields": []}) - - @pytest.mark.tags(CaseLabel.L1) - def test_search_with_output_field(self, _async): - """ - target: test search with output fields - method: search with one output_field - expected: search success - """ - # 1. initialize with data - auto_id = False - enable_dynamic_field = False - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, - auto_id=auto_id, - enable_dynamic_field=enable_dynamic_field)[0:4] - # 2. search - collection_w.search(vectors[:default_nq], default_search_field, - default_search_params, default_limit, - default_search_exp, _async=_async, - output_fields=[default_int64_field_name], - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, - "ids": insert_ids, - "limit": default_limit, - "_async": _async, - "output_fields": [default_int64_field_name]}) - - @pytest.mark.tags(CaseLabel.L1) - def test_search_with_output_vector_field(self, _async): - """ - target: test search with output fields - method: search with one output_field - expected: search success - """ - # 1. initialize with data - auto_id = True - enable_dynamic_field = False - collection_w, _, _, insert_ids = \ - self.init_collection_general(prefix, True, auto_id=auto_id, enable_dynamic_field=enable_dynamic_field)[0:4] - # 2. search - collection_w.search(vectors[:default_nq], default_search_field, - default_search_params, default_limit, - default_search_exp, _async=_async, - output_fields=[field_name], - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, "ids": insert_ids, - "limit": default_limit, "_async": _async, - "output_fields": [field_name]})[0] - - @pytest.mark.tags(CaseLabel.L2) - def test_search_with_output_fields(self, _async): - """ - target: test search with output fields - method: search with multiple output_field - expected: search success - """ - # 1. initialize with data - nb = 2000 - dim = 64 - auto_id = False - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, nb, - is_all_data_type=True, - auto_id=auto_id, - dim=dim)[0:4] - # 2. search - vectors = [[random.random() for _ in range(dim)] for _ in range(nq)] - output_fields = [default_int64_field_name, default_float_field_name] - collection_w.search(vectors[:nq], default_search_field, - default_search_params, default_limit, - default_search_exp, _async=_async, - output_fields=output_fields, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": default_limit, - "_async": _async, - "output_fields": output_fields}) - - @pytest.mark.tags(CaseLabel.L2) - def test_search_output_array_field(self, enable_dynamic_field): - """ - target: test search output array field - method: create connection, collection, insert and search - expected: search successfully - """ - # 1. create a collection - auto_id = True - schema = cf.gen_array_collection_schema(auto_id=auto_id) - collection_w = self.init_collection_wrap(schema=schema) - - # 2. insert data - if enable_dynamic_field: - data = cf.gen_row_data_by_schema(schema=schema) - else: - data = cf.gen_array_dataframe_data(auto_id=auto_id) - - collection_w.insert(data) - - # 3. create index and load - collection_w.create_index(default_search_field) - collection_w.load() - - # 4. search output array field, check - output_fields = [ct.default_int64_field_name, ct.default_int32_array_field_name, - ct.default_float_array_field_name] - collection_w.search(vectors[:default_nq], default_search_field, {}, default_limit, - output_fields=output_fields, - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, - "limit": default_limit, - "output_fields": output_fields}) - - @pytest.mark.tags(CaseLabel.L1) - @pytest.mark.parametrize("index", ct.all_index_types[:8]) - @pytest.mark.parametrize("metrics", ct.dense_metrics) - @pytest.mark.parametrize("limit", [200]) - def test_search_output_field_vector_after_different_index_metrics(self, index, metrics, limit): - """ - target: test search with output vector field after different index - method: 1. create a collection and insert data - 2. create index and load - 3. search with output field vector - 4. check the result vectors should be equal to the inserted - expected: search success - """ - collection_w, _vectors = self.init_collection_general(prefix, True, is_index=False)[:2] - - # 2. create index and load - params = cf.get_index_params_params(index) - default_index = {"index_type": index, "params": params, "metric_type": metrics} - collection_w.create_index(field_name, default_index) - collection_w.load() - - # 3. search with output field vector - search_params = cf.gen_search_param(index, metrics) - for search_param in search_params: - if index == "HNSW": - limit = search_param["params"]["ef"] - if limit > max_limit: - limit = default_nb - if index == "DISKANN": - limit = search_param["params"]["search_list"] - collection_w.search(vectors[:1], default_search_field, - search_param, limit, default_search_exp, - output_fields=[field_name], - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": limit, - "original_entities": _vectors[0], - "output_fields": [field_name]}) - - @pytest.mark.tags(CaseLabel.L1) - @pytest.mark.parametrize("metrics", ct.binary_metrics[:2]) - @pytest.mark.parametrize("index", ["BIN_FLAT", "BIN_IVF_FLAT"]) - def test_search_output_field_vector_after_binary_index(self, metrics, index): - """ - target: test search with output vector field after binary index - method: 1. create a collection and insert data - 2. create index and load - 3. search with output field vector - 4. check the result vectors should be equal to the inserted - expected: search success - """ - # 1. create a collection and insert data - collection_w = self.init_collection_general(prefix, is_binary=True, is_index=False)[0] - data = cf.gen_default_binary_dataframe_data()[0] - collection_w.insert(data) - - # 2. create index and load - params = {"M": 48, "efConstruction": 500} if index == "HNSW" else {"nlist": 128} - default_index = {"index_type": index, "metric_type": metrics, "params": params} - collection_w.create_index(binary_field_name, default_index) - collection_w.load() - - # 3. search with output field vector - search_params = cf.gen_search_param(index, metrics) - binary_vectors = cf.gen_binary_vectors(1, default_dim)[1] - for search_param in search_params: - res = collection_w.search(binary_vectors, binary_field_name, - search_param, 2, default_search_exp, - output_fields=[binary_field_name])[0] - - # 4. check the result vectors should be equal to the inserted - assert res[0][0].entity.binary_vector == data[binary_field_name][res[0][0].id] - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("metrics", ct.structure_metrics) - @pytest.mark.parametrize("index", ["BIN_FLAT"]) - def test_search_output_field_vector_after_structure_metrics(self, metrics, index): - """ - target: test search with output vector field after binary index - method: 1. create a collection and insert data - 2. create index and load - 3. search with output field vector - 4. check the result vectors should be equal to the inserted - expected: search success - """ - dim = 8 - # 1. create a collection and insert data - collection_w = self.init_collection_general(prefix, dim=dim, is_binary=True, is_index=False)[0] - data = cf.gen_default_binary_dataframe_data(dim=dim)[0] - collection_w.insert(data) - - # 2. create index and load - default_index = {"index_type": index, "metric_type": metrics, "params": {"nlist": 128}} - collection_w.create_index(binary_field_name, default_index) - collection_w.load() - - # 3. search with output field vector - search_params = {"metric_type": metrics, "params": {"nprobe": 10}} - binary_vectors = cf.gen_binary_vectors(ct.default_nq, dim)[1] - res = collection_w.search(binary_vectors, binary_field_name, - search_params, 2, default_search_exp, - output_fields=[binary_field_name])[0] - - # 4. check the result vectors should be equal to the inserted - assert res[0][0].entity.binary_vector == data[binary_field_name][res[0][0].id] - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("dim", [32, 77, 768]) - def test_search_output_field_vector_with_different_dim(self, dim): - """ - target: test search with output vector field after binary index - method: 1. create a collection and insert data - 2. create index and load - 3. search with output field vector - 4. check the result vectors should be equal to the inserted - expected: search success - """ - # 1. create a collection and insert data - collection_w, _vectors = self.init_collection_general(prefix, True, dim=dim)[:2] - - # 2. search with output field vector - vectors = cf.gen_vectors(default_nq, dim=dim) - collection_w.search(vectors[:default_nq], default_search_field, - default_search_params, default_limit, default_search_exp, - output_fields=[field_name], - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, - "limit": default_limit, - "original_entities": _vectors[0], - "output_fields": [field_name]}) - - @pytest.mark.tags(CaseLabel.L2) - def test_search_output_vector_field_and_scalar_field(self, enable_dynamic_field): - """ - target: test search with output vector field and scalar field - method: 1. initialize a collection - 2. search with output field vector - 3. check no field missing - expected: search success - """ - # 1. initialize a collection - collection_w, _vectors = self.init_collection_general(prefix, True, - enable_dynamic_field=enable_dynamic_field)[:2] - - # search with output field vector - output_fields = [default_float_field_name, default_string_field_name, - default_json_field_name, default_search_field] - original_entities = [] - if enable_dynamic_field: - entities = [] - for vector in _vectors[0]: - entities.append({default_int64_field_name: vector[default_int64_field_name], - default_float_field_name: vector[default_float_field_name], - default_string_field_name: vector[default_string_field_name], - default_json_field_name: vector[default_json_field_name], - default_search_field: vector[default_search_field]}) - original_entities.append(pd.DataFrame(entities)) - else: - original_entities = _vectors - collection_w.search(vectors[:1], default_search_field, - default_search_params, default_limit, default_search_exp, - output_fields=output_fields, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": default_limit, - "pk_name": default_int64_field_name, - "original_entities": original_entities[0], - "output_fields": output_fields}) - if enable_dynamic_field: - collection_w.search(vectors[:1], default_search_field, - default_search_params, default_limit, default_search_exp, - output_fields=["$meta", default_search_field], - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": default_limit, - "pk_name": default_int64_field_name, - "original_entities": original_entities[0], - "output_fields": output_fields}) - - @pytest.mark.tags(CaseLabel.L2) - def test_search_output_vector_field_and_pk_field(self, enable_dynamic_field): - """ - target: test search with output vector field and pk field - method: 1. initialize a collection - 2. search with output field vector - 3. check no field missing - expected: search success - """ - # 1. initialize a collection - collection_w = self.init_collection_general(prefix, True, - enable_dynamic_field=enable_dynamic_field)[0] - - # 2. search with output field vector - output_fields = [default_int64_field_name, default_string_field_name, default_search_field] - collection_w.search(vectors[:1], default_search_field, - default_search_params, default_limit, default_search_exp, - output_fields=output_fields, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": default_limit, - "output_fields": output_fields}) - - @pytest.mark.tags(CaseLabel.L2) - def test_search_output_field_vector_with_partition(self): - """ - target: test search with output vector field - method: 1. create a collection and insert data - 2. create index and load - 3. search with output field vector - 4. check the result vectors should be equal to the inserted - expected: search success - """ - # 1. create a collection and insert data - collection_w = self.init_collection_general(prefix, is_index=False)[0] - partition_w = self.init_partition_wrap(collection_w) - data = cf.gen_default_dataframe_data() - partition_w.insert(data) - - # 2. create index and load - collection_w.create_index(field_name, default_index_params) - collection_w.load() - - # 3. search with output field vector - partition_w.search(vectors[:1], default_search_field, - default_search_params, default_limit, default_search_exp, - output_fields=[field_name], - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": default_limit, - "original_entities": data, - "output_fields": [field_name]}) - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("wildcard_output_fields", [["*"], ["*", default_int64_field_name], - ["*", default_search_field]]) - def test_search_with_output_field_wildcard(self, wildcard_output_fields, _async): - """ - target: test search with output fields using wildcard - method: search with one output_field (wildcard) - expected: search success - """ - # 1. initialize with data - auto_id = True - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, - auto_id=auto_id)[0:4] - # 2. search - output_fields = cf.get_wildcard_output_field_names(collection_w, wildcard_output_fields) - collection_w.search(vectors[:default_nq], default_search_field, - default_search_params, default_limit, - default_search_exp, _async=_async, - output_fields=wildcard_output_fields, - check_task=CheckTasks.check_search_results, - check_items={"nq": default_nq, - "ids": insert_ids, - "pk_name": ct.default_int64_field_name, - "limit": default_limit, - "_async": _async, - "output_fields": output_fields}) - - @pytest.mark.tags(CaseLabel.L2) - def test_search_with_invalid_output_fields(self): - """ - target: test search with output fields using wildcard - method: search with one output_field (wildcard) - expected: search success - """ - # 1. initialize with data - invalid_output_fields = [["%"], [""], ["-"]] - auto_id = False - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, auto_id=auto_id)[0:4] - # 2. search - for field in invalid_output_fields: - error1 = {ct.err_code: 999, ct.err_msg: "field %s not exist" % field[0]} - error2 = {ct.err_code: 999, ct.err_msg: "`output_fields` value %s is illegal" % field} - error = error2 if field == [""] else error1 - collection_w.search(vectors[:default_nq], default_search_field, - default_search_params, default_limit, - default_search_exp, - output_fields=field, - check_task=CheckTasks.err_res, check_items=error) - - @pytest.mark.tags(CaseLabel.L2) - def test_search_multi_collections(self, nq, _async): - """ - target: test search multi collections of L2 - method: add vectors into 10 collections, and search - expected: search status ok, the length of result - """ - nb = 1000 - dim = 64 - auto_id = True - self._connect() - collection_num = 10 - for i in range(collection_num): - # 1. initialize with data - log.info("test_search_multi_collections: search round %d" % (i + 1)) - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, nb, - auto_id=auto_id, - dim=dim)[0:4] - # 2. search - vectors = [[random.random() for _ in range(dim)] for _ in range(nq)] - log.info("test_search_multi_collections: searching %s entities (nq = %s) from collection %s" % - (default_limit, nq, collection_w.name)) - collection_w.search(vectors[:nq], default_search_field, - default_search_params, default_limit, - default_search_exp, _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "pk_name": ct.default_int64_field_name, - "limit": default_limit, - "_async": _async}) - - @pytest.mark.tags(CaseLabel.L2) - def test_search_concurrent_multi_threads(self, nq, _async, null_data_percent): - """ - target: test concurrent search with multi-processes - method: search with 10 processes, each process uses dependent connection - expected: status ok and the returned vectors should be query_records - """ - # 1. initialize with data - nb = 3000 - dim = 64 - auto_id = False - enable_dynamic_field = False - threads_num = 10 - threads = [] - collection_w, _, _, insert_ids = \ - self.init_collection_general(prefix, True, nb, auto_id=auto_id, dim=dim, - enable_dynamic_field=enable_dynamic_field, - nullable_fields={ct.default_string_field_name: null_data_percent})[0:4] - - def search(collection_w): - vectors = [[random.random() for _ in range(dim)] - for _ in range(nq)] - collection_w.search(vectors[:nq], default_search_field, - default_search_params, default_limit, - default_search_exp, _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": default_limit, - "_async": _async}) - - # 2. search with multi-processes - log.info("test_search_concurrent_multi_threads: searching with %s processes" % threads_num) - for i in range(threads_num): - t = threading.Thread(target=search, args=(collection_w,)) - threads.append(t) - t.start() - time.sleep(0.2) - for t in threads: - t.join() - @pytest.mark.tags(CaseLabel.L2) @pytest.mark.skip(reason="issue 37113") def test_search_concurrent_two_collections_nullable(self, nq, _async): @@ -1565,192 +1037,6 @@ class TestCollectionSearch(TestcaseBase): "limit": default_limit, }) - @pytest.mark.tags(CaseLabel.L1) - def test_search_with_consistency_bounded(self, nq, _async): - """ - target: test search with different consistency level - method: 1. create a collection - 2. insert data - 3. search with consistency_level is "bounded" - expected: searched successfully - """ - limit = 1000 - nb_old = 500 - dim = 64 - auto_id = True - enable_dynamic_field = False - collection_w, _, _, insert_ids = \ - self.init_collection_general(prefix, True, nb_old, auto_id=auto_id, - dim=dim, enable_dynamic_field=enable_dynamic_field)[0:4] - # 2. search for original data after load - vectors = [[random.random() for _ in range(dim)] for _ in range(nq)] - collection_w.search(vectors[:nq], default_search_field, - default_search_params, limit, - default_search_exp, _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": nb_old, - "_async": _async, - }) - - kwargs = {} - consistency_level = kwargs.get( - "consistency_level", CONSISTENCY_BOUNDED) - kwargs.update({"consistency_level": consistency_level}) - - nb_new = 400 - _, _, _, insert_ids_new, _ = cf.insert_data(collection_w, nb_new, - auto_id=auto_id, dim=dim, - insert_offset=nb_old, - enable_dynamic_field=enable_dynamic_field) - insert_ids.extend(insert_ids_new) - - collection_w.search(vectors[:nq], default_search_field, - default_search_params, limit, - default_search_exp, _async=_async, - **kwargs, - ) - - @pytest.mark.tags(CaseLabel.L1) - def test_search_with_consistency_strong(self, nq, _async): - """ - target: test search with different consistency level - method: 1. create a collection - 2. insert data - 3. search with consistency_level is "Strong" - expected: searched successfully - """ - limit = 1000 - nb_old = 500 - dim = 64 - auto_id = False - enable_dynamic_field = False - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, nb_old, - auto_id=auto_id, dim=dim, - enable_dynamic_field=enable_dynamic_field)[0:4] - # 2. search for original data after load - vectors = [[random.random() for _ in range(dim)] for _ in range(nq)] - collection_w.search(vectors[:nq], default_search_field, - default_search_params, limit, - default_search_exp, _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": nb_old, - "_async": _async}) - - nb_new = 400 - _, _, _, insert_ids_new, _ = cf.insert_data(collection_w, nb_new, - auto_id=auto_id, dim=dim, - insert_offset=nb_old, - enable_dynamic_field=enable_dynamic_field) - insert_ids.extend(insert_ids_new) - kwargs = {} - consistency_level = kwargs.get("consistency_level", CONSISTENCY_STRONG) - kwargs.update({"consistency_level": consistency_level}) - - collection_w.search(vectors[:nq], default_search_field, - default_search_params, limit, - default_search_exp, _async=_async, - **kwargs, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": nb_old + nb_new, - "_async": _async}) - - @pytest.mark.tags(CaseLabel.L1) - def test_search_with_consistency_eventually(self, nq, _async): - """ - target: test search with different consistency level - method: 1. create a collection - 2. insert data - 3. search with consistency_level is "eventually" - expected: searched successfully - """ - limit = 1000 - nb_old = 500 - dim = 64 - auto_id = True - enable_dynamic_field = True - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, nb_old, - auto_id=auto_id, dim=dim, - enable_dynamic_field=enable_dynamic_field)[0:4] - # 2. search for original data after load - vectors = [[random.random() for _ in range(dim)] for _ in range(nq)] - collection_w.search(vectors[:nq], default_search_field, - default_search_params, limit, - default_search_exp, _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": nb_old, - "_async": _async}) - nb_new = 400 - _, _, _, insert_ids_new, _ = cf.insert_data(collection_w, nb_new, - auto_id=auto_id, dim=dim, - insert_offset=nb_old, - enable_dynamic_field=enable_dynamic_field) - insert_ids.extend(insert_ids_new) - kwargs = {} - consistency_level = kwargs.get( - "consistency_level", CONSISTENCY_EVENTUALLY) - kwargs.update({"consistency_level": consistency_level}) - collection_w.search(vectors[:nq], default_search_field, - default_search_params, limit, - default_search_exp, _async=_async, - **kwargs) - - @pytest.mark.tags(CaseLabel.L1) - def test_search_with_consistency_session(self, nq, _async): - """ - target: test search with different consistency level - method: 1. create a collection - 2. insert data - 3. search with consistency_level is "session" - expected: searched successfully - """ - limit = 1000 - nb_old = 500 - dim = 64 - auto_id = False - enable_dynamic_field = True - collection_w, _, _, insert_ids = self.init_collection_general(prefix, True, nb_old, - auto_id=auto_id, dim=dim, - enable_dynamic_field=enable_dynamic_field)[0:4] - # 2. search for original data after load - vectors = [[random.random() for _ in range(dim)] for _ in range(nq)] - collection_w.search(vectors[:nq], default_search_field, - default_search_params, limit, - default_search_exp, _async=_async, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": nb_old, - "_async": _async}) - - kwargs = {} - consistency_level = kwargs.get( - "consistency_level", CONSISTENCY_SESSION) - kwargs.update({"consistency_level": consistency_level}) - - nb_new = 400 - _, _, _, insert_ids_new, _ = cf.insert_data(collection_w, nb_new, - auto_id=auto_id, dim=dim, - insert_offset=nb_old, - enable_dynamic_field=enable_dynamic_field) - insert_ids.extend(insert_ids_new) - collection_w.search(vectors[:nq], default_search_field, - default_search_params, limit, - default_search_exp, _async=_async, - **kwargs, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "ids": insert_ids, - "limit": nb_old + nb_new, - "_async": _async}) - @pytest.mark.tags(CaseLabel.L1) def test_search_ignore_growing(self, nq, _async): """ @@ -2161,3 +1447,4 @@ class TestCollectionSearch(TestcaseBase): "invalid parameter"}) + \ No newline at end of file diff --git a/tests/python_client/milvus_client_v2/test_milvus_client_search_v2_new.py b/tests/python_client/milvus_client_v2/test_milvus_client_search_v2_new.py index e769b747bb..3be5bd8e48 100644 --- a/tests/python_client/milvus_client_v2/test_milvus_client_search_v2_new.py +++ b/tests/python_client/milvus_client_v2/test_milvus_client_search_v2_new.py @@ -1,4 +1,6 @@ import logging +import time + import numpy as np from common.constants import * from utils.util_pymilvus import * @@ -47,13 +49,13 @@ class TestMilvusClientSearchBasicV2(TestMilvusClientV2Base): self.collection_name = "TestMilvusClientSearchV2" + cf.gen_unique_str("_") self.partition_names = ["partition_1", "partition_2"] self.pk_field_name = ct.default_primary_field_name - self.float_vector_field_name = "float_vector" + self.float_vector_field_name = ct.default_float_vec_field_name self.bfloat16_vector_field_name = "bfloat16_vector" self.sparse_vector_field_name = "sparse_vector" self.binary_vector_field_name = "binary_vector" - self.float_vector_dim = 128 - self.bf16_vector_dim = 200 - self.binary_vector_dim = 256 + self.float_vector_dim = 36 + self.bf16_vector_dim = 35 + self.binary_vector_dim = 32 self.float_vector_metric = "COSINE" self.bf16_vector_metric = "L2" self.sparse_vector_metric = "IP" @@ -346,7 +348,8 @@ class TestMilvusClientSearchBasicV2(TestMilvusClientV2Base): ) @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("limit, nq", zip([1, 1000, ct.max_limit], [ct.max_nq, 10, 1])) + # @pytest.mark.parametrize("limit, nq", zip([1, 1000, ct.max_limit], [ct.max_nq, 10, 1])) + @pytest.mark.parametrize("limit, nq", zip([ct.max_limit], [1])) def test_search_with_different_nq_limits(self, limit, nq): """ target: test search with different nq and limit values @@ -360,7 +363,7 @@ class TestMilvusClientSearchBasicV2(TestMilvusClientV2Base): # Generate vectors to search vectors_to_search = cf.gen_vectors(nq, self.float_vector_dim) - search_params = {"metric_type": self.float_vector_metric, "params": {"nprobe": 100}} + search_params = {"metric_type": self.float_vector_metric, "params": {"nprobe": 128}} # search with limit search_res, _ = self.search( @@ -453,6 +456,73 @@ class TestMilvusClientSearchBasicV2(TestMilvusClientV2Base): "pk_name": self.pk_field_name } ) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("wildcard_output_fields", [["*"], ["*", ct.default_primary_field_name], + ["*", ct.default_float_vec_field_name]]) + def test_search_partition_with_output_fields(self, wildcard_output_fields): + """ + target: test partition search with output fields + method: 1. connect to milvus + 2. partition search on an existing collection with output fields + expected: search successfully with output fields + """ + client = self._client() + collection_name = self.collection_name + collection_info = self.describe_collection(client, collection_name)[0] + fields = collection_info.get('fields', None) + field_names = [field.get('name') for field in fields] + partition_name = self.partition_names[0] + + # Generate vectors to search + vectors_to_search = cf.gen_vectors(default_nq, self.float_vector_dim) + search_params = {"metric_type": self.float_vector_metric, "params": {"nprobe": 100}} + + # search with output fields + output_fields = cf.get_wildcard_output_field_names(collection_info, wildcard_output_fields) + search_res, _ = self.search( + client, + collection_name, + vectors_to_search[:default_nq], + partition_names=[partition_name], + anns_field=self.float_vector_field_name, + search_params=search_params, + limit=default_limit, + output_fields=["*"], + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": default_nq, + "limit": default_limit, + "output_fields": field_names.extend([self.dyna_filed_name1, self.dyna_filed_name2])}) + + @pytest.mark.tags(CaseLabel.L2) + def test_search_with_invalid_output_fields(self): + """ + target: test search with output fields using wildcard + method: search with one output_field (wildcard) + expected: search success + """ + client = self._client() + collection_name = self.collection_name + collection_info = self.describe_collection(client, collection_name)[0] + fields = collection_info.get('fields', None) + field_names = [field.get('name') for field in fields] + partition_name = self.partition_names[0] + + # Generate vectors to search + vectors_to_search = cf.gen_vectors(default_nq, self.float_vector_dim) + search_params = {} + invalid_output_fields = [["%"], [""], ["-"], ["non_exist_field"]] + for field in invalid_output_fields: + error1 = {ct.err_code: 999, ct.err_msg: "field %s not exist" % field[0]} + error2 = {ct.err_code: 999, ct.err_msg: "`output_fields` value %s is illegal" % field} + error = error2 if field == [""] else error1 + self.search(client, collection_name, vectors_to_search[:default_nq], + anns_field=self.float_vector_field_name, + search_params=search_params, + limit=default_limit, + output_fields=field, + check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L2) def test_search_with_more_than_max_limit(self): @@ -727,19 +797,12 @@ class TestSearchV2Independent(TestMilvusClientV2Base): schema = self.create_schema(client)[0] schema.add_field(ct.default_primary_field_name, DataType.INT64, is_primary=True, auto_id=False) schema.add_field(ct.default_float_vec_field_name, DataType.FLOAT_VECTOR, dim=ct.default_dim) - schema.add_field(ct.default_float_field_name, DataType.FLOAT) - schema.add_field(ct.default_string_field_name, DataType.VARCHAR, max_length=256) + schema.add_field(ct.default_float_field_name, DataType.FLOAT, nullable=True) + schema.add_field(ct.default_string_field_name, DataType.VARCHAR, max_length=256, nullable=True) self.create_collection(client, collection_name, schema=schema) # insert data - data = [] - for i in range(default_nb): - data.append({ - ct.default_primary_field_name: i, - ct.default_float_vec_field_name: cf.gen_vectors(1, ct.default_dim)[0], - ct.default_float_field_name: i * 1.0, - ct.default_string_field_name: str(i) - }) + data = cf.gen_row_data_by_schema(schema=schema, nb=default_nb) self.insert(client, collection_name, data) # create index with metric cosine @@ -1006,7 +1069,7 @@ class TestSearchV2Independent(TestMilvusClientV2Base): # search in the collection vectors_to_search = cf.gen_vectors(1, ct.default_dim) - limit = 1000 + limit = 100 search_params = {} search_res1, _ = self.search( client, @@ -1101,6 +1164,7 @@ class TestSearchV2Independent(TestMilvusClientV2Base): # release the partition again and load the collection self.release_partitions(client, collection_name, [to_be_released_partition]) self.load_collection(client, collection_name) + self.refresh_load(client, collection_name) # workaround for #43386, remove this line after it was fixed # search again search_res5, _ = self.search( @@ -1271,7 +1335,8 @@ class TestSearchV2Independent(TestMilvusClientV2Base): search_params=search_params, limit=ct.default_limit, output_fields=["*"], check_task=CheckTasks.check_search_results, - check_items={"nq": ct.default_nq, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, "limit": ct.default_limit}) # disable mmap self.release_collection(client, collection_name) @@ -1283,7 +1348,8 @@ class TestSearchV2Independent(TestMilvusClientV2Base): search_params=search_params, limit=ct.default_limit, output_fields=["*"], check_task=CheckTasks.check_search_results, - check_items={"nq": ct.default_nq, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, "limit": ct.default_limit}) @pytest.mark.tags(CaseLabel.L2) @@ -1345,7 +1411,8 @@ class TestSearchV2Independent(TestMilvusClientV2Base): search_params=search_params, limit=ct.default_limit, output_fields=output_fields, check_task=CheckTasks.check_search_results, - check_items={"nq": ct.default_nq, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, "limit": ct.default_limit}) # disable mmap self.release_collection(client, collection_name) @@ -1357,26 +1424,31 @@ class TestSearchV2Independent(TestMilvusClientV2Base): search_params=search_params, limit=ct.default_limit, output_fields=output_fields, check_task=CheckTasks.check_search_results, - check_items={"nq": ct.default_nq, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, "limit": ct.default_limit}) @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("num_shards", [-256, 0, ct.max_shards_num // 2, ct.max_shards_num]) - def test_search_with_non_default_shard_nums(self, num_shards): + def test_search_with_non_default_shard_nums(self, num_shards): """ Test search functionality with non-default shard numbers. - This test verifies that: - 1. Collections are created with default shard numbers when num_shards <= 0 - 2. Collections are created with specified shard numbers when num_shards > 0 - 3. Search operations work correctly with different shard configurations + This test verifies that search operations work correctly when collections are created with: + - Negative shard numbers (should use default) + - Zero shards (should use default) + - Half of max shards + - Max shards - The test follows these steps: - 1. Creates a collection with specified shard numbers + The test performs the following steps: + 1. Creates a collection with specified shard number 2. Inserts test data - 3. Builds an index - 4. Performs a search operation - 5. Validates the results + 3. Builds index + 4. Loads collection + 5. Executes search and verifies results + + @param num_shards: Number of shards to test (parameterized) + @tags: L2 """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() @@ -1412,36 +1484,193 @@ class TestSearchV2Independent(TestMilvusClientV2Base): self.search(client, collection_name, vectors, anns_field="vector", search_params=search_params, limit=ct.default_limit, check_task=CheckTasks.check_search_results, - check_items={"nq": ct.default_nq, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, "limit": ct.default_limit}) @pytest.mark.tags(CaseLabel.L2) - def test_search_HNSW_index_with_redundant_param(self): + @pytest.mark.parametrize('vector_dtype', ct.all_dense_vector_types) + @pytest.mark.parametrize('index', ct.all_index_types[:8]) + def test_search_output_field_vector_with_dense_vextor_and_index(self, vector_dtype, index): """ - Test search functionality with HNSW index and redundant parameters. + Test search with output vector field after different index types. - This test verifies that: - 1. HNSW index can be created with redundant parameters - 2. Search operations work correctly with redundant parameters - 3. Redundant parameters are ignored + Steps: + 1. Create a collection with specified schema and insert test data + 2. Build index (with error handling for unsupported index types) + 3. Load collection and perform search operations with: + - All output fields ("*") + - Explicitly specified all fields + - Subset of fields + 4. Verify search results match expected output fields - The test performs following steps: - 1. Creates a collection with float vectors - 2. Inserts test data - 3. Creates HNSW index with redundant parameters - 4. Performs a search operation - 5. Validates the results + Parameters: + - vector_dtype: Type of vector data (all supported dense vector types) + - index: Index type (first 8 supported index types) + + Expected: + - Successful search operations with correct output fields returned + - Proper error when attempting unsupported index combinations """ - dim = 16 - index = "HNSW" + + metrics = 'COSINE' client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() + dim = 32 schema = self.create_schema(client)[0] schema.add_field('id', DataType.INT64, is_primary=True, auto_id=False) - schema.add_field('vector', DataType.FLOAT_VECTOR, dim=dim) + schema.add_field('vector', vector_dtype, dim=dim) + schema.add_field('float_array', DataType.ARRAY, element_type=DataType.FLOAT, max_capacity=200) + schema.add_field('json_field', DataType.JSON, max_length=200) + schema.add_field('string_field', DataType.VARCHAR, max_length=200) self.create_collection(client, collection_name, schema=schema) - # insert + # Insert data in 3 batches with unique primary keys using a loop + insert_times = 3 + random_vectors = list(cf.gen_vectors(ct.default_nb*insert_times, dim, vector_data_type=vector_dtype)) \ + if vector_dtype == DataType.FLOAT_VECTOR \ + else cf.gen_vectors(ct.default_nb*insert_times, dim, vector_data_type=vector_dtype) + for j in range(insert_times): + start_pk = j * ct.default_nb + rows = [{ + "id": i + start_pk, + "vector": random_vectors[i + start_pk], + "float_array": [random.random() for _ in range(10)], + "json_field": {"name": "abook", "words": i}, + "string_field": "Hello, Milvus!" + } for i in range(ct.default_nb)] + self.insert(client, collection_name, rows) + self.flush(client, collection_name) + + # build index + index_params, _ = self.prepare_index_params(client) + index_params.add_index(field_name='vector', index_type=index, + metric_type=metrics, + params=cf.get_index_params_params(index_type=index)) + if vector_dtype == DataType.INT8_VECTOR and index != 'HNSW': + # INT8_Vector only supports HNSW index for now + error = {"err_code": 999, "err_msg": f"data type Int8Vector can't build with this index {index}"} + self.create_index(client, collection_name, index_params=index_params, + check_task=CheckTasks.err_res, check_items=error) + else: + self.create_index(client, collection_name, index_params=index_params) + + # load the collection with index + assert self.wait_for_index_ready(client, collection_name, default_vector_field_name, timeout=120) + self.load_collection(client, collection_name) + + # search with output field vector + search_params = {} + vectors = random_vectors[:ct.default_nq] + # search output all fields + self.search(client, collection_name, vectors, anns_field="vector", + search_params=search_params, limit=ct.default_limit, + output_fields=["*"], + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, + "limit": ct.default_limit, + "output_fields": ["id", "vector", "float_array", "json_field", "string_field"]}) + # search output specify all fields + self.search(client, collection_name, vectors, anns_field="vector", + search_params=search_params, limit=ct.default_limit, + output_fields=["id", "vector", "float_array", "json_field", "string_field"], + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, + "limit": ct.default_limit, + "output_fields": ["id", "vector", "float_array", "json_field", "string_field"]}) + # search output specify some fields + self.search(client, collection_name, vectors, anns_field="vector", + search_params=search_params, limit=ct.default_limit, + output_fields=["id", "vector", "json_field"], + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, + "limit": ct.default_limit, + "output_fields": ["id", "vector", "json_field"]}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize('index', ct.binary_supported_index_types) + def test_search_with_output_fields_vector_with_binary_vector_and_index(self, index): + """ + Test search functionality with output fields for binary vector type and specified index. + + This test case verifies that: + 1. A collection with binary vector field can be created and data inserted + 2. Index can be built on the binary vector field + 3. Search operation with output fields (including vector field) works correctly + 4. Results contain expected output fields (id and vector) + + Parameters: + index: The index type to test with (parametrized via pytest.mark.parametrize) + + The test performs following steps: + - Creates collection with binary vector field + - Inserts test data in batches + - Builds specified index type + - Performs search with output fields + - Validates search results contain expected fields + """ + vector_dtype = DataType.BINARY_VECTOR + client = self._client() + dim = 32 + collection_name = cf.gen_collection_name_by_testcase_name() + schema, _ = self.create_schema(client) + schema.add_field("id", datatype=DataType.INT64, is_primary=True, auto_id=False) + schema.add_field("vector", datatype=vector_dtype, dim=dim) + self.create_collection(client, collection_name, schema=schema) + + # Insert data in 3 batches with unique primary keys using a loop + insert_times = 3 + random_vectors = list(cf.gen_vectors(ct.default_nb * insert_times, dim, vector_data_type=vector_dtype)) \ + if vector_dtype == DataType.FLOAT_VECTOR \ + else cf.gen_vectors(ct.default_nb * insert_times, dim, vector_data_type=vector_dtype) + for j in range(insert_times): + start_pk = j * ct.default_nb + rows = [{ + "id": i + start_pk, + "vector": random_vectors[i + start_pk] + } for i in range(ct.default_nb)] + self.insert(client, collection_name, rows) + self.flush(client, collection_name) + + # build index + index_params, _ = self.prepare_index_params(client) + index_params.add_index(field_name='vector', index_type=index, + metric_type='JACCARD', + params=cf.get_index_params_params(index_type=index)) + self.create_index(client, collection_name, index_params=index_params) + + # load the collection with index + assert self.wait_for_index_ready(client, collection_name, 'vector', timeout=120) + self.load_collection(client, collection_name) + + # search with output field vector + search_params = {} + vectors = random_vectors[:ct.default_nq] + self.search(client, collection_name, vectors, anns_field="vector", + search_params=search_params, limit=ct.default_limit, + output_fields=["*"], + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, + "limit": ct.default_limit, + "output_fields": ["id", "vector"]}) + + @pytest.mark.tags(CaseLabel.L2) + def test_search_with_output_fields_empty(self): + """ + target: test search with output fields + method: search with empty output_field + expected: search success + """ + client = self._client() + collection_name = cf.gen_collection_name_by_testcase_name() + dim = 32 + # create collection with fast mode + self.create_collection(client, collection_name, dimension=dim) + # insert data data = [] for i in range(ct.default_nb): data.append({ @@ -1450,24 +1679,20 @@ class TestSearchV2Independent(TestMilvusClientV2Base): }) self.insert(client, collection_name, data) self.flush(client, collection_name) - # create index - index_params = self.prepare_index_params(client)[0] - params = cf.get_index_params_params(index) - params["nlist"] = 100 # nlist is redundant parameter - index_params.add_index(field_name='vector', index_type=index, - metric_type='COSINE', params=params) - self.create_index(client, collection_name, index_params=index_params) - self.wait_for_index_ready(client, collection_name, index_name='vector') - index_info = self.describe_index(client, collection_name, index_name='vector') - assert index_info[0]["nlist"] == '100' - # load - self.load_collection(client, collection_name) - # search - vectors = cf.gen_vectors(ct.default_nq, dim) + # search with empty output fields search_params = {} + vectors = cf.gen_vectors(ct.default_nq, dim) self.search(client, collection_name, vectors, anns_field="vector", search_params=search_params, limit=ct.default_limit, + output_fields=[], check_task=CheckTasks.check_search_results, - check_items={"nq": ct.default_nq, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, + "limit": ct.default_limit}) + self.search(client, collection_name, vectors, anns_field="vector", + search_params=search_params, limit=ct.default_limit, + output_fields=None, + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": ct.default_nq, "limit": ct.default_limit}) - \ No newline at end of file diff --git a/tests/python_client/testcases/indexes/idx_diskann.py b/tests/python_client/testcases/indexes/idx_diskann.py new file mode 100644 index 0000000000..f6d882996f --- /dev/null +++ b/tests/python_client/testcases/indexes/idx_diskann.py @@ -0,0 +1,95 @@ +from pymilvus import DataType +from common import common_type as ct + +success = "success" + + +class DISKANN: + supported_vector_types = [ + DataType.FLOAT_VECTOR, + DataType.FLOAT16_VECTOR, + DataType.BFLOAT16_VECTOR + ] + + supported_metrics = ['L2', 'IP', 'COSINE'] + + build_params = [ + # search_list_size + # Type: Integer Range: [1, int_max] + # Default value: 100 + {"description": "Minimum Boundary Test", "params": {"search_list_size": 1}, "expected": success}, + {"description": "Large Value Test", "params": {"search_list_size": 10000}, "expected": success}, + {"description": "Out of Range Test - Negative", "params": {"search_list_size": -1}, "expected": success}, + {"description": "String Type Test", "params": {"search_list_size": "100"}, "expected": success}, + {"description": "Float Type Test", "params": {"search_list_size": 100.0}, "expected": success}, + {"description": "Boolean Type Test", "params": {"search_list_size": True}, "expected": success}, + {"description": "None Type Test", "params": {"search_list_size": None}, "expected": success}, + # search_cache_budget_gb_ratio + # Type: Float Range: [0.0, 0.3) + # Default value: 0.10 + # TODO: runt he minium bourndary test after issue #43176 fixed + # {"description": "Minimum Boundary Test", "params": {"search_cache_budget_gb_ratio": 0.0}, "expected": success}, + {"description": "Maximum Boundary Test", "params": {"search_cache_budget_gb_ratio": 0.3}, "expected": success}, + {"description": "Default value Test", "params": {"search_cache_budget_gb_ratio": 0.1}, "expected": success}, + {"description": "Out of Range Test - Negative", "params": {"search_cache_budget_gb_ratio": -0.1}, "expected": success}, + {"description": "Out of Range Test - Too Large", "params": {"search_cache_budget_gb_ratio": 0.31}, "expected": success}, + {"description": "String Type Test", "params": {"search_cache_budget_gb_ratio": "0.2"}, "expected": success}, + {"description": "Boolean Type Test", "params": {"search_cache_budget_gb_ratio": True}, "expected": success}, + {"description": "None Type Test", "params": {"search_cache_budget_gb_ratio": None}, "expected": success}, + # pq_code_budget_gb_ratio + # Type: Float Range: (0.0, 0.25] + # Default value: 0.125 + {"description": "Minimum Boundary Test", "params": {"pq_code_budget_gb_ratio": 0.0001}, "expected": success}, + {"description": "Maximum Boundary Test", "params": {"pq_code_budget_gb_ratio": 0.25}, "expected": success}, + {"description": "Default value Test", "params": {"pq_code_budget_gb_ratio": 0.125}, "expected": success}, + {"description": "Out of Range Test - Negative", "params": {"pq_code_budget_gb_ratio": -0.1}, "expected": success}, + {"description": "Out of Range Test - Too Large", "params": {"pq_code_budget_gb_ratio": 0.26}, "expected": success}, + {"description": "String Type Test", "params": {"pq_code_budget_gb_ratio": "0.1"}, "expected": success}, + {"description": "Boolean Type Test", "params": {"pq_code_budget_gb_ratio": True}, "expected": success}, + {"description": "None Type Test", "params": {"pq_code_budget_gb_ratio": None}, "expected": success}, + # max_degree + # Type: Integer Range: [1, 512] + # Default value: 56 + {"description": "Minimum Boundary Test", "params": {"max_degree": 1}, "expected": success}, + {"description": "Maximum Boundary Test", "params": {"max_degree": 512}, "expected": success}, + {"description": "Default value Test", "params": {"max_degree": 56}, "expected": success}, + {"description": "Large Value Test", "params": {"max_degree": 128}, "expected": success}, + {"description": "Out of Range Test - Negative", "params": {"max_degree": -1}, "expected": success}, + {"description": "String Type Test", "params": {"max_degree": "32"}, "expected": success}, + {"description": "Float Type Test", "params": {"max_degree": 32.0}, "expected": success}, + {"description": "Boolean Type Test", "params": {"max_degree": True}, "expected": success}, + {"description": "None Type Test", "params": {"max_degree": None}, "expected": success}, + # 组合参数 + {"description": "Optimal Performance Combination Test", "params": {"search_list_size": 100, "beamwidth": 10, "search_cache_budget_gb_ratio": 0.5, "pq_code_budget_gb_ratio": 0.5}, "expected": success}, + {"description": "empty dict params", "params": {}, "expected": success}, + {"description": "not_defined_param in the dict params", "params": {"search_list_size": 100, "not_defined_param": "nothing"}, "expected": success}, + + ] + + search_params = [ + # beam_width_ratio + # Type: Float Range: [1, max(128 / CPU number, 16)] + # Default value: 4.0 + {"description": "Minimum Boundary Test", "params": {"beam_width_ratio": 1.0}, "expected": success}, + {"description": "Maximum Boundary Test", "params": {"beam_width_ratio": 16.0}, "expected": success}, + {"description": "Default value Test", "params": {"beam_width_ratio": 4.0}, "expected": success}, + {"description": "Out of Range Test - Negative", "params": {"beam_width_ratio": -0.1}, "expected": success}, + {"description": "Out of Range Test - Too Large", "params": {"beam_width_ratio": 17.0}, "expected": success}, + {"description": "String Type Test", "params": {"beam_width_ratio": "2.0"}, "expected": success}, + {"description": "Boolean Type Test", "params": {"beam_width_ratio": True}, "expected": success}, + {"description": "None Type Test", "params": {"beam_width_ratio": None}, "expected": success}, + # search_list_size + # Type: Integer Range: [1, int_max] + # Default value: 100 + {"description": "Minimum Boundary Test", "params": {"search_list_size": 1}, "expected": {"err_code": 999, "err_msg": "search_list_size(1) should be larger than k(10)"}}, + {"description": "Large Value Test", "params": {"search_list_size": 1000}, "expected": success}, + {"description": "Default value Test", "params": {"search_list_size": 100}, "expected": success}, + {"description": "Out of Range Test - Negative", "params": {"search_list_size": -1}, "expected": {"err_code": 999, "err_msg": "param 'search_list_size' (-1) should be in range [1, 2147483647]"}}, + {"description": "String Type Test", "params": {"search_list_size": "100"}, "expected": success}, + {"description": "Float Type Test", "params": {"search_list_size": 100.0}, "expected": {"err_code": 999, "err_msg": "Type conflict in json: param 'search_list_size' (100.0) should be integer"}}, + {"description": "Boolean Type Test", "params": {"search_list_size": True}, "expected": {"err_code": 999, "err_msg": "Type conflict in json: param 'search_list_size' (true) should be integer"}}, + {"description": "None Type Test", "params": {"search_list_size": None}, "expected": {"err_code": 999, "err_msg": "Type conflict in json: param 'search_list_size' (null) should be integer"}}, + # mix params + {"description": "mix params", "params": {"search_list_size": 100, "beam_width_ratio": 0.5}, "expected": success}, + {"description": "mix params", "params": {}, "expected": success}, + ] \ No newline at end of file diff --git a/tests/python_client/testcases/indexes/idx_hnsw.py b/tests/python_client/testcases/indexes/idx_hnsw.py new file mode 100644 index 0000000000..5f0d8f498f --- /dev/null +++ b/tests/python_client/testcases/indexes/idx_hnsw.py @@ -0,0 +1,175 @@ +from pymilvus import DataType +from common import common_type as ct + +success = "success" + +class HNSW: + supported_vector_types = [ + DataType.FLOAT_VECTOR, + DataType.FLOAT16_VECTOR, + DataType.BFLOAT16_VECTOR, + DataType.INT8_VECTOR + ] + + supported_metrics = ['L2', 'IP', 'COSINE'] + + build_params = [ + # M params test + { + "description": "Minimum Boundary Test", + "params": {"M": 2}, + "expected": success + }, + { + "description": "Maximum Boundary Test", + "params": {"M": 2048}, + "expected": success + }, + { + "description": "Out of Range Test - Negative", + "params": {"M": -1}, + "expected": {"err_code": 999, "err_msg": "param 'M' (-1) should be in range [2, 2048]"} + }, + { + "description": "Out of Range Test - Too Large", + "params": {"M": 2049}, + "expected": {"err_code": 999, "err_msg": "param 'M' (2049) should be in range [2, 2048]"} + }, + { + "description": "String Type Test will ignore the wrong type", + "params": {"M": "16"}, + "expected": success + }, + { + "description": "Float Type Test", + "params": {"M": 16.0}, + "expected": {"err_code": 999, "err_msg": "wrong data type in json"} + }, + { + "description": "Boolean Type Test", + "params": {"M": True}, + "expected": {"err_code": 999, "err_msg": "invalid integer value, key: 'M', value: 'True': invalid parameter"} + }, + { + "description": "None Type Test, use default value", + "params": {"M": None}, + "expected": success + }, + { + "description": "List Type Test", + "params": {"M": [16]}, + "expected": {"err_code": 999, "err_msg": "invalid integer value, key: 'M', value: '[16]': invalid parameter"} + }, + # efConstruction params test + { + "description": "Minimum Boundary Test", + "params": {"efConstruction": 1}, + "expected": success + }, + { + "description": "Large Value Test", + "params": {"efConstruction": 10000}, + "expected": success + }, + { + "description": "Out of Range Test - Negative", + "params": {"efConstruction": -1}, + "expected": {"err_code": 999, "err_msg": "param 'efConstruction' (-1) should be in range [1, 2147483647]"} + }, + { + "description": "String Type Test will ignore the wrong type", + "params": {"efConstruction": "100"}, + "expected": success + }, + { + "description": "Float Type Test", + "params": {"efConstruction": 100.0}, + "expected": {"err_code": 999, "err_msg": "wrong data type in json"} + }, + { + "description": "Boolean Type Test", + "params": {"efConstruction": True}, + "expected": {"err_code": 999, "err_msg": "invalid integer value, key: 'efConstruction', value: 'True': invalid parameter"} + }, + { + "description": "None Type Test, use default value", + "params": {"efConstruction": None}, + "expected": success + }, + { + "description": "List Type Test", + "params": {"efConstruction": [100]}, + "expected": {"err_code": 999, "err_msg": "invalid integer value, key: 'efConstruction', value: '[100]': invalid parameter"} + }, + # combination params test + { + "description": "Optimal Performance Combination Test", + "params": {"M": 16, "efConstruction": 200}, + "expected": success + }, + { + "description": "empty dict params", + "params": {}, + "expected": success + }, + { + "description": "not_defined_param in the dict params", + "params": {"M": 16, "efConstruction": 200, "not_defined_param": "nothing"}, + "expected": success + }, + ] + + search_params = [ + # ef params test + { + "description": "Minimum Boundary Test", + "params": {"ef": 1}, + "expected": {"err_code": 999, "err_msg": "ef(1) should be larger than k(10)"} # assume default limit=10 + }, + { + "description": "Large Value Test", + "params": {"ef": 10000}, + "expected": success + }, + { + "description": "Out of Range Test - Negative", + "params": {"ef": -1}, + "expected": {"err_code": 999, "err_msg": "param 'ef' (-1) should be in range [1, 2147483647]"} + }, + { + "description": "String Type Test, not check data type", + "params": {"ef": "32"}, + "expected": success + }, + { + "description": "Float Type Test", + "params": {"ef": 32.0}, + "expected": {"err_code": 999, "err_msg": "Type conflict in json: param 'ef' (32.0) should be integer"} + }, + { + "description": "Boolean Type Test", + "params": {"ef": True}, + "expected": {"err_code": 999, "err_msg": "Type conflict in json: param 'ef' (true) should be integer"} + }, + { + "description": "None Type Test", + "params": {"ef": None}, + "expected": {"err_code": 999, "err_msg": "Type conflict in json: param 'ef' (null) should be integer"} + }, + { + "description": "List Type Test", + "params": {"ef": [32]}, + "expected": {"err_code": 999, "err_msg": "param 'ef' ([32]) should be integer"} + }, + # combination params test + { + "description": "Optimal Performance Combination Test", + "params": {"ef": 64}, + "expected": success + }, + { + "description": "empty dict params", + "params": {}, + "expected": success + }, + ] \ No newline at end of file diff --git a/tests/python_client/testcases/indexes/test_diskann.py b/tests/python_client/testcases/indexes/test_diskann.py new file mode 100644 index 0000000000..2da095d67a --- /dev/null +++ b/tests/python_client/testcases/indexes/test_diskann.py @@ -0,0 +1,229 @@ +import logging +from utils.util_pymilvus import * +from common.common_type import CaseLabel, CheckTasks +from common import common_type as ct +from common import common_func as cf +from base.client_v2_base import TestMilvusClientV2Base +import pytest +from idx_diskann import DISKANN + +index_type = "DISKANN" +success = "success" +pk_field_name = 'id' +vector_field_name = 'vector' +dim = ct.default_dim +default_nb = 2000 +default_build_params = {"search_list_size": 100, "beamwidth": 10, "pq_code_budget_gb": 1.0, "num_threads": 8, "max_degree": 64, "indexing_list_size": 100, "build_dram_budget_gb": 2.0, "search_dram_budget_gb": 1.0} +default_search_params = {"search_list_size": 100, "beamwidth": 10, "search_dram_budget_gb": 1.0} + + +class TestDiskannBuildParams(TestMilvusClientV2Base): + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("params", DISKANN.build_params) + def test_diskann_build_params(self, params): + client = self._client() + collection_name = cf.gen_collection_name_by_testcase_name() + schema, _ = self.create_schema(client) + schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False) + schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim) + self.create_collection(client, collection_name, schema=schema) + insert_times = 2 + random_vectors = list(cf.gen_vectors(default_nb * insert_times, dim, vector_data_type=DataType.FLOAT_VECTOR)) + for j in range(insert_times): + start_pk = j * default_nb + rows = [{ + pk_field_name: i + start_pk, + vector_field_name: random_vectors[i + start_pk] + } for i in range(default_nb)] + self.insert(client, collection_name, rows) + self.flush(client, collection_name) + build_params = params.get("params", None) + index_params = self.prepare_index_params(client)[0] + index_params.add_index(field_name=vector_field_name, + metric_type=cf.get_default_metric_for_vector_type(vector_type=DataType.FLOAT_VECTOR), + index_type=index_type, + params=build_params) + if params.get("expected", None) != success: + self.create_index(client, collection_name, index_params, + check_task=CheckTasks.err_res, + check_items=params.get("expected")) + else: + self.create_index(client, collection_name, index_params) + self.wait_for_index_ready(client, collection_name, index_name=vector_field_name) + self.load_collection(client, collection_name) + nq = 2 + search_vectors = cf.gen_vectors(nq, dim=dim, vector_data_type=DataType.FLOAT_VECTOR) + self.search(client, collection_name, search_vectors, + search_params=default_search_params, + limit=ct.default_limit, + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": nq, + "limit": ct.default_limit, + "pk_name": pk_field_name}) + idx_info = client.describe_index(collection_name, vector_field_name) + if build_params is not None: + for key, value in build_params.items(): + if value is not None: + assert key in idx_info.keys() + assert str(value) == idx_info[key] + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("vector_data_type", ct.all_vector_types) + def test_diskann_on_all_vector_types(self, vector_data_type): + client = self._client() + collection_name = cf.gen_collection_name_by_testcase_name() + schema, _ = self.create_schema(client) + schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False) + if vector_data_type == DataType.SPARSE_FLOAT_VECTOR: + schema.add_field(vector_field_name, datatype=vector_data_type) + else: + schema.add_field(vector_field_name, datatype=vector_data_type, dim=dim) + self.create_collection(client, collection_name, schema=schema) + insert_times = 2 + random_vectors = list(cf.gen_vectors(default_nb*insert_times, default_dim, vector_data_type=vector_data_type)) \ + if vector_data_type == DataType.FLOAT_VECTOR \ + else cf.gen_vectors(default_nb*insert_times, default_dim, vector_data_type=vector_data_type) + for j in range(insert_times): + start_pk = j * default_nb + rows = [{ + pk_field_name: i + start_pk, + vector_field_name: random_vectors[i + start_pk] + } for i in range(default_nb)] + self.insert(client, collection_name, rows) + self.flush(client, collection_name) + index_params = self.prepare_index_params(client)[0] + metric_type = cf.get_default_metric_for_vector_type(vector_data_type) + index_params.add_index(field_name=vector_field_name, + metric_type=metric_type, + index_type=index_type, + **default_build_params) + if vector_data_type not in DISKANN.supported_vector_types: + self.create_index(client, collection_name, index_params, + check_task=CheckTasks.err_res, + check_items={"err_code": 999, + "err_msg": f"can't build with this index DISKANN: invalid parameter"}) + else: + self.create_index(client, collection_name, index_params) + self.wait_for_index_ready(client, collection_name, index_name=vector_field_name) + self.load_collection(client, collection_name) + nq = 2 + search_vectors = cf.gen_vectors(nq, dim=dim, vector_data_type=vector_data_type) + self.search(client, collection_name, search_vectors, + search_params=default_search_params, + limit=ct.default_limit, + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": nq, + "limit": ct.default_limit, + "pk_name": pk_field_name}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("metric", DISKANN.supported_metrics) + def test_diskann_on_all_metrics(self, metric): + client = self._client() + collection_name = cf.gen_collection_name_by_testcase_name() + schema, _ = self.create_schema(client) + schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False) + schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim) + self.create_collection(client, collection_name, schema=schema) + insert_times = 2 + random_vectors = list(cf.gen_vectors(default_nb*insert_times, default_dim, vector_data_type=DataType.FLOAT_VECTOR)) + for j in range(insert_times): + start_pk = j * default_nb + rows = [{ + pk_field_name: i + start_pk, + vector_field_name: random_vectors[i + start_pk] + } for i in range(default_nb)] + self.insert(client, collection_name, rows) + self.flush(client, collection_name) + index_params = self.prepare_index_params(client)[0] + index_params.add_index(field_name=vector_field_name, + metric_type=metric, + index_type=index_type, + **default_build_params) + self.create_index(client, collection_name, index_params) + self.wait_for_index_ready(client, collection_name, index_name=vector_field_name) + self.load_collection(client, collection_name) + nq = 2 + search_vectors = cf.gen_vectors(nq, dim=dim, vector_data_type=DataType.FLOAT_VECTOR) + self.search(client, collection_name, search_vectors, + search_params=default_search_params, + limit=ct.default_limit, + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": nq, + "limit": ct.default_limit, + "pk_name": pk_field_name}) + + +@pytest.mark.xdist_group("TestDiskannSearchParams") +class TestDiskannSearchParams(TestMilvusClientV2Base): + def setup_class(self): + super().setup_class(self) + self.collection_name = "TestDiskannSearchParams" + cf.gen_unique_str("_") + self.float_vector_field_name = vector_field_name + self.float_vector_dim = dim + self.primary_keys = [] + self.enable_dynamic_field = False + self.datas = [] + + @pytest.fixture(scope="class", autouse=True) + def prepare_collection(self, request): + client = self._client() + collection_schema = self.create_schema(client)[0] + collection_schema.add_field(pk_field_name, DataType.INT64, is_primary=True, auto_id=False) + collection_schema.add_field(self.float_vector_field_name, DataType.FLOAT_VECTOR, dim=128) + self.create_collection(client, self.collection_name, schema=collection_schema, + enable_dynamic_field=self.enable_dynamic_field, force_teardown=False) + insert_times = 2 + float_vectors = cf.gen_vectors(default_nb * insert_times, dim=self.float_vector_dim, + vector_data_type=DataType.FLOAT_VECTOR) + for j in range(insert_times): + rows = [] + for i in range(default_nb): + pk = i + j * default_nb + row = { + pk_field_name: pk, + self.float_vector_field_name: list(float_vectors[pk]) + } + self.datas.append(row) + rows.append(row) + self.insert(client, self.collection_name, data=rows) + self.primary_keys.extend([i + j * default_nb for i in range(default_nb)]) + self.flush(client, self.collection_name) + index_params = self.prepare_index_params(client)[0] + index_params.add_index(field_name=self.float_vector_field_name, + metric_type="COSINE", + index_type=index_type, + params=default_build_params) + self.create_index(client, self.collection_name, index_params=index_params) + self.wait_for_index_ready(client, self.collection_name, index_name=self.float_vector_field_name) + self.load_collection(client, self.collection_name) + def teardown(): + self.drop_collection(self._client(), self.collection_name) + request.addfinalizer(teardown) + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("params", DISKANN.search_params) + def test_diskann_search_params(self, params): + client = self._client() + collection_name = self.collection_name + nq = 2 + search_vectors = cf.gen_vectors(nq, dim=self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) + search_params = params.get("params", None) + if params.get("expected", None) != success: + self.search(client, collection_name, search_vectors, + search_params=search_params, + limit=ct.default_limit, + check_task=CheckTasks.err_res, + check_items=params.get("expected")) + else: + self.search(client, collection_name, search_vectors, + search_params=search_params, + limit=ct.default_limit, + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": nq, + "limit": ct.default_limit, + "pk_name": pk_field_name}) \ No newline at end of file diff --git a/tests/python_client/testcases/indexes/test_hnsw.py b/tests/python_client/testcases/indexes/test_hnsw.py new file mode 100644 index 0000000000..9012cb1c85 --- /dev/null +++ b/tests/python_client/testcases/indexes/test_hnsw.py @@ -0,0 +1,273 @@ +import logging +from utils.util_pymilvus import * +from common.common_type import CaseLabel, CheckTasks +from common import common_type as ct +from common import common_func as cf +from base.client_v2_base import TestMilvusClientV2Base +import pytest +from idx_hnsw import HNSW + +index_type = "HNSW" +success = "success" +pk_field_name = 'id' +vector_field_name = 'vector' +dim = ct.default_dim +default_nb = 2000 +default_build_params = {"M": 16, "efConstruction": 200} +default_search_params = {"ef": 64} + + +class TestHnswBuildParams(TestMilvusClientV2Base): + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("params", HNSW.build_params) + def test_hnsw_build_params(self, params): + """ + Test the build params of HNSW index + """ + client = self._client() + collection_name = cf.gen_collection_name_by_testcase_name() + schema, _ = self.create_schema(client) + schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False) + schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim) + self.create_collection(client, collection_name, schema=schema) + + # Insert data in 2 batches with unique primary keys + insert_times = 2 + random_vectors = list(cf.gen_vectors(default_nb * insert_times, dim, vector_data_type=DataType.FLOAT_VECTOR)) + for j in range(insert_times): + start_pk = j * default_nb + rows = [{ + pk_field_name: i + start_pk, + vector_field_name: random_vectors[i + start_pk] + } for i in range(default_nb)] + self.insert(client, collection_name, rows) + self.flush(client, collection_name) + + # create index + build_params = params.get("params", None) + index_params = self.prepare_index_params(client)[0] + index_params.add_index(field_name=vector_field_name, + metric_type=cf.get_default_metric_for_vector_type(vector_type=DataType.FLOAT_VECTOR), + index_type=index_type, + params=build_params) + # build index + if params.get("expected", None) != success: + self.create_index(client, collection_name, index_params, + check_task=CheckTasks.err_res, + check_items=params.get("expected")) + else: + self.create_index(client, collection_name, index_params) + self.wait_for_index_ready(client, collection_name, index_name=vector_field_name) + + # load collection + self.load_collection(client, collection_name) + + # search + nq = 2 + search_vectors = cf.gen_vectors(nq, dim=dim, vector_data_type=DataType.FLOAT_VECTOR) + self.search(client, collection_name, search_vectors, + search_params=default_search_params, + limit=ct.default_limit, + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": nq, + "limit": ct.default_limit, + "pk_name": pk_field_name}) + + # verify the index params are persisted + idx_info = client.describe_index(collection_name, vector_field_name) + if build_params is not None: + for key, value in build_params.items(): + if value is not None: + assert key in idx_info.keys() + assert str(value) in idx_info.values() + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("vector_data_type", ct.all_vector_types) + def test_hnsw_on_all_vector_types(self, vector_data_type): + """ + Test HNSW index on all the vector types and metrics + """ + client = self._client() + collection_name = cf.gen_collection_name_by_testcase_name() + schema, _ = self.create_schema(client) + schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False) + if vector_data_type == DataType.SPARSE_FLOAT_VECTOR: + schema.add_field(vector_field_name, datatype=vector_data_type) + else: + schema.add_field(vector_field_name, datatype=vector_data_type, dim=dim) + self.create_collection(client, collection_name, schema=schema) + + # Insert data in 2 batches with unique primary keys + insert_times = 2 + random_vectors = list(cf.gen_vectors(default_nb*insert_times, dim, vector_data_type=vector_data_type)) \ + if vector_data_type == DataType.FLOAT_VECTOR \ + else cf.gen_vectors(default_nb*insert_times, dim, vector_data_type=vector_data_type) + for j in range(insert_times): + start_pk = j * default_nb + rows = [{ + pk_field_name: i + start_pk, + vector_field_name: random_vectors[i + start_pk] + } for i in range(default_nb)] + self.insert(client, collection_name, rows) + self.flush(client, collection_name) + + # create index + index_params = self.prepare_index_params(client)[0] + metric_type = cf.get_default_metric_for_vector_type(vector_data_type) + index_params.add_index(field_name=vector_field_name, + metric_type=metric_type, + index_type=index_type, + M=16, + efConstruction=200) + if vector_data_type not in HNSW.supported_vector_types: + self.create_index(client, collection_name, index_params, + check_task=CheckTasks.err_res, + check_items={"err_code": 999, + "err_msg": f"can't build with this index HNSW: invalid parameter"}) + else: + self.create_index(client, collection_name, index_params) + self.wait_for_index_ready(client, collection_name, index_name=vector_field_name) + # load collection + self.load_collection(client, collection_name) + # search + nq = 2 + search_vectors = cf.gen_vectors(nq, dim=dim, vector_data_type=vector_data_type) + self.search(client, collection_name, search_vectors, + search_params=default_search_params, + limit=ct.default_limit, + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": nq, + "limit": ct.default_limit, + "pk_name": pk_field_name}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("metric", HNSW.supported_metrics) + def test_hnsw_on_all_metrics(self, metric): + """ + Test the search params of HNSW index + """ + client = self._client() + collection_name = cf.gen_collection_name_by_testcase_name() + schema, _ = self.create_schema(client) + schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False) + schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim) + self.create_collection(client, collection_name, schema=schema) + + # insert data + insert_times = 2 + random_vectors = list(cf.gen_vectors(default_nb*insert_times, dim, vector_data_type=DataType.FLOAT_VECTOR)) + for j in range(insert_times): + start_pk = j * default_nb + rows = [{ + pk_field_name: i + start_pk, + vector_field_name: random_vectors[i + start_pk] + } for i in range(default_nb)] + self.insert(client, collection_name, rows) + self.flush(client, collection_name) + + # create index + index_params = self.prepare_index_params(client)[0] + index_params.add_index(field_name=vector_field_name, + metric_type=metric, + index_type=index_type, + M=16, + efConstruction=200) + self.create_index(client, collection_name, index_params) + self.wait_for_index_ready(client, collection_name, index_name=vector_field_name) + # load collection + self.load_collection(client, collection_name) + # search + nq = 2 + search_vectors = cf.gen_vectors(nq, dim=dim, vector_data_type=DataType.FLOAT_VECTOR) + self.search(client, collection_name, search_vectors, + search_params=default_search_params, + limit=ct.default_limit, + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": nq, + "limit": ct.default_limit, + "pk_name": pk_field_name}) + + +@pytest.mark.xdist_group("TestHnswSearchParams") +class TestHnswSearchParams(TestMilvusClientV2Base): + """Test search with pagination functionality for HNSW index""" + + def setup_class(self): + super().setup_class(self) + self.collection_name = "TestHnswSearchParams" + cf.gen_unique_str("_") + self.float_vector_field_name = vector_field_name + self.float_vector_dim = dim + self.primary_keys = [] + self.enable_dynamic_field = False + self.datas = [] + + @pytest.fixture(scope="class", autouse=True) + def prepare_collection(self, request): + """ + Initialize collection before test class runs + """ + client = self._client() + collection_schema = self.create_schema(client)[0] + collection_schema.add_field(pk_field_name, DataType.INT64, is_primary=True, auto_id=False) + collection_schema.add_field(self.float_vector_field_name, DataType.FLOAT_VECTOR, dim=128) + self.create_collection(client, self.collection_name, schema=collection_schema, + enable_dynamic_field=self.enable_dynamic_field, force_teardown=False) + insert_times = 2 + float_vectors = cf.gen_vectors(default_nb * insert_times, dim=self.float_vector_dim, + vector_data_type=DataType.FLOAT_VECTOR) + for j in range(insert_times): + rows = [] + for i in range(default_nb): + pk = i + j * default_nb + row = { + pk_field_name: pk, + self.float_vector_field_name: list(float_vectors[pk]) + } + self.datas.append(row) + rows.append(row) + self.insert(client, self.collection_name, data=rows) + self.primary_keys.extend([i + j * default_nb for i in range(default_nb)]) + self.flush(client, self.collection_name) + # Create HNSW index + index_params = self.prepare_index_params(client)[0] + index_params.add_index(field_name=self.float_vector_field_name, + metric_type="COSINE", + index_type=index_type, + params=default_build_params) + self.create_index(client, self.collection_name, index_params=index_params) + self.wait_for_index_ready(client, self.collection_name, index_name=self.float_vector_field_name) + self.load_collection(client, self.collection_name) + + def teardown(): + self.drop_collection(self._client(), self.collection_name) + request.addfinalizer(teardown) + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("params", HNSW.search_params) + def test_hnsw_search_params(self, params): + """ + Test the search params of HNSW index + """ + client = self._client() + collection_name = self.collection_name + nq = 2 + search_vectors = cf.gen_vectors(nq, dim=self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR) + search_params = params.get("params", None) + if params.get("expected", None) != success: + self.search(client, collection_name, search_vectors, + search_params=search_params, + limit=ct.default_limit, + check_task=CheckTasks.err_res, + check_items=params.get("expected")) + else: + self.search(client, collection_name, search_vectors, + search_params=search_params, + limit=ct.default_limit, + check_task=CheckTasks.check_search_results, + check_items={"enable_milvus_client_api": True, + "nq": nq, + "limit": ct.default_limit, + "pk_name": pk_field_name}) \ No newline at end of file diff --git a/tests/python_client/testcases/test_insert.py b/tests/python_client/testcases/test_insert.py index beb2a79aa6..c99497c8c0 100644 --- a/tests/python_client/testcases/test_insert.py +++ b/tests/python_client/testcases/test_insert.py @@ -527,7 +527,7 @@ class TestInsertOperation(TestcaseBase): expected: error raised """ collection_w = self.init_collection_wrap(name=cf.gen_unique_str(prefix)) - nb = 1 + nb = 10 data = [] fields = collection_w.schema.fields for field in fields: @@ -747,12 +747,9 @@ class TestInsertOperation(TestcaseBase): c_name = cf.gen_unique_str(prefix) schema = cf.gen_default_collection_schema(primary_field=pk_field, auto_id=True) collection_w = self.init_collection_wrap(name=c_name, schema=schema) - data = [] nb = 100 - for field in collection_w.schema.fields: - field_data = cf.gen_data_by_collection_field(field, nb=nb) - if field.name != pk_field: - data.append(field_data) + data = cf.gen_column_data_by_schema(nb=nb, schema=collection_w.schema) + collection_w.insert(data=data) assert collection_w.num_entities == nb @@ -1246,7 +1243,7 @@ class TestInsertInvalid(TestcaseBase): primary_field=primary_field, is_index=False, is_all_data_type=True, with_json=True)[0] nb = 100 - data = cf.gen_data_by_collection_schema(collection_w.schema, nb=nb) + data = cf.gen_column_data_by_schema(schema=collection_w.schema, nb=nb) for dirty_i in [0, nb // 2, nb - 1]: # check the dirty data at first, middle and last log.debug(f"dirty_i: {dirty_i}") for i in range(len(data)): @@ -2194,7 +2191,7 @@ class TestUpsertInvalid(TestcaseBase): primary_field=primary_field, is_index=False, is_all_data_type=True, with_json=True)[0] nb = 100 - data = cf.gen_data_by_collection_schema(collection_w.schema, nb=nb) + data = cf.gen_column_data_by_schema(schema=collection_w.schema, nb=nb) for dirty_i in [0, nb // 2, nb - 1]: # check the dirty data at first, middle and last log.debug(f"dirty_i: {dirty_i}") for i in range(len(data)): diff --git a/tests/python_client/testcases/test_mix_scenes.py b/tests/python_client/testcases/test_mix_scenes.py index a7e3b76f35..531a06e8a8 100644 --- a/tests/python_client/testcases/test_mix_scenes.py +++ b/tests/python_client/testcases/test_mix_scenes.py @@ -128,7 +128,7 @@ class TestNoIndexDQLExpr(TestCaseClassBase): """ # the total number of inserted data that matches the expression expr_count = len([i for i in self.insert_data.get(expr_field, []) if - eval('math.fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) + eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) # query res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field]) @@ -359,7 +359,7 @@ class TestHybridIndexDQLExpr(TestCaseClassBase): """ # the total number of inserted data that matches the expression expr_count = len([i for i in self.insert_data.get(expr_field, []) if - eval('math.fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) + eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) # query res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field]) @@ -696,7 +696,7 @@ class TestInvertedIndexDQLExpr(TestCaseClassBase): """ # the total number of inserted data that matches the expression expr_count = len([i for i in self.insert_data.get(expr_field, []) if - eval('math.fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) + eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) # query res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field]) @@ -1022,7 +1022,7 @@ class TestBitmapIndexDQLExpr(TestCaseClassBase): """ # the total number of inserted data that matches the expression expr_count = len([i for i in self.insert_data.get(expr_field, []) if - eval('math.fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) + eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) # query res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field]) @@ -1438,7 +1438,7 @@ class TestBitmapIndexOffsetCache(TestCaseClassBase): """ # the total number of inserted data that matches the expression expr_count = len([i for i in self.insert_data.get(expr_field, []) if - eval('math.fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) + eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) # query res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=['*']) @@ -1796,7 +1796,7 @@ class TestBitmapIndexMmap(TestCaseClassBase): """ # the total number of inserted data that matches the expression expr_count = len([i for i in self.insert_data.get(expr_field, []) if - eval('math.fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) + eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))]) # query res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field]) @@ -2519,7 +2519,6 @@ class TestGroupSearch(TestCaseClassBase): output_fields=[DataType.VARCHAR.name], check_task=CheckTasks.check_search_results, check_items={"nq": ct.default_nq, "limit": ct.default_limit})[0] - print(res) for i in range(ct.default_nq): group_values = [] for l in range(ct.default_limit): @@ -2542,6 +2541,31 @@ class TestGroupSearch(TestCaseClassBase): check_task=CheckTasks.check_search_results, check_items={"nq": ct.default_nq, "limit": ct.default_limit}) + @pytest.mark.tags(CaseLabel.L2) + def test_hybrid_search_group_by_empty_results(self): + """ + verify hybrid search group by works if group by empty results + """ + # 3. prepare search params + req_list = [] + for i in range(len(self.vector_fields)): + search_param = { + "data": cf.gen_vectors(ct.default_nq, dim=self.dims[i], + vector_data_type=cf.get_field_dtype_by_field_name(self.collection_wrap, + self.vector_fields[i])), + "anns_field": self.vector_fields[i], + "param": {}, + "limit": ct.default_limit, + "expr": f"{self.primary_field} < 0"} # make sure return empty results + req = AnnSearchRequest(**search_param) + req_list.append(req) + # 4. hybrid search group by empty resutls + self.collection_wrap.hybrid_search(req_list, WeightedRanker(0.1, 0.9, 0.2, 0.3), ct.default_limit, + group_by_field=DataType.VARCHAR.name, + output_fields=[DataType.VARCHAR.name], + check_task=CheckTasks.check_search_results, + check_items={"nq": ct.default_nq, "limit": 0}) + @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("support_field", [DataType.INT8.name, DataType.INT64.name, DataType.BOOL.name, DataType.VARCHAR.name]) diff --git a/tests/python_client/testcases/test_utility.py b/tests/python_client/testcases/test_utility.py index a1f1a5184b..8c774a5702 100644 --- a/tests/python_client/testcases/test_utility.py +++ b/tests/python_client/testcases/test_utility.py @@ -1387,8 +1387,8 @@ class TestUtilityAdvanced(TestcaseBase): ) for _ in range(segment_num): - # insert random pks, ***start=None will generate random data*** - data = cf.gen_values(self.collection_wrap.schema, nb=nb, start_id=None) + # insert random pks + data = cf.gen_values(self.collection_wrap.schema, nb=nb, random_pk=True) self.collection_wrap.insert(data) self.collection_wrap.flush() @@ -1443,15 +1443,14 @@ class TestUtilityAdvanced(TestcaseBase): self.build_multi_index(index_params=DefaultVectorIndexParams.IVF_SQ8(ct.default_float_vec_field_name)) self.collection_wrap.load() - # insert random pks, ***start=None will generate random data*** - data = cf.gen_values(self.collection_wrap.schema, nb=nb, start_id=None) + # insert random pks *** + data = cf.gen_values(self.collection_wrap.schema, nb=nb, random_pk=True) self.collection_wrap.insert(data) # get_query_segment_info and verify results res_sealed, _ = self.utility_wrap.get_query_segment_info(collection_name) assert len(res_sealed) == 0 - @pytest.mark.tags(CaseLabel.L1) def test_get_sealed_query_segment_info_after_create_index(self): """