import pytest import numpy as np from base.client_v2_base import TestMilvusClientV2Base from utils.util_log import test_log as log from common import common_func as cf from common import common_type as ct from common.common_type import CaseLabel, CheckTasks from utils.util_pymilvus import * prefix = "client_insert" epsilon = ct.epsilon default_nb = ct.default_nb default_nb_medium = ct.default_nb_medium default_nq = ct.default_nq default_dim = ct.default_dim default_limit = ct.default_limit default_search_exp = "id >= 0" exp_res = "exp_res" default_search_string_exp = "varchar >= \"0\"" default_search_mix_exp = "int64 >= 0 && varchar >= \"0\"" default_invaild_string_exp = "varchar >= 0" default_json_search_exp = "json_field[\"number\"] >= 0" perfix_expr = 'varchar like "0%"' default_search_field = ct.default_float_vec_field_name default_search_params = ct.default_search_params default_primary_key_field_name = "id" default_vector_field_name = "vector" default_dynamic_field_name = "field_new" default_float_field_name = ct.default_float_field_name default_bool_field_name = ct.default_bool_field_name default_string_field_name = ct.default_string_field_name default_int32_array_field_name = ct.default_int32_array_field_name default_string_array_field_name = ct.default_string_array_field_name default_int32_field_name = ct.default_int32_field_name default_int32_value = ct.default_int32_value class TestMilvusClientUpsertInvalid(TestMilvusClientV2Base): """ Test case of search interface """ @pytest.fixture(scope="function", params=[False, True]) def auto_id(self, request): yield request.param @pytest.fixture(scope="function", params=["COSINE", "L2"]) def metric_type(self, request): yield request.param """ ****************************************************************** # The following are invalid base cases ****************************************************************** """ @pytest.mark.tags(CaseLabel.L2) def test_milvus_client_upsert_column_data(self): """ target: test insert column data method: create connection, collection, insert and search expected: raise error """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim) # 2. insert vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nb)] data = [[i for i in range(default_nb)], vectors] error = {ct.err_code: 999, ct.err_msg: "The Input data type is inconsistent with defined schema, please check it."} self.upsert(client, collection_name, data, check_task=CheckTasks.err_res, check_items=error) self.drop_collection(client, collection_name) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_empty_collection_name(self): """ target: test high level api: client.create_collection method: create collection with invalid primary field expected: Raise exception """ client = self._client() collection_name = "" rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] error = {ct.err_code: 1, ct.err_msg: f"`collection_name` value {collection_name} is illegal"} self.upsert(client, collection_name, rows, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("collection_name", ["12-s", "12 s", "(mn)", "中文", "%$#"]) def test_milvus_client_upsert_invalid_collection_name(self, collection_name): """ target: test high level api: client.create_collection method: create collection with invalid primary field expected: Raise exception """ client = self._client() rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] error = {ct.err_code: 1100, ct.err_msg: f"Invalid collection name: {collection_name}. the first character of a " f"collection name must be an underscore or letter: invalid parameter"} self.upsert(client, collection_name, rows, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_collection_name_over_max_length(self): """ target: test high level api: client.create_collection method: create collection with invalid primary field expected: Raise exception """ client = self._client() collection_name = "a".join("a" for i in range(256)) rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] error = {ct.err_code: 1100, ct.err_msg: f"the length of a collection name must be less than 255 characters"} self.upsert(client, collection_name, rows, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_not_exist_collection_name(self): """ target: test high level api: client.create_collection method: create collection with invalid primary field expected: Raise exception """ client = self._client() collection_name = cf.gen_unique_str("insert_not_exist") rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] error = {ct.err_code: 100, ct.err_msg: f"can't find collection[database=default][collection={collection_name}]"} self.upsert(client, collection_name, rows, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("data", ["12-s", "12 s", "(mn)", "中文", "%$#", " "]) def test_milvus_client_upsert_data_invalid_type(self, data): """ target: test high level api: client.create_collection method: create collection with invalid primary field expected: Raise exception """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. insert error = {ct.err_code: 1, ct.err_msg: f"wrong type of argument 'data',expected 'Dict' or list of 'Dict'"} self.upsert(client, collection_name, data, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_data_empty(self): """ target: test high level api: client.create_collection method: create collection with invalid primary field expected: Raise exception """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. insert error = {ct.err_code: 1, ct.err_msg: f"wrong type of argument 'data',expected 'Dict' or list of 'Dict'"} self.upsert(client, collection_name, data="", check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_data_vector_field_missing(self): """ target: test high level api: client.create_collection method: create collection with invalid primary field expected: Raise exception """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(10)] error = {ct.err_code: 1, ct.err_msg: "Insert missed an field `vector` to collection without set nullable==true or set default_value"} self.upsert(client, collection_name, data=rows, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_data_id_field_missing(self): """ target: test high level api: client.create_collection method: create collection with invalid primary field expected: Raise exception """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(20)] error = {ct.err_code: 1, ct.err_msg: f"Insert missed an field `id` to collection without set nullable==true or set default_value"} self.upsert(client, collection_name, data=rows, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_data_extra_field(self): """ target: test milvus client: insert extra field than schema method: insert extra field than schema when enable_dynamic_field is False expected: Raise exception """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection dim = 32 self.create_collection(client, collection_name, dim, enable_dynamic_field=False) # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(10)] error = {ct.err_code: 1, ct.err_msg: f"Attempt to insert an unexpected field `float` to collection without enabling dynamic field"} self.upsert(client, collection_name, data=rows, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_data_dim_not_match(self): """ target: test milvus client: insert extra field than schema method: insert extra field than schema when enable_dynamic_field is False expected: Raise exception """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim) # 2. insert rng = np.random.default_rng(seed=19530) rows = [ {default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim + 1))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] error = {ct.err_code: 65536, ct.err_msg: f"of float data should divide the dim({default_dim})"} self.upsert(client, collection_name, data=rows, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_not_matched_data(self): """ target: test milvus client: insert not matched data then defined method: insert string to int primary field expected: Raise exception """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim) # 2. insert rng = np.random.default_rng(seed=19530) rows = [ {default_primary_key_field_name: str(i), default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] error = {ct.err_code: 1, ct.err_msg: "The Input data type is inconsistent with defined schema, {id} field should be a int64"} self.upsert(client, collection_name, data=rows, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("partition_name", ["12 s", "(mn)", "中文", "%$#", " "]) def test_milvus_client_upsert_invalid_partition_name(self, partition_name): """ target: test milvus client: insert extra field than schema method: insert extra field than schema when enable_dynamic_field is False expected: Raise exception """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim) # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] error = {ct.err_code: 65535, ct.err_msg: f"Invalid partition name: {partition_name}"} if partition_name == " ": error = {ct.err_code: 1, ct.err_msg: f"Invalid partition name: . Partition name should not be empty."} self.upsert(client, collection_name, data=rows, partition_name=partition_name, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_not_exist_partition_name(self): """ target: test milvus client: insert extra field than schema method: insert extra field than schema when enable_dynamic_field is False expected: Raise exception """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim) # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] partition_name = cf.gen_unique_str("partition_not_exist") error = {ct.err_code: 200, ct.err_msg: f"partition not found[partition={partition_name}]"} self.upsert(client, collection_name, data=rows, partition_name=partition_name, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L2) def test_milvus_client_upsert_collection_partition_not_match(self): """ target: test milvus client: insert extra field than schema method: insert extra field than schema when enable_dynamic_field is False expected: Raise exception """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() another_collection_name = cf.gen_unique_str(prefix + "another") partition_name = cf.gen_unique_str("partition") # 1. create collection self.create_collection(client, collection_name, default_dim) self.create_collection(client, another_collection_name, default_dim) self.create_partition(client, another_collection_name, partition_name) # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] error = {ct.err_code: 200, ct.err_msg: f"partition not found[partition={partition_name}]"} self.upsert(client, collection_name, data=rows, partition_name=partition_name, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("nullable", [True, False]) def test_milvus_client_insert_array_element_null(self, nullable): """ target: test search with null expression on each key of json method: create connection, collection, insert and search expected: raise exception """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() dim = 5 # 1. create collection nullable_field_name = "nullable_field" schema = self.create_schema(client, enable_dynamic_field=False)[0] schema.add_field(default_primary_key_field_name, DataType.VARCHAR, max_length=64, is_primary=True, auto_id=False) schema.add_field(default_vector_field_name, DataType.FLOAT_VECTOR, dim=dim) schema.add_field(nullable_field_name, DataType.ARRAY, element_type=DataType.INT64, max_capacity=12, max_length=64, nullable=nullable) index_params = self.prepare_index_params(client)[0] index_params.add_index(default_vector_field_name, metric_type="COSINE") self.create_collection(client, collection_name, dimension=dim, schema=schema, index_params=index_params) # 2. insert vectors = cf.gen_vectors(default_nb, dim) rows = [{default_primary_key_field_name: str(i), default_vector_field_name: vectors[i], nullable_field_name: [None, 2, 3]} for i in range(default_nb)] error = {ct.err_code: 1, ct.err_msg: "The Input data type is inconsistent with defined schema, {nullable_field} field " "should be a array, but got a {} instead."} self.insert(client, collection_name, rows, check_task=CheckTasks.err_res, check_items=error) class TestMilvusClientUpsertValid(TestMilvusClientV2Base): """ Test case of search interface """ @pytest.fixture(scope="function", params=[False, True]) def auto_id(self, request): yield request.param @pytest.fixture(scope="function", params=["COSINE", "L2"]) def metric_type(self, request): yield request.param """ ****************************************************************** # The following are valid base cases ****************************************************************** """ @pytest.mark.tags(CaseLabel.L0) def test_milvus_client_upsert_default(self): """ target: test search (high level api) normal case method: create connection, collection, insert and search expected: search/query successfully """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") collections = self.list_collections(client)[0] assert collection_name in collections self.describe_collection(client, collection_name, check_task=CheckTasks.check_describe_collection_property, check_items={"collection_name": collection_name, "dim": default_dim, "consistency_level": 0}) # 2. insert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] results = self.upsert(client, collection_name, rows)[0] assert results['upsert_count'] == default_nb # 3. search vectors_to_search = rng.random((1, default_dim)) insert_ids = [i for i in range(default_nb)] self.search(client, collection_name, vectors_to_search, check_task=CheckTasks.check_search_results, check_items={"enable_milvus_client_api": True, "nq": len(vectors_to_search), "ids": insert_ids, "limit": default_limit, "pk_name": default_primary_key_field_name}) # 4. query self.query(client, collection_name, filter=default_search_exp, check_task=CheckTasks.check_query_results, check_items={exp_res: rows, "with_vec": True, "pk_name": default_primary_key_field_name}) self.release_collection(client, collection_name) self.drop_collection(client, collection_name) @pytest.mark.tags(CaseLabel.L2) def test_milvus_client_upsert_empty_data(self): """ target: test search (high level api) normal case method: create connection, collection, insert and search expected: search/query successfully """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. insert rows = [] results = self.upsert(client, collection_name, rows)[0] assert results['upsert_count'] == 0 # 3. search rng = np.random.default_rng(seed=19530) vectors_to_search = rng.random((1, default_dim)) self.search(client, collection_name, vectors_to_search, check_task=CheckTasks.check_search_results, check_items={"enable_milvus_client_api": True, "nq": len(vectors_to_search), "ids": [], "pk_name": default_primary_key_field_name, "limit": 0}) self.drop_collection(client, collection_name) @pytest.mark.tags(CaseLabel.L2) def test_milvus_client_upsert_partition(self): """ target: test fast create collection normal case method: create collection expected: create collection with default schema, index, and load successfully """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() partition_name = cf.gen_unique_str(prefix) # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. create partition self.create_partition(client, collection_name, partition_name) partitions = self.list_partitions(client, collection_name)[0] assert partition_name in partitions index = self.list_indexes(client, collection_name)[0] assert index == ['vector'] # load_state = self.get_load_state(collection_name)[0] rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] # 3. upsert to default partition results = self.upsert(client, collection_name, rows, partition_name=partitions[0])[0] assert results['upsert_count'] == default_nb # 4. upsert to non-default partition results = self.upsert(client, collection_name, rows, partition_name=partition_name)[0] assert results['upsert_count'] == default_nb # 5. search vectors_to_search = rng.random((1, default_dim)) insert_ids = [i for i in range(default_nb)] self.search(client, collection_name, vectors_to_search, check_task=CheckTasks.check_search_results, check_items={"enable_milvus_client_api": True, "nq": len(vectors_to_search), "ids": insert_ids, "limit": default_limit, "pk_name": default_primary_key_field_name}) # partition_number = self.get_partition_stats(client, collection_name, "_default")[0] # assert partition_number == default_nb # partition_number = self.get_partition_stats(client, collection_name, partition_name)[0] # assert partition_number[0]['value'] == 0 if self.has_partition(client, collection_name, partition_name)[0]: self.release_partitions(client, collection_name, partition_name) self.drop_partition(client, collection_name, partition_name) if self.has_collection(client, collection_name)[0]: self.drop_collection(client, collection_name) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_insert_upsert(self): """ target: test fast create collection normal case method: create collection expected: create collection with default schema, index, and load successfully """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() partition_name = cf.gen_unique_str(prefix) # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. create partition self.create_partition(client, collection_name, partition_name) partitions = self.list_partitions(client, collection_name)[0] assert partition_name in partitions index = self.list_indexes(client, collection_name)[0] assert index == ['vector'] # load_state = self.get_load_state(collection_name)[0] # 3. insert and upsert rng = np.random.default_rng(seed=19530) rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)] results = self.insert(client, collection_name, rows, partition_name=partition_name)[0] assert results['insert_count'] == default_nb rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, "new_diff_str_field": str(i)} for i in range(default_nb)] results = self.upsert(client, collection_name, rows, partition_name=partition_name)[0] assert results['upsert_count'] == default_nb # 3. search vectors_to_search = rng.random((1, default_dim)) insert_ids = [i for i in range(default_nb)] self.search(client, collection_name, vectors_to_search, check_task=CheckTasks.check_search_results, check_items={"enable_milvus_client_api": True, "nq": len(vectors_to_search), "ids": insert_ids, "limit": default_limit, "pk_name": default_primary_key_field_name}) if self.has_partition(client, collection_name, partition_name)[0]: self.release_partitions(client, collection_name, partition_name) self.drop_partition(client, collection_name, partition_name) if self.has_collection(client, collection_name)[0]: self.drop_collection(client, collection_name) class TestMilvusClientUpsertDedup(TestMilvusClientV2Base): """Test case for upsert deduplication functionality""" @pytest.fixture(scope="function", params=["COSINE", "L2"]) def metric_type(self, request): yield request.param @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_dedup_int64_pk(self): """ target: test upsert with duplicate int64 primary keys in same batch method: 1. create collection with int64 primary key 2. upsert data with duplicate primary keys [1, 2, 3, 2, 1] 3. query to verify only last occurrence is kept expected: only 3 unique records exist, with data from last occurrence """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. upsert data with duplicate PKs: [1, 2, 3, 2, 1] # Expected: keep last occurrence -> [3, 2, 1] at indices [2, 3, 4] rng = np.random.default_rng(seed=19530) rows = [ {default_primary_key_field_name: 1, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: 1.0, default_string_field_name: "str_1_first"}, {default_primary_key_field_name: 2, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: 2.0, default_string_field_name: "str_2_first"}, {default_primary_key_field_name: 3, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: 3.0, default_string_field_name: "str_3"}, {default_primary_key_field_name: 2, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: 2.1, default_string_field_name: "str_2_last"}, {default_primary_key_field_name: 1, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: 1.1, default_string_field_name: "str_1_last"}, ] results = self.upsert(client, collection_name, rows)[0] # After deduplication, should only have 3 records assert results['upsert_count'] == 3 # 3. query to verify deduplication - should have only 3 unique records query_results = self.query(client, collection_name, filter="id >= 0")[0] assert len(query_results) == 3 # Verify that last occurrence data is kept id_to_data = {item['id']: item for item in query_results} assert 1 in id_to_data assert 2 in id_to_data assert 3 in id_to_data # Check that data from last occurrence is preserved assert id_to_data[1]['float'] == 1.1 assert id_to_data[1]['varchar'] == "str_1_last" assert id_to_data[2]['float'] == 2.1 assert id_to_data[2]['varchar'] == "str_2_last" assert id_to_data[3]['float'] == 3.0 assert id_to_data[3]['varchar'] == "str_3" self.drop_collection(client, collection_name) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_dedup_varchar_pk(self): """ target: test upsert with duplicate varchar primary keys in same batch method: 1. create collection with varchar primary key 2. upsert data with duplicate primary keys ["a", "b", "c", "b", "a"] 3. query to verify only last occurrence is kept expected: only 3 unique records exist, with data from last occurrence """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection with varchar primary key schema = self.create_schema(client, enable_dynamic_field=True)[0] schema.add_field("id", DataType.VARCHAR, max_length=64, is_primary=True, auto_id=False) schema.add_field(default_vector_field_name, DataType.FLOAT_VECTOR, dim=default_dim) schema.add_field("age", DataType.INT64) index_params = self.prepare_index_params(client)[0] index_params.add_index(default_vector_field_name, metric_type="COSINE") self.create_collection(client, collection_name, default_dim, schema=schema, index_params=index_params, consistency_level="Strong") # 2. upsert data with duplicate PKs: ["a", "b", "c", "b", "a"] # Expected: keep last occurrence -> ["c", "b", "a"] at indices [2, 3, 4] rng = np.random.default_rng(seed=19530) rows = [ {"id": "a", default_vector_field_name: list(rng.random((1, default_dim))[0]), "age": 10}, {"id": "b", default_vector_field_name: list(rng.random((1, default_dim))[0]), "age": 20}, {"id": "c", default_vector_field_name: list(rng.random((1, default_dim))[0]), "age": 30}, {"id": "b", default_vector_field_name: list(rng.random((1, default_dim))[0]), "age": 21}, {"id": "a", default_vector_field_name: list(rng.random((1, default_dim))[0]), "age": 11}, ] results = self.upsert(client, collection_name, rows)[0] # After deduplication, should only have 3 records assert results['upsert_count'] == 3 # 3. query to verify deduplication query_results = self.query(client, collection_name, filter='id in ["a", "b", "c"]')[0] assert len(query_results) == 3 # Verify that last occurrence data is kept id_to_data = {item['id']: item for item in query_results} assert "a" in id_to_data assert "b" in id_to_data assert "c" in id_to_data # Check that data from last occurrence is preserved assert id_to_data["a"]["age"] == 11 assert id_to_data["b"]["age"] == 21 assert id_to_data["c"]["age"] == 30 self.drop_collection(client, collection_name) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_dedup_all_duplicates(self): """ target: test upsert when all records have same primary key method: 1. create collection 2. upsert 5 records with same primary key 3. query to verify only 1 record exists expected: only 1 record exists with data from last occurrence """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. upsert data where all have same PK (id=1) rng = np.random.default_rng(seed=19530) rows = [ {default_primary_key_field_name: 1, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: f"version_{i}"} for i in range(5) ] results = self.upsert(client, collection_name, rows)[0] # After deduplication, should only have 1 record assert results['upsert_count'] == 1 # 3. query to verify only 1 record exists query_results = self.query(client, collection_name, filter="id == 1")[0] assert len(query_results) == 1 # Verify it's the last occurrence (i=4) assert query_results[0]['float'] == 4.0 assert query_results[0]['varchar'] == "version_4" self.drop_collection(client, collection_name) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_dedup_no_duplicates(self): """ target: test upsert with no duplicate primary keys method: 1. create collection 2. upsert data with unique primary keys 3. query to verify all records exist expected: all records exist as-is """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. upsert data with unique PKs rng = np.random.default_rng(seed=19530) nb = 10 rows = [ {default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(nb) ] results = self.upsert(client, collection_name, rows)[0] # No deduplication should occur assert results['upsert_count'] == nb # 3. query to verify all records exist query_results = self.query(client, collection_name, filter=f"id >= 0")[0] assert len(query_results) == nb self.drop_collection(client, collection_name) @pytest.mark.tags(CaseLabel.L2) def test_milvus_client_upsert_dedup_large_batch(self): """ target: test upsert deduplication with large batch method: 1. create collection 2. upsert large batch with 50% duplicate primary keys 3. query to verify correct number of records expected: only unique records exist """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. upsert large batch where each ID appears twice rng = np.random.default_rng(seed=19530) nb = 500 unique_ids = nb // 2 # 250 unique IDs rows = [] for i in range(nb): pk = i % unique_ids # This creates duplicates: 0,1,2...249,0,1,2...249 rows.append({ default_primary_key_field_name: pk, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: float(i), # Different value for each row default_string_field_name: f"batch_{i}" }) results = self.upsert(client, collection_name, rows)[0] # After deduplication, should only have unique_ids records assert results['upsert_count'] == unique_ids # 3. query to verify correct number of records query_results = self.query(client, collection_name, filter=f"id >= 0", limit=1000)[0] assert len(query_results) == unique_ids # Verify that last occurrence is kept (should have higher float values) for item in query_results: pk = item['id'] # Last occurrence of pk is at index (pk + unique_ids) expected_float = float(pk + unique_ids) assert item['float'] == expected_float assert item['varchar'] == f"batch_{pk + unique_ids}" self.drop_collection(client, collection_name) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_dedup_with_partition(self): """ target: test upsert deduplication works correctly with partitions method: 1. create collection with partition 2. upsert data with duplicates to specific partition 3. query to verify deduplication in partition expected: deduplication works within partition """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() partition_name = cf.gen_unique_str("partition") # 1. create collection and partition self.create_collection(client, collection_name, default_dim, consistency_level="Strong") self.create_partition(client, collection_name, partition_name) # 2. upsert data with duplicates to partition rng = np.random.default_rng(seed=19530) rows = [ {default_primary_key_field_name: 1, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: 1.0, default_string_field_name: "first"}, {default_primary_key_field_name: 2, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: 2.0, default_string_field_name: "unique"}, {default_primary_key_field_name: 1, default_vector_field_name: list(rng.random((1, default_dim))[0]), default_float_field_name: 1.1, default_string_field_name: "last"}, ] results = self.upsert(client, collection_name, rows, partition_name=partition_name)[0] assert results['upsert_count'] == 2 # 3. query partition to verify deduplication query_results = self.query(client, collection_name, filter="id >= 0", partition_names=[partition_name])[0] assert len(query_results) == 2 # Verify correct data id_to_data = {item['id']: item for item in query_results} assert id_to_data[1]['float'] == 1.1 assert id_to_data[1]['varchar'] == "last" assert id_to_data[2]['float'] == 2.0 assert id_to_data[2]['varchar'] == "unique" self.drop_collection(client, collection_name) @pytest.mark.tags(CaseLabel.L1) def test_milvus_client_upsert_dedup_with_vectors(self): """ target: test upsert deduplication preserves correct vector data method: 1. create collection 2. upsert data with duplicate PKs but different vectors 3. search to verify correct vector is preserved expected: vector from last occurrence is preserved """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() # 1. create collection self.create_collection(client, collection_name, default_dim, consistency_level="Strong") # 2. upsert data with duplicate PK=1 but different vectors # Create distinctly different vectors for easy verification first_vector = [1.0] * default_dim # All 1.0 last_vector = [2.0] * default_dim # All 2.0 rows = [ {default_primary_key_field_name: 1, default_vector_field_name: first_vector, default_float_field_name: 1.0, default_string_field_name: "first"}, {default_primary_key_field_name: 2, default_vector_field_name: [0.5] * default_dim, default_float_field_name: 2.0, default_string_field_name: "unique"}, {default_primary_key_field_name: 1, default_vector_field_name: last_vector, default_float_field_name: 1.1, default_string_field_name: "last"}, ] results = self.upsert(client, collection_name, rows)[0] assert results['upsert_count'] == 2 # 3. query to get vector data query_results = self.query(client, collection_name, filter="id == 1", output_fields=["id", "vector", "float", "varchar"])[0] assert len(query_results) == 1 # Verify it's the last occurrence with last_vector result = query_results[0] assert result['float'] == 1.1 assert result['varchar'] == "last" # Vector should be last_vector (all 2.0) assert all(abs(v - 2.0) < 0.001 for v in result['vector']) self.drop_collection(client, collection_name)