milvus/tests/python_client/milvus_client/test_milvus_client_upsert.py
wei liu 7aed88113c
enhance: Deduplicate primary keys in upsert request batch (#45249)
issue: #44320

This change adds deduplication logic to handle duplicate primary keys
within a single upsert batch, keeping the last occurrence of each
primary key.

Key changes:
- Add DeduplicateFieldData function to remove duplicate PKs from field
data, supporting both Int64 and VarChar primary keys
- Refactor fillFieldPropertiesBySchema into two separate functions:
validateFieldDataColumns for validation and fillFieldPropertiesOnly for
property filling, improving code clarity and reusability
- Integrate deduplication logic in upsertTask.PreExecute to
automatically deduplicate data before processing
- Add comprehensive unit tests for deduplication with various PK types
(Int64, VarChar) and field types (scalar, vector)
- Add Python integration tests to verify end-to-end behavior

---------

Signed-off-by: Wei Liu <wei.liu@zilliz.com>
2025-11-17 21:35:40 +08:00

892 lines
44 KiB
Python

import pytest
import numpy as np
from base.client_v2_base import TestMilvusClientV2Base
from utils.util_log import test_log as log
from common import common_func as cf
from common import common_type as ct
from common.common_type import CaseLabel, CheckTasks
from utils.util_pymilvus import *
prefix = "client_insert"
epsilon = ct.epsilon
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
default_search_exp = "id >= 0"
exp_res = "exp_res"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_primary_key_field_name = "id"
default_vector_field_name = "vector"
default_dynamic_field_name = "field_new"
default_float_field_name = ct.default_float_field_name
default_bool_field_name = ct.default_bool_field_name
default_string_field_name = ct.default_string_field_name
default_int32_array_field_name = ct.default_int32_array_field_name
default_string_array_field_name = ct.default_string_array_field_name
default_int32_field_name = ct.default_int32_field_name
default_int32_value = ct.default_int32_value
class TestMilvusClientUpsertInvalid(TestMilvusClientV2Base):
""" Test case of search interface """
@pytest.fixture(scope="function", params=[False, True])
def auto_id(self, request):
yield request.param
@pytest.fixture(scope="function", params=["COSINE", "L2"])
def metric_type(self, request):
yield request.param
"""
******************************************************************
# The following are invalid base cases
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L2)
def test_milvus_client_upsert_column_data(self):
"""
target: test insert column data
method: create connection, collection, insert and search
expected: raise error
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim)
# 2. insert
vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_nb)]
data = [[i for i in range(default_nb)], vectors]
error = {ct.err_code: 999,
ct.err_msg: "The Input data type is inconsistent with defined schema, please check it."}
self.upsert(client, collection_name, data,
check_task=CheckTasks.err_res, check_items=error)
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_empty_collection_name(self):
"""
target: test high level api: client.create_collection
method: create collection with invalid primary field
expected: Raise exception
"""
client = self._client()
collection_name = ""
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
error = {ct.err_code: 1, ct.err_msg: f"`collection_name` value {collection_name} is illegal"}
self.upsert(client, collection_name, rows,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("collection_name", ["12-s", "12 s", "(mn)", "中文", "%$#"])
def test_milvus_client_upsert_invalid_collection_name(self, collection_name):
"""
target: test high level api: client.create_collection
method: create collection with invalid primary field
expected: Raise exception
"""
client = self._client()
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
error = {ct.err_code: 1100, ct.err_msg: f"Invalid collection name: {collection_name}. the first character of a "
f"collection name must be an underscore or letter: invalid parameter"}
self.upsert(client, collection_name, rows,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_collection_name_over_max_length(self):
"""
target: test high level api: client.create_collection
method: create collection with invalid primary field
expected: Raise exception
"""
client = self._client()
collection_name = "a".join("a" for i in range(256))
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
error = {ct.err_code: 1100, ct.err_msg: f"the length of a collection name must be less than 255 characters"}
self.upsert(client, collection_name, rows,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_not_exist_collection_name(self):
"""
target: test high level api: client.create_collection
method: create collection with invalid primary field
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_unique_str("insert_not_exist")
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
error = {ct.err_code: 100, ct.err_msg: f"can't find collection[database=default][collection={collection_name}]"}
self.upsert(client, collection_name, rows,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("data", ["12-s", "12 s", "(mn)", "中文", "%$#", " "])
def test_milvus_client_upsert_data_invalid_type(self, data):
"""
target: test high level api: client.create_collection
method: create collection with invalid primary field
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. insert
error = {ct.err_code: 1, ct.err_msg: f"wrong type of argument 'data',expected 'Dict' or list of 'Dict'"}
self.upsert(client, collection_name, data,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_data_empty(self):
"""
target: test high level api: client.create_collection
method: create collection with invalid primary field
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. insert
error = {ct.err_code: 1, ct.err_msg: f"wrong type of argument 'data',expected 'Dict' or list of 'Dict'"}
self.upsert(client, collection_name, data="",
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_data_vector_field_missing(self):
"""
target: test high level api: client.create_collection
method: create collection with invalid primary field
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. insert
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i,
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(10)]
error = {ct.err_code: 1,
ct.err_msg: "Insert missed an field `vector` to collection without set nullable==true or set default_value"}
self.upsert(client, collection_name, data=rows,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_data_id_field_missing(self):
"""
target: test high level api: client.create_collection
method: create collection with invalid primary field
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. insert
rng = np.random.default_rng(seed=19530)
rows = [{default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(20)]
error = {ct.err_code: 1,
ct.err_msg: f"Insert missed an field `id` to collection without set nullable==true or set default_value"}
self.upsert(client, collection_name, data=rows,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_data_extra_field(self):
"""
target: test milvus client: insert extra field than schema
method: insert extra field than schema when enable_dynamic_field is False
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
dim = 32
self.create_collection(client, collection_name, dim, enable_dynamic_field=False)
# 2. insert
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(10)]
error = {ct.err_code: 1,
ct.err_msg: f"Attempt to insert an unexpected field `float` to collection without enabling dynamic field"}
self.upsert(client, collection_name, data=rows,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_data_dim_not_match(self):
"""
target: test milvus client: insert extra field than schema
method: insert extra field than schema when enable_dynamic_field is False
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim)
# 2. insert
rng = np.random.default_rng(seed=19530)
rows = [
{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim + 1))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
error = {ct.err_code: 65536, ct.err_msg: f"of float data should divide the dim({default_dim})"}
self.upsert(client, collection_name, data=rows,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_not_matched_data(self):
"""
target: test milvus client: insert not matched data then defined
method: insert string to int primary field
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim)
# 2. insert
rng = np.random.default_rng(seed=19530)
rows = [
{default_primary_key_field_name: str(i), default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
error = {ct.err_code: 1,
ct.err_msg: "The Input data type is inconsistent with defined schema, {id} field should be a int64"}
self.upsert(client, collection_name, data=rows,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("partition_name", ["12 s", "(mn)", "中文", "%$#", " "])
def test_milvus_client_upsert_invalid_partition_name(self, partition_name):
"""
target: test milvus client: insert extra field than schema
method: insert extra field than schema when enable_dynamic_field is False
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim)
# 2. insert
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
error = {ct.err_code: 65535, ct.err_msg: f"Invalid partition name: {partition_name}"}
if partition_name == " ":
error = {ct.err_code: 1, ct.err_msg: f"Invalid partition name: . Partition name should not be empty."}
self.upsert(client, collection_name, data=rows, partition_name=partition_name,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_not_exist_partition_name(self):
"""
target: test milvus client: insert extra field than schema
method: insert extra field than schema when enable_dynamic_field is False
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim)
# 2. insert
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
partition_name = cf.gen_unique_str("partition_not_exist")
error = {ct.err_code: 200, ct.err_msg: f"partition not found[partition={partition_name}]"}
self.upsert(client, collection_name, data=rows, partition_name=partition_name,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L2)
def test_milvus_client_upsert_collection_partition_not_match(self):
"""
target: test milvus client: insert extra field than schema
method: insert extra field than schema when enable_dynamic_field is False
expected: Raise exception
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
another_collection_name = cf.gen_unique_str(prefix + "another")
partition_name = cf.gen_unique_str("partition")
# 1. create collection
self.create_collection(client, collection_name, default_dim)
self.create_collection(client, another_collection_name, default_dim)
self.create_partition(client, another_collection_name, partition_name)
# 2. insert
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
error = {ct.err_code: 200, ct.err_msg: f"partition not found[partition={partition_name}]"}
self.upsert(client, collection_name, data=rows, partition_name=partition_name,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("nullable", [True, False])
def test_milvus_client_insert_array_element_null(self, nullable):
"""
target: test search with null expression on each key of json
method: create connection, collection, insert and search
expected: raise exception
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
dim = 5
# 1. create collection
nullable_field_name = "nullable_field"
schema = self.create_schema(client, enable_dynamic_field=False)[0]
schema.add_field(default_primary_key_field_name, DataType.VARCHAR, max_length=64, is_primary=True,
auto_id=False)
schema.add_field(default_vector_field_name, DataType.FLOAT_VECTOR, dim=dim)
schema.add_field(nullable_field_name, DataType.ARRAY, element_type=DataType.INT64, max_capacity=12,
max_length=64, nullable=nullable)
index_params = self.prepare_index_params(client)[0]
index_params.add_index(default_vector_field_name, metric_type="COSINE")
self.create_collection(client, collection_name, dimension=dim, schema=schema, index_params=index_params)
# 2. insert
vectors = cf.gen_vectors(default_nb, dim)
rows = [{default_primary_key_field_name: str(i), default_vector_field_name: vectors[i],
nullable_field_name: [None, 2, 3]} for i in range(default_nb)]
error = {ct.err_code: 1,
ct.err_msg: "The Input data type is inconsistent with defined schema, {nullable_field} field "
"should be a array, but got a {<class 'list'>} instead."}
self.insert(client, collection_name, rows,
check_task=CheckTasks.err_res,
check_items=error)
class TestMilvusClientUpsertValid(TestMilvusClientV2Base):
""" Test case of search interface """
@pytest.fixture(scope="function", params=[False, True])
def auto_id(self, request):
yield request.param
@pytest.fixture(scope="function", params=["COSINE", "L2"])
def metric_type(self, request):
yield request.param
"""
******************************************************************
# The following are valid base cases
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L0)
def test_milvus_client_upsert_default(self):
"""
target: test search (high level api) normal case
method: create connection, collection, insert and search
expected: search/query successfully
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
collections = self.list_collections(client)[0]
assert collection_name in collections
self.describe_collection(client, collection_name,
check_task=CheckTasks.check_describe_collection_property,
check_items={"collection_name": collection_name,
"dim": default_dim,
"consistency_level": 0})
# 2. insert
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
results = self.upsert(client, collection_name, rows)[0]
assert results['upsert_count'] == default_nb
# 3. search
vectors_to_search = rng.random((1, default_dim))
insert_ids = [i for i in range(default_nb)]
self.search(client, collection_name, vectors_to_search,
check_task=CheckTasks.check_search_results,
check_items={"enable_milvus_client_api": True,
"nq": len(vectors_to_search),
"ids": insert_ids,
"limit": default_limit,
"pk_name": default_primary_key_field_name})
# 4. query
self.query(client, collection_name, filter=default_search_exp,
check_task=CheckTasks.check_query_results,
check_items={exp_res: rows,
"with_vec": True,
"pk_name": default_primary_key_field_name})
self.release_collection(client, collection_name)
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L2)
def test_milvus_client_upsert_empty_data(self):
"""
target: test search (high level api) normal case
method: create connection, collection, insert and search
expected: search/query successfully
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. insert
rows = []
results = self.upsert(client, collection_name, rows)[0]
assert results['upsert_count'] == 0
# 3. search
rng = np.random.default_rng(seed=19530)
vectors_to_search = rng.random((1, default_dim))
self.search(client, collection_name, vectors_to_search,
check_task=CheckTasks.check_search_results,
check_items={"enable_milvus_client_api": True,
"nq": len(vectors_to_search),
"ids": [],
"pk_name": default_primary_key_field_name,
"limit": 0})
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L2)
def test_milvus_client_upsert_partition(self):
"""
target: test fast create collection normal case
method: create collection
expected: create collection with default schema, index, and load successfully
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
partition_name = cf.gen_unique_str(prefix)
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. create partition
self.create_partition(client, collection_name, partition_name)
partitions = self.list_partitions(client, collection_name)[0]
assert partition_name in partitions
index = self.list_indexes(client, collection_name)[0]
assert index == ['vector']
# load_state = self.get_load_state(collection_name)[0]
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
# 3. upsert to default partition
results = self.upsert(client, collection_name, rows, partition_name=partitions[0])[0]
assert results['upsert_count'] == default_nb
# 4. upsert to non-default partition
results = self.upsert(client, collection_name, rows, partition_name=partition_name)[0]
assert results['upsert_count'] == default_nb
# 5. search
vectors_to_search = rng.random((1, default_dim))
insert_ids = [i for i in range(default_nb)]
self.search(client, collection_name, vectors_to_search,
check_task=CheckTasks.check_search_results,
check_items={"enable_milvus_client_api": True,
"nq": len(vectors_to_search),
"ids": insert_ids,
"limit": default_limit,
"pk_name": default_primary_key_field_name})
# partition_number = self.get_partition_stats(client, collection_name, "_default")[0]
# assert partition_number == default_nb
# partition_number = self.get_partition_stats(client, collection_name, partition_name)[0]
# assert partition_number[0]['value'] == 0
if self.has_partition(client, collection_name, partition_name)[0]:
self.release_partitions(client, collection_name, partition_name)
self.drop_partition(client, collection_name, partition_name)
if self.has_collection(client, collection_name)[0]:
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_insert_upsert(self):
"""
target: test fast create collection normal case
method: create collection
expected: create collection with default schema, index, and load successfully
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
partition_name = cf.gen_unique_str(prefix)
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. create partition
self.create_partition(client, collection_name, partition_name)
partitions = self.list_partitions(client, collection_name)[0]
assert partition_name in partitions
index = self.list_indexes(client, collection_name)[0]
assert index == ['vector']
# load_state = self.get_load_state(collection_name)[0]
# 3. insert and upsert
rng = np.random.default_rng(seed=19530)
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
results = self.insert(client, collection_name, rows, partition_name=partition_name)[0]
assert results['insert_count'] == default_nb
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, "new_diff_str_field": str(i)} for i in range(default_nb)]
results = self.upsert(client, collection_name, rows, partition_name=partition_name)[0]
assert results['upsert_count'] == default_nb
# 3. search
vectors_to_search = rng.random((1, default_dim))
insert_ids = [i for i in range(default_nb)]
self.search(client, collection_name, vectors_to_search,
check_task=CheckTasks.check_search_results,
check_items={"enable_milvus_client_api": True,
"nq": len(vectors_to_search),
"ids": insert_ids,
"limit": default_limit,
"pk_name": default_primary_key_field_name})
if self.has_partition(client, collection_name, partition_name)[0]:
self.release_partitions(client, collection_name, partition_name)
self.drop_partition(client, collection_name, partition_name)
if self.has_collection(client, collection_name)[0]:
self.drop_collection(client, collection_name)
class TestMilvusClientUpsertDedup(TestMilvusClientV2Base):
"""Test case for upsert deduplication functionality"""
@pytest.fixture(scope="function", params=["COSINE", "L2"])
def metric_type(self, request):
yield request.param
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_dedup_int64_pk(self):
"""
target: test upsert with duplicate int64 primary keys in same batch
method:
1. create collection with int64 primary key
2. upsert data with duplicate primary keys [1, 2, 3, 2, 1]
3. query to verify only last occurrence is kept
expected: only 3 unique records exist, with data from last occurrence
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. upsert data with duplicate PKs: [1, 2, 3, 2, 1]
# Expected: keep last occurrence -> [3, 2, 1] at indices [2, 3, 4]
rng = np.random.default_rng(seed=19530)
rows = [
{default_primary_key_field_name: 1, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: 1.0, default_string_field_name: "str_1_first"},
{default_primary_key_field_name: 2, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: 2.0, default_string_field_name: "str_2_first"},
{default_primary_key_field_name: 3, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: 3.0, default_string_field_name: "str_3"},
{default_primary_key_field_name: 2, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: 2.1, default_string_field_name: "str_2_last"},
{default_primary_key_field_name: 1, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: 1.1, default_string_field_name: "str_1_last"},
]
results = self.upsert(client, collection_name, rows)[0]
# After deduplication, should only have 3 records
assert results['upsert_count'] == 3
# 3. query to verify deduplication - should have only 3 unique records
query_results = self.query(client, collection_name, filter="id >= 0")[0]
assert len(query_results) == 3
# Verify that last occurrence data is kept
id_to_data = {item['id']: item for item in query_results}
assert 1 in id_to_data
assert 2 in id_to_data
assert 3 in id_to_data
# Check that data from last occurrence is preserved
assert id_to_data[1]['float'] == 1.1
assert id_to_data[1]['varchar'] == "str_1_last"
assert id_to_data[2]['float'] == 2.1
assert id_to_data[2]['varchar'] == "str_2_last"
assert id_to_data[3]['float'] == 3.0
assert id_to_data[3]['varchar'] == "str_3"
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_dedup_varchar_pk(self):
"""
target: test upsert with duplicate varchar primary keys in same batch
method:
1. create collection with varchar primary key
2. upsert data with duplicate primary keys ["a", "b", "c", "b", "a"]
3. query to verify only last occurrence is kept
expected: only 3 unique records exist, with data from last occurrence
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection with varchar primary key
schema = self.create_schema(client, enable_dynamic_field=True)[0]
schema.add_field("id", DataType.VARCHAR, max_length=64, is_primary=True, auto_id=False)
schema.add_field(default_vector_field_name, DataType.FLOAT_VECTOR, dim=default_dim)
schema.add_field("age", DataType.INT64)
index_params = self.prepare_index_params(client)[0]
index_params.add_index(default_vector_field_name, metric_type="COSINE")
self.create_collection(client, collection_name, default_dim, schema=schema,
index_params=index_params, consistency_level="Strong")
# 2. upsert data with duplicate PKs: ["a", "b", "c", "b", "a"]
# Expected: keep last occurrence -> ["c", "b", "a"] at indices [2, 3, 4]
rng = np.random.default_rng(seed=19530)
rows = [
{"id": "a", default_vector_field_name: list(rng.random((1, default_dim))[0]),
"age": 10},
{"id": "b", default_vector_field_name: list(rng.random((1, default_dim))[0]),
"age": 20},
{"id": "c", default_vector_field_name: list(rng.random((1, default_dim))[0]),
"age": 30},
{"id": "b", default_vector_field_name: list(rng.random((1, default_dim))[0]),
"age": 21},
{"id": "a", default_vector_field_name: list(rng.random((1, default_dim))[0]),
"age": 11},
]
results = self.upsert(client, collection_name, rows)[0]
# After deduplication, should only have 3 records
assert results['upsert_count'] == 3
# 3. query to verify deduplication
query_results = self.query(client, collection_name, filter='id in ["a", "b", "c"]')[0]
assert len(query_results) == 3
# Verify that last occurrence data is kept
id_to_data = {item['id']: item for item in query_results}
assert "a" in id_to_data
assert "b" in id_to_data
assert "c" in id_to_data
# Check that data from last occurrence is preserved
assert id_to_data["a"]["age"] == 11
assert id_to_data["b"]["age"] == 21
assert id_to_data["c"]["age"] == 30
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_dedup_all_duplicates(self):
"""
target: test upsert when all records have same primary key
method:
1. create collection
2. upsert 5 records with same primary key
3. query to verify only 1 record exists
expected: only 1 record exists with data from last occurrence
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. upsert data where all have same PK (id=1)
rng = np.random.default_rng(seed=19530)
rows = [
{default_primary_key_field_name: 1, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: f"version_{i}"}
for i in range(5)
]
results = self.upsert(client, collection_name, rows)[0]
# After deduplication, should only have 1 record
assert results['upsert_count'] == 1
# 3. query to verify only 1 record exists
query_results = self.query(client, collection_name, filter="id == 1")[0]
assert len(query_results) == 1
# Verify it's the last occurrence (i=4)
assert query_results[0]['float'] == 4.0
assert query_results[0]['varchar'] == "version_4"
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_dedup_no_duplicates(self):
"""
target: test upsert with no duplicate primary keys
method:
1. create collection
2. upsert data with unique primary keys
3. query to verify all records exist
expected: all records exist as-is
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. upsert data with unique PKs
rng = np.random.default_rng(seed=19530)
nb = 10
rows = [
{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: i * 1.0, default_string_field_name: str(i)}
for i in range(nb)
]
results = self.upsert(client, collection_name, rows)[0]
# No deduplication should occur
assert results['upsert_count'] == nb
# 3. query to verify all records exist
query_results = self.query(client, collection_name, filter=f"id >= 0")[0]
assert len(query_results) == nb
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L2)
def test_milvus_client_upsert_dedup_large_batch(self):
"""
target: test upsert deduplication with large batch
method:
1. create collection
2. upsert large batch with 50% duplicate primary keys
3. query to verify correct number of records
expected: only unique records exist
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. upsert large batch where each ID appears twice
rng = np.random.default_rng(seed=19530)
nb = 500
unique_ids = nb // 2 # 250 unique IDs
rows = []
for i in range(nb):
pk = i % unique_ids # This creates duplicates: 0,1,2...249,0,1,2...249
rows.append({
default_primary_key_field_name: pk,
default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: float(i), # Different value for each row
default_string_field_name: f"batch_{i}"
})
results = self.upsert(client, collection_name, rows)[0]
# After deduplication, should only have unique_ids records
assert results['upsert_count'] == unique_ids
# 3. query to verify correct number of records
query_results = self.query(client, collection_name, filter=f"id >= 0", limit=1000)[0]
assert len(query_results) == unique_ids
# Verify that last occurrence is kept (should have higher float values)
for item in query_results:
pk = item['id']
# Last occurrence of pk is at index (pk + unique_ids)
expected_float = float(pk + unique_ids)
assert item['float'] == expected_float
assert item['varchar'] == f"batch_{pk + unique_ids}"
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_dedup_with_partition(self):
"""
target: test upsert deduplication works correctly with partitions
method:
1. create collection with partition
2. upsert data with duplicates to specific partition
3. query to verify deduplication in partition
expected: deduplication works within partition
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
partition_name = cf.gen_unique_str("partition")
# 1. create collection and partition
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
self.create_partition(client, collection_name, partition_name)
# 2. upsert data with duplicates to partition
rng = np.random.default_rng(seed=19530)
rows = [
{default_primary_key_field_name: 1, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: 1.0, default_string_field_name: "first"},
{default_primary_key_field_name: 2, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: 2.0, default_string_field_name: "unique"},
{default_primary_key_field_name: 1, default_vector_field_name: list(rng.random((1, default_dim))[0]),
default_float_field_name: 1.1, default_string_field_name: "last"},
]
results = self.upsert(client, collection_name, rows, partition_name=partition_name)[0]
assert results['upsert_count'] == 2
# 3. query partition to verify deduplication
query_results = self.query(client, collection_name, filter="id >= 0",
partition_names=[partition_name])[0]
assert len(query_results) == 2
# Verify correct data
id_to_data = {item['id']: item for item in query_results}
assert id_to_data[1]['float'] == 1.1
assert id_to_data[1]['varchar'] == "last"
assert id_to_data[2]['float'] == 2.0
assert id_to_data[2]['varchar'] == "unique"
self.drop_collection(client, collection_name)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_upsert_dedup_with_vectors(self):
"""
target: test upsert deduplication preserves correct vector data
method:
1. create collection
2. upsert data with duplicate PKs but different vectors
3. search to verify correct vector is preserved
expected: vector from last occurrence is preserved
"""
client = self._client()
collection_name = cf.gen_collection_name_by_testcase_name()
# 1. create collection
self.create_collection(client, collection_name, default_dim, consistency_level="Strong")
# 2. upsert data with duplicate PK=1 but different vectors
# Create distinctly different vectors for easy verification
first_vector = [1.0] * default_dim # All 1.0
last_vector = [2.0] * default_dim # All 2.0
rows = [
{default_primary_key_field_name: 1, default_vector_field_name: first_vector,
default_float_field_name: 1.0, default_string_field_name: "first"},
{default_primary_key_field_name: 2, default_vector_field_name: [0.5] * default_dim,
default_float_field_name: 2.0, default_string_field_name: "unique"},
{default_primary_key_field_name: 1, default_vector_field_name: last_vector,
default_float_field_name: 1.1, default_string_field_name: "last"},
]
results = self.upsert(client, collection_name, rows)[0]
assert results['upsert_count'] == 2
# 3. query to get vector data
query_results = self.query(client, collection_name, filter="id == 1",
output_fields=["id", "vector", "float", "varchar"])[0]
assert len(query_results) == 1
# Verify it's the last occurrence with last_vector
result = query_results[0]
assert result['float'] == 1.1
assert result['varchar'] == "last"
# Vector should be last_vector (all 2.0)
assert all(abs(v - 2.0) < 0.001 for v in result['vector'])
self.drop_collection(client, collection_name)