diff --git a/tests/python_client/.gitignore b/tests/python_client/.gitignore index e65f537928..481f6ed47b 100644 --- a/tests/python_client/.gitignore +++ b/tests/python_client/.gitignore @@ -7,7 +7,7 @@ .idea *.html *.hdf5 - +*.npy .python-version __pycache__ .vscode diff --git a/tests/python_client/base/utility_wrapper.py b/tests/python_client/base/utility_wrapper.py index be431d124e..5c93b063bf 100644 --- a/tests/python_client/base/utility_wrapper.py +++ b/tests/python_client/base/utility_wrapper.py @@ -6,10 +6,9 @@ import sys sys.path.append("..") from check.func_check import ResponseChecker from utils.api_request import api_request -from common.common_type import BulkLoadStates +from pymilvus import BulkInsertState from pymilvus.orm.role import Role - - +from utils.util_log import test_log as log TIMEOUT = 20 @@ -19,68 +18,169 @@ class ApiUtilityWrapper: ut = utility role = None - def bulk_load(self, collection_name, partition_name="", row_based=True, files="", timeout=None, + def bulk_insert(self, collection_name, is_row_based=True, files="", partition_name=None, timeout=None, using="default", check_task=None, check_items=None, **kwargs): + working_tasks = self.get_bulk_insert_working_list() + log.info(f"before bulk load, there are {len(working_tasks)} working tasks") + log.info(f"files to load: {files}") func_name = sys._getframe().f_code.co_name - res, is_succ = api_request([self.ut.bulk_load, collection_name, partition_name, row_based, - files, timeout, using], **kwargs) + res, is_succ = api_request([self.ut.bulk_insert, collection_name, is_row_based, + files, partition_name, timeout, using], **kwargs) check_result = ResponseChecker(res, func_name, check_task, check_items, is_succ, collection_name=collection_name, using=using).run() + time.sleep(1) + working_tasks = self.get_bulk_insert_working_list() + log.info(f"after bulk load, there are {len(working_tasks)} working tasks") return res, check_result - def get_bulk_load_state(self, task_id, timeout=None, using="default", check_task=None, check_items=None, **kwargs): + def get_bulk_insert_state(self, task_id, timeout=None, using="default", check_task=None, check_items=None, **kwargs): func_name = sys._getframe().f_code.co_name - res, is_succ = api_request([self.ut.get_bulk_load_state, task_id, timeout, using], **kwargs) + res, is_succ = api_request([self.ut.get_bulk_insert_state, task_id, timeout, using], **kwargs) check_result = ResponseChecker(res, func_name, check_task, check_items, is_succ, task_id=task_id, using=using).run() return res, check_result - def wait_for_bulk_load_tasks_completed(self, task_ids, target_state=BulkLoadStates.BulkLoadPersisted, + def list_bulk_insert_tasks(self, limit=0, collection_name=None, timeout=None, using="default", check_task=None, check_items=None, **kwargs): + func_name = sys._getframe().f_code.co_name + res, is_succ = api_request([self.ut.list_bulk_insert_tasks, limit, collection_name, timeout, using], **kwargs) + check_result = ResponseChecker(res, func_name, check_task, check_items, is_succ, + limit=limit, collection_name=collection_name, using=using).run() + return res, check_result + + def get_bulk_insert_pending_list(self): + tasks = {} + for task in self.ut.list_bulk_insert_tasks(): + if task.state == BulkInsertState.ImportPending: + tasks[task.task_id] = task + return tasks + + def get_bulk_insert_working_list(self): + tasks = {} + for task in self.ut.list_bulk_insert_tasks(): + if task.state in [BulkInsertState.ImportStarted]: + tasks[task.task_id] = task + return tasks + + def list_all_bulk_insert_tasks(self, limit=0): + tasks, _ = self.list_bulk_insert_tasks(limit=limit) + pending = 0 + started = 0 + persisted = 0 + completed = 0 + failed = 0 + failed_and_cleaned = 0 + unknown = 0 + for task in tasks: + print(task) + if task.state == BulkInsertState.ImportPending: + pending = pending + 1 + elif task.state == BulkInsertState.ImportStarted: + started = started + 1 + elif task.state == BulkInsertState.ImportPersisted: + persisted = persisted + 1 + elif task.state == BulkInsertState.ImportCompleted: + completed = completed + 1 + elif task.state == BulkInsertState.ImportFailed: + failed = failed + 1 + elif task.state == BulkInsertState.ImportFailedAndCleaned: + failed_and_cleaned = failed_and_cleaned + 1 + else: + unknown = unknown + 1 + + log.info("There are", len(tasks), "bulkload tasks.", pending, "pending,", started, "started,", persisted, + "persisted,", completed, "completed,", failed, "failed", failed_and_cleaned, "failed_and_cleaned", + unknown, "unknown") + + def wait_for_bulk_insert_tasks_completed(self, task_ids, target_state=BulkInsertState.ImportCompleted, timeout=None, using="default", **kwargs): start = time.time() - successes = {} - fails = {} + tasks_state_distribution = { + "success": set(), + "failed": set(), + "in_progress": set() + } + tasks_state = {} if timeout is not None: - task_timeout = timeout / len(task_ids) + task_timeout = timeout else: task_timeout = TIMEOUT - while (len(successes) + len(fails)) < len(task_ids): - in_progress = {} - time.sleep(0.1) + start = time.time() + end = time.time() + log.info(f"wait bulk load timeout is {task_timeout}") + pending_tasks = self.get_bulk_insert_pending_list() + log.info(f"before waiting, there are {len(pending_tasks)} pending tasks") + while len(tasks_state_distribution["success"])+len(tasks_state_distribution["failed"]) < len(task_ids) and end-start <= task_timeout: + time.sleep(2) + for task_id in task_ids: - if successes.get(task_id, None) is not None or fails.get(task_id, None) is not None: + if task_id in tasks_state_distribution["success"] or task_id in tasks_state_distribution["failed"]: continue else: - state, _ = self.get_bulk_load_state(task_id, task_timeout, using, **kwargs) - if target_state == BulkLoadStates.BulkLoadDataQueryable: - if state.data_queryable is True: - successes[task_id] = True - else: - in_progress[task_id] = False - elif target_state == BulkLoadStates.BulkLoadDataIndexed: - if state.data_indexed is True: - successes[task_id] = True - else: - in_progress[task_id] = False - else: - if state.state_name == target_state: - successes[task_id] = state - elif state.state_name == BulkLoadStates.BulkLoadFailed: - fails[task_id] = state - else: - in_progress[task_id] = state - end = time.time() - if timeout is not None: - if end - start > timeout: - in_progress.update(fails) - in_progress.update(successes) - return False, in_progress + state, _ = self.get_bulk_insert_state(task_id, task_timeout, using, **kwargs) + tasks_state[task_id] = state - if len(fails) == 0: - return True, successes + if target_state == BulkInsertState.ImportPersisted: + if state.state in [BulkInsertState.ImportPersisted, BulkInsertState.ImportCompleted]: + if task_id in tasks_state_distribution["in_progress"]: + tasks_state_distribution["in_progress"].remove(task_id) + tasks_state_distribution["success"].add(task_id) + elif state.state in [BulkInsertState.ImportPending, BulkInsertState.ImportStarted]: + tasks_state_distribution["in_progress"].add(task_id) + else: + tasks_state_distribution["failed"].add(task_id) + + if target_state == BulkInsertState.ImportCompleted: + if state.state in [BulkInsertState.ImportCompleted]: + if task_id in tasks_state_distribution["in_progress"]: + tasks_state_distribution["in_progress"].remove(task_id) + tasks_state_distribution["success"].add(task_id) + elif state.state in [BulkInsertState.ImportPending, BulkInsertState.ImportStarted, BulkInsertState.ImportPersisted]: + tasks_state_distribution["in_progress"].add(task_id) + else: + tasks_state_distribution["failed"].add(task_id) + + end = time.time() + pending_tasks = self.get_bulk_insert_pending_list() + log.info(f"after waiting, there are {len(pending_tasks)} pending tasks") + log.info(f"task state distribution: {tasks_state_distribution}") + log.debug(tasks_state) + if len(tasks_state_distribution["success"]) == len(task_ids): + log.info(f"wait for bulk load tasks completed successfully, cost time: {end-start}") + return True, tasks_state else: - fails.update(successes) - return False, fails + log.info(f"wait for bulk load tasks completed failed, cost time: {end-start}") + return False, tasks_state + + def wait_all_pending_tasks_finished(self): + task_states_map = {} + all_tasks, _ = self.list_bulk_insert_tasks() + # log.info(f"all tasks: {all_tasks}") + for task in all_tasks: + if task.state in [BulkInsertState.ImportStarted, BulkInsertState.ImportPersisted]: + task_states_map[task.task_id] = task.state + + log.info(f"current tasks states: {task_states_map}") + pending_tasks = self.get_bulk_insert_pending_list() + working_tasks = self.get_bulk_insert_working_list() + log.info(f"in the start, there are {len(working_tasks)} working tasks, {working_tasks} {len(pending_tasks)} pending tasks, {pending_tasks}") + time_cnt = 0 + pending_task_ids = set() + while len(pending_tasks) > 0: + time.sleep(5) + time_cnt += 5 + pending_tasks = self.get_bulk_insert_pending_list() + working_tasks = self.get_bulk_insert_working_list() + cur_pending_task_ids = [] + for task_id in pending_tasks.keys(): + cur_pending_task_ids.append(task_id) + pending_task_ids.add(task_id) + log.info(f"after {time_cnt}, there are {len(working_tasks)} working tasks, {len(pending_tasks)} pending tasks") + log.debug(f"total pending tasks: {pending_task_ids} current pending tasks: {cur_pending_task_ids}") + log.info(f"after {time_cnt}, all pending tasks are finished") + all_tasks, _ = self.list_bulk_insert_tasks() + for task in all_tasks: + if task.task_id in pending_task_ids: + log.info(f"task {task.task_id} state transfer from pending to {task.state_name}") def get_query_segment_info(self, collection_name, timeout=None, using="default", check_task=None, check_items=None): timeout = TIMEOUT if timeout is None else timeout diff --git a/tests/python_client/bulk_load/bulk_load_data.py b/tests/python_client/bulk_insert/bulk_insert_data.py similarity index 86% rename from tests/python_client/bulk_load/bulk_load_data.py rename to tests/python_client/bulk_insert/bulk_insert_data.py index 7717c2ef7a..8ff1682b28 100644 --- a/tests/python_client/bulk_load/bulk_load_data.py +++ b/tests/python_client/bulk_insert/bulk_insert_data.py @@ -1,16 +1,14 @@ import time import os +import pathlib import numpy as np import random from sklearn import preprocessing from common.common_func import gen_unique_str from minio_comm import copy_files_to_minio +from utils.util_log import test_log as log -# TODO: remove hardcode with input configurations -minio = "minio_address:port" # minio service and port -bucket_name = "milvus-bulk-load" # bucket name of milvus is using - -data_source = "/tmp/bulk_load_data" +data_source = "/tmp/bulk_insert_data" BINARY = "binary" FLOAT = "float" @@ -23,6 +21,7 @@ class DataField: string_field = "string_scalar" bool_field = "bool_scalar" float_field = "float_scalar" + double_field = "double_scalar" class DataErrorType: @@ -35,8 +34,8 @@ class DataErrorType: str_on_vector_field = "str_on_vector_field" -def gen_file_prefix(row_based=True, auto_id=True, prefix=""): - if row_based: +def gen_file_prefix(is_row_based=True, auto_id=True, prefix=""): + if is_row_based: if auto_id: return f"{prefix}_row_auto" else: @@ -244,8 +243,8 @@ def gen_column_base_json_file(col_file, str_pk, data_fields, float_vect, f.write("\n") -def gen_vectors_in_numpy_file(dir, float_vector, rows, dim, force=False): - file_name = f"{DataField.vec_field}.npy" +def gen_vectors_in_numpy_file(dir, data_field, float_vector, rows, dim, force=False): + file_name = f"{data_field}.npy" file = f'{dir}/{file_name}' if not os.path.exists(file) or force: @@ -257,6 +256,23 @@ def gen_vectors_in_numpy_file(dir, float_vector, rows, dim, force=False): else: vectors = gen_binary_vectors(rows, (dim // 8)) arr = np.array(vectors) + # print(f"file_name: {file_name} data type: {arr.dtype}") + log.info(f"file_name: {file_name} data type: {arr.dtype} data shape: {arr.shape}") + np.save(file, arr) + return file_name + + +def gen_string_in_numpy_file(dir, data_field, rows, start=0, force=False): + file_name = f"{data_field}.npy" + file = f"{dir}/{file_name}" + if not os.path.exists(file) or force: + # non vector columns + data = [] + if rows > 0: + data = [gen_unique_str(str(i)) for i in range(start, rows+start)] + arr = np.array(data) + # print(f"file_name: {file_name} data type: {arr.dtype}") + log.info(f"file_name: {file_name} data type: {arr.dtype} data shape: {arr.shape}") np.save(file, arr) return file_name @@ -267,19 +283,24 @@ def gen_int_or_float_in_numpy_file(dir, data_field, rows, start=0, force=False): if not os.path.exists(file) or force: # non vector columns data = [] + # arr = np.array([]) if rows > 0: if data_field == DataField.float_field: - data = [random.random() for _ in range(rows)] + data = [np.float32(random.random()) for _ in range(rows)] + elif data_field == DataField.double_field: + data = [np.float64(random.random()) for _ in range(rows)] elif data_field == DataField.pk_field: data = [i for i in range(start, start + rows)] elif data_field == DataField.int_field: data = [random.randint(-999999, 9999999) for _ in range(rows)] - arr = np.array(data) - np.save(file, arr) + # print(f"file_name: {file_name} data type: {arr.dtype}") + arr = np.array(data) + log.info(f"file_name: {file_name} data type: {arr.dtype} data shape: {arr.shape}") + np.save(file, arr) return file_name -def gen_file_name(row_based, rows, dim, auto_id, str_pk, +def gen_file_name(is_row_based, rows, dim, auto_id, str_pk, float_vector, data_fields, file_num, file_type, err_type): row_suffix = entity_suffix(rows) field_suffix = "" @@ -297,7 +318,7 @@ def gen_file_name(row_based, rows, dim, auto_id, str_pk, pk = "" if str_pk: pk = "str_pk_" - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id, prefix=err_type) + prefix = gen_file_prefix(is_row_based=is_row_based, auto_id=auto_id, prefix=err_type) file_name = f"{prefix}_{pk}{vt}{field_suffix}{dim}d_{row_suffix}_{file_num}{file_type}" return file_name @@ -312,7 +333,7 @@ def gen_subfolder(root, dim, rows, file_num): return subfolder -def gen_json_files(row_based, rows, dim, auto_id, str_pk, +def gen_json_files(is_row_based, rows, dim, auto_id, str_pk, float_vector, data_fields, file_nums, multi_folder, file_type, err_type, force, **kwargs): # gen json files @@ -322,7 +343,7 @@ def gen_json_files(row_based, rows, dim, auto_id, str_pk, if not auto_id and DataField.pk_field not in data_fields: data_fields.append(DataField.pk_field) for i in range(file_nums): - file_name = gen_file_name(row_based=row_based, rows=rows, dim=dim, + file_name = gen_file_name(is_row_based=is_row_based, rows=rows, dim=dim, auto_id=auto_id, str_pk=str_pk, float_vector=float_vector, data_fields=data_fields, file_num=i, file_type=file_type, err_type=err_type) file = f"{data_source}/{file_name}" @@ -330,7 +351,7 @@ def gen_json_files(row_based, rows, dim, auto_id, str_pk, subfolder = gen_subfolder(root=data_source, dim=dim, rows=rows, file_num=i) file = f"{data_source}/{subfolder}/{file_name}" if not os.path.exists(file) or force: - if row_based: + if is_row_based: gen_row_based_json_file(row_file=file, str_pk=str_pk, float_vect=float_vector, data_fields=data_fields, rows=rows, dim=dim, start_uid=start_uid, err_type=err_type, **kwargs) @@ -346,7 +367,7 @@ def gen_json_files(row_based, rows, dim, auto_id, str_pk, return files -def gen_npy_files(float_vector, rows, dim, data_fields, file_nums=1, force=False): +def gen_npy_files(float_vector, rows, dim, data_fields, file_nums=1, err_type="", force=False): # gen numpy files files = [] start_uid = 0 @@ -354,8 +375,10 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_nums=1, force=False # gen the numpy file without subfolders if only one set of files for data_field in data_fields: if data_field == DataField.vec_field: - file_name = gen_vectors_in_numpy_file(dir=data_source, float_vector=float_vector, + file_name = gen_vectors_in_numpy_file(dir=data_source, data_field=data_field, float_vector=float_vector, rows=rows, dim=dim, force=force) + elif data_field == DataField.string_field: # string field for numpy not supported yet at 2022-10-17 + file_name = gen_string_in_numpy_file(dir=data_source, data_field=data_field, rows=rows, force=force) else: file_name = gen_int_or_float_in_numpy_file(dir=data_source, data_field=data_field, rows=rows, force=force) @@ -365,8 +388,8 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_nums=1, force=False subfolder = gen_subfolder(root=data_source, dim=dim, rows=rows, file_num=i) dir = f"{data_source}/{subfolder}" for data_field in data_fields: - if data_field == DataField.vec_field: - file_name = gen_vectors_in_numpy_file(dir=dir, float_vector=float_vector, rows=rows, dim=dim, force=force) + if DataField.vec_field in data_field: + file_name = gen_vectors_in_numpy_file(dir=dir, data_field=data_field, float_vector=float_vector, rows=rows, dim=dim, force=force) else: file_name = gen_int_or_float_in_numpy_file(dir=dir, data_field=data_field, rows=rows, start=start_uid, force=force) files.append(f"{subfolder}/{file_name}") @@ -374,15 +397,21 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_nums=1, force=False return files -def prepare_bulk_load_json_files(row_based=True, rows=100, dim=128, +def prepare_bulk_insert_json_files(minio_endpoint="", bucket_name="milvus-bucket", is_row_based=True, rows=100, dim=128, auto_id=True, str_pk=False, float_vector=True, data_fields=[], file_nums=1, multi_folder=False, file_type=".json", err_type="", force=False, **kwargs): """ Generate files based on the params in json format and copy them to minio - :param row_based: indicate the file(s) to be generated is row based or not - :type row_based: boolean + :param minio_endpoint: the minio_endpoint of minio + :type minio_endpoint: str + + :param bucket_name: the bucket name of Milvus + :type bucket_name: str + + :param is_row_based: indicate the file(s) to be generated is row based or not + :type is_row_based: boolean :param rows: the number entities to be generated in the file(s) :type rows: int @@ -427,16 +456,16 @@ def prepare_bulk_load_json_files(row_based=True, rows=100, dim=128, :return list file names list """ - files = gen_json_files(row_based=row_based, rows=rows, dim=dim, + files = gen_json_files(is_row_based=is_row_based, rows=rows, dim=dim, auto_id=auto_id, str_pk=str_pk, float_vector=float_vector, data_fields=data_fields, file_nums=file_nums, multi_folder=multi_folder, file_type=file_type, err_type=err_type, force=force, **kwargs) - copy_files_to_minio(host=minio, r_source=data_source, files=files, bucket_name=bucket_name, force=force) + copy_files_to_minio(host=minio_endpoint, r_source=data_source, files=files, bucket_name=bucket_name, force=force) return files -def prepare_bulk_load_numpy_files(rows, dim, data_fields=[DataField.vec_field], +def prepare_bulk_insert_numpy_files(minio_endpoint="", bucket_name="milvus-bucket", rows=100, dim=128, data_fields=[DataField.vec_field], float_vector=True, file_nums=1, force=False): """ Generate column based files based on params in numpy format and copy them to the minio @@ -471,6 +500,6 @@ def prepare_bulk_load_numpy_files(rows, dim, data_fields=[DataField.vec_field], data_fields=data_fields, file_nums=file_nums, force=force) - copy_files_to_minio(host=minio, r_source=data_source, files=files, bucket_name=bucket_name, force=force) + copy_files_to_minio(host=minio_endpoint, r_source=data_source, files=files, bucket_name=bucket_name, force=force) return files diff --git a/tests/python_client/bulk_load/minio_comm.py b/tests/python_client/bulk_insert/minio_comm.py similarity index 100% rename from tests/python_client/bulk_load/minio_comm.py rename to tests/python_client/bulk_insert/minio_comm.py diff --git a/tests/python_client/bulk_insert/test_bulk_insert.py b/tests/python_client/bulk_insert/test_bulk_insert.py new file mode 100644 index 0000000000..43a17b550a --- /dev/null +++ b/tests/python_client/bulk_insert/test_bulk_insert.py @@ -0,0 +1,2435 @@ +import logging +import time +import pytest +import random +from pathlib import Path +from base.client_base import TestcaseBase +from common import common_func as cf +from common import common_type as ct +from common.milvus_sys import MilvusSys +from common.common_type import CaseLabel, CheckTasks +from utils.util_k8s import ( + get_pod_ip_name_pairs, + get_milvus_instance_name, +) +from utils.util_log import test_log as log +from bulk_insert_data import ( + prepare_bulk_insert_json_files, + prepare_bulk_insert_numpy_files, + DataField as df, + DataErrorType, +) + + +default_vec_only_fields = [df.vec_field] +default_multi_fields = [ + df.vec_field, + df.int_field, + df.string_field, + df.bool_field, + df.float_field, +] +default_vec_n_int_fields = [df.vec_field, df.int_field] + + +milvus_ns = "chaos-testing" +base_dir = "/tmp/bulk_insert_data" + + +def entity_suffix(entities): + if entities // 1000000 > 0: + suffix = f"{entities // 1000000}m" + elif entities // 1000 > 0: + suffix = f"{entities // 1000}k" + else: + suffix = f"{entities}" + return suffix + + +class TestcaseBaseBulkInsert(TestcaseBase): + + @pytest.fixture(scope="function", autouse=True) + def init_minio_client(self, host): + Path("/tmp/bulk_insert_data").mkdir(parents=True, exist_ok=True) + self._connect() + self.instance_name = get_milvus_instance_name(milvus_ns, host) + minio_ip_pod_pair = get_pod_ip_name_pairs( + milvus_ns, f"release={self.instance_name}, app=minio" + ) + ms = MilvusSys() + minio_ip = list(minio_ip_pod_pair.keys())[0] + minio_port = "9000" + self.minio_endpoint = f"{minio_ip}:{minio_port}" + self.bucket_name = ms.index_nodes[0]["infos"]["system_configurations"][ + "minio_bucket_name" + ] + + def teardown_method(self, method): + log.info(("*" * 35) + " teardown " + ("*" * 35)) + log.info("[teardown_method] Start teardown test case %s..." % method.__name__) + + +class TestBulkInsert(TestcaseBaseBulkInsert): + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [8]) # 8, 128 + @pytest.mark.parametrize("entities", [100]) # 100, 1000 + def test_float_vector_only(self, is_row_based, auto_id, dim, entities): + """ + collection: auto_id, customized_id + collection schema: [pk, float_vector] + Steps: + 1. create collection + 2. import data + 3. verify the data entities equal the import data + 4. load the collection + 5. verify search successfully + 6. verify query successfully + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_vec_only_fields, + force=True, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, + partition_name=None, + is_row_based=is_row_based, + files=files, + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, _ = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + assert success + + num_entities = self.collection_wrap.num_entities + log.info(f" collection entities: {num_entities}") + assert num_entities == entities + + # verify imported data is available for search + self.collection_wrap.load() + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + log.info( + f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}" + ) + nq = 2 + topk = 2 + search_data = cf.gen_vectors(nq, dim) + search_params = {"metric_type": "L2", "params": {"nprobe": 2}} + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=topk, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "limit": topk}, + ) + for hits in res: + ids = hits.ids + results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}") + assert len(results) == len(ids) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("dim", [8]) # 8 + @pytest.mark.parametrize("entities", [100]) # 100 + def test_str_pk_float_vector_only(self, is_row_based, dim, entities): + """ + collection schema: [str_pk, float_vector] + Steps: + 1. create collection + 2. import data + 3. verify the data entities equal the import data + 4. load the collection + 5. verify search successfully + 6. verify query successfully + """ + auto_id = False # no auto id for string_pk schema + string_pk = True + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + str_pk=string_pk, + data_fields=default_vec_only_fields, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_string_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + completed, _ = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{completed} in {tt}") + assert completed + + num_entities = self.collection_wrap.num_entities + log.info(f" collection entities: {num_entities}") + assert num_entities == entities + + # verify imported data is available for search + self.collection_wrap.load() + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + log.info( + f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}" + ) + nq = 3 + topk = 2 + search_data = cf.gen_vectors(nq, dim) + search_params = {"metric_type": "L2", "params": {"nprobe": 2}} + time.sleep(5) + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=topk, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "limit": topk}, + ) + for hits in res: + ids = hits.ids + expr = f"{df.pk_field} in {ids}" + expr = expr.replace("'", '"') + results, _ = self.collection_wrap.query(expr=expr) + assert len(results) == len(ids) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [4]) + @pytest.mark.parametrize("entities", [3000]) + def test_partition_float_vector_int_scalar( + self, is_row_based, auto_id, dim, entities + ): + """ + collection: customized partitions + collection schema: [pk, float_vectors, int_scalar] + 1. create collection and a partition + 2. build index and load partition + 3. import data into the partition + 4. verify num entities + 5. verify index status + 6. verify search and query + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_vec_n_int_fields, + file_nums=1, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # create a partition + p_name = cf.gen_unique_str("bulk_insert") + m_partition, _ = self.collection_wrap.create_partition(partition_name=p_name) + # build index before bulk insert + index_params = { + "index_type": "IVF_SQ8", + "params": {"nlist": 128}, + "metric_type": "L2", + } + self.collection_wrap.create_index( + field_name=df.vec_field, index_params=index_params + ) + # load before bulk insert + self.collection_wrap.load(partition_names=[p_name]) + + # import data into the partition + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, + partition_name=p_name, + is_row_based=is_row_based, + files=files, + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, state = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + assert success + + assert m_partition.num_entities == entities + assert self.collection_wrap.num_entities == entities + log.debug(state) + res, _ = self.utility_wrap.index_building_progress(c_name) + exp_res = {"total_rows": entities, "indexed_rows": entities} + assert res == exp_res + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + log.info( + f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}" + ) + + nq = 10 + topk = 5 + search_data = cf.gen_vectors(nq, dim) + search_params = {"metric_type": "L2", "params": {"nprobe": 16}} + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=topk, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "limit": topk}, + ) + for hits in res: + ids = hits.ids + results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}") + assert len(results) == len(ids) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [16]) + @pytest.mark.parametrize("entities", [2000]) + def test_binary_vector_only(self, is_row_based, auto_id, dim, entities): + """ + collection schema: [pk, binary_vector] + Steps: + 1. create collection + 2. create index and load collection + 3. import data + 4. verify build status + 5. verify the data entities + 6. load collection + 7. verify search successfully + 6. verify query successfully + """ + float_vec = False + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + float_vector=float_vec, + data_fields=default_vec_only_fields, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_binary_vec_field(name=df.vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # build index before bulk insert + binary_index_params = { + "index_type": "BIN_IVF_FLAT", + "metric_type": "JACCARD", + "params": {"nlist": 64}, + } + self.collection_wrap.create_index( + field_name=df.vec_field, index_params=binary_index_params + ) + # load collection + self.collection_wrap.load() + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, _ = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + assert success + res, _ = self.utility_wrap.index_building_progress(c_name) + exp_res = {'total_rows': entities, 'indexed_rows': entities} + assert res == exp_res + + # verify num entities + assert self.collection_wrap.num_entities == entities + # verify search and query + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + search_data = cf.gen_binary_vectors(1, dim)[1] + search_params = {"metric_type": "JACCARD", "params": {"nprobe": 10}} + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=1, + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, "limit": 1}, + ) + for hits in res: + ids = hits.ids + results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}") + assert len(results) == len(ids) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize( + "fields_num_in_file", ["equal", "more", "less"] + ) # "equal", "more", "less" + @pytest.mark.parametrize("dim", [16]) + @pytest.mark.parametrize("entities", [500]) + def test_float_vector_multi_scalars( + self, is_row_based, auto_id, fields_num_in_file, dim, entities + ): + """ + collection schema: [pk, float_vector, + float_scalar, int_scalar, string_scalar, bool_scalar] + Steps: + 1. create collection + 2. create index and load collection + 3. import data + 4. verify the data entities + 5. verify index status + 6. verify search and query + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_multi_fields, + force=True, + ) + additional_field = "int_scalar_add" + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + cf.gen_string_field(name=df.string_field), + cf.gen_bool_field(name=df.bool_field), + ] + if fields_num_in_file == "more": + fields.pop() + elif fields_num_in_file == "less": + fields.append(cf.gen_int32_field(name=additional_field)) + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # build index before bulk insert + # build index + index_params = { + "index_type": "HNSW", + "params": {"M": 8, "efConstruction": 100}, + "metric_type": "L2", + } + self.collection_wrap.create_index( + field_name=df.vec_field, index_params=index_params + ) + # load collection + self.collection_wrap.load() + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + if fields_num_in_file == "less": + assert not success + if is_row_based: + failed_reason = ( + f"JSON row validator: field {additional_field} missed at the row 0" + ) + else: + failed_reason = "is not equal to other fields" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + else: + assert success + + num_entities = self.collection_wrap.num_entities + log.info(f" collection entities: {num_entities}") + assert num_entities == entities + + # verify no index + res, _ = self.collection_wrap.has_index() + assert res is True + # verify search and query + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + nq = 3 + topk = 10 + search_data = cf.gen_vectors(nq, dim) + search_params = {"metric_type": "IP", "params": {"ef": 64}} + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=topk, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "limit": topk}, + ) + for hits in res: + ids = hits.ids + results, _ = self.collection_wrap.query( + expr=f"{df.pk_field} in {ids}", + output_fields=[df.pk_field, df.int_field], + ) + assert len(results) == len(ids) + if not auto_id: + for i in range(len(results)): + assert results[i].get(df.int_field, 0) == results[i].get( + df.pk_field, 1 + ) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("create_index_before_bulk_insert", [True, False]) + @pytest.mark.parametrize("loaded_before_bulk_insert", [True, False]) + def test_load_before_or_after_bulk_insert(self, loaded_before_bulk_insert, create_index_before_bulk_insert): + """ + collection schema: [pk, float_vector] + Steps: + 1. create collection + 2. create index and load collection or not + 3. import data + 4. load collection or not + 5. verify the data entities + 5. verify the index status + 6. verify search and query + """ + if loaded_before_bulk_insert and not create_index_before_bulk_insert: + pytest.skip("can not load collection if index not created") + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=True, + rows=500, + dim=16, + auto_id=True, + data_fields=[df.pk_field, df.vec_field], + force=True, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=16), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=True) + self.collection_wrap.init_collection(c_name, schema=schema) + # build index + index_params = { + "index_type": "HNSW", + "params": {"M": 8, "efConstruction": 100}, + "metric_type": "L2", + } + self.collection_wrap.create_index( + field_name=df.vec_field, index_params=index_params + ) + if loaded_before_bulk_insert: + # load collection + self.collection_wrap.load() + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=True, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + assert success + if not loaded_before_bulk_insert: + # load collection + self.collection_wrap.load() + + num_entities = self.collection_wrap.num_entities + log.info(f"collection entities: {num_entities}") + assert num_entities == 500 + # verify no index + res, _ = self.utility_wrap.index_building_progress(c_name) + exp_res = {'total_rows': num_entities, 'indexed_rows': num_entities} + assert res == exp_res + # verify search and query + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + nq = 3 + topk = 10 + search_data = cf.gen_vectors(nq, 16) + search_params = {"metric_type": "IP", "params": {"ef": 64}} + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=topk, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "limit": topk}, + ) + for hits in res: + ids = hits.ids + expr = f"{df.pk_field} in {ids}" + expr = expr.replace("'", '"') + results, _ = self.collection_wrap.query(expr=expr) + assert len(results) == len(ids) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize( + "fields_num_in_file", ["equal", "more", "less"] + ) # "equal", "more", "less" + @pytest.mark.parametrize("dim", [16]) # 1024 + @pytest.mark.parametrize("entities", [500]) # 5000 + def test_string_pk_float_vector_multi_scalars( + self, is_row_based, fields_num_in_file, dim, entities + ): + """ + collection schema: [str_pk, float_vector, + float_scalar, int_scalar, string_scalar, bool_scalar] + Steps: + 1. create collection with string primary key + 2. create index and load collection + 3. import data + 4. verify the data entities + 5. verify index status + 6. verify search and query + """ + string_pk = True + auto_id = False + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + str_pk=string_pk, + data_fields=default_multi_fields, + ) + additional_field = "int_scalar_add" + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_string_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + cf.gen_string_field(name=df.string_field), + cf.gen_bool_field(name=df.bool_field), + ] + if fields_num_in_file == "more": + fields.pop() + elif fields_num_in_file == "less": + fields.append(cf.gen_int32_field(name=additional_field)) + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # build index + index_params = { + "index_type": "HNSW", + "params": {"M": 8, "efConstruction": 100}, + "metric_type": "L2", + } + self.collection_wrap.create_index( + field_name=df.vec_field, index_params=index_params + ) + # load collection + self.collection_wrap.load() + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + if fields_num_in_file == "less": + assert not success # TODO: check error msg + if is_row_based: + failed_reason = ( + f"JSON row validator: field {additional_field} missed at the row 0" + ) + else: + failed_reason = "is not equal to other fields" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + else: + assert success + log.info(f" collection entities: {self.collection_wrap.num_entities}") + assert self.collection_wrap.num_entities == entities + # verify no index + res, _ = self.collection_wrap.has_index() + assert res is True + # verify search and query + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + search_data = cf.gen_vectors(1, dim) + search_params = {"metric_type": "L2", "params": {"nprobe": 2}} + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=1, + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, "limit": 1}, + ) + for hits in res: + ids = hits.ids + expr = f"{df.pk_field} in {ids}" + expr = expr.replace("'", '"') + results, _ = self.collection_wrap.query(expr=expr) + assert len(results) == len(ids) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [pytest.param(True, marks=pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/19499")), False]) # True, False + @pytest.mark.parametrize("auto_id", [True, False]) # True, False + @pytest.mark.parametrize("dim", [16]) # 16 + @pytest.mark.parametrize("entities", [100]) # 3000 + @pytest.mark.parametrize("file_nums", [32]) # 10 + @pytest.mark.parametrize("multi_folder", [True, False]) # True, False + def test_float_vector_from_multi_files( + self, is_row_based, auto_id, dim, entities, file_nums, multi_folder + ): + """ + collection: auto_id + collection schema: [pk, float_vector, + float_scalar, int_scalar, string_scalar, bool_scalar] + Steps: + 1. create collection + 2. build index and load collection + 3. import data from multiple files + 4. verify the data entities + 5. verify index status + 6. verify search successfully + 7. verify query successfully + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_multi_fields, + file_nums=file_nums, + multi_folder=multi_folder, + force=True, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + cf.gen_string_field(name=df.string_field), + cf.gen_bool_field(name=df.bool_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # build index + index_params = ct.default_index + self.collection_wrap.create_index( + field_name=df.vec_field, index_params=index_params + ) + # load collection + self.collection_wrap.load() + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + if not is_row_based: + assert not success + failed_reason = "is duplicated" # "the field xxx is duplicated" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + else: + assert success + num_entities = self.collection_wrap.num_entities + log.info(f" collection entities: {num_entities}") + assert num_entities == entities * file_nums + + # verify index built + res, _ = self.utility_wrap.index_building_progress(c_name) + exp_res = {'total_rows': entities * file_nums, 'indexed_rows': entities * file_nums} + assert res == exp_res + + # verify search and query + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + nq = 5 + topk = 1 + search_data = cf.gen_vectors(nq, dim) + search_params = ct.default_search_params + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=topk, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "limit": topk}, + ) + for hits in res: + ids = hits.ids + results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}") + assert len(results) == len(ids) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("multi_fields", [True, False]) + @pytest.mark.parametrize("dim", [15]) + @pytest.mark.parametrize("entities", [200]) + def test_float_vector_from_numpy_file( + self, is_row_based, auto_id, multi_fields, dim, entities + ): + """ + collection schema 1: [pk, float_vector] + schema 2: [pk, float_vector, int_scalar, string_scalar, float_scalar, bool_scalar] + data file: .npy files + Steps: + 1. create collection + 2. import data + 3. if is_row_based: verify import failed + 4. if column_based: + 4.1 verify the data entities equal the import data + 4.2 verify search and query successfully + """ + data_fields = [df.vec_field] + np_files = prepare_bulk_insert_numpy_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + rows=entities, + dim=dim, + data_fields=data_fields, + force=True, + ) + if not multi_fields: + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + ] + if not auto_id: + scalar_fields = [df.pk_field] + else: + scalar_fields = None + else: + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + cf.gen_string_field(name=df.string_field), + cf.gen_bool_field(name=df.bool_field), + ] + if not auto_id: + scalar_fields = [ + df.pk_field, + df.float_field, + df.int_field, + df.string_field, + df.bool_field, + ] + else: + scalar_fields = [ + df.int_field, + df.string_field, + df.bool_field, + df.float_field, + ] + + files = np_files + if scalar_fields is not None: + json_files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + dim=dim, + auto_id=auto_id, + rows=entities, + data_fields=scalar_fields, + force=True, + ) + files = np_files + json_files + + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + + if is_row_based: + assert not success + failed_reason1 = "unsupported file type for row-based mode" + failed_reason2 = ( + f"JSON row validator: field {df.vec_field} missed at the row 0" + ) + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason1 in state.infos.get( + "failed_reason", "" + ) or failed_reason2 in state.infos.get("failed_reason", "") + else: + assert success + log.info(f" collection entities: {self.collection_wrap.num_entities}") + assert self.collection_wrap.num_entities == entities + # create index and load + index_params = ct.default_index + self.collection_wrap.create_index( + field_name=df.vec_field, index_params=index_params + ) + self.collection_wrap.load() + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + log.info( + f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}" + ) + # verify imported data is available for search + nq = 2 + topk = 5 + search_data = cf.gen_vectors(nq, dim) + search_params = {"metric_type": "L2", "params": {"nprobe": 2}} + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=topk, + check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "limit": topk}, + ) + for hits in res: + ids = hits.ids + results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}") + assert len(results) == len(ids) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("dim", [8]) + @pytest.mark.parametrize("entities", [10]) + def test_data_type_float_on_int_pk(self, is_row_based, dim, entities): + """ + collection schema: [pk, float_vector, + float_scalar, int_scalar, string_scalar, bool_scalar] + data files: json file that one of entities has float on int pk + Steps: + 1. create collection + 2. import data + 3. verify the data entities + 4. verify query successfully + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=False, + data_fields=default_multi_fields, + err_type=DataErrorType.float_on_int_pk, + force=True, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + # TODO: add string pk + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + cf.gen_string_field(name=df.string_field), + cf.gen_bool_field(name=df.bool_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=False) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + log.info(f"bulk insert state:{success}") + assert success + assert self.collection_wrap.num_entities == entities + index_params = ct.default_index + self.collection_wrap.create_index( + field_name=df.vec_field, index_params=index_params + ) + self.collection_wrap.load() + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + # the pk value was automatically convert to int from float + res, _ = self.collection_wrap.query( + expr=f"{df.pk_field} in [3]", output_fields=[df.pk_field] + ) + assert [{df.pk_field: 3}] == res + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [8]) + @pytest.mark.parametrize("entities", [10]) + def test_data_type_int_on_float_scalar(self, is_row_based, auto_id, dim, entities): + """ + collection schema: [pk, float_vector, + float_scalar, int_scalar, string_scalar, bool_scalar] + data files: json file that one of entities has int on float scalar + Steps: + 1. create collection + 2. import data + 3. verify the data entities + 4. verify query successfully + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_multi_fields, + err_type=DataErrorType.int_on_float_scalar, + force=True, + ) + + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + cf.gen_float_field(name=df.float_field), + cf.gen_string_field(name=df.string_field), + cf.gen_bool_field(name=df.bool_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + log.info(f"bulk insert state:{success}") + assert success + assert self.collection_wrap.num_entities == entities + + index_params = ct.default_index + self.collection_wrap.create_index( + field_name=df.vec_field, index_params=index_params + ) + self.collection_wrap.load() + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + search_data = cf.gen_vectors(1, dim) + search_params = {"metric_type": "L2", "params": {"nprobe": 2}} + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=1, + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, "limit": 1}, + ) + uids = res[0].ids + res, _ = self.collection_wrap.query( + expr=f"{df.pk_field} in {uids}", output_fields=[df.float_field] + ) + assert isinstance(res[0].get(df.float_field, 1), float) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("auto_id", [True]) + @pytest.mark.parametrize("dim", [128]) # 128 + @pytest.mark.parametrize("entities", [1000]) # 1000 + def test_with_all_field_numpy(self, auto_id, dim, entities): + """ + collection schema 1: [pk, int64, float64, string float_vector] + data file: vectors.npy and uid.npy, + Steps: + 1. create collection + 2. import data + 3. verify + """ + data_fields = [df.pk_field, df.int_field, df.float_field, df.double_field, df.vec_field] + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_int64_field(name=df.int_field), + cf.gen_float_field(name=df.float_field), + cf.gen_double_field(name=df.double_field), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + ] + files = prepare_bulk_insert_numpy_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + rows=entities, + dim=dim, + data_fields=data_fields, + force=True, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=False, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + assert success + num_entities = self.collection_wrap.num_entities + log.info(f" collection entities: {num_entities}") + assert num_entities == entities + # verify imported data is available for search + index_params = ct.default_index + self.collection_wrap.create_index( + field_name=df.vec_field, index_params=index_params + ) + self.collection_wrap.load() + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + # log.info(f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}") + search_data = cf.gen_vectors(1, dim) + search_params = {"metric_type": "L2", "params": {"nprobe": 2}} + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=1, + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, "limit": 1}, + ) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [6]) + @pytest.mark.parametrize("entities", [2000]) + @pytest.mark.parametrize("file_nums", [10]) + def test_multi_numpy_files_from_diff_folders( + self, auto_id, dim, entities, file_nums + ): + """ + collection schema 1: [pk, float_vector] + data file: .npy files in different folders + Steps: + 1. create collection, create index and load + 2. import data + 3. verify that import numpy files in a loop + """ + is_row_based = False # numpy files supports only column based + + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_int64_field(name=df.int_field), + cf.gen_float_field(name=df.float_field), + cf.gen_double_field(name=df.double_field), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields) + self.collection_wrap.init_collection(c_name, schema=schema) + # build index + index_params = ct.default_index + self.collection_wrap.create_index( + field_name=df.vec_field, index_params=index_params + ) + # load collection + self.collection_wrap.load() + data_fields = [f.name for f in fields if not f.to_dict().get("auto_id", False)] + task_ids = [] + for i in range(file_nums): + files = prepare_bulk_insert_numpy_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + rows=entities, + dim=dim, + data_fields=data_fields, + file_nums=1, + force=True, + ) + task_id, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + task_ids.append(task_id[0]) + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + log.info(f"bulk insert state:{success}") + + assert success + log.info(f" collection entities: {self.collection_wrap.num_entities}") + assert self.collection_wrap.num_entities == entities * file_nums + + # verify search and query + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + search_data = cf.gen_vectors(1, dim) + search_params = ct.default_search_params + res, _ = self.collection_wrap.search( + search_data, + df.vec_field, + param=search_params, + limit=1, + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, "limit": 1}, + ) + + # TODO: not supported yet + def test_from_customize_bucket(self): + pass + + +class TestBulkInsertInvalidParams(TestcaseBaseBulkInsert): + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + def test_non_existing_file(self, is_row_based): + """ + collection: either auto_id or not + collection schema: not existing file(s) + Steps: + 1. create collection + 3. import data, but the data file(s) not exists + 4. verify import failed with errors + """ + files = ["not_existing.json"] + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=ct.default_dim), + ] + schema = cf.gen_collection_schema(fields=fields) + self.collection_wrap.init_collection(c_name, schema=schema) + + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, + partition_name=None, + is_row_based=is_row_based, + files=files, + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + assert not success + failed_reason = f"the file {files[0]} is empty" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + def test_empty_json_file(self, is_row_based, auto_id): + """ + collection schema: [pk, float_vector] + data file: empty file + Steps: + 1. create collection + 2. import data, but the data file(s) is empty + 3. verify import fail if column based + 4. verify import successfully if row based + """ + # set 0 entities + entities = 0 + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=ct.default_dim, + auto_id=auto_id, + data_fields=default_vec_only_fields, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=ct.default_dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + + # import data + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, + partition_name=None, + is_row_based=is_row_based, + files=files, + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + assert not success + failed_reason = "JSON parse: row count is 0" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [8]) # 8 + @pytest.mark.parametrize("entities", [100]) # 100 + @pytest.mark.xfail(reason="issue https://github.com/milvus-io/milvus/issues/19658") + def test_wrong_file_type(self, is_row_based, auto_id, dim, entities): + """ + collection schema: [pk, float_vector] + data files: wrong data type + Steps: + 1. create collection + 2. import data + 3. verify import failed with errors + """ + if is_row_based: + if auto_id: + file_type = ".npy" + else: + file_type = "" # TODO + else: + if auto_id: + file_type = ".csv" + else: + file_type = ".txt" + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_vec_only_fields, + file_type=file_type, + ) + + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + log.info(schema) + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, + partition_name=None, + is_row_based=is_row_based, + files=files, + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + assert not success + failed_reason = "unsupported file type" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [8]) + @pytest.mark.parametrize("entities", [100]) + def test_wrong_row_based_values(self, is_row_based, auto_id, dim, entities): + """ + collection schema: [pk, float_vector] + data files: wrong row based values + Steps: + 1. create collection + 3. import data with wrong row based value + 4. verify import failed with errors + """ + # set the wrong row based params + wrong_row_based = not is_row_based + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=wrong_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_vec_only_fields, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + + # import data + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, + partition_name=None, + is_row_based=is_row_based, + files=files, + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + assert not success + if is_row_based: + value = df.vec_field # if auto_id else df.pk_field + failed_reason = f"JSON parse: invalid row-based JSON format, the key {value} is not found" + else: + failed_reason = "JSON parse: row count is 0" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [8]) # 8 + @pytest.mark.parametrize("entities", [100]) # 100 + def test_wrong_pk_field_name(self, is_row_based, auto_id, dim, entities): + """ + collection schema: [pk, float_vector] + data files: wrong primary key field name + Steps: + 1. create collection with a dismatch_uid as pk + 2. import data + 3. verify import data successfully if collection with auto_id + 4. verify import error if collection with auto_id=False + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_vec_only_fields, + ) + dismatch_pk_field = "dismatch_pk" + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=dismatch_pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, + partition_name=None, + is_row_based=is_row_based, + files=files, + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + if auto_id: + assert success + else: + assert not success + if is_row_based: + failed_reason = f"field {dismatch_pk_field} missed at the row 0" + else: + failed_reason = f"import error: field {dismatch_pk_field} row count 0 is not equal to other fields" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [8]) # 8 + @pytest.mark.parametrize("entities", [100]) # 100 + def test_wrong_vector_field_name(self, is_row_based, auto_id, dim, entities): + """ + collection schema: [pk, float_vector] + Steps: + 1. create collection with a dismatch_uid as pk + 2. import data + 3. verify import data successfully if collection with auto_id + 4. verify import error if collection with auto_id=False + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_vec_only_fields, + ) + dismatch_vec_field = "dismatched_vectors" + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=dismatch_vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, + partition_name=None, + is_row_based=is_row_based, + files=files, + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + + assert not success + if is_row_based: + failed_reason = f"field {dismatch_vec_field} missed at the row 0" + else: + if auto_id: + failed_reason = f"JSON column consumer: row count is 0" + else: + failed_reason = f"import error: field {dismatch_vec_field} row count 0 is not equal to other fields" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [4]) + @pytest.mark.parametrize("entities", [200]) + def test_wrong_scalar_field_name(self, is_row_based, auto_id, dim, entities): + """ + collection schema: [pk, float_vectors, int_scalar] + data file: with dismatched int scalar + 1. create collection + 2. import data that one scalar field name is dismatched + 3. verify that import fails with errors + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_vec_n_int_fields, + ) + dismatch_scalar_field = "dismatched_scalar" + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=dismatch_scalar_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, + partition_name="", + is_row_based=is_row_based, + files=files, + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + assert not success + if is_row_based: + failed_reason = f"field {dismatch_scalar_field} missed at the row 0" + else: + failed_reason = f"import error: field {dismatch_scalar_field} row count 0 is not equal to other fields" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [4]) + @pytest.mark.parametrize("entities", [200]) + def test_wrong_dim_in_schema(self, is_row_based, auto_id, dim, entities): + """ + collection schema: [pk, float_vectors, int_scalar] + data file: with wrong dim of vectors + 1. import data the collection + 2. verify that import fails with errors + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_vec_n_int_fields, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + wrong_dim = dim + 1 + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=wrong_dim), + cf.gen_int32_field(name=df.int_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + log.info(f"bulk insert state:{success}") + assert not success + failed_reason = f"array size {dim} doesn't equal to vector dimension {wrong_dim} of field vectors at the row " + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("dim", [4]) + @pytest.mark.parametrize("entities", [200]) + def test_non_existing_collection(self, is_row_based, dim, entities): + """ + collection: not create collection + collection schema: [pk, float_vectors, int_scalar] + 1. import data into a non existing collection + 2. verify that import fails with errors + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + data_fields=default_vec_n_int_fields, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + # import data into a non existing collection + err_msg = f"can't find collection: {c_name}" + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, + is_row_based=is_row_based, + files=files, + check_task=CheckTasks.err_res, + check_items={"err_code": 1, "err_msg": err_msg}, + ) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("dim", [4]) + @pytest.mark.parametrize("entities", [200]) + @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/19553") + def test_non_existing_partition(self, is_row_based, dim, entities): + """ + collection: create a collection + collection schema: [pk, float_vectors, int_scalar] + 1. import data into a non existing partition + 2. verify that import fails with errors + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + data_fields=default_vec_n_int_fields, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + ] + schema = cf.gen_collection_schema(fields=fields) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data into a non existing partition + p_name = "non_existing" + err_msg = f" partition {p_name} does not exist" + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, + partition_name=p_name, + is_row_based=is_row_based, + files=files, + check_task=CheckTasks.err_res, + check_items={"err_code": 11, "err_msg": err_msg}, + ) + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [4]) + @pytest.mark.parametrize("entities", [1000]) + @pytest.mark.parametrize("position", [0, 500, 999]) # the index of wrong dim entity + def test_wrong_dim_in_one_entities_of_file( + self, is_row_based, auto_id, dim, entities, position + ): + """ + collection schema: [pk, float_vectors, int_scalar] + data file: one of entities has wrong dim data + 1. import data the collection + 2. verify that import fails with errors + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_vec_n_int_fields, + err_type=DataErrorType.one_entity_wrong_dim, + wrong_position=position, + force=True, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + log.info(f"bulk insert state:{success}") + assert not success + failed_reason = ( + f"doesn't equal to vector dimension {dim} of field vectors at the row" + ) + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + assert self.collection_wrap.num_entities == 0 + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [16]) + @pytest.mark.parametrize("entities", [300]) + @pytest.mark.parametrize("file_nums", [10]) # max task nums 32? need improve + def test_float_vector_one_of_files_fail( + self, is_row_based, auto_id, dim, entities, file_nums + ): + """ + collection schema: [pk, float_vectors, int_scalar], one of entities has wrong dim data + data files: multi files, and there are errors in one of files + 1. import data 11 files(10 correct and 1 with errors) into the collection + 2. verify that import fails with errors and no data imported + """ + correct_files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_multi_fields, + file_nums=file_nums, + force=True, + ) + + # append a file that has errors + dismatch_dim = dim + 1 + err_files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dismatch_dim, + auto_id=auto_id, + data_fields=default_multi_fields, + file_nums=1, + ) + files = correct_files + err_files + random.shuffle(files) # mix up the file order + + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + cf.gen_string_field(name=df.string_field), + cf.gen_bool_field(name=df.bool_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + assert not success + if is_row_based: + # all correct files shall be imported successfully + assert self.collection_wrap.num_entities == entities * file_nums + else: + assert self.collection_wrap.num_entities == 0 + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [128]) # 128 + @pytest.mark.parametrize("entities", [1000]) # 1000 + def test_wrong_dim_in_numpy(self, auto_id, dim, entities): + """ + collection schema 1: [pk, float_vector] + data file: .npy file with wrong dim + Steps: + 1. create collection + 2. import data + 3. verify failed with errors + """ + data_fields = [df.vec_field] + if not auto_id: + data_fields.append(df.pk_field) + files = prepare_bulk_insert_numpy_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + rows=entities, + dim=dim, + data_fields=data_fields, + force=True, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + wrong_dim = dim + 1 + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=wrong_dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=False, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + + assert not success + failed_reason = f"Numpy parse: illegal row width {dim} for field {df.vec_field} dimension {wrong_dim}" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + assert self.collection_wrap.num_entities == 0 + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("auto_id", [False]) + @pytest.mark.parametrize("dim", [15]) + @pytest.mark.parametrize("entities", [100]) + @pytest.mark.xfail(reason="https://github.com/milvus-io/milvus/issues/18992") + def test_wrong_field_name_in_numpy(self, auto_id, dim, entities): + """ + collection schema 1: [pk, float_vector] + data file: .npy file + Steps: + 1. create collection + 2. import data + 3. if is_row_based: verify import failed + 4. if column_based: + 4.1 verify the data entities equal the import data + 4.2 verify search and query successfully + """ + data_fields = [df.vec_field] + if not auto_id: + data_fields.append(df.pk_field) + files = prepare_bulk_insert_numpy_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + rows=entities, + dim=dim, + data_fields=data_fields, + force=True, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + wrong_vec_field = f"wrong_{df.vec_field}" + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=wrong_vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + log.info(schema) + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=False, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + + assert not success + failed_reason = f"Numpy parse: the field {df.vec_field} doesn't exist" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + assert self.collection_wrap.num_entities == 0 + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [16]) # 128 + @pytest.mark.parametrize("entities", [100]) # 1000 + def test_duplicate_numpy_files(self, auto_id, dim, entities): + """ + collection schema 1: [pk, float_vector] + data file: .npy files + Steps: + 1. create collection + 2. import data with duplicate npy files + 3. verify fail to import with errors + """ + data_fields = [df.vec_field] + if not auto_id: + data_fields.append(df.pk_field) + files = prepare_bulk_insert_numpy_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + rows=entities, + dim=dim, + data_fields=data_fields, + ) + files += files + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=False, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + assert not success + failed_reason = "duplicate file" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + assert self.collection_wrap.num_entities == 0 + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("dim", [8]) + @pytest.mark.parametrize("entities", [10]) + def test_data_type_string_on_int_pk(self, is_row_based, dim, entities): + """ + collection schema: default multi scalars + data file: json file with one of entities has string on int pk + Steps: + 1. create collection + 2. import data with is_row_based=False + 3. verify import failed + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=False, + data_fields=default_multi_fields, + err_type=DataErrorType.str_on_int_pk, + force=True, + ) + + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + # TODO: add string pk + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + cf.gen_string_field(name=df.string_field), + cf.gen_bool_field(name=df.bool_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=False) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + log.info(f"bulk insert state:{success}") + assert not success + failed_reason = f"illegal numeric value" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + assert self.collection_wrap.num_entities == 0 + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [8]) + @pytest.mark.parametrize("entities", [10]) + def test_data_type_typo_on_bool(self, is_row_based, auto_id, dim, entities): + """ + collection schema: [pk, float_vector, + float_scalar, int_scalar, string_scalar, bool_scalar] + data files: json file that one of entities has typo on boolean field + Steps: + 1. create collection + 2. import data + 3. verify import failed with errors + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=False, + data_fields=default_multi_fields, + err_type=DataErrorType.typo_on_bool, + scalars=default_multi_fields, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + # TODO: add string pk + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + cf.gen_float_field(name=df.float_field), + cf.gen_string_field(name=df.string_field), + cf.gen_bool_field(name=df.bool_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + log.info(f"bulk insert state:{success}") + assert not success + failed_reason1 = "illegal value" + failed_reason2 = "invalid character" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason1 in state.infos.get( + "failed_reason", "" + ) or failed_reason2 in state.infos.get("failed_reason", "") + assert self.collection_wrap.num_entities == 0 + + # + # assert success + # assert self.collection_wrap.num_entities == entities + # + # self.collection_wrap.load() + # + # # the pk value was automatically convert to int from float + # res, _ = self.collection_wrap.query(expr=f"{float_field} in [1.0]", output_fields=[float_field]) + # assert res[0].get(float_field, 0) == 1.0 + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [6]) + @pytest.mark.parametrize("entities", [10]) + @pytest.mark.parametrize("file_nums", [2]) + def test_multi_numpy_files_from_diff_folders_in_one_request( + self, auto_id, dim, entities, file_nums + ): + """ + collection schema 1: [pk, float_vector] + data file: .npy files in different folders + Steps: + 1. create collection + 2. import data + 3. fail to import data with errors + """ + is_row_based = False # numpy files supports only column based + data_fields = [df.vec_field] + if not auto_id: + data_fields.append(df.pk_field) + files = prepare_bulk_insert_numpy_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + rows=entities, + dim=dim, + data_fields=data_fields, + file_nums=file_nums, + force=True, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + + t0 = time.time() + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + tt = time.time() - t0 + log.info(f"bulk insert state:{success} in {tt}") + + assert not success + failed_reason = "duplicate file" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + assert self.collection_wrap.num_entities == 0 + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("dim", [9]) + @pytest.mark.parametrize("entities", [10]) + def test_data_type_str_on_float_scalar(self, is_row_based, auto_id, dim, entities): + """ + collection schema: [pk, float_vector, + float_scalar, int_scalar, string_scalar, bool_scalar] + data files: json file that entities has string data on float scalars + Steps: + 1. create collection + 2. import data + 3. verify import failed with errors + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + data_fields=default_multi_fields, + err_type=DataErrorType.str_on_float_scalar, + scalars=default_multi_fields, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + cf.gen_float_field(name=df.float_field), + cf.gen_string_field(name=df.string_field), + cf.gen_bool_field(name=df.bool_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + log.info(f"bulk insert state:{success}") + assert not success + failed_reason = "illegal numeric value" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + assert self.collection_wrap.num_entities == 0 + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("is_row_based", [True, False]) + @pytest.mark.parametrize("auto_id", [True, False]) + @pytest.mark.parametrize("float_vector", [True, False]) + @pytest.mark.parametrize("dim", [8]) + @pytest.mark.parametrize("entities", [500]) + def test_data_type_str_on_vector_fields( + self, is_row_based, auto_id, float_vector, dim, entities + ): + """ + collection schema: [pk, float_vector, + float_scalar, int_scalar, string_scalar, bool_scalar] + data files: json file that entities has string data on vectors + Steps: + 1. create collection + 2. import data + 3. verify import failed with errors + """ + files = prepare_bulk_insert_json_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + is_row_based=is_row_based, + rows=entities, + dim=dim, + auto_id=auto_id, + float_vector=float_vector, + data_fields=default_multi_fields, + err_type=DataErrorType.str_on_vector_field, + wrong_position=entities // 2, + scalars=default_multi_fields, + force=True, + ) + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=df.vec_field, dim=dim), + cf.gen_int32_field(name=df.int_field), + cf.gen_float_field(name=df.float_field), + cf.gen_string_field(name=df.string_field), + cf.gen_bool_field(name=df.bool_field), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + # import data + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=is_row_based, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=90 + ) + log.info(f"bulk insert state:{success}") + assert not success + failed_reason = "illegal numeric value" + if not float_vector: + failed_reason = f"doesn't equal to vector dimension {dim} of field vectors" + for state in states.values(): + assert state.state_name in ["Failed", "Failed and cleaned"] + assert failed_reason in state.infos.get("failed_reason", "") + assert self.collection_wrap.num_entities == 0 + + +@pytest.mark.skip() +class TestBulkInsertAdvanced(TestcaseBaseBulkInsert): + + @pytest.mark.tags(CaseLabel.L3) + @pytest.mark.parametrize("auto_id", [True]) + @pytest.mark.parametrize("dim", [128]) # 128 + @pytest.mark.parametrize( + "entities", [50000, 500000, 1000000] + ) # 1m*3; 50k*20; 2m*3, 500k*4 + def test_float_vector_from_multi_numpy_files(self, auto_id, dim, entities): + """ + collection schema 1: [pk, float_vector] + data file: .npy files + Steps: + 1. create collection + 2. import data + 3. if column_based: + 4.1 verify the data entities equal the import data + 4.2 verify search and query successfully + """ + # NOTE: 128d_1m --> 977MB + suffix = entity_suffix(entities) + vec_field = f"vectors_{dim}d_{suffix}" + self._connect() + c_name = cf.gen_unique_str("bulk_insert") + fields = [ + cf.gen_int64_field(name=df.pk_field, is_primary=True), + cf.gen_float_vec_field(name=vec_field, dim=dim), + ] + schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) + self.collection_wrap.init_collection(c_name, schema=schema) + data_fields = [df.pk_field, vec_field] + # import data + file_nums = 3 + files = prepare_bulk_insert_numpy_files( + minio_endpoint=self.minio_endpoint, + bucket_name=self.bucket_name, + rows=entities, + dim=dim, + data_fields=data_fields, + file_nums=file_nums, + force=True, + ) + log.info(f"files:{files}") + for i in range(file_nums): + files = [ + f"{dim}d_{suffix}_{i}/{vec_field}.npy" + ] # npy file name shall be the vector field name + if not auto_id: + files.append(f"{dim}d_{suffix}_{i}/{df.pk_field}.npy") + t0 = time.time() + check_flag = True + for file in files: + file_size = Path(f"{base_dir}/{file}").stat().st_size / 1024 / 1024 + if file_size >= 1024: + check_flag = False + break + + task_ids, _ = self.utility_wrap.bulk_insert( + collection_name=c_name, is_row_based=False, files=files + ) + logging.info(f"bulk insert task ids:{task_ids}") + success, states = self.utility_wrap.wait_for_bulk_insert_tasks_completed( + task_ids=task_ids, timeout=180 + ) + tt = time.time() - t0 + log.info( + f"auto_id:{auto_id}, bulk insert{suffix}-{i} state:{success} in {tt}" + ) + assert success is check_flag + + # TODO: assert num entities + if success: + t0 = time.time() + num_entities = self.collection_wrap.num_entities + tt = time.time() - t0 + log.info(f" collection entities: {num_entities} in {tt}") + assert num_entities == entities * file_nums + + # verify imported data is available for search + self.collection_wrap.load() + log.info(f"wait for load finished and be ready for search") + time.sleep(5) + loaded_segs = len(self.utility_wrap.get_query_segment_info(c_name)[0]) + log.info(f"query seg info: {loaded_segs} segs loaded.") + search_data = cf.gen_vectors(1, dim) + search_params = {"metric_type": "L2", "params": {"nprobe": 2}} + res, _ = self.collection_wrap.search( + search_data, + vec_field, + param=search_params, + limit=1, + check_task=CheckTasks.check_search_results, + check_items={"nq": 1, "limit": 1}, + ) diff --git a/tests/python_client/bulk_load/test_bulk_load.py b/tests/python_client/bulk_load/test_bulk_load.py deleted file mode 100644 index 0987d3e404..0000000000 --- a/tests/python_client/bulk_load/test_bulk_load.py +++ /dev/null @@ -1,1905 +0,0 @@ -import logging -import time -import pytest -import random -from base.client_base import TestcaseBase -from common import common_func as cf -from common import common_type as ct -from common.common_type import CaseLabel, CheckTasks, BulkLoadStates -from utils.util_log import test_log as log -from bulk_load_data import prepare_bulk_load_json_files, prepare_bulk_load_numpy_files,\ - DataField as df, DataErrorType - - -default_vec_only_fields = [df.vec_field] -default_multi_fields = [df.vec_field, df.int_field, df.string_field, - df.bool_field, df.float_field] -default_vec_n_int_fields = [df.vec_field, df.int_field] - - -def entity_suffix(entities): - if entities // 1000000 > 0: - suffix = f"{entities // 1000000}m" - elif entities // 1000 > 0: - suffix = f"{entities // 1000}k" - else: - suffix = f"{entities}" - return suffix - - -class TestBulkLoad(TestcaseBase): - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) # 8, 128 - @pytest.mark.parametrize("entities", [100]) # 100, 1000 - def test_float_vector_only(self, row_based, auto_id, dim, entities): - """ - collection: auto_id, customized_id - collection schema: [pk, float_vector] - Steps: - 1. create collection - 2. import data - 3. verify the data entities equal the import data - 4. load the collection - 5. verify search successfully - 6. verify query successfully - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_vec_only_fields, force=True) - self._connect() - c_name = cf.gen_unique_str("bulkload") - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, _ = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert success - - num_entities = self.collection_wrap.num_entities - log.info(f" collection entities: {num_entities}") - assert num_entities == entities - - # verify imported data is available for search - self.collection_wrap.load() - log.info(f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}") - nq = 2 - topk = 2 - search_data = cf.gen_vectors(nq, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 2}} - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=topk, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "limit": topk}) - for hits in res: - ids = hits.ids - results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}") - assert len(results) == len(ids) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("dim", [8]) # 8 - @pytest.mark.parametrize("entities", [100]) # 100 - def test_str_pk_float_vector_only(self, row_based, dim, entities): - """ - collection schema: [str_pk, float_vector] - Steps: - 1. create collection - 2. import data - 3. verify the data entities equal the import data - 4. load the collection - 5. verify search successfully - 6. verify query successfully - """ - auto_id = False # no auto id for string_pk schema - string_pk = True - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, str_pk=string_pk, - data_fields=default_vec_only_fields) - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_string_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - completed, _ = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{completed} in {tt}") - assert completed - - num_entities = self.collection_wrap.num_entities - log.info(f" collection entities: {num_entities}") - assert num_entities == entities - - # verify imported data is available for search - self.collection_wrap.load() - log.info(f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}") - nq = 3 - topk = 2 - search_data = cf.gen_vectors(nq, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 2}} - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=topk, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "limit": topk}) - for hits in res: - ids = hits.ids - expr = f"{df.pk_field} in {ids}" - expr = expr.replace("'", "\"") - results, _ = self.collection_wrap.query(expr=expr) - assert len(results) == len(ids) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [3000]) - def test_partition_float_vector_int_scalar(self, row_based, auto_id, dim, entities): - """ - collection: customized partitions - collection schema: [pk, float_vectors, int_scalar] - 1. create collection and a partition - 2. build index and load partition - 3. import data into the partition - 4. verify num entities - 5. verify index status - 6. verify search and query - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_vec_n_int_fields, file_nums=1) - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # create a partition - p_name = cf.gen_unique_str() - m_partition, _ = self.collection_wrap.create_partition(partition_name=p_name) - # build index before bulk load - index_params = {"index_type": "IVF_SQ8", "params": {"nlist": 128}, "metric_type": "L2"} - self.collection_wrap.create_index(field_name=df.vec_field, index_params=index_params) - # load before bulk load - self.collection_wrap.load(partition_names=[p_name]) - - # import data into the partition - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name=p_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, state = self.utility_wrap.\ - wait_for_bulk_load_tasks_completed(task_ids=task_ids, - target_state=BulkLoadStates.BulkLoadDataQueryable, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert success - - assert m_partition.num_entities == entities - assert self.collection_wrap.num_entities == entities - - res, _ = self.utility_wrap.index_building_progress(c_name) - exp_res = {'total_rows': entities, 'indexed_rows': entities} - assert res == exp_res - - log.info(f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}") - nq = 10 - topk = 5 - search_data = cf.gen_vectors(nq, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 16}} - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=topk, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "limit": topk}) - for hits in res: - ids = hits.ids - results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}") - assert len(results) == len(ids) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [16]) - @pytest.mark.parametrize("entities", [2000]) - @pytest.mark.xfail(reason="issue #16890") - def test_binary_vector_only(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, binary_vector] - Steps: - 1. create collection - 2. build collection - 3. import data - 4. verify build status - 5. verify the data entities - 6. load collection - 7. verify search successfully - 6. verify query successfully - """ - float_vec = False - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, float_vector=float_vec, - data_fields=default_vec_only_fields) - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_binary_vec_field(name=df.vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # build index before bulk load - binary_index_params = {"index_type": "BIN_IVF_FLAT", "metric_type": "JACCARD", "params": {"nlist": 64}} - - self.collection_wrap.create_index(field_name=df.vec_field, index_params=binary_index_params) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - # TODO: Update to BulkLoadDataIndexed when issue #16889 fixed - success, _ = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - target_state=BulkLoadStates.BulkLoadDataIndexed, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert success - - # TODO: verify build index after #16890 fixed - # res, _ = self.utility_wrap.index_building_progress(c_name) - # exp_res = {'total_rows': entities, 'indexed_rows': entities} - # assert res == exp_res - - # verify num entities - assert self.collection_wrap.num_entities == entities - - # load collection - self.collection_wrap.load() - - # verify search and query - search_data = cf.gen_binary_vectors(1, dim)[1] - search_params = {"metric_type": "JACCARD", "params": {"nprobe": 10}} - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - for hits in res: - ids = hits.ids - results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}") - assert len(results) == len(ids) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("fields_num_in_file", ["equal", "more", "less"]) # "equal", "more", "less" - @pytest.mark.parametrize("dim", [16]) - @pytest.mark.parametrize("entities", [500]) - def test_float_vector_multi_scalars(self, row_based, auto_id, fields_num_in_file, dim, entities): - """ - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - Steps: - 1. create collection - 2. load collection - 3. import data - 4. verify the data entities - 5. verify index status - 6. verify search and query - 6. build index - 7. release collection and reload - 7. verify search successfully - 6. verify query successfully - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_multi_fields, force=True) - additional_field = "int_scalar_add" - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field), - cf.gen_string_field(name=df.string_field), - cf.gen_bool_field(name=df.bool_field)] - if fields_num_in_file == "more": - fields.pop() - elif fields_num_in_file == "less": - fields.append(cf.gen_int32_field(name=additional_field)) - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # load collection - self.collection_wrap.load() - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - target_state=BulkLoadStates.BulkLoadPersisted, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - if fields_num_in_file == "less": - assert not success - if row_based: - failed_reason = f"JSON row validator: field {additional_field} missed at the row 0" - else: - failed_reason = "is not equal to other fields" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - else: - assert success - - num_entities = self.collection_wrap.num_entities - log.info(f" collection entities: {num_entities}") - assert num_entities == entities - - # verify no index - res, _ = self.collection_wrap.has_index() - assert res is False - # verify search and query - nq = 3 - topk = 10 - search_data = cf.gen_vectors(nq, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 2}} - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=topk, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "limit": topk}) - for hits in res: - ids = hits.ids - results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}", - output_fields=[df.pk_field, df.int_field]) - assert len(results) == len(ids) - if not auto_id: - for i in range(len(results)): - assert results[i].get(df.int_field, 0) == results[i].get(df.pk_field, 1) - - # build index - index_params = {"index_type": "HNSW", "params": {"M": 8, "efConstruction": 100}, "metric_type": "IP"} - self.collection_wrap.create_index(field_name=df.vec_field, index_params=index_params) - - # release collection and reload - self.collection_wrap.release() - self.collection_wrap.load() - - # verify index built - res, _ = self.collection_wrap.has_index() - assert res is True - - # search and query - search_params = {"params": {"ef": 64}, "metric_type": "IP"} - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=topk, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "limit": topk}) - for hits in res: - ids = hits.ids - results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}", - output_fields=[df.pk_field, df.int_field]) - assert len(results) == len(ids) - if not auto_id: - for i in range(len(results)): - assert results[i].get(df.int_field, 0) == results[i].get(df.pk_field, 1) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("fields_num_in_file", ["equal", "more", "less"]) # "equal", "more", "less" - @pytest.mark.parametrize("dim", [16]) # 1024 - @pytest.mark.parametrize("entities", [500]) # 5000 - def test_string_pk_float_vector_multi_scalars(self, row_based, fields_num_in_file, dim, entities): - """ - collection schema: [str_pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - Steps: - 1. create collection with string primary key - 2. load collection - 3. import data - 4. verify the data entities - 5. verify index status - 6. verify search and query - 6. build index - 7. release collection and reload - 7. verify search successfully - 6. verify query successfully - """ - string_pk = True - auto_id = False - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, str_pk=string_pk, - data_fields=default_multi_fields) - additional_field = "int_scalar_add" - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_string_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field), - cf.gen_string_field(name=df.string_field), - cf.gen_bool_field(name=df.bool_field)] - if fields_num_in_file == "more": - fields.pop() - elif fields_num_in_file == "less": - fields.append(cf.gen_int32_field(name=additional_field)) - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # load collection - self.collection_wrap.load() - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - target_state=BulkLoadStates.BulkLoadPersisted, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - if fields_num_in_file == "less": - assert not success # TODO: check error msg - if row_based: - failed_reason = f"JSON row validator: field {additional_field} missed at the row 0" - else: - failed_reason = "is not equal to other fields" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - else: - assert success - - # TODO: assert num entities - log.info(f" collection entities: {self.collection_wrap.num_entities}") - assert self.collection_wrap.num_entities == entities - - # verify no index - res, _ = self.collection_wrap.has_index() - assert res is False - # verify search and query - search_data = cf.gen_vectors(1, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 2}} - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - for hits in res: - ids = hits.ids - expr = f"{df.pk_field} in {ids}" - expr = expr.replace("'", "\"") - results, _ = self.collection_wrap.query(expr=expr) - assert len(results) == len(ids) - - # build index - index_params = {"index_type": "HNSW", "params": {"M": 8, "efConstruction": 100}, "metric_type": "IP"} - self.collection_wrap.create_index(field_name=df.vec_field, index_params=index_params) - - # release collection and reload - self.collection_wrap.release() - self.collection_wrap.load() - - # verify index built - res, _ = self.collection_wrap.has_index() - assert res is True - - # search and query - search_params = {"params": {"ef": 64}, "metric_type": "IP"} - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - for hits in res: - ids = hits.ids - expr = f"{df.pk_field} in {ids}" - expr = expr.replace("'", "\"") - results, _ = self.collection_wrap.query(expr=expr) - assert len(results) == len(ids) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) # True, False - @pytest.mark.parametrize("auto_id", [True, False]) # True, False - @pytest.mark.parametrize("dim", [16]) # 16 - @pytest.mark.parametrize("entities", [100]) # 3000 - @pytest.mark.parametrize("file_nums", [32]) # 10 - @pytest.mark.parametrize("multi_folder", [True, False]) # True, False - @pytest.mark.xfail(reason="issue #17600") - # TODO: reason="BulkloadIndexed cannot be reached for issue #16889") - def test_float_vector_from_multi_files(self, row_based, auto_id, dim, entities, file_nums, multi_folder): - """ - collection: auto_id - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - Steps: - 1. create collection - 2. build index and load collection - 3. import data from multiple files - 4. verify the data entities - 5. verify index status - 6. verify search successfully - 7. verify query successfully - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_multi_fields, - file_nums=file_nums, multi_folder=multi_folder, force=True) - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field), - cf.gen_string_field(name=df.string_field), - cf.gen_bool_field(name=df.bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # build index - index_params = ct.default_index - self.collection_wrap.create_index(field_name=df.vec_field, index_params=index_params) - # load collection - self.collection_wrap.load() - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - # TODO: update to BulkLoadDataIndexed after issue #16889 fixed - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - target_state=BulkLoadStates.BulkLoadPersisted, - timeout=300) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - if not row_based: - assert not success - failed_reason = "is duplicated" # "the field xxx is duplicated" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - else: - assert success - num_entities = self.collection_wrap.num_entities - log.info(f" collection entities: {num_entities}") - assert num_entities == entities * file_nums - - # verify index built - # res, _ = self.utility_wrap.index_building_progress(c_name) - # exp_res = {'total_rows': entities * file_nums, 'indexed_rows': entities * file_nums} - # assert res == exp_res - - # verify search and query - nq = 5 - topk = 1 - search_data = cf.gen_vectors(nq, dim) - search_params = ct.default_search_params - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=topk, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "limit": topk}) - for hits in res: - ids = hits.ids - results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}") - assert len(results) == len(ids) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("multi_fields", [True, False]) - @pytest.mark.parametrize("dim", [15]) - @pytest.mark.parametrize("entities", [200]) - # TODO: string data shall be re-generated - def test_float_vector_from_numpy_file(self, row_based, auto_id, multi_fields, dim, entities): - """ - collection schema 1: [pk, float_vector] - schema 2: [pk, float_vector, int_scalar, string_scalar, float_scalar, bool_scalar] - data file: .npy files - Steps: - 1. create collection - 2. import data - 3. if row_based: verify import failed - 4. if column_based: - 4.1 verify the data entities equal the import data - 4.2 verify search and query successfully - """ - data_fields = [df.vec_field] - np_files = prepare_bulk_load_numpy_files(rows=entities, dim=dim, data_fields=data_fields, - force=True) - if not multi_fields: - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim)] - if not auto_id: - scalar_fields = [df.pk_field] - else: - scalar_fields = None - else: - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field), - cf.gen_string_field(name=df.string_field), - cf.gen_bool_field(name=df.bool_field) - ] - if not auto_id: - scalar_fields = [df.pk_field, df.float_field, df.int_field, df.string_field, df.bool_field] - else: - scalar_fields = [df.int_field, df.string_field, df.bool_field, df.float_field] - - files = np_files - if scalar_fields is not None: - json_files = prepare_bulk_load_json_files(row_based=row_based, dim=dim, - auto_id=auto_id, rows=entities, - data_fields=scalar_fields, force=True) - files = np_files + json_files - - self._connect() - c_name = cf.gen_unique_str() - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - - if row_based: - assert not success - failed_reason1 = "unsupported file type for row-based mode" - failed_reason2 = f"JSON row validator: field {df.vec_field} missed at the row 0" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason1 in state.infos.get("failed_reason", "") or \ - failed_reason2 in state.infos.get("failed_reason", "") - else: - assert success - log.info(f" collection entities: {self.collection_wrap.num_entities}") - assert self.collection_wrap.num_entities == entities - - # verify imported data is available for search - self.collection_wrap.load() - log.info(f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}") - nq = 2 - topk = 5 - search_data = cf.gen_vectors(nq, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 2}} - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=topk, - check_task=CheckTasks.check_search_results, - check_items={"nq": nq, - "limit": topk}) - for hits in res: - ids = hits.ids - results, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {ids}") - assert len(results) == len(ids) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("dim", [8]) - @pytest.mark.parametrize("entities", [10]) - def test_data_type_float_on_int_pk(self, row_based, dim, entities): - """ - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - data files: json file that one of entities has float on int pk - Steps: - 1. create collection - 2. import data - 3. verify the data entities - 4. verify query successfully - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=False, - data_fields=default_multi_fields, - err_type=DataErrorType.float_on_int_pk, force=True) - self._connect() - c_name = cf.gen_unique_str() - # TODO: add string pk - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field), - cf.gen_string_field(name=df.string_field), - cf.gen_bool_field(name=df.bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=False) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert success - assert self.collection_wrap.num_entities == entities - - self.collection_wrap.load() - - # the pk value was automatically convert to int from float - res, _ = self.collection_wrap.query(expr=f"{df.pk_field} in [3]", output_fields=[df.pk_field]) - assert [{df.pk_field: 3}] == res - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) - @pytest.mark.parametrize("entities", [10]) - def test_data_type_int_on_float_scalar(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - data files: json file that one of entities has int on float scalar - Steps: - 1. create collection - 2. import data - 3. verify the data entities - 4. verify query successfully - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_multi_fields, - err_type=DataErrorType.int_on_float_scalar, force=True) - - self._connect() - c_name = cf.gen_unique_str() - # TODO: add string pk - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field), - cf.gen_float_field(name=df.float_field), - cf.gen_string_field(name=df.string_field), - cf.gen_bool_field(name=df.bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert success - assert self.collection_wrap.num_entities == entities - - self.collection_wrap.load() - - # it was automatically converted from int to float - search_data = cf.gen_vectors(1, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 2}} - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - uids = res[0].ids - res, _ = self.collection_wrap.query(expr=f"{df.pk_field} in {uids}", output_fields=[df.float_field]) - assert isinstance(res[0].get(df.float_field, 1), float) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [128]) # 128 - @pytest.mark.parametrize("entities", [1000]) # 1000 - @pytest.mark.parametrize("with_int_field", [True, False]) - @pytest.mark.xfail(reason="issue #17600") - def test_with_uid_n_int_numpy(self, auto_id, dim, entities, with_int_field): - """ - collection schema 1: [pk, float_vector] - data file: vectors.npy and uid.npy - Steps: - 1. create collection - 2. import data - 3. verify failed with errors - """ - data_fields = [df.vec_field] - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim)] - if not auto_id: - data_fields.append(df.pk_field) - if with_int_field: - data_fields.append(df.int_field) - fields.append(cf.gen_int64_field(name=df.int_field)) - files = prepare_bulk_load_numpy_files(rows=entities, dim=dim, - data_fields=data_fields, - force=True) - self._connect() - c_name = cf.gen_unique_str() - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=False, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert success - num_entities = self.collection_wrap.num_entities - log.info(f" collection entities: {num_entities}") - assert num_entities == entities - - # verify imported data is available for search - self.collection_wrap.load() - # log.info(f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}") - search_data = cf.gen_vectors(1, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 2}} - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [6]) - @pytest.mark.parametrize("entities", [2000]) - @pytest.mark.parametrize("file_nums", [10]) - @pytest.mark.xfail(reason="issue #17597") - def test_multi_numpy_files_from_diff_folders(self, auto_id, dim, entities, file_nums): - """ - collection schema 1: [pk, float_vector] - data file: .npy files in different folders - Steps: - 1. create collection - 2. import data - 3. verify that import numpy files in a loop - """ - row_based = False # numpy files supports only column based - - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # build index - index_params = ct.default_index - self.collection_wrap.create_index(field_name=df.vec_field, index_params=index_params) - # load collection - self.collection_wrap.load() - - data_fields = [df.vec_field] - if not auto_id: - data_fields.append(df.pk_field) - task_ids = [] - for i in range(file_nums): - files = prepare_bulk_load_numpy_files(rows=entities, dim=dim, - data_fields=data_fields, - file_nums=1, force=True) - task_id, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - task_ids.append(task_id[0]) - success, states = self.utility_wrap.\ - wait_for_bulk_load_tasks_completed(task_ids=task_ids, - target_state=BulkLoadStates.BulkLoadDataQueryable, - timeout=30) - log.info(f"bulk load state:{success}") - - assert success - log.info(f" collection entities: {self.collection_wrap.num_entities}") - assert self.collection_wrap.num_entities == entities * file_nums - - # verify search and query - search_data = cf.gen_vectors(1, dim) - search_params = ct.default_search_params - res, _ = self.collection_wrap.search(search_data, df.vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - - # TODO: not supported yet - def test_from_customize_bucket(self): - pass - - -class TestBulkLoadInvalidParams(TestcaseBase): - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - def test_non_existing_file(self, row_based): - """ - collection: either auto_id or not - collection schema: not existing file(s) - Steps: - 1. create collection - 3. import data, but the data file(s) not exists - 4. verify import failed with errors - """ - files = ["not_existing.json"] - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=ct.default_dim)] - schema = cf.gen_collection_schema(fields=fields) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - assert not success - failed_reason = f"the file {files[0]} is empty" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - def test_empty_json_file(self, row_based, auto_id): - """ - collection schema: [pk, float_vector] - data file: empty file - Steps: - 1. create collection - 2. import data, but the data file(s) is empty - 3. verify import fail if column based - 4. verify import successfully if row based - """ - # set 0 entities - entities = 0 - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=ct.default_dim, auto_id=auto_id, - data_fields=default_vec_only_fields) - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=ct.default_dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - assert not success - failed_reason = "JSON parse: row count is 0" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) # 8 - @pytest.mark.parametrize("entities", [100]) # 100 - def test_wrong_file_type(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vector] - data files: wrong data type - Steps: - 1. create collection - 2. import data - 3. verify import failed with errors - """ - if row_based: - if auto_id: - file_type = ".csv" - else: - file_type = "" - else: - if auto_id: - file_type = ".npy" - else: - file_type = ".txt" - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_vec_only_fields, - file_type=file_type) - - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert not success - failed_reason = "unsupported file type" - if not row_based and auto_id: - failed_reason = "Numpy parse: npy: not a valid NumPy file format" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) - @pytest.mark.parametrize("entities", [100]) - def test_wrong_row_based_values(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vector] - data files: wrong row based values - Steps: - 1. create collection - 3. import data with wrong row based value - 4. verify import failed with errors - """ - # set the wrong row based params - wrong_row_based = not row_based - files = prepare_bulk_load_json_files(row_based=wrong_row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_vec_only_fields) - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - assert not success - if row_based: - value = df.vec_field # if auto_id else df.pk_field - failed_reason = f"JSON parse: invalid row-based JSON format, the key {value} is not found" - else: - failed_reason = "JSON parse: row count is 0" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) # 8 - @pytest.mark.parametrize("entities", [100]) # 100 - def test_wrong_pk_field_name(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vector] - data files: wrong primary key field name - Steps: - 1. create collection with a dismatch_uid as pk - 2. import data - 3. verify import data successfully if collection with auto_id - 4. verify import error if collection with auto_id=False - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_vec_only_fields) - dismatch_pk_field = "dismatch_pk" - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=dismatch_pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - if auto_id: - assert success - else: - assert not success - if row_based: - failed_reason = f"field {dismatch_pk_field} missed at the row 0" - else: - failed_reason = f"import error: field {dismatch_pk_field} row count 0 is not equal to other fields" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) # 8 - @pytest.mark.parametrize("entities", [100]) # 100 - def test_wrong_vector_field_name(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vector] - Steps: - 1. create collection with a dismatch_uid as pk - 2. import data - 3. verify import data successfully if collection with auto_id - 4. verify import error if collection with auto_id=False - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_vec_only_fields) - dismatch_vec_field = "dismatched_vectors" - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=dismatch_vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - - assert not success - if row_based: - failed_reason = f"field {dismatch_vec_field} missed at the row 0" - else: - if auto_id: - failed_reason = f"JSON column consumer: row count is 0" - else: - failed_reason = f"import error: field {dismatch_vec_field} row count 0 is not equal to other fields" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [200]) - def test_wrong_scalar_field_name(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vectors, int_scalar] - data file: with dismatched int scalar - 1. create collection - 2. import data that one scalar field name is dismatched - 3. verify that import fails with errors - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_vec_n_int_fields) - dismatch_scalar_field = "dismatched_scalar" - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=dismatch_scalar_field)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name="", - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert not success - if row_based: - failed_reason = f"field {dismatch_scalar_field} missed at the row 0" - else: - failed_reason = f"import error: field {dismatch_scalar_field} row count 0 is not equal to other fields" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [200]) - def test_wrong_dim_in_schema(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vectors, int_scalar] - data file: with wrong dim of vectors - 1. import data the collection - 2. verify that import fails with errors - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_vec_n_int_fields) - self._connect() - c_name = cf.gen_unique_str() - wrong_dim = dim + 1 - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=wrong_dim), - cf.gen_int32_field(name=df.int_field)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert not success - failed_reason = f"array size {dim} doesn't equal to vector dimension {wrong_dim} of field vectors at the row " - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [200]) - def test_non_existing_collection(self, row_based, dim, entities): - """ - collection: not create collection - collection schema: [pk, float_vectors, int_scalar] - 1. import data into a non existing collection - 2. verify that import fails with errors - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, data_fields=default_vec_n_int_fields) - self._connect() - c_name = cf.gen_unique_str() - # import data into a non existing collection - err_msg = f"can't find collection: {c_name}" - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files, - check_task=CheckTasks.err_res, - check_items={"err_code": 1, - "err_msg": err_msg} - ) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [200]) - def test_non_existing_partition(self, row_based, dim, entities): - """ - collection: create a collection - collection schema: [pk, float_vectors, int_scalar] - 1. import data into a non existing partition - 2. verify that import fails with errors - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, data_fields=default_vec_n_int_fields) - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field)] - schema = cf.gen_collection_schema(fields=fields) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data into a non existing partition - p_name = "non_existing" - err_msg = f" partition {p_name} does not exist" - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name=p_name, - row_based=row_based, - files=files, - check_task=CheckTasks.err_res, - check_items={"err_code": 11, - "err_msg": err_msg} - ) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [1000]) - @pytest.mark.parametrize("position", [0, 500, 999]) # the index of wrong dim entity - def test_wrong_dim_in_one_entities_of_file(self, row_based, auto_id, dim, entities, position): - """ - collection schema: [pk, float_vectors, int_scalar] - data file: one of entities has wrong dim data - 1. import data the collection - 2. verify that import fails with errors - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_vec_n_int_fields, - err_type=DataErrorType.one_entity_wrong_dim, - wrong_position=position, force=True) - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert not success - failed_reason = f"doesn't equal to vector dimension {dim} of field vectors at the row" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [16]) - @pytest.mark.parametrize("entities", [300]) - @pytest.mark.parametrize("file_nums", [10]) # max task nums 32? need improve - @pytest.mark.xfail(reason="not all correct data file imported successfully, issue #16923") - def test_float_vector_one_of_files_fail(self, row_based, auto_id, dim, entities, file_nums): - """ - collection schema: [pk, float_vectors, int_scalar], one of entities has wrong dim data - data files: multi files, and there are errors in one of files - 1. import data 11 files(10 correct and 1 with errors) into the collection - 2. verify that import fails with errors and no data imported - """ - correct_files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_multi_fields, - file_nums=file_nums, force=True) - - # append a file that has errors - dismatch_dim = dim + 1 - err_files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dismatch_dim, auto_id=auto_id, - data_fields=default_multi_fields, file_nums=1) - files = correct_files + err_files - random.shuffle(files) # mix up the file order - - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field), - cf.gen_string_field(name=df.string_field), - cf.gen_bool_field(name=df.bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=300) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert not success - if row_based: - # all correct files shall be imported successfully - assert self.collection_wrap.num_entities == entities * file_nums - else: - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [128]) # 128 - @pytest.mark.parametrize("entities", [1000]) # 1000 - def test_wrong_dim_in_numpy(self, auto_id, dim, entities): - """ - collection schema 1: [pk, float_vector] - data file: .npy file with wrong dim - Steps: - 1. create collection - 2. import data - 3. verify failed with errors - """ - data_fields = [df.vec_field] - if not auto_id: - data_fields.append(df.pk_field) - files = prepare_bulk_load_numpy_files(rows=entities, dim=dim, - data_fields=data_fields, - force=True) - self._connect() - c_name = cf.gen_unique_str() - wrong_dim = dim + 1 - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=wrong_dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=False, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - - assert not success - failed_reason = f"Numpy parse: illegal row width {dim} for field {df.vec_field} dimension {wrong_dim}" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [15]) - @pytest.mark.parametrize("entities", [100]) - def test_wrong_field_name_in_numpy(self, auto_id, dim, entities): - """ - collection schema 1: [pk, float_vector] - data file: .npy file - Steps: - 1. create collection - 2. import data - 3. if row_based: verify import failed - 4. if column_based: - 4.1 verify the data entities equal the import data - 4.2 verify search and query successfully - """ - data_fields = [df.vec_field] - if not auto_id: - data_fields.append(df.pk_field) - files = prepare_bulk_load_numpy_files(rows=entities, dim=dim, - data_fields=data_fields, - force=True) - self._connect() - c_name = cf.gen_unique_str() - wrong_vec_field = f"wrong_{df.vec_field}" - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=wrong_vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=False, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - - assert not success - failed_reason = f"Numpy parse: the field {df.vec_field} doesn't exist" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [16]) # 128 - @pytest.mark.parametrize("entities", [100]) # 1000 - def test_duplicate_numpy_files(self, auto_id, dim, entities): - """ - collection schema 1: [pk, float_vector] - data file: .npy files - Steps: - 1. create collection - 2. import data with duplicate npy files - 3. verify fail to import with errors - """ - data_fields = [df.vec_field] - if not auto_id: - data_fields.append(df.pk_field) - files = prepare_bulk_load_numpy_files(rows=entities, dim=dim, - data_fields=data_fields) - files += files - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=False, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert not success - failed_reason = "duplicate file" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("dim", [8]) - @pytest.mark.parametrize("entities", [10]) - def test_data_type_string_on_int_pk(self, row_based, dim, entities): - """ - collection schema: default multi scalars - data file: json file with one of entities has string on int pk - Steps: - 1. create collection - 2. import data with row_based=False - 3. verify import failed - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=False, - data_fields=default_multi_fields, - err_type=DataErrorType.str_on_int_pk, force=True) - - self._connect() - c_name = cf.gen_unique_str() - # TODO: add string pk - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field), - cf.gen_string_field(name=df.string_field), - cf.gen_bool_field(name=df.bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=False) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert not success - failed_reason = f"illegal numeric value" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) - @pytest.mark.parametrize("entities", [10]) - def test_data_type_typo_on_bool(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - data files: json file that one of entities has typo on boolean field - Steps: - 1. create collection - 2. import data - 3. verify import failed with errors - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=False, - data_fields=default_multi_fields, - err_type=DataErrorType.typo_on_bool, - scalars=default_multi_fields) - self._connect() - c_name = cf.gen_unique_str() - # TODO: add string pk - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field), - cf.gen_float_field(name=df.float_field), - cf.gen_string_field(name=df.string_field), - cf.gen_bool_field(name=df.bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert not success - failed_reason1 = "illegal value" - failed_reason2 = "invalid character" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason1 in state.infos.get("failed_reason", "") or \ - failed_reason2 in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - # - # assert success - # assert self.collection_wrap.num_entities == entities - # - # self.collection_wrap.load() - # - # # the pk value was automatically convert to int from float - # res, _ = self.collection_wrap.query(expr=f"{float_field} in [1.0]", output_fields=[float_field]) - # assert res[0].get(float_field, 0) == 1.0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [6]) - @pytest.mark.parametrize("entities", [10]) - @pytest.mark.parametrize("file_nums", [2]) - def test_multi_numpy_files_from_diff_folders_in_one_request(self, auto_id, dim, entities, file_nums): - """ - collection schema 1: [pk, float_vector] - data file: .npy files in different folders - Steps: - 1. create collection - 2. import data - 3. fail to import data with errors - """ - row_based = False # numpy files supports only column based - data_fields = [df.vec_field] - if not auto_id: - data_fields.append(df.pk_field) - files = prepare_bulk_load_numpy_files(rows=entities, dim=dim, - data_fields=data_fields, - file_nums=file_nums, force=True) - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - success, states = self.utility_wrap. \ - wait_for_bulk_load_tasks_completed(task_ids=task_ids, - target_state=BulkLoadStates.BulkLoadPersisted, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - - assert not success - failed_reason = "duplicate file" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [9]) - @pytest.mark.parametrize("entities", [10]) - def test_data_type_str_on_float_scalar(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - data files: json file that entities has string data on float scalars - Steps: - 1. create collection - 2. import data - 3. verify import failed with errors - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, - data_fields=default_multi_fields, - err_type=DataErrorType.str_on_float_scalar, - scalars=default_multi_fields) - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field), - cf.gen_float_field(name=df.float_field), - cf.gen_string_field(name=df.string_field), - cf.gen_bool_field(name=df.bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert not success - failed_reason = "illegal numeric value" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("float_vector", [True, False]) - @pytest.mark.parametrize("dim", [8]) - @pytest.mark.parametrize("entities", [500]) - def test_data_type_str_on_vector_fields(self, row_based, auto_id, float_vector, dim, entities): - """ - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - data files: json file that entities has string data on vectors - Steps: - 1. create collection - 2. import data - 3. verify import failed with errors - """ - files = prepare_bulk_load_json_files(row_based=row_based, rows=entities, - dim=dim, auto_id=auto_id, float_vector=float_vector, - data_fields=default_multi_fields, - err_type=DataErrorType.str_on_vector_field, - wrong_position=entities // 2, - scalars=default_multi_fields, force=True) - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=df.vec_field, dim=dim), - cf.gen_int32_field(name=df.int_field), - cf.gen_float_field(name=df.float_field), - cf.gen_string_field(name=df.string_field), - cf.gen_bool_field(name=df.bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert not success - failed_reason = "illegal numeric value" - if not float_vector: - failed_reason = f"doesn't equal to vector dimension {dim} of field vectors" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - -@pytest.mark.skip() -class TestBulkLoadAdvanced(TestcaseBase): - - def setup_class(self): - log.info("[setup_import] Start setup class...") - log.info("copy data files to minio") - - def teardown_class(self): - log.info("[teardown_import] Start teardown class...") - log.info("clean up data files in minio") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True]) - @pytest.mark.parametrize("dim", [128]) # 128 - @pytest.mark.parametrize("entities", [50000, 500000, 1000000]) # 1m*3; 50k*20; 2m*3, 500k*4 - def test_float_vector_from_multi_numpy_files(self, auto_id, dim, entities): - """ - collection schema 1: [pk, float_vector] - data file: .npy files - Steps: - 1. create collection - 2. import data - 3. if column_based: - 4.1 verify the data entities equal the import data - 4.2 verify search and query successfully - """ - suffix = entity_suffix(entities) - vec_field = f"vectors_{dim}d_{suffix}" - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=df.pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - file_nums = 3 - for i in range(file_nums): - files = [f"{dim}d_{suffix}_{i}/{vec_field}.npy"] # npy file name shall be the vector field name - if not auto_id: - files.append(f"{dim}d_{suffix}_{i}/{df.pk_field}.npy") - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=False, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=180) - tt = time.time() - t0 - log.info(f"auto_id:{auto_id}, bulk load{suffix}-{i} state:{success} in {tt}") - assert success - - # TODO: assert num entities - t0 = time.time() - num_entities = self.collection_wrap.num_entities - tt = time.time() - t0 - log.info(f" collection entities: {num_entities} in {tt}") - assert num_entities == entities * file_nums - - # verify imported data is available for search - self.collection_wrap.load() - loaded_segs = len(self.utility_wrap.get_query_segment_info(c_name)[0]) - log.info(f"query seg info: {loaded_segs} segs loaded.") - search_data = cf.gen_vectors(1, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 2}} - res, _ = self.collection_wrap.search(search_data, vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - # self.collection_wrap.query(expr=f"id in {ids}") - - """Validate data consistency and availability during import""" diff --git a/tests/python_client/chaos/chaos_commons.py b/tests/python_client/chaos/chaos_commons.py index 4d99b0cf2c..4d38be28d5 100644 --- a/tests/python_client/chaos/chaos_commons.py +++ b/tests/python_client/chaos/chaos_commons.py @@ -1,5 +1,6 @@ import os import threading +import time import glob from chaos import constants from yaml import full_load @@ -68,8 +69,21 @@ def get_chaos_yamls(): return glob.glob(constants.TESTS_CONFIG_LOCATION + constants.ALL_CHAOS_YAMLS) -def reconnect(connections, alias='default'): +def reconnect(connections, alias='default', timeout=360): """trying to connect by connection alias""" + is_connected = False + start = time.time() + end = time.time() + while not is_connected or end-start < timeout: + try: + connections.connect(alias) + is_connected = True + except Exception as e: + log.debug(f"fail to connect, error: {str(e)}") + time.sleep(10) + end = time.time() + else: + log.info(f"failed to reconnect after {timeout} seconds") return connections.connect(alias) diff --git a/tests/python_client/chaos/checker.py b/tests/python_client/chaos/checker.py index 340e52ca1e..2e8e7e729d 100644 --- a/tests/python_client/chaos/checker.py +++ b/tests/python_client/chaos/checker.py @@ -26,7 +26,7 @@ class Op(Enum): compact = 'compact' drop = 'drop' load_balance = 'load_balance' - bulk_load = 'bulk_load' + bulk_insert = 'bulk_insert' unknown = 'unknown' @@ -540,7 +540,7 @@ class LoadBalanceChecker(Checker): sleep(constants.WAIT_PER_OP / 10) -class BulkLoadChecker(Checker): +class BulkInsertChecker(Checker): """check bulk load operations in a dependent thread""" def __init__(self, collection_name=None, files=[]): @@ -550,25 +550,25 @@ class BulkLoadChecker(Checker): self.utility_wrap = ApiUtilityWrapper() self.schema = cf.gen_default_collection_schema() self.files = files - self.row_based = True + self.is_row_based = True self.recheck_failed_task = False self.failed_tasks = [] self.c_name = None - def update(self, files=None, schema=None, row_based=None): + def update(self, files=None, schema=None, is_row_based=None): if files is not None: self.files = files if schema is not None: self.schema = schema - if row_based is not None: - self.row_based = row_based + if is_row_based is not None: + self.is_row_based = is_row_based @trace() - def bulk_load(self): - task_ids, result = self.utility_wrap.bulk_load(collection_name=self.c_name, - row_based=self.row_based, + def bulk_insert(self): + task_ids, result = self.utility_wrap.bulk_insert(collection_name=self.c_name, + is_row_based=self.is_row_based, files=self.files) - completed, result = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, timeout=30) + completed, result = self.utility_wrap.wait_for_bulk_insert_tasks_completed(task_ids=task_ids, timeout=60) return task_ids, completed @exception_handler() @@ -580,7 +580,7 @@ class BulkLoadChecker(Checker): self.c_name = cf.gen_unique_str("BulkLoadChecker_") self.c_wrap.init_collection(name=self.c_name, schema=self.schema) # import data - task_ids, completed = self.bulk_load() + task_ids, completed = self.bulk_insert() if not completed: self.failed_tasks.append(self.c_name) return task_ids, completed diff --git a/tests/python_client/chaos/test_chaos_bulk_load.py b/tests/python_client/chaos/test_chaos_bulk_insert.py similarity index 81% rename from tests/python_client/chaos/test_chaos_bulk_load.py rename to tests/python_client/chaos/test_chaos_bulk_insert.py index 40d1781c0e..3456644698 100644 --- a/tests/python_client/chaos/test_chaos_bulk_load.py +++ b/tests/python_client/chaos/test_chaos_bulk_insert.py @@ -9,7 +9,7 @@ from time import sleep from pathlib import Path from minio import Minio from pymilvus import connections -from chaos.checker import (InsertFlushChecker, SearchChecker, QueryChecker, BulkLoadChecker, Op) +from chaos.checker import (InsertFlushChecker, SearchChecker, QueryChecker, BulkInsertChecker, Op) from common.cus_resource_opts import CustomResourceOperations as CusResource from common.milvus_sys import MilvusSys from utils.util_log import test_log as log @@ -19,8 +19,8 @@ from chaos import chaos_commons as cc from common.common_type import CaseLabel from common import common_func as cf from chaos import constants -# from bulk_load.bulk_load_data import gen_file_name -from bulk_load.minio_comm import copy_files_to_minio +# from bulk_insert.bulk_insert_data import gen_file_name +from bulk_insert.minio_comm import copy_files_to_minio from delayed_assert import expect, assert_expectations @@ -86,17 +86,17 @@ class TestChaos(TestChaosBase): checkers = { # Op.insert: InsertFlushChecker(collection_name=c_name), # Op.search: SearchChecker(collection_name=c_name, replica_number=2), - Op.bulk_load: BulkLoadChecker() + Op.bulk_insert: BulkLoadChecker() # Op.query: QueryChecker(collection_name=c_name, replica_number=2) } self.health_checkers = checkers @pytest.fixture(scope="function", autouse=True) - def prepare_bulk_load(self, nb=1000, row_based=True): - if Op.bulk_load not in self.health_checkers: - log.info("bulk_load checker is not in health checkers, skip prepare bulk load") + def prepare_bulk_insert(self, nb=1000, is_row_based=True): + if Op.bulk_insert not in self.health_checkers: + log.info("bulk_insert checker is not in health checkers, skip prepare bulk load") return - log.info("bulk_load checker is in health checkers, prepare data firstly") + log.info("bulk_insert checker is in health checkers, prepare data firstly") release_name = self.instance_name minio_ip_pod_pair = get_pod_ip_name_pairs("chaos-testing", f"release={release_name}, app=minio") ms = MilvusSys() @@ -105,27 +105,27 @@ class TestChaos(TestChaosBase): minio_endpoint = f"{minio_ip}:{minio_port}" bucket_name = ms.index_nodes[0]["infos"]["system_configurations"]["minio_bucket_name"] schema = cf.gen_default_collection_schema() - data = cf.gen_default_list_data_for_bulk_load(nb=nb) + data = cf.gen_default_list_data_for_bulk_insert(nb=nb) fields_name = [field.name for field in schema.fields] - if not row_based: + if not is_row_based: data_dict = dict(zip(fields_name, data)) - if row_based: + if is_row_based: entities = [] for i in range(nb): entity_value = [field_values[i] for field_values in data] entity = dict(zip(fields_name, entity_value)) entities.append(entity) data_dict = {"rows": entities} - file_name = "bulk_load_data_source.json" + file_name = "/tmp/ci_logs/bulk_insert_data_source.json" files = [file_name] #TODO: npy file type is not supported so far log.info("generate bulk load file") with open(file_name, "w") as f: - f.write(json.dumps(data_dict)) + f.write(json.dumps(data_dict, indent=4)) log.info("upload file to minio") client = Minio(minio_endpoint, access_key="minioadmin", secret_key="minioadmin", secure=False) client.fput_object(bucket_name, file_name, file_name) - self.health_checkers[Op.bulk_load].update(schema=schema, files=files, row_based=row_based) + self.health_checkers[Op.bulk_insert].update(schema=schema, files=files, is_row_based=is_row_based) log.info("prepare data for bulk load done") def teardown(self): @@ -139,24 +139,23 @@ class TestChaos(TestChaosBase): log.info(f'Alive threads: {threading.enumerate()}') @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("target_component", ["minio"]) # "minio", "proxy", "rootcoord", "datacoord", "datanode", "etcd" - @pytest.mark.parametrize("chaos_type", ["pod_kill"]) # "pod_kill", "pod_failure" - def test_bulk_load(self, chaos_type, target_component): + def test_bulk_insert(self, chaos_type, target_component): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) release_name = self.instance_name cc.start_monitor_threads(self.health_checkers) - chaos_config = cc.gen_experiment_config(f"{str(Path(__file__).absolute().parent)}/chaos_objects/{chaos_type}/chaos_{target_component}_{chaos_type}.yaml") - chaos_config['metadata']['name'] = f"test-bulk-load-{int(time.time())}" + chaos_config = cc.gen_experiment_config( + f"{str(Path(__file__).absolute().parent)}/chaos_objects/{chaos_type.replace('-', '_')}/chaos_{target_component}_{chaos_type.replace('-', '_')}.yaml") + chaos_config['metadata']['name'] = f"test-{target_component}-{chaos_type.replace('_','-')}-{int(time.time())}" kind = chaos_config['kind'] meta_name = chaos_config.get('metadata', None).get('name', None) update_key_value(chaos_config, "release", release_name) update_key_value(chaos_config, "app.kubernetes.io/instance", release_name) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") - # wait 20s - sleep(constants.WAIT_PER_OP * 10) + # wait 120s + sleep(constants.WAIT_PER_OP * 12) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) @@ -170,15 +169,17 @@ class TestChaos(TestChaosBase): sleep(constants.WAIT_PER_OP * 10) # reset counting cc.reset_counting(self.health_checkers) - # wait 120s - sleep(constants.CHAOS_DURATION) + # wait 240s + sleep(constants.WAIT_PER_OP * 24) log.info(f'Alive threads: {threading.enumerate()}') # assert statistic log.info("******2nd assert after chaos injected: ") - assert_statistic(self.health_checkers, - expectations={ - Op.bulk_load: constants.FAIL, - }) + for op, checker in self.health_checkers.items(): + checker.check_result() + # assert_statistic(self.health_checkers, + # expectations={ + # Op.bulk_insert: constants.FAIL, + # }) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") @@ -191,13 +192,14 @@ class TestChaos(TestChaosBase): log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) + log.info("reconnect to milvus") cc.reconnect(connections, alias='default') # recheck failed tasks in third assert - self.health_checkers[Op.bulk_load].recheck_failed_task = True + self.health_checkers[Op.bulk_insert].recheck_failed_task = True # reset counting again cc.reset_counting(self.health_checkers) - # wait 50s (varies by feature) - sleep(constants.WAIT_PER_OP * 10) + # wait 240s (varies by feature) + sleep(constants.WAIT_PER_OP * 24) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) diff --git a/tests/python_client/chaos/test_load_with_checker.py b/tests/python_client/chaos/test_load_with_checker.py index 50d11d7c16..419a38b42b 100644 --- a/tests/python_client/chaos/test_load_with_checker.py +++ b/tests/python_client/chaos/test_load_with_checker.py @@ -14,7 +14,7 @@ from chaos.checker import (CreateChecker, CompactChecker, DropChecker, LoadBalanceChecker, - BulkLoadChecker, + BulkInsertChecker, Op) from common.cus_resource_opts import CustomResourceOperations as CusResource from common.milvus_sys import MilvusSys @@ -65,17 +65,17 @@ class TestChaos(TestChaosBase): # Op.compact: CompactChecker(collection_name=c_name), # Op.index: IndexChecker(), # Op.drop: DropChecker(), - # Op.bulk_load: BulkLoadChecker(), + # Op.bulk_insert: BulkInsertChecker(), Op.load_balance: LoadBalanceChecker() } self.health_checkers = checkers - self.prepare_bulk_load() + self.prepare_bulk_insert() - def prepare_bulk_load(self, nb=30000, row_based=True): - if Op.bulk_load not in self.health_checkers: - log.info("bulk_load checker is not in health checkers, skip prepare bulk load") + def prepare_bulk_insert(self, nb=30000, row_based=True): + if Op.bulk_insert not in self.health_checkers: + log.info("bulk_insert checker is not in health checkers, skip prepare bulk insert") return - log.info("bulk_load checker is in health checkers, prepare data firstly") + log.info("bulk_insert checker is in health checkers, prepare data firstly") release_name = self.instance_name minio_ip_pod_pair = get_pod_ip_name_pairs("chaos-testing", f"release={release_name}, app=minio") ms = MilvusSys() @@ -84,7 +84,7 @@ class TestChaos(TestChaosBase): minio_endpoint = f"{minio_ip}:{minio_port}" bucket_name = ms.index_nodes[0]["infos"]["system_configurations"]["minio_bucket_name"] schema = cf.gen_default_collection_schema() - data = cf.gen_default_list_data_for_bulk_load(nb=nb) + data = cf.gen_default_list_data_for_bulk_insert(nb=nb) fields_name = [field.name for field in schema.fields] if not row_based: data_dict = dict(zip(fields_name, data)) @@ -95,17 +95,17 @@ class TestChaos(TestChaosBase): entity = dict(zip(fields_name, entity_value)) entities.append(entity) data_dict = {"rows": entities} - file_name = "bulk_load_data_source.json" + file_name = "bulk_insert_data_source.json" files = [file_name] #TODO: npy file type is not supported so far - log.info("generate bulk load file") + log.info("generate bulk insert file") with open(file_name, "w") as f: f.write(json.dumps(data_dict)) log.info("upload file to minio") client = Minio(minio_endpoint, access_key="minioadmin", secret_key="minioadmin", secure=False) client.fput_object(bucket_name, file_name, file_name) - self.health_checkers[Op.bulk_load].update(schema=schema, files=files, row_based=row_based) - log.info("prepare data for bulk load done") + self.health_checkers[Op.bulk_insert].update(schema=schema, files=files, row_based=row_based) + log.info("prepare data for bulk insert done") def teardown(self): chaos_res = CusResource(kind=self._chaos_config['kind'], diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index 4e73df986b..605ede2525 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -319,7 +319,7 @@ def gen_default_list_data(nb=ct.default_nb, dim=ct.default_dim): return data -def gen_default_list_data_for_bulk_load(nb=ct.default_nb, dim=ct.default_dim): +def gen_default_list_data_for_bulk_insert(nb=ct.default_nb, dim=ct.default_dim): int_values = [i for i in range(nb)] float_values = [float(i) for i in range(nb)] string_values = [str(i) for i in range(nb)] diff --git a/tests/python_client/config/log_config.py b/tests/python_client/config/log_config.py index e832b06cde..8707af4a08 100644 --- a/tests/python_client/config/log_config.py +++ b/tests/python_client/config/log_config.py @@ -1,5 +1,5 @@ import os - +import datetime class LogConfig: def __init__(self): @@ -16,7 +16,8 @@ class LogConfig: log_path = os.environ[var] return str(log_path) except Exception as e: - log_path = "/tmp/ci_logs/" + # now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + log_path = f"/tmp/ci_logs" print("[get_env_variable] failed to get environment variables : %s, use default path : %s" % (str(e), log_path)) return log_path diff --git a/tests/python_client/requirements.txt b/tests/python_client/requirements.txt index 758e184774..667a2f0ee3 100644 --- a/tests/python_client/requirements.txt +++ b/tests/python_client/requirements.txt @@ -9,7 +9,7 @@ allure-pytest==2.7.0 pytest-print==0.2.1 pytest-level==0.1.1 pytest-xdist==2.5.0 -pymilvus==2.2.0.dev45 +pymilvus==2.2.0.dev49 pytest-rerunfailures==9.1.1 git+https://github.com/Projectplace/pytest-tags ndg-httpsclient diff --git a/tests/python_client/testcases/test_import.py b/tests/python_client/testcases/test_import.py deleted file mode 100644 index 26bd7b3e39..0000000000 --- a/tests/python_client/testcases/test_import.py +++ /dev/null @@ -1,1493 +0,0 @@ -import logging -import time -from time import sleep -import pytest -import random -from base.client_base import TestcaseBase -from common import common_func as cf -from common import common_type as ct -from common.common_type import CaseLabel, CheckTasks -from utils.util_log import test_log as log -# from minio import Minio - - -vec_field = "vectors" -pk_field = "uid" -float_field = "float_scalar" -int_field = "int_scalar" -bool_field = "bool_scalar" -string_field = "string_scalar" - - -def gen_file_prefix(row_based=True, auto_id=True, prefix=""): - if row_based: - if auto_id: - return f"{prefix}row_auto" - else: - return f"{prefix}row_cust" - else: - if auto_id: - return f"{prefix}col_auto" - else: - return f"{prefix}col_cust" - - -class TestImport(TestcaseBase): - - def setup_class(self): - log.info("[setup_import] Start setup class...") - # TODO: copy data files to minio - log.info("copy data files to minio") - - def teardown_class(self): - log.info("[teardown_import] Start teardown class...") - # TODO: clean up data or not is a question - log.info("clean up data files in minio") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8, 128]) # 8, 128 - @pytest.mark.parametrize("entities", [100, 1000]) # 100, 1000 - def test_float_vector_only(self, row_based, auto_id, dim, entities): - """ - collection: auto_id, customized_id - collection schema: [pk, float_vector] - Steps: - 1. create collection - 2. import data - 3. verify the data entities equal the import data - 4. load the collection - 5. verify search successfully - 6. verify query successfully - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id) - files = [f"{prefix}_float_vectors_only_{dim}d_{entities}.json"] - self._connect() - c_name = cf.gen_unique_str(prefix) - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - completed, _ = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{completed} in {tt}") - assert completed - - # TODO: assert num entities - log.info(f" collection entities: {self.collection_wrap.num_entities}") - assert self.collection_wrap.num_entities == entities - - # verify imported data is available for search - self.collection_wrap.load() - log.info(f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}") - search_data = cf.gen_vectors(1, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 2}} - res, _ = self.collection_wrap.search(search_data, vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - # self.collection_wrap.query(expr=f"id in {ids}") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [10000]) - def test_partition_float_vector_int_scalar(self, row_based, auto_id, dim, entities): - """ - collection: customized partitions - collection schema: [pk, float_vectors, int_scalar] - 1. create collection and a partition - 2. build index and load partition - 3. import data into the partition - 4. verify num entities - 5. verify index status - 6. verify search and query - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id) - files = [f"{prefix}_float_vectors_int_scalar_{dim}d_{entities}.json"] - self._connect() - c_name = cf.gen_unique_str(prefix) - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name="int_scalar")] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # create a partition - p_name = cf.gen_unique_str() - m_partition, _ = self.collection_wrap.create_partition(partition_name=p_name) - # build index before bulk load - index_params = {"index_type": "IVF_SQ8", "params": {"nlist": 128}, "metric_type": "L2"} - self.collection_wrap.create_index(field_name=vec_field, index_params=index_params) - # load before bulk load - self.collection_wrap.load(partition_names=[p_name]) - - # import data into the partition - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name=p_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, _ = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert success - - assert m_partition.num_entities == entities - assert self.collection_wrap.num_entities == entities - - log.info(f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}") - # sleep 10s for issue #16607 - sleep(10) - log.info(f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}") - - search_data = cf.gen_vectors(1, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 16}} - res, _ = self.collection_wrap.search(search_data, vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True]) - @pytest.mark.parametrize("auto_id", [True]) - @pytest.mark.parametrize("dim", [16]) - @pytest.mark.parametrize("entities", [10]) - def test_binary_vector_only(self, row_based, auto_id, dim, entities): - """ - collection: auto_id - collection schema: [pk, binary_vector] - Steps: - 1. create collection - 2. build collection - 3. import data - 4. verify build status - 5. verify the data entities - 6. load collection - 7. verify search successfully - 6. verify query successfully - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id) - files = [f"{prefix}_binary_vectors_only_{dim}d_{entities}.json"] - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_binary_vec_field(name=vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # build index before bulk load - binary_index_params = {"index_type": "BIN_IVF_FLAT", "metric_type": "JACCARD", "params": {"nlist": 64}} - - self.collection_wrap.create_index(field_name=vec_field, index_params=binary_index_params) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - completed, _ = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{completed} in {tt}") - assert completed - - # verify build index status - sleep(3) - # TODO: verify build index after index_building_progress() refactor - # res, _ = self.utility_wrap.index_building_progress(c_name) - # exp_res = {'total_rows': entities, 'indexed_rows': entities} - # assert res == exp_res - - # TODO: verify num entities - assert self.collection_wrap.num_entities == entities - - # load collection - self.collection_wrap.load() - - # verify search and query - search_data = cf.gen_binary_vectors(1, dim)[1] - search_params = {"metric_type": "JACCARD", "params": {"nprobe": 10}} - res, _ = self.collection_wrap.search(search_data, vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("fields_num_in_file", ["more", "equal", "less"]) # "equal", "more", "less" - @pytest.mark.parametrize("dim", [1024]) # 1024 - @pytest.mark.parametrize("entities", [5000]) # 5000 - def test_float_vector_multi_scalars(self, row_based, auto_id, fields_num_in_file, dim, entities): - """ - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - Steps: - 1. create collection - 2. load collection - 3. import data - 4. verify the data entities - 5. verify index status - 6. verify search and query - 6. build index - 7. release collection and reload - 7. verify search successfully - 6. verify query successfully - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id) - files = [f"{prefix}_float_vectors_multi_scalars_{dim}d_{entities}.json"] - additional_field = "int_scalar_add" - self._connect() - c_name = cf.gen_unique_str(prefix) - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name="int_scalar"), - # TODO: string is not supported, exception when collection.load - # cf.gen_string_field(name="string_scalar") - cf.gen_bool_field(name="bool_scalar") - ] - if fields_num_in_file == "more": - fields.pop() - elif fields_num_in_file == "less": - fields.append(cf.gen_int32_field(name=additional_field)) - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # load collection - self.collection_wrap.load() - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - if fields_num_in_file == "less": - assert not success # TODO: check error msg - if row_based: - failed_reason = f"JSON row validator: field {additional_field} missed at the row 0" - else: - failed_reason = "is not equal to other fields" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - else: - assert success - - # TODO: assert num entities - log.info(f" collection entities: {self.collection_wrap.num_entities}") - assert self.collection_wrap.num_entities == entities - - # verify no index - res, _ = self.collection_wrap.has_index() - assert res is False - # verify search and query - search_data = cf.gen_vectors(1, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 2}} - res, _ = self.collection_wrap.search(search_data, vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - - # self.collection_wrap.query(expr=f"id in {ids}") - - # build index - index_params = {"index_type": "HNSW", "params": {"M": 8, "efConstruction": 100}, "metric_type": "IP"} - self.collection_wrap.create_index(field_name=vec_field, index_params=index_params) - - # release collection and reload - self.collection_wrap.release() - self.collection_wrap.load() - - # verify index built - res, _ = self.collection_wrap.has_index() - assert res is True - - # search and query - search_params = {"params": {"ef": 64}, "metric_type": "IP"} - res, _ = self.collection_wrap.search(search_data, vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [16]) # 16 - @pytest.mark.parametrize("entities", [3000]) # 3000 - @pytest.mark.parametrize("file_nums", [10]) # 10, max task nums 32? need improve - @pytest.mark.parametrize("multi_folder", [True, False]) - def test_float_vector_from_multi_files(self, row_based, auto_id, dim, entities, file_nums, multi_folder): - """ - collection: auto_id - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - Steps: - 1. create collection - 2. build index and load collection - 3. import data from multiple files - 4. verify the data entities - 5. verify index status - 6. verify search successfully - 7. verify query successfully - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id) - files = [] - if not multi_folder: - for i in range(file_nums): - files.append(f"{prefix}_float_vectors_multi_scalars_{dim}d_{entities}_{i}.json") - else: - # sub_folder index 20 to 29 - for i in range(20, 30): - files.append(f"/sub{i}/{prefix}_float_vectors_multi_scalars_{dim}d_{entities}_{i}.json") - self._connect() - c_name = cf.gen_unique_str(prefix) - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name="int_scalar"), - # TODO: string is not supported, exception when collection.load - # cf.gen_string_field(name="string_scalar") - cf.gen_bool_field(name="bool_scalar") - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # build index - index_params = ct.default_index - self.collection_wrap.create_index(field_name=vec_field, index_params=index_params) - # load collection - self.collection_wrap.load() - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=300) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - if not row_based: - assert not success - failed_reason = "is duplicated" # "the field xxx is duplicated" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - else: - assert success - log.info(f" collection entities: {self.collection_wrap.num_entities}") - assert self.collection_wrap.num_entities == entities * file_nums - - # verify index built - sleep(10) # TODO: need improve to smart wait for building completed - # res, _ = self.utility_wrap.index_building_progress(c_name) - # exp_res = {'total_rows': entities * file_nums, 'indexed_rows': entities * file_nums} - # assert res == exp_res - - # verify search and query - search_data = cf.gen_vectors(1, dim) - search_params = ct.default_search_params - res, _ = self.collection_wrap.search(search_data, vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - - # self.collection_wrap.query(expr=f"id in {ids}") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("multi_fields", [True, False]) - @pytest.mark.parametrize("dim", [128]) # 128 - @pytest.mark.parametrize("entities", [1000]) # 1000 - def test_float_vector_from_npy_file(self, row_based, auto_id, multi_fields, dim, entities): - """ - collection schema 1: [pk, float_vector] - schema 2: [pk, float_vector, int_scalar, string_scalar, float_scalar, bool_scalar] - data file: .npy files - Steps: - 1. create collection - 2. import data - 3. if row_based: verify import failed - 4. if column_based: - 4.1 verify the data entities equal the import data - 4.2 verify search and query successfully - """ - vec_field = f"vectors_{dim}d_{entities}" - self._connect() - c_name = cf.gen_unique_str() - if not multi_fields: - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim)] - else: - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name="int_scalar"), - # TODO: string is not supported, exception when collection.load - # cf.gen_string_field(name="string_scalar") - cf.gen_bool_field(name="bool_scalar") - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - files = [f"{vec_field}.npy"] # npy file name shall be the vector field name - if not multi_fields: - if not auto_id: - files.append(f"col_uid_only_{dim}d_{entities}.json") - files.reverse() - else: - if not auto_id: - files.append(f"col_uid_multi_scalars_{dim}d_{entities}.json") - else: - files.append(f"col_multi_scalars_{dim}d_{entities}.json") - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - - if row_based: - assert not success - failed_reason1 = "unsupported file type for row-based mode" - if auto_id: - failed_reason2 = f"invalid row-based JSON format, the key {int_field} is not found" - else: - failed_reason2 = f"invalid row-based JSON format, the key {pk_field} is not found" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason1 in state.infos.get("failed_reason", "") or \ - failed_reason2 in state.infos.get("failed_reason", "") - else: - assert success - # TODO: assert num entities - log.info(f" collection entities: {self.collection_wrap.num_entities}") - assert self.collection_wrap.num_entities == entities - - # verify imported data is available for search - self.collection_wrap.load() - log.info(f"query seg info: {self.utility_wrap.get_query_segment_info(c_name)[0]}") - search_data = cf.gen_vectors(1, dim) - search_params = {"metric_type": "L2", "params": {"nprobe": 2}} - res, _ = self.collection_wrap.search(search_data, vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - # self.collection_wrap.query(expr=f"id in {ids}") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("dim", [8]) - @pytest.mark.parametrize("entities", [10]) - def test_data_type_float_on_int_pk(self, row_based, dim, entities): - """ - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - data files: json file that one of entities has float on int pk - Steps: - 1. create collection - 2. import data - 3. verify the data entities - 4. verify query successfully - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=False, prefix="float_on_int_pk_") - files = [f"{prefix}_float_vectors_multi_scalars_{dim}d_{entities}_0.json"] - self._connect() - c_name = cf.gen_unique_str(prefix) - # TODO: add string pk - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name=int_field), - # TODO: string is not supported, exception when collection.load - # cf.gen_string_field(name="string_scalar") - cf.gen_bool_field(name=bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=False) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert success - assert self.collection_wrap.num_entities == entities - - self.collection_wrap.load() - - # the pk value was automatically convert to int from float - res, _ = self.collection_wrap.query(expr=f"{pk_field} in [3]", output_fields=[pk_field]) - assert [{pk_field: 3}] == res - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) - @pytest.mark.parametrize("entities", [10]) - def test_data_type_int_on_float_scalar(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - data files: json file that one of entities has int on float scalar - Steps: - 1. create collection - 2. import data - 3. verify the data entities - 4. verify query successfully - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id, prefix="int_on_float_scalar_") - files = [f"{prefix}_float_vectors_multi_scalars_{dim}d_{entities}_0.json"] - self._connect() - c_name = cf.gen_unique_str(prefix) - # TODO: add string pk - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name=int_field), - cf.gen_float_field(name=float_field), - # TODO: string is not supported, exception when collection.load - # cf.gen_string_field(name=string_field) - cf.gen_bool_field(name=bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert success - assert self.collection_wrap.num_entities == entities - - self.collection_wrap.load() - - # the pk value was automatically convert to int from float - res, _ = self.collection_wrap.query(expr=f"{float_field} in [1.0]", output_fields=[float_field]) - assert res[0].get(float_field, 0) == 1.0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [16]) # 128 - @pytest.mark.parametrize("entities", [100]) # 1000 - @pytest.mark.parametrize("file_nums", [32]) # 32, max task nums 32? need improve - @pytest.mark.skip(season="redesign after issue #16698 fixed") - def test_multi_numpy_files_from_multi_folders(self, auto_id, dim, entities, file_nums): - """ - collection schema 1: [pk, float_vector] - data file: .npy files - Steps: - 1. create collection - 2. import data - 3. if row_based: verify import failed - 4. if column_based: - 4.1 verify the data entities equal the import data - 4.2 verify search and query successfully - """ - vec_field = f"vectors_{dim}d_{entities}" - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # build index - index_params = ct.default_index - self.collection_wrap.create_index(field_name=vec_field, index_params=index_params) - # load collection - self.collection_wrap.load() - # import data - for i in range(file_nums): - files = [f"/{i}/{vec_field}.npy"] # npy file name shall be the vector field name - if not auto_id: - files.append(f"/{i}/{pk_field}.npy") - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=False, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - - assert success - log.info(f" collection entities: {self.collection_wrap.num_entities}") - assert self.collection_wrap.num_entities == entities * file_nums - - # verify search and query - sleep(10) - search_data = cf.gen_vectors(1, dim) - search_params = ct.default_search_params - res, _ = self.collection_wrap.search(search_data, vec_field, - param=search_params, limit=1, - check_task=CheckTasks.check_search_results, - check_items={"nq": 1, - "limit": 1}) - - # TODO: not supported yet - def test_from_customize_bucket(self): - pass - -# @pytest.mark.tags(CaseLabel.L3) -# @pytest.mark.parametrize("row_based", [True, False]) -# @pytest.mark.parametrize("auto_id", [True, False]) -# def test_auto_id_binary_vector_string_scalar(self, row_based, auto_id): -# """ -# collection: -# collection schema: [pk, binary_vector, string_scalar] -# 1. create collection -# 2. insert some data -# 3. import data -# 4. verify data entities -# 5. build index -# 6. load collection -# 7. verify search and query -# """ -# pass -# -# @pytest.mark.tags(CaseLabel.L3) -# def test_custom_id_float_vector_string_primary(self): -# """ -# collection: custom_id -# collection schema: float vectors and string primary key -# """ -# pass -# -# @pytest.mark.tags(CaseLabel.L3) -# def test_custom_id_float_partition_vector_string_primary(self): -# """ -# collection: custom_id and custom partition -# collection schema: float vectors and string primary key -# """ -# pass -# -# @pytest.mark.tags(CaseLabel.L3) -# def test_custom_id_binary_vector_int_primary_from_bucket(self): -# """ -# collection: custom_id -# collection schema: binary vectors and int primary key -# import from a particular bucket -# """ -# pass -# -# @pytest.mark.tags(CaseLabel.L3) -# def test_custom_id_binary_vector_string_primary_multi_scalars_twice(self): -# """ -# collection: custom_id -# collection schema: binary vectors, string primary key and multiple scalars -# import twice -# """ -# pass -# -# @pytest.mark.tags(CaseLabel.L3) -# def test_custom_id_float_vector_int_primary_multi_scalars_twice(self): -# """ -# collection: custom_id -# collection schema: float vectors, int primary key and multiple scalars -# import twice -# """ -# pass -# -# -# class TestColumnBasedImport(TestcaseBase): -# @pytest.mark.tags(CaseLabel.L3) -# def test_auto_id_float_vector(self): -# """ -# collection: auto_id -# collection schema: [auto_id, float vector] -# Steps: -# 1. create collection -# 2. import column based data file -# 3. verify the data entities equal the import data -# 4. load the collection -# 5. verify search successfully -# 6. verify query successfully -# """ -# pass - - -class TestImportInvalidParams(TestcaseBase): - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - def test_non_existing_file(self, row_based): - """ - collection: either auto_id or not - collection schema: not existing file(s) - Steps: - 1. create collection - 3. import data, but the data file(s) not exists - 4. verify import failed with errors - """ - files = ["not_existing.json"] - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=ct.default_dim)] - schema = cf.gen_collection_schema(fields=fields) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - assert not success - failed_reason = "minio file manage cannot be found" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - def test_empty_json_file(self, row_based, auto_id): - """ - collection: either auto_id or not - collection schema: [pk, float_vector] - Steps: - 1. create collection - 2. import data, but the data file(s) is empty - 3. verify import fail if column based - 4. verify import successfully if row based - """ - # set the wrong row based params - files = ["empty.json"] - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=ct.default_dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - if row_based: - assert success - else: - assert not success - failed_reason = "JSON column consumer: row count is 0" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [128]) # 8 - @pytest.mark.parametrize("entities", [100]) # 100 - def test_wrong_file_type(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vector] - data files: wrong data type - Steps: - 1. create collection - 2. import data - 3. verify import failed with errors - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id, prefix="err_file_type_") - if row_based: - if auto_id: - data_type = ".csv" - else: - data_type = "" - else: - if auto_id: - data_type = ".npy" - else: - data_type = ".txt" - files = [f"{prefix}_float_vectors_only_{dim}d_{entities}{data_type}"] - self._connect() - c_name = cf.gen_unique_str(prefix) - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert not success - failed_reason = "unsupported file type" - if not row_based and auto_id: - failed_reason = "Numpy parse: npy: not a valid NumPy file format" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) - @pytest.mark.parametrize("entities", [100]) - def test_wrong_row_based_values(self, row_based, auto_id, dim, entities): - """ - collection: either auto_id or not - import data: not existing file(s) - Steps: - 1. create collection - 3. import data, but the data file(s) not exists - 4. verify import failed with errors - """ - # set the wrong row based params - prefix = gen_file_prefix(row_based=not row_based) - files = [f"{prefix}_float_vectors_only_{dim}d_{entities}.json"] - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - assert not success - if row_based: - failed_reason = "invalid row-based JSON format, the key vectors is not found" - else: - failed_reason = "JSON column consumer: row count is 0" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) # 8 - @pytest.mark.parametrize("entities", [100]) # 100 - def test_wrong_pk_field_name(self, row_based, auto_id, dim, entities): - """ - collection: auto_id, customized_id - import data: [pk, float_vector] - Steps: - 1. create collection with a dismatch_uid as pk - 2. import data - 3. verify import data successfully if collection with auto_id - 4. verify import error if collection with auto_id=False - """ - prefix = gen_file_prefix(row_based, auto_id) - files = [f"{prefix}_float_vectors_only_{dim}d_{entities}.json"] - pk_field = "dismatch_pk" - self._connect() - c_name = cf.gen_unique_str(prefix) - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - if auto_id: - assert success - else: - assert not success - if row_based: - failed_reason = f"field {pk_field} missed at the row 0" - else: - # TODO: improve the failed msg: issue #16722 - failed_reason = f"import error: field {pk_field} row count 0 is not equal to other fields" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) # 8 - @pytest.mark.parametrize("entities", [100]) # 100 - def test_wrong_vector_field_name(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vector] - Steps: - 1. create collection with a dismatch_uid as pk - 2. import data - 3. verify import data successfully if collection with auto_id - 4. verify import error if collection with auto_id=False - """ - prefix = gen_file_prefix(row_based, auto_id) - files = [f"{prefix}_float_vectors_only_{dim}d_{entities}.json"] - vec_field = "dismatched_vectors" - self._connect() - c_name = cf.gen_unique_str(prefix) - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - - assert not success - if row_based: - failed_reason = f"field {vec_field} missed at the row 0" - else: - if auto_id: - failed_reason = f"JSON column consumer: row count is 0" - else: - # TODO: improve the failed msg: issue #16722 - failed_reason = f"import error: field {vec_field} row count 0 is not equal to other fields 100" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [10000]) - def test_wrong_scalar_field_name(self, row_based, auto_id, dim, entities): - """ - collection: customized partitions - collection schema: [pk, float_vectors, int_scalar] - 1. create collection - 2. import data that one scalar field name is dismatched - 3. verify that import fails with errors - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id) - files = [f"{prefix}_float_vectors_int_scalar_{dim}d_{entities}.json"] - scalar_field = "dismatched_scalar" - self._connect() - c_name = cf.gen_unique_str(prefix) - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name=scalar_field)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name="", - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert not success - if row_based: - failed_reason = f"field {scalar_field} missed at the row 0" - else: - # TODO: improve the failed msg: issue #16722 - failed_reason = f"import error: field {scalar_field} row count 0 is not equal to other fields 100" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [10000]) - def test_wrong_dim_in_schema(self, row_based, auto_id, dim, entities): - """ - collection: create a collection with a dim that dismatch with json file - collection schema: [pk, float_vectors, int_scalar] - 1. import data the collection - 2. verify that import fails with errors - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id) - files = [f"{prefix}_float_vectors_int_scalar_{dim}d_{entities}.json"] - self._connect() - c_name = cf.gen_unique_str(prefix) - wrong_dim = dim + 1 - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=wrong_dim), - cf.gen_int32_field(name=int_field)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert not success - failed_reason = f"array size {dim} doesn't equal to vector dimension {wrong_dim} of field vectors at the row " - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [10000]) - def test_non_existing_collection(self, row_based, dim, entities): - """ - collection: not create collection - collection schema: [pk, float_vectors, int_scalar] - 1. import data into a non existing collection - 2. verify that import fails with errors - """ - prefix = gen_file_prefix(row_based=row_based) - files = [f"{prefix}_float_vectors_int_scalar_{dim}d_{entities}.json"] - self._connect() - c_name = cf.gen_unique_str(prefix) - # import data into a non existing collection - err_msg = f"can't find collection: {c_name}" - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files, - check_task=CheckTasks.err_res, - check_items={"err_code": 1, - "err_msg": err_msg} - ) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [10000]) - def test_non_existing_partition(self, row_based, dim, entities): - """ - collection: create a collection - collection schema: [pk, float_vectors, int_scalar] - 1. import data into a non existing partition - 2. verify that import fails with errors - """ - prefix = gen_file_prefix(row_based=row_based) - files = [f"{prefix}_float_vectors_int_scalar_{dim}d_{entities}.json"] - self._connect() - c_name = cf.gen_unique_str(prefix) - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name=int_field)] - schema = cf.gen_collection_schema(fields=fields) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data into a non existing partition - p_name = "non_existing" - err_msg = f" partition {p_name} does not exist" - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name=p_name, - row_based=row_based, - files=files, - check_task=CheckTasks.err_res, - check_items={"err_code": 11, - "err_msg": err_msg} - ) - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [4]) - @pytest.mark.parametrize("entities", [10000]) - @pytest.mark.parametrize("position", ["first", "middle", "end"]) - def test_wrong_dim_in_one_entities_of_file(self, row_based, auto_id, dim, entities, position): - """ - collection: create a collection - collection schema: [pk, float_vectors, int_scalar], one of entities has wrong dim data - 1. import data the collection - 2. verify that import fails with errors - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id, prefix=f"err_{position}_dim_") - files = [f"{prefix}_float_vectors_int_scalar_{dim}d_{entities}.json"] - self._connect() - c_name = cf.gen_unique_str(prefix) - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name=int_field)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert not success - failed_reason = f"doesn't equal to vector dimension {dim} of field vectors at the row" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [16]) # 16 - @pytest.mark.parametrize("entities", [3000]) # 3000 - @pytest.mark.parametrize("file_nums", [10]) # max task nums 32? need improve - def test_float_vector_one_of_files_fail(self, row_based, auto_id, dim, entities, file_nums): - """ - collection schema: [pk, float_vectors, int_scalar], one of entities has wrong dim data - data files: multi files, and there are errors in one of files - 1. import data 11 files(10 correct and 1 with errors) into the collection - 2. verify that import fails with errors and no data imported - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id) - files = [] - for i in range(file_nums): - files.append(f"{prefix}_float_vectors_multi_scalars_{dim}d_{entities}_{i}.json") - # append a file that has errors - files.append(f"err_{prefix}_float_vectors_multi_scalars_{dim}d_{entities}_101.json") - random.shuffle(files) # mix up the file order - - self._connect() - c_name = cf.gen_unique_str(prefix) - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name="int_scalar"), - # TODO: string is not supported, exception when collection.load - # cf.gen_string_field(name="string_scalar") - cf.gen_bool_field(name="bool_scalar") - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - partition_name='', - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=300) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - assert not success - if row_based: - # all correct files shall be imported successfully - assert self.collection_wrap.num_entities == entities * file_nums - else: - # TODO: Update assert after #16707 fixed - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("same_field", [True, False]) - @pytest.mark.parametrize("dim", [128]) # 128 - @pytest.mark.parametrize("entities", [1000]) # 1000 - @pytest.mark.xfail(reason="issue #16698") - def test_float_vector_from_multi_npy_files(self, auto_id, same_field, dim, entities): - """ - collection schema 1: [pk, float_vector] - data file: .npy files - Steps: - 1. create collection - 2. import data with row_based=False from multiple .npy files - 3. verify import failed with errors - """ - vec_field = f"vectors_{dim}d_{entities}_0" - self._connect() - c_name = cf.gen_unique_str() - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim)] - if not same_field: - fields.append(cf.gen_float_field(name=f"vectors_{dim}d_{entities}_1")) - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - files = [f"{vec_field}.npy", f"{vec_field}.npy"] - if not same_field: - files = [f"{vec_field}.npy", f"vectors_{dim}d_{entities}_1.npy"] - if not auto_id: - files.append(f"col_uid_only_{dim}d_{entities}.json") - - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=False, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert not success - failed_reason = f"Numpy parse: illegal data type" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [128]) # 128 - @pytest.mark.parametrize("entities", [1000]) # 1000 - def test_wrong_dim_in_numpy(self, auto_id, dim, entities): - """ - collection schema 1: [pk, float_vector] - data file: .npy file - Steps: - 1. create collection - 2. import data - 3. if row_based: verify import failed - 4. if column_based: - 4.1 verify the data entities equal the import data - 4.2 verify search and query successfully - """ - vec_field = f"vectors_{dim}d_{entities}" - self._connect() - c_name = cf.gen_unique_str() - wrong_dim = dim + 1 - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=wrong_dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - files = [f"{vec_field}.npy"] # npy file name shall be the vector field name - if not auto_id: - files.append(f"col_uid_only_{dim}d_{entities}.json") - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=False, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - - assert not success - failed_reason = f"Numpy parse: illegal row width {dim} for field {vec_field} dimension {wrong_dim}" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [128]) # 128 - @pytest.mark.parametrize("entities", [1000]) # 1000 - def test_wrong_field_name_in_numpy(self, auto_id, dim, entities): - """ - collection schema 1: [pk, float_vector] - data file: .npy file - Steps: - 1. create collection - 2. import data - 3. if row_based: verify import failed - 4. if column_based: - 4.1 verify the data entities equal the import data - 4.2 verify search and query successfully - """ - vec_field = f"vectors_{dim}d_{entities}" - self._connect() - c_name = cf.gen_unique_str() - wrong_vec_field = f"wrong_{vec_field}" - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=wrong_vec_field, dim=dim)] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - - # import data - files = [f"{vec_field}.npy"] # npy file name shall be the vector field name - if not auto_id: - files.append(f"col_uid_only_{dim}d_{entities}.json") - t0 = time.time() - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=False, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed(task_ids=task_ids, - timeout=30) - tt = time.time() - t0 - log.info(f"bulk load state:{success} in {tt}") - - assert not success - failed_reason = f"Numpy parse: the field {vec_field} doesn't exist" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("dim", [8]) - @pytest.mark.parametrize("entities", [10]) - def test_data_type_string_on_int_pk(self, row_based, dim, entities): - """ - collection schema: [pk, float_vectors, int_scalar], one of entities has wrong dim data - data file: json file with one of entities has string on int pk - Steps: - 1. create collection - 2. import data with row_based=False - 3. verify import failed - """ - err_string_on_pk = "iamstring" - prefix = gen_file_prefix(row_based=row_based, auto_id=False, prefix="err_str_on_int_pk_") - files = [f"{prefix}_float_vectors_multi_scalars_{dim}d_{entities}_0.json"] - self._connect() - c_name = cf.gen_unique_str(prefix) - # TODO: add string pk - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name=int_field), - # TODO: string is not supported, exception when collection.load - # cf.gen_string_field(name="string_scalar") - cf.gen_bool_field(name=bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=False) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert not success - failed_reason = f"illegal numeric value {err_string_on_pk} at the row" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - @pytest.mark.tags(CaseLabel.L3) - @pytest.mark.parametrize("row_based", [True, False]) - @pytest.mark.parametrize("auto_id", [True, False]) - @pytest.mark.parametrize("dim", [8]) - @pytest.mark.parametrize("entities", [10]) - def test_data_type_int_on_float_scalar(self, row_based, auto_id, dim, entities): - """ - collection schema: [pk, float_vector, - float_scalar, int_scalar, string_scalar, bool_scalar] - data files: json file that one of entities has typo on boolean field - Steps: - 1. create collection - 2. import data - 3. verify import failed with errors - """ - prefix = gen_file_prefix(row_based=row_based, auto_id=auto_id, prefix="err_typo_on_bool_") - files = [f"{prefix}_float_vectors_multi_scalars_{dim}d_{entities}_0.json"] - self._connect() - c_name = cf.gen_unique_str(prefix) - # TODO: add string pk - fields = [cf.gen_int64_field(name=pk_field, is_primary=True), - cf.gen_float_vec_field(name=vec_field, dim=dim), - cf.gen_int32_field(name=int_field), - cf.gen_float_field(name=float_field), - # TODO: string is not supported, exception when collection.load - # cf.gen_string_field(name=string_field) - cf.gen_bool_field(name=bool_field) - ] - schema = cf.gen_collection_schema(fields=fields, auto_id=auto_id) - self.collection_wrap.init_collection(c_name, schema=schema) - # import data - task_ids, _ = self.utility_wrap.bulk_load(collection_name=c_name, - row_based=row_based, - files=files) - logging.info(f"bulk load task ids:{task_ids}") - success, states = self.utility_wrap.wait_for_bulk_load_tasks_completed( - task_ids=task_ids, - timeout=30) - log.info(f"bulk load state:{success}") - assert not success - failed_reason1 = "illegal value" - failed_reason2 = "invalid character" - for state in states.values(): - assert state.state_name == "BulkLoadFailed" - assert failed_reason1 in state.infos.get("failed_reason", "") or \ - failed_reason2 in state.infos.get("failed_reason", "") - assert self.collection_wrap.num_entities == 0 - - # - # assert success - # assert self.collection_wrap.num_entities == entities - # - # self.collection_wrap.load() - # - # # the pk value was automatically convert to int from float - # res, _ = self.collection_wrap.query(expr=f"{float_field} in [1.0]", output_fields=[float_field]) - # assert res[0].get(float_field, 0) == 1.0 - - - # TODO: string data on float field - - -# class TestImportAdvanced(TestcaseBase): -# -# def setup_class(self): -# log.info("[setup_import] Start setup class...") -# log.info("copy data files to minio") -# -# def teardown_class(self): -# log.info("[teardown_import] Start teardown class...") -# log.info("clean up data files in minio") -# -# """Validate data consistency and availability during import""" -# @pytest.mark.tags(CaseLabel.L3) -# def test_default(self): -# pass - diff --git a/tests/python_client/testcases/test_utility.py b/tests/python_client/testcases/test_utility.py index 3eaea4bf7f..965cd5d087 100644 --- a/tests/python_client/testcases/test_utility.py +++ b/tests/python_client/testcases/test_utility.py @@ -308,6 +308,7 @@ class TestUtilityParams(TestcaseBase): self.utility_wrap.drop_collection(c_name) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_left_vector_invalid_type(self, get_invalid_vector_dict): """ target: test calculated distance with invalid vectors @@ -324,6 +325,7 @@ class TestUtilityParams(TestcaseBase): "is illegal".format(invalid_vector)}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_left_vector_invalid_value(self, get_invalid_vector_dict): """ target: test calculated distance with invalid vectors @@ -340,6 +342,7 @@ class TestUtilityParams(TestcaseBase): "is illegal".format(invalid_vector)}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_right_vector_invalid_type(self, get_invalid_vector_dict): """ target: test calculated distance with invalid vectors @@ -358,6 +361,7 @@ class TestUtilityParams(TestcaseBase): "is illegal".format(invalid_vector)}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_right_vector_invalid_value(self, get_invalid_vector_dict): """ target: test calculated distance with invalid vectors @@ -376,6 +380,7 @@ class TestUtilityParams(TestcaseBase): "is illegal".format(invalid_vector)}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_invalid_metric_type(self, get_support_metric_field, get_invalid_metric_type): """ target: test calculated distance with invalid metric @@ -397,6 +402,7 @@ class TestUtilityParams(TestcaseBase): "is illegal".format(metric)}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_invalid_metric_value(self, get_support_metric_field, get_invalid_metric_value): """ target: test calculated distance with invalid metric @@ -418,6 +424,7 @@ class TestUtilityParams(TestcaseBase): "float vector".format(metric)}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_not_support_metric(self, get_support_metric_field, get_not_support_metric): """ target: test calculated distance with invalid metric @@ -439,6 +446,7 @@ class TestUtilityParams(TestcaseBase): "float vector".format(metric)}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_invalid_using(self, get_support_metric_field): """ target: test calculated distance with invalid using @@ -459,6 +467,7 @@ class TestUtilityParams(TestcaseBase): "err_msg": "should create connect"}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_not_match_dim(self): """ target: test calculated distance with invalid vectors @@ -478,6 +487,7 @@ class TestUtilityParams(TestcaseBase): "vectors with different dimension"}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_collection_before_load(self, get_support_metric_field): """ target: test calculated distance when entities is not ready @@ -1010,6 +1020,7 @@ class TestUtilityBase(TestcaseBase): sleep(1) @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_default(self): """ target: test calculated distance with default params @@ -1030,6 +1041,7 @@ class TestUtilityBase(TestcaseBase): "vectors_r": vectors_r}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_default_sqrt(self, metric_field, metric): """ target: test calculated distance with default param @@ -1052,6 +1064,7 @@ class TestUtilityBase(TestcaseBase): "metric": metric}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_default_metric(self, sqrt): """ target: test calculated distance with default param @@ -1074,6 +1087,7 @@ class TestUtilityBase(TestcaseBase): "sqrt": sqrt}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_binary_metric(self, metric_field, metric_binary): """ target: test calculate distance with binary vectors @@ -1099,6 +1113,7 @@ class TestUtilityBase(TestcaseBase): "metric": metric_binary}) @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_from_collection_ids(self, metric_field, metric, sqrt): """ target: test calculated distance from collection entities @@ -1130,6 +1145,7 @@ class TestUtilityBase(TestcaseBase): "sqrt": sqrt}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_from_collections(self, metric_field, metric, sqrt): """ target: test calculated distance between entities from collections @@ -1160,6 +1176,7 @@ class TestUtilityBase(TestcaseBase): "sqrt": sqrt}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_left_vector_and_collection_ids(self, metric_field, metric, sqrt): """ target: test calculated distance from collection entities @@ -1190,6 +1207,7 @@ class TestUtilityBase(TestcaseBase): "sqrt": sqrt}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_right_vector_and_collection_ids(self, metric_field, metric, sqrt): """ target: test calculated distance from collection entities @@ -1218,6 +1236,7 @@ class TestUtilityBase(TestcaseBase): "sqrt": sqrt}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_from_partition_ids(self, metric_field, metric, sqrt): """ target: test calculated distance from one partition entities @@ -1252,6 +1271,7 @@ class TestUtilityBase(TestcaseBase): "sqrt": sqrt}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_from_partitions(self, metric_field, metric, sqrt): """ target: test calculated distance between entities from partitions @@ -1281,6 +1301,7 @@ class TestUtilityBase(TestcaseBase): "sqrt": sqrt}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_left_vectors_and_partition_ids(self, metric_field, metric, sqrt): """ target: test calculated distance between vectors and partition entities @@ -1314,6 +1335,7 @@ class TestUtilityBase(TestcaseBase): "sqrt": sqrt}) @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.skip(reason="calc_distance interface is no longer supported") def test_calc_distance_right_vectors_and_partition_ids(self, metric_field, metric, sqrt): """ target: test calculated distance between vectors and partition entities @@ -3340,7 +3362,7 @@ class TestUtilityRBAC(TestcaseBase): collection_w.flush(check_task=CheckTasks.check_permission_deny) default_term_expr = f'{ct.default_int64_field_name} in [0, 1]' collection_w.query(default_term_expr, check_task=CheckTasks.check_permission_deny) - # self.utility_wrap.bulk_load(c_name, check_task=CheckTasks.check_permission_deny) + # self.utility_wrap.bulk_insert(c_name, check_task=CheckTasks.check_permission_deny) # Global permission deny self.init_collection_wrap(name=c_name_2, check_task=CheckTasks.check_permission_deny)