diff --git a/tests/python_client/common/bulk_insert_data.py b/tests/python_client/common/bulk_insert_data.py index 0913a98f54..4c9bfc9bad 100644 --- a/tests/python_client/common/bulk_insert_data.py +++ b/tests/python_client/common/bulk_insert_data.py @@ -34,6 +34,7 @@ class DataField: fp16_vec_field = "float16_vec_field" int_field = "int_scalar" string_field = "string_scalar" + text_field = "text_scalar" bool_field = "bool_scalar" float_field = "float_scalar" double_field = "double_scalar" @@ -403,6 +404,23 @@ def gen_string_in_numpy_file(dir, data_field, rows, start=0, force=False): return file_name +def gen_text_in_numpy_file(dir, data_field, rows, start=0, force=False, nullable=False): + file_name = f"{data_field}.npy" + file = f"{dir}/{file_name}" + if not os.path.exists(file) or force: + # non vector columns + data = [] + if rows > 0: + data = [fake.text() + " milvus " for i in range(start, rows+start)] + if nullable: + data = [None if random.random() < 0.5 else fake.text() + " milvus " for _ in range(rows)] + arr = np.array(data) + # print(f"file_name: {file_name} data type: {arr.dtype}") + log.info(f"file_name: {file_name} data type: {arr.dtype} data shape: {arr.shape}") + np.save(file, arr) + return file_name + + def gen_dynamic_field_in_numpy_file(dir, rows, start=0, force=False): file_name = f"$meta.npy" file = f"{dir}/{file_name}" @@ -553,6 +571,11 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128 data = [gen_unique_str(str(i)) for i in range(start, rows + start)] else: data = [None for _ in range(start, rows + start)] + elif data_field == DataField.text_field: + if not nullable: + data = [fake.text() + " milvus " for i in range(start, rows + start)] + else: + data = [None if random.random() < 0.5 else fake.text() + " milvus " for _ in range(start, rows + start)] elif data_field == DataField.bool_field: if not nullable: data = [random.choice([True, False]) for i in range(start, rows + start)] @@ -573,7 +596,7 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128 for i in range(start, rows + start)]) else: data = pd.Series( - [np.array(None) for i in range(start, rows + start)]) + [None for i in range(start, rows + start)]) elif data_field == DataField.array_int_field: if not nullable: data = pd.Series( @@ -581,7 +604,7 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128 for i in range(start, rows + start)]) else: data = pd.Series( - [np.array(None) for i in range(start, rows + start)]) + [None for i in range(start, rows + start)]) elif data_field == DataField.array_float_field: if not nullable: data = pd.Series( @@ -589,7 +612,7 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128 for i in range(start, rows + start)]) else: data = pd.Series( - [np.array(None) for i in range(start, rows + start)]) + [None for i in range(start, rows + start)]) elif data_field == DataField.array_string_field: if not nullable: data = pd.Series( @@ -597,7 +620,9 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128 for i in range(start, rows + start)]) else: data = pd.Series( - [np.array(None) for i in range(start, rows + start)]) + [None for i in range(start, rows + start)]) + else: + raise Exception("unsupported field name") return data @@ -714,6 +739,14 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d elif data_field == DataField.string_field: if not nullable: d[data_field] = gen_unique_str(str(r + start)) + elif data_field == DataField.text_field: + if not nullable: + d[data_field] = fake.text() + " milvus " + else: + if random.random() < 0.5: + d[data_field] = None + else: + d[data_field] = fake.text() + " milvus " elif data_field == DataField.bool_field: if not nullable: d[data_field] = random.choice([True, False]) @@ -746,6 +779,8 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d d[data_field] = [gen_unique_str(str(i)) for i in range(array_length)] else: d[data_field] = None + else: + raise Exception("unsupported field name") if enable_dynamic_field: d[str(r+start)] = r+start d["name"] = fake.name() @@ -845,6 +880,8 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_size=None, file_num vector_type=vector_type, rows=rows, dim=dim, force=force) elif data_field == DataField.string_field: # string field for numpy not supported yet at 2022-10-17 file_name = gen_string_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force) + elif data_field == DataField.text_field: + file_name = gen_text_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force, nullable=nullable) elif data_field == DataField.bool_field: file_name = gen_bool_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force) elif data_field == DataField.json_field: diff --git a/tests/python_client/testcases/test_bulk_insert.py b/tests/python_client/testcases/test_bulk_insert.py index b29098fc00..c82d52b46f 100644 --- a/tests/python_client/testcases/test_bulk_insert.py +++ b/tests/python_client/testcases/test_bulk_insert.py @@ -770,6 +770,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert): cf.gen_int64_field(name=df.int_field, nullable=nullable), cf.gen_float_field(name=df.float_field, nullable=nullable), cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable), + cf.gen_string_field(name=df.text_field, enable_match=True, nullable=nullable), cf.gen_json_field(name=df.json_field, nullable=nullable), cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable), cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable), @@ -895,6 +896,11 @@ class TestBulkInsert(TestcaseBaseBulkInsert): query_data = [r[expr_field] for r in res][:len(self.collection_wrap.partitions)] res, _ = self.collection_wrap.query(expr=f"{expr_field} in {query_data}", output_fields=[expr_field]) assert len(res) == len(query_data) + res, _ = self.collection_wrap.query(expr=f"TextMatch({df.text_field}, 'milvus')", output_fields=[df.text_field]) + if nullable is False: + assert len(res) == entities + else: + assert 0 < len(res) < entities if enable_partition_key: assert len(self.collection_wrap.partitions) > 1 @@ -929,6 +935,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert): cf.gen_int64_field(name=df.int_field, nullable=nullable), cf.gen_float_field(name=df.float_field), cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key), + cf.gen_string_field(name=df.text_field, enable_match=True, nullable=nullable), cf.gen_json_field(name=df.json_field), cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim), cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim), @@ -1042,6 +1049,11 @@ class TestBulkInsert(TestcaseBaseBulkInsert): query_data = [r[df.string_field] for r in res][:len(self.collection_wrap.partitions)] res, _ = self.collection_wrap.query(expr=f"{df.string_field} in {query_data}", output_fields=[df.string_field]) assert len(res) == len(query_data) + res, _ = self.collection_wrap.query(expr=f"TextMatch({df.text_field}, 'milvus')", output_fields=[df.text_field]) + if nullable is False: + assert len(res) == entities + else: + assert 0 < len(res) < entities if enable_partition_key: assert len(self.collection_wrap.partitions) > 1 @@ -1065,8 +1077,6 @@ class TestBulkInsert(TestcaseBaseBulkInsert): """ if enable_dynamic_field is False and include_meta is True: pytest.skip("include_meta only works with enable_dynamic_field") - if nullable is True: - pytest.skip("issue #36252") if enable_partition_key is True and nullable is True: pytest.skip("partition key field not support nullable") float_vec_field_dim = dim @@ -1078,6 +1088,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert): cf.gen_int64_field(name=df.int_field, nullable=nullable), cf.gen_float_field(name=df.float_field, nullable=nullable), cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key, nullable=nullable), + cf.gen_string_field(name=df.text_field, enable_match=True, nullable=nullable), cf.gen_json_field(name=df.json_field, nullable=nullable), cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64, nullable=nullable), cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable), @@ -1191,10 +1202,17 @@ class TestBulkInsert(TestcaseBaseBulkInsert): assert "address" in fields_from_search # query data res, _ = self.collection_wrap.query(expr=f"{df.string_field} >= '0'", output_fields=[df.string_field]) - assert len(res) == entities + if nullable is False: + assert len(res) == entities query_data = [r[df.string_field] for r in res][:len(self.collection_wrap.partitions)] res, _ = self.collection_wrap.query(expr=f"{df.string_field} in {query_data}", output_fields=[df.string_field]) - assert len(res) == len(query_data) + if nullable is False: + assert len(res) == len(query_data) + res, _ = self.collection_wrap.query(expr=f"TextMatch({df.text_field}, 'milvus')", output_fields=[df.text_field]) + if nullable is False: + assert len(res) == entities + else: + assert 0 < len(res) < entities if enable_partition_key: assert len(self.collection_wrap.partitions) > 1 diff --git a/tests/scripts/ci_e2e_4am.sh b/tests/scripts/ci_e2e_4am.sh index f348d0f43e..854c1cf9df 100755 --- a/tests/scripts/ci_e2e_4am.sh +++ b/tests/scripts/ci_e2e_4am.sh @@ -116,10 +116,10 @@ fi if [[ "${MILVUS_HELM_RELEASE_NAME}" != *"msop"* ]]; then if [[ -n "${TEST_TIMEOUT:-}" ]]; then - timeout "${TEST_TIMEOUT}" pytest testcases --endpoint http://${MILVUS_SERVICE_NAME}:${MILVUS_SERVICE_PORT} --minio_host ${MINIO_SERVICE_NAME} -v -x -m BulkInsert -n 6 --timeout 180\ + timeout "${TEST_TIMEOUT}" pytest testcases --endpoint http://${MILVUS_SERVICE_NAME}:${MILVUS_SERVICE_PORT} --minio_host ${MINIO_SERVICE_NAME} -v -x -m BulkInsert -n 6 --timeout 240\ --html=${CI_LOG_PATH}/report_restful.html --self-contained-html else - pytest testcases --endpoint http://${MILVUS_SERVICE_NAME}:${MILVUS_SERVICE_PORT} --minio_host ${MINIO_SERVICE_NAME} -v -x -m BulkInsert -n 6 --timeout 180\ + pytest testcases --endpoint http://${MILVUS_SERVICE_NAME}:${MILVUS_SERVICE_PORT} --minio_host ${MINIO_SERVICE_NAME} -v -x -m BulkInsert -n 6 --timeout 240\ --html=${CI_LOG_PATH}/report_restful.html --self-contained-html fi fi