mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
test: update bulk insert bench test (#29534)
update bulk insert bench test Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
parent
3824282d7b
commit
fdbef35745
@ -87,7 +87,7 @@ class TestBulkInsertPerf(TestcaseBaseBulkInsert):
|
|||||||
cf.gen_json_field(name=df.json_field),
|
cf.gen_json_field(name=df.json_field),
|
||||||
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
|
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
|
||||||
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
|
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
|
||||||
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR),
|
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=200),
|
||||||
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
|
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
|
||||||
cf.gen_float_vec_field(name=df.vec_field, dim=dim),
|
cf.gen_float_vec_field(name=df.vec_field, dim=dim),
|
||||||
]
|
]
|
||||||
@ -99,6 +99,7 @@ class TestBulkInsertPerf(TestcaseBaseBulkInsert):
|
|||||||
dim=dim,
|
dim=dim,
|
||||||
data_fields=data_fields,
|
data_fields=data_fields,
|
||||||
file_size=file_size,
|
file_size=file_size,
|
||||||
|
row_group_size=None,
|
||||||
file_nums=file_nums,
|
file_nums=file_nums,
|
||||||
array_length=array_len,
|
array_length=array_len,
|
||||||
enable_dynamic_field=enable_dynamic_field,
|
enable_dynamic_field=enable_dynamic_field,
|
||||||
@ -146,7 +147,7 @@ class TestBulkInsertPerf(TestcaseBaseBulkInsert):
|
|||||||
cf.gen_json_field(name=df.json_field),
|
cf.gen_json_field(name=df.json_field),
|
||||||
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
|
cf.gen_array_field(name=df.array_int_field, element_type=DataType.INT64),
|
||||||
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
|
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT),
|
||||||
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR),
|
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=200),
|
||||||
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
|
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL),
|
||||||
cf.gen_float_vec_field(name=df.vec_field, dim=dim),
|
cf.gen_float_vec_field(name=df.vec_field, dim=dim),
|
||||||
]
|
]
|
||||||
|
|||||||
@ -670,7 +670,7 @@ def gen_dynamic_field_data_in_parquet_file(rows, start=0):
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def gen_parquet_files(float_vector, rows, dim, data_fields, file_size=None, file_nums=1, array_length=None, err_type="", enable_dynamic_field=False):
|
def gen_parquet_files(float_vector, rows, dim, data_fields, file_size=None, row_group_size=None, file_nums=1, array_length=None, err_type="", enable_dynamic_field=False):
|
||||||
# gen numpy files
|
# gen numpy files
|
||||||
if err_type == "":
|
if err_type == "":
|
||||||
err_type = "none"
|
err_type = "none"
|
||||||
@ -690,8 +690,10 @@ def gen_parquet_files(float_vector, rows, dim, data_fields, file_size=None, file
|
|||||||
df = pd.DataFrame(all_field_data)
|
df = pd.DataFrame(all_field_data)
|
||||||
log.info(f"df: \n{df}")
|
log.info(f"df: \n{df}")
|
||||||
file_name = f"data-fields-{len(data_fields)}-rows-{rows}-dim-{dim}-file-num-{file_nums}-error-{err_type}-{int(time.time())}.parquet"
|
file_name = f"data-fields-{len(data_fields)}-rows-{rows}-dim-{dim}-file-num-{file_nums}-error-{err_type}-{int(time.time())}.parquet"
|
||||||
|
if row_group_size is not None:
|
||||||
|
df.to_parquet(f"{data_source}/{file_name}", engine='pyarrow', row_group_size=row_group_size)
|
||||||
|
else:
|
||||||
df.to_parquet(f"{data_source}/{file_name}", engine='pyarrow')
|
df.to_parquet(f"{data_source}/{file_name}", engine='pyarrow')
|
||||||
|
|
||||||
# get the file size
|
# get the file size
|
||||||
if file_size is not None:
|
if file_size is not None:
|
||||||
batch_file_size = os.path.getsize(f"{data_source}/{file_name}")
|
batch_file_size = os.path.getsize(f"{data_source}/{file_name}")
|
||||||
@ -702,6 +704,9 @@ def gen_parquet_files(float_vector, rows, dim, data_fields, file_size=None, file
|
|||||||
all_df = pd.concat([df for _ in range(total_batch)], axis=0, ignore_index=True)
|
all_df = pd.concat([df for _ in range(total_batch)], axis=0, ignore_index=True)
|
||||||
file_name = f"data-fields-{len(data_fields)}-rows-{total_rows}-dim-{dim}-file-num-{file_nums}-error-{err_type}-{int(time.time())}.parquet"
|
file_name = f"data-fields-{len(data_fields)}-rows-{total_rows}-dim-{dim}-file-num-{file_nums}-error-{err_type}-{int(time.time())}.parquet"
|
||||||
log.info(f"all df: \n {all_df}")
|
log.info(f"all df: \n {all_df}")
|
||||||
|
if row_group_size is not None:
|
||||||
|
all_df.to_parquet(f"{data_source}/{file_name}", engine='pyarrow', row_group_size=row_group_size)
|
||||||
|
else:
|
||||||
all_df.to_parquet(f"{data_source}/{file_name}", engine='pyarrow')
|
all_df.to_parquet(f"{data_source}/{file_name}", engine='pyarrow')
|
||||||
batch_file_size = os.path.getsize(f"{data_source}/{file_name}")
|
batch_file_size = os.path.getsize(f"{data_source}/{file_name}")
|
||||||
log.info(f"file_size with rows {total_rows} for {file_name}: {batch_file_size/1024/1024} MB")
|
log.info(f"file_size with rows {total_rows} for {file_name}: {batch_file_size/1024/1024} MB")
|
||||||
@ -717,6 +722,9 @@ def gen_parquet_files(float_vector, rows, dim, data_fields, file_size=None, file
|
|||||||
all_field_data["$meta"] = gen_dynamic_field_data_in_parquet_file(rows=rows, start=0)
|
all_field_data["$meta"] = gen_dynamic_field_data_in_parquet_file(rows=rows, start=0)
|
||||||
df = pd.DataFrame(all_field_data)
|
df = pd.DataFrame(all_field_data)
|
||||||
file_name = f"data-fields-{len(data_fields)}-rows-{rows}-dim-{dim}-file-num-{i}-error-{err_type}-{int(time.time())}.parquet"
|
file_name = f"data-fields-{len(data_fields)}-rows-{rows}-dim-{dim}-file-num-{i}-error-{err_type}-{int(time.time())}.parquet"
|
||||||
|
if row_group_size is not None:
|
||||||
|
df.to_parquet(f"{data_source}/{file_name}", engine='pyarrow', row_group_size=row_group_size)
|
||||||
|
else:
|
||||||
df.to_parquet(f"{data_source}/{file_name}", engine='pyarrow')
|
df.to_parquet(f"{data_source}/{file_name}", engine='pyarrow')
|
||||||
files.append(file_name)
|
files.append(file_name)
|
||||||
start_uid += rows
|
start_uid += rows
|
||||||
@ -847,7 +855,7 @@ def prepare_bulk_insert_numpy_files(minio_endpoint="", bucket_name="milvus-bucke
|
|||||||
return files
|
return files
|
||||||
|
|
||||||
|
|
||||||
def prepare_bulk_insert_parquet_files(minio_endpoint="", bucket_name="milvus-bucket", rows=100, dim=128, array_length=None, file_size=None,
|
def prepare_bulk_insert_parquet_files(minio_endpoint="", bucket_name="milvus-bucket", rows=100, dim=128, array_length=None, file_size=None, row_group_size=None,
|
||||||
enable_dynamic_field=False, data_fields=[DataField.vec_field], float_vector=True, file_nums=1, force=False):
|
enable_dynamic_field=False, data_fields=[DataField.vec_field], float_vector=True, file_nums=1, force=False):
|
||||||
"""
|
"""
|
||||||
Generate column based files based on params in parquet format and copy them to the minio
|
Generate column based files based on params in parquet format and copy them to the minio
|
||||||
@ -879,7 +887,7 @@ def prepare_bulk_insert_parquet_files(minio_endpoint="", bucket_name="milvus-buc
|
|||||||
File name list or file name with sub-folder list
|
File name list or file name with sub-folder list
|
||||||
"""
|
"""
|
||||||
files = gen_parquet_files(rows=rows, dim=dim, float_vector=float_vector, enable_dynamic_field=enable_dynamic_field,
|
files = gen_parquet_files(rows=rows, dim=dim, float_vector=float_vector, enable_dynamic_field=enable_dynamic_field,
|
||||||
data_fields=data_fields, array_length=array_length, file_size=file_size,
|
data_fields=data_fields, array_length=array_length, file_size=file_size, row_group_size=row_group_size,
|
||||||
file_nums=file_nums)
|
file_nums=file_nums)
|
||||||
copy_files_to_minio(host=minio_endpoint, r_source=data_source, files=files, bucket_name=bucket_name, force=force)
|
copy_files_to_minio(host=minio_endpoint, r_source=data_source, files=files, bucket_name=bucket_name, force=force)
|
||||||
return files
|
return files
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user