test: add geometry datatype in import testcases (#45014)

/kind improvement

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
zhuwenxing 2025-11-04 16:55:33 +08:00 committed by GitHub
parent 6327c9a514
commit 06933c25b8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 81 additions and 1 deletions

View File

@ -45,6 +45,7 @@ class DataField:
array_float_field = "array_float" array_float_field = "array_float"
array_string_field = "array_string" array_string_field = "array_string"
new_field = "new_field" new_field = "new_field"
geo_field = "geo"
class DataErrorType: class DataErrorType:
@ -100,6 +101,51 @@ def gen_binary_vectors(nb, dim):
return vectors return vectors
def gen_wkt_geometry(nb, bounds=(0, 100, 0, 100)):
"""
Generate random WKT geometry strings for bulk insert
Generates a mix of POINT, LINESTRING, and POLYGON types
Args:
nb: Number of geometry strings to generate
bounds: Coordinate bounds as (min_x, max_x, min_y, max_y)
Returns:
List of WKT strings
"""
geometries = []
geom_types = ["POINT", "LINESTRING", "POLYGON"]
for _ in range(nb):
geom_type = random.choice(geom_types)
if geom_type == "POINT":
x = random.uniform(bounds[0], bounds[1])
y = random.uniform(bounds[2], bounds[3])
wkt = f"POINT ({x:.2f} {y:.2f})"
elif geom_type == "LINESTRING":
num_points = random.randint(2, 5)
points = []
for _ in range(num_points):
x = random.uniform(bounds[0], bounds[1])
y = random.uniform(bounds[2], bounds[3])
points.append(f"{x:.2f} {y:.2f}")
wkt = f"LINESTRING ({', '.join(points)})"
else: # POLYGON
# Generate a simple rectangle polygon
x = random.uniform(bounds[0], bounds[1] - 20)
y = random.uniform(bounds[2], bounds[3] - 20)
width = random.uniform(10, 20)
height = random.uniform(10, 20)
wkt = f"POLYGON (({x:.2f} {y:.2f}, {x + width:.2f} {y:.2f}, {x + width:.2f} {y + height:.2f}, {x:.2f} {y + height:.2f}, {x:.2f} {y:.2f}))"
geometries.append(wkt)
return geometries
def gen_fp16_vectors(num, dim, for_json=False): def gen_fp16_vectors(num, dim, for_json=False):
""" """
generate float16 vector data generate float16 vector data
@ -468,6 +514,19 @@ def gen_json_in_numpy_file(dir, data_field, rows, start=0, force=False):
return file_name return file_name
def gen_geometry_in_numpy_file(dir, data_field, rows, start=0, force=False):
file_name = f"{data_field}.npy"
file = f"{dir}/{file_name}"
if not os.path.exists(file) or force:
data = []
if rows > 0:
data = gen_wkt_geometry(rows)
arr = np.array(data)
log.info(f"file_name: {file_name} data type: {arr.dtype} data shape: {arr.shape}")
np.save(file, arr)
return file_name
def gen_int_or_float_in_numpy_file(dir, data_field, rows, start=0, force=False, nullable=False, **kwargs): def gen_int_or_float_in_numpy_file(dir, data_field, rows, start=0, force=False, nullable=False, **kwargs):
file_name = f"{data_field}.npy" file_name = f"{data_field}.npy"
file = f"{dir}/{file_name}" file = f"{dir}/{file_name}"
@ -635,6 +694,12 @@ def gen_data_by_data_field(data_field, rows, start=0, float_vector=True, dim=128
for i in range(start, rows + start)]) for i in range(start, rows + start)])
else: else:
data = [None for _ in range(start, rows + start)] data = [None for _ in range(start, rows + start)]
elif data_field == DataField.geo_field:
if not nullable:
# Generate WKT geometry strings for parquet
data = gen_wkt_geometry(rows)
else:
data = [None for _ in range(start, rows + start)]
else: else:
raise Exception("unsupported field name") raise Exception("unsupported field name")
@ -796,6 +861,12 @@ def gen_dict_data_by_data_field(data_fields, rows, start=0, float_vector=True, d
d[data_field] = [gen_unique_str(str(i)) for i in range(array_length)] d[data_field] = [gen_unique_str(str(i)) for i in range(array_length)]
else: else:
d[data_field] = None d[data_field] = None
elif data_field == DataField.geo_field:
if not nullable:
# Generate a single WKT geometry string
d[data_field] = gen_wkt_geometry(1)[0]
else:
d[data_field] = None
else: else:
raise Exception("unsupported field name") raise Exception("unsupported field name")
if enable_dynamic_field: if enable_dynamic_field:
@ -906,6 +977,8 @@ def gen_npy_files(float_vector, rows, dim, data_fields, file_size=None, file_num
file_name = gen_bool_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force) file_name = gen_bool_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force)
elif data_field == DataField.json_field: elif data_field == DataField.json_field:
file_name = gen_json_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force) file_name = gen_json_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force)
elif data_field == DataField.geo_field:
file_name = gen_geometry_in_numpy_file(dir=data_source_new, data_field=data_field, rows=rows, force=force)
else: else:
file_name = gen_int_or_float_in_numpy_file(dir=data_source_new, data_field=data_field, file_name = gen_int_or_float_in_numpy_file(dir=data_source_new, data_field=data_field,
rows=rows, force=force, nullable=nullable, shuffle_pk=shuffle_pk) rows=rows, force=force, nullable=nullable, shuffle_pk=shuffle_pk)

View File

@ -684,9 +684,13 @@ def gen_json_field(name=ct.default_json_field_name, description=ct.default_desc,
def gen_geometry_field(name=ct.default_geometry_field_name, description=ct.default_desc, is_primary=False, **kwargs): def gen_geometry_field(name=ct.default_geometry_field_name, description=ct.default_desc, is_primary=False, **kwargs):
return gen_scalar_field(DataType.GEOMETRY, name=name, description=description, is_primary=is_primary, **kwargs) return gen_scalar_field(DataType.GEOMETRY, name=name, description=description, is_primary=is_primary, **kwargs)
def gen_geometry_field(name="geo", description=ct.default_desc, is_primary=False, **kwargs):
return gen_scalar_field(DataType.GEOMETRY, name=name, description=description, is_primary=is_primary, **kwargs)
def gen_array_field(name=ct.default_array_field_name, element_type=DataType.INT64, max_capacity=ct.default_max_capacity, def gen_array_field(name=ct.default_array_field_name, element_type=DataType.INT64, max_capacity=ct.default_max_capacity,
description=ct.default_desc, is_primary=False, **kwargs): description=ct.default_desc, is_primary=False, **kwargs):
return gen_scalar_field(DataType.ARRAY, name=name, description=description, is_primary=is_primary, return gen_scalar_field(DataType.ARRAY, name=name, description=description, is_primary=is_primary,
element_type=element_type, max_capacity=max_capacity, **kwargs) element_type=element_type, max_capacity=max_capacity, **kwargs)
def gen_int8_field(name=ct.default_int8_field_name, description=ct.default_desc, is_primary=False, **kwargs): def gen_int8_field(name=ct.default_int8_field_name, description=ct.default_desc, is_primary=False, **kwargs):

View File

@ -789,6 +789,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable), cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable), cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable), cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
cf.gen_geometry_field(name=df.geo_field),
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim), cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim), cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim), cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim),
@ -984,6 +985,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key), cf.gen_string_field(name=df.string_field, is_partition_key=enable_partition_key),
cf.gen_string_field(name=df.text_field, enable_analyzer=True, enable_match=True, nullable=nullable), cf.gen_string_field(name=df.text_field, enable_analyzer=True, enable_match=True, nullable=nullable),
cf.gen_json_field(name=df.json_field), cf.gen_json_field(name=df.json_field),
cf.gen_geometry_field(name=df.geo_field),
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim), cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim), cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim), cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim),
@ -1165,6 +1167,7 @@ class TestBulkInsert(TestcaseBaseBulkInsert):
cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable), cf.gen_array_field(name=df.array_float_field, element_type=DataType.FLOAT, nullable=nullable),
cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable), cf.gen_array_field(name=df.array_string_field, element_type=DataType.VARCHAR, max_length=100, nullable=nullable),
cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable), cf.gen_array_field(name=df.array_bool_field, element_type=DataType.BOOL, nullable=nullable),
cf.gen_geometry_field(name=df.geo_field),
cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim), cf.gen_float_vec_field(name=df.float_vec_field, dim=float_vec_field_dim),
cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim), cf.gen_binary_vec_field(name=df.binary_vec_field, dim=binary_vec_field_dim),
cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim), cf.gen_bfloat16_vec_field(name=df.bf16_vec_field, dim=bf16_vec_field_dim),