mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-08 01:58:34 +08:00
test: Add more sparse test cases (#33916)
issue: https://github.com/milvus-io/milvus/issues/31483 Signed-off-by: elstic <hao.wang@zilliz.com>
This commit is contained in:
parent
6d5747cb3e
commit
1216a4bcd8
@ -262,6 +262,11 @@ class TestcaseBase(Base):
|
|||||||
if is_binary:
|
if is_binary:
|
||||||
default_schema = cf.gen_default_binary_collection_schema(auto_id=auto_id, dim=dim,
|
default_schema = cf.gen_default_binary_collection_schema(auto_id=auto_id, dim=dim,
|
||||||
primary_field=primary_field)
|
primary_field=primary_field)
|
||||||
|
if vector_data_type == ct.sparse_vector:
|
||||||
|
default_schema = cf.gen_default_sparse_schema(auto_id=auto_id, primary_field=primary_field,
|
||||||
|
enable_dynamic_field=enable_dynamic_field,
|
||||||
|
with_json=with_json,
|
||||||
|
multiple_dim_array=multiple_dim_array)
|
||||||
if is_all_data_type:
|
if is_all_data_type:
|
||||||
default_schema = cf.gen_collection_schema_all_datatype(auto_id=auto_id, dim=dim,
|
default_schema = cf.gen_collection_schema_all_datatype(auto_id=auto_id, dim=dim,
|
||||||
primary_field=primary_field,
|
primary_field=primary_field,
|
||||||
@ -289,6 +294,9 @@ class TestcaseBase(Base):
|
|||||||
# This condition will be removed after auto index feature
|
# This condition will be removed after auto index feature
|
||||||
if is_binary:
|
if is_binary:
|
||||||
collection_w.create_index(ct.default_binary_vec_field_name, ct.default_bin_flat_index)
|
collection_w.create_index(ct.default_binary_vec_field_name, ct.default_bin_flat_index)
|
||||||
|
elif vector_data_type == ct.sparse_vector:
|
||||||
|
for vector_name in vector_name_list:
|
||||||
|
collection_w.create_index(vector_name, ct.default_sparse_inverted_index)
|
||||||
else:
|
else:
|
||||||
if len(multiple_dim_array) == 0 or is_all_data_type == False:
|
if len(multiple_dim_array) == 0 or is_all_data_type == False:
|
||||||
vector_name_list.append(ct.default_float_vec_field_name)
|
vector_name_list.append(ct.default_float_vec_field_name)
|
||||||
|
|||||||
@ -145,6 +145,12 @@ def gen_double_field(name=ct.default_double_field_name, is_primary=False, descri
|
|||||||
|
|
||||||
def gen_float_vec_field(name=ct.default_float_vec_field_name, is_primary=False, dim=ct.default_dim,
|
def gen_float_vec_field(name=ct.default_float_vec_field_name, is_primary=False, dim=ct.default_dim,
|
||||||
description=ct.default_desc, vector_data_type="FLOAT_VECTOR", **kwargs):
|
description=ct.default_desc, vector_data_type="FLOAT_VECTOR", **kwargs):
|
||||||
|
if vector_data_type == "SPARSE_FLOAT_VECTOR":
|
||||||
|
dtype = DataType.SPARSE_FLOAT_VECTOR
|
||||||
|
float_vec_field, _ = ApiFieldSchemaWrapper().init_field_schema(name=name, dtype=dtype,
|
||||||
|
description=description,
|
||||||
|
is_primary=is_primary, **kwargs)
|
||||||
|
return float_vec_field
|
||||||
if vector_data_type == "FLOAT_VECTOR":
|
if vector_data_type == "FLOAT_VECTOR":
|
||||||
dtype = DataType.FLOAT_VECTOR
|
dtype = DataType.FLOAT_VECTOR
|
||||||
elif vector_data_type == "FLOAT16_VECTOR":
|
elif vector_data_type == "FLOAT16_VECTOR":
|
||||||
@ -358,9 +364,14 @@ def gen_collection_schema_all_datatype(description=ct.default_desc,
|
|||||||
else:
|
else:
|
||||||
multiple_dim_array.insert(0, dim)
|
multiple_dim_array.insert(0, dim)
|
||||||
for i in range(len(multiple_dim_array)):
|
for i in range(len(multiple_dim_array)):
|
||||||
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.all_float_vector_types[i%3]}",
|
if ct.all_float_vector_types[i%3] != ct.sparse_vector:
|
||||||
|
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.all_float_vector_types[i%3]}",
|
||||||
dim=multiple_dim_array[i],
|
dim=multiple_dim_array[i],
|
||||||
vector_data_type=ct.all_float_vector_types[i%3]))
|
vector_data_type=ct.all_float_vector_types[i%3]))
|
||||||
|
else:
|
||||||
|
# The field of a sparse vector cannot be dimensioned
|
||||||
|
fields.append(gen_float_vec_field(name=f"multiple_vector_{ct.sparse_vector}",
|
||||||
|
vector_data_type=ct.sparse_vector))
|
||||||
|
|
||||||
schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description,
|
schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description,
|
||||||
primary_field=primary_field, auto_id=auto_id,
|
primary_field=primary_field, auto_id=auto_id,
|
||||||
@ -384,8 +395,17 @@ def gen_default_binary_collection_schema(description=ct.default_desc, primary_fi
|
|||||||
|
|
||||||
|
|
||||||
def gen_default_sparse_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
|
def gen_default_sparse_schema(description=ct.default_desc, primary_field=ct.default_int64_field_name,
|
||||||
auto_id=False, **kwargs):
|
auto_id=False, with_json=False, multiple_dim_array=[], **kwargs):
|
||||||
|
|
||||||
fields = [gen_int64_field(), gen_float_field(), gen_string_field(), gen_sparse_vec_field()]
|
fields = [gen_int64_field(), gen_float_field(), gen_string_field(), gen_sparse_vec_field()]
|
||||||
|
if with_json:
|
||||||
|
fields.insert(-1, gen_json_field())
|
||||||
|
|
||||||
|
if len(multiple_dim_array) != 0:
|
||||||
|
for i in range(len(multiple_dim_array)):
|
||||||
|
vec_name = ct.default_sparse_vec_field_name + "_" + str(i)
|
||||||
|
vec_field = gen_sparse_vec_field(name=vec_name)
|
||||||
|
fields.append(vec_field)
|
||||||
sparse_schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description,
|
sparse_schema, _ = ApiCollectionSchemaWrapper().init_collection_schema(fields=fields, description=description,
|
||||||
primary_field=primary_field,
|
primary_field=primary_field,
|
||||||
auto_id=auto_id, **kwargs)
|
auto_id=auto_id, **kwargs)
|
||||||
@ -418,7 +438,7 @@ def gen_vectors(nb, dim, vector_data_type="FLOAT_VECTOR"):
|
|||||||
vectors = gen_fp16_vectors(nb, dim)[1]
|
vectors = gen_fp16_vectors(nb, dim)[1]
|
||||||
elif vector_data_type == "BFLOAT16_VECTOR":
|
elif vector_data_type == "BFLOAT16_VECTOR":
|
||||||
vectors = gen_bf16_vectors(nb, dim)[1]
|
vectors = gen_bf16_vectors(nb, dim)[1]
|
||||||
elif vector_data_type == "SPARSE_VECTOR":
|
elif vector_data_type == "SPARSE_FLOAT_VECTOR":
|
||||||
vectors = gen_sparse_vectors(nb, dim)
|
vectors = gen_sparse_vectors(nb, dim)
|
||||||
|
|
||||||
if dim > 1:
|
if dim > 1:
|
||||||
@ -508,10 +528,10 @@ def gen_general_default_list_data(nb=ct.default_nb, dim=ct.default_dim, start=0,
|
|||||||
index = 2
|
index = 2
|
||||||
del insert_list[index]
|
del insert_list[index]
|
||||||
if len(multiple_dim_array) != 0:
|
if len(multiple_dim_array) != 0:
|
||||||
if len(multiple_vector_field_name) != len(multiple_dim_array):
|
# if len(multiple_vector_field_name) != len(multiple_dim_array):
|
||||||
log.error("multiple vector feature is enabled, please input the vector field name list "
|
# log.error("multiple vector feature is enabled, please input the vector field name list "
|
||||||
"not including the default vector field")
|
# "not including the default vector field")
|
||||||
assert len(multiple_vector_field_name) == len(multiple_dim_array)
|
# assert len(multiple_vector_field_name) == len(multiple_dim_array)
|
||||||
for i in range(len(multiple_dim_array)):
|
for i in range(len(multiple_dim_array)):
|
||||||
new_float_vec_values = gen_vectors(nb, multiple_dim_array[i], vector_data_type=vector_data_type)
|
new_float_vec_values = gen_vectors(nb, multiple_dim_array[i], vector_data_type=vector_data_type)
|
||||||
insert_list.append(new_float_vec_values)
|
insert_list.append(new_float_vec_values)
|
||||||
@ -832,7 +852,7 @@ def gen_default_list_sparse_data(nb=ct.default_nb, dim=ct.default_dim, start=0,
|
|||||||
string_values = [str(i) for i in range(start, start + nb)]
|
string_values = [str(i) for i in range(start, start + nb)]
|
||||||
json_values = [{"number": i, "string": str(i), "bool": bool(i), "list": [j for j in range(0, i)]}
|
json_values = [{"number": i, "string": str(i), "bool": bool(i), "list": [j for j in range(0, i)]}
|
||||||
for i in range(start, start + nb)]
|
for i in range(start, start + nb)]
|
||||||
sparse_vec_values = gen_vectors(nb, dim, vector_data_type="SPARSE_VECTOR")
|
sparse_vec_values = gen_vectors(nb, dim, vector_data_type="SPARSE_FLOAT_VECTOR")
|
||||||
if with_json:
|
if with_json:
|
||||||
data = [int_values, float_values, string_values, json_values, sparse_vec_values]
|
data = [int_values, float_values, string_values, json_values, sparse_vec_values]
|
||||||
else:
|
else:
|
||||||
@ -1772,7 +1792,7 @@ def insert_data(collection_w, nb=ct.default_nb, is_binary=False, is_all_data_typ
|
|||||||
multiple_vector_field_name=vector_name_list,
|
multiple_vector_field_name=vector_name_list,
|
||||||
vector_data_type=vector_data_type,
|
vector_data_type=vector_data_type,
|
||||||
auto_id=auto_id, primary_field=primary_field)
|
auto_id=auto_id, primary_field=primary_field)
|
||||||
elif vector_data_type == "FLOAT16_VECTOR" or "BFLOAT16_VECTOR":
|
elif vector_data_type in ct.all_float_vector_types:
|
||||||
default_data = gen_general_default_list_data(nb // num, dim=dim, start=start, with_json=with_json,
|
default_data = gen_general_default_list_data(nb // num, dim=dim, start=start, with_json=with_json,
|
||||||
random_primary_key=random_primary_key,
|
random_primary_key=random_primary_key,
|
||||||
multiple_dim_array=multiple_dim_array,
|
multiple_dim_array=multiple_dim_array,
|
||||||
@ -1972,14 +1992,10 @@ def extract_vector_field_name_list(collection_w):
|
|||||||
fields = schema_dict.get('fields')
|
fields = schema_dict.get('fields')
|
||||||
vector_name_list = []
|
vector_name_list = []
|
||||||
for field in fields:
|
for field in fields:
|
||||||
if str(field['type']) in ["101", "102", "103"]:
|
if field['type'] == DataType.FLOAT_VECTOR \
|
||||||
if field['name'] != ct.default_float_vec_field_name:
|
or field['type'] == DataType.FLOAT16_VECTOR \
|
||||||
vector_name_list.append(field['name'])
|
or field['type'] == DataType.BFLOAT16_VECTOR \
|
||||||
|
or field['type'] == DataType.SPARSE_FLOAT_VECTOR:
|
||||||
for field in fields:
|
|
||||||
if str(field['type']) == 'DataType.FLOAT_VECTOR' \
|
|
||||||
or str(field['type']) == 'DataType.FLOAT16_VECTOR' \
|
|
||||||
or str(field['type']) == 'DataType.BFLOAT16_VECTOR':
|
|
||||||
if field['name'] != ct.default_float_vec_field_name:
|
if field['name'] != ct.default_float_vec_field_name:
|
||||||
vector_name_list.append(field['name'])
|
vector_name_list.append(field['name'])
|
||||||
|
|
||||||
@ -2120,11 +2136,13 @@ def gen_vectors_based_on_vector_type(num, dim, vector_data_type):
|
|||||||
fp16_vectors: the bytes used for insert
|
fp16_vectors: the bytes used for insert
|
||||||
return: raw_vectors and fp16_vectors
|
return: raw_vectors and fp16_vectors
|
||||||
"""
|
"""
|
||||||
if vector_data_type == "FLOAT_VECTOR":
|
if vector_data_type == ct.float_type:
|
||||||
vectors = [[random.random() for _ in range(dim)] for _ in range(num)]
|
vectors = [[random.random() for _ in range(dim)] for _ in range(num)]
|
||||||
elif vector_data_type == "FLOAT16_VECTOR":
|
elif vector_data_type == ct.float16_type:
|
||||||
vectors = gen_fp16_vectors(num, dim)[1]
|
vectors = gen_fp16_vectors(num, dim)[1]
|
||||||
elif vector_data_type == "BFLOAT16_VECTOR":
|
elif vector_data_type == ct.bfloat16_type:
|
||||||
vectors = gen_bf16_vectors(num, dim)[1]
|
vectors = gen_bf16_vectors(num, dim)[1]
|
||||||
|
elif vector_data_type == ct.sparse_vector:
|
||||||
|
vectors = gen_sparse_vectors(num, dim)
|
||||||
|
|
||||||
return vectors
|
return vectors
|
||||||
|
|||||||
@ -44,7 +44,8 @@ default_binary_vec_field_name = "binary_vector"
|
|||||||
float_type = "FLOAT_VECTOR"
|
float_type = "FLOAT_VECTOR"
|
||||||
float16_type = "FLOAT16_VECTOR"
|
float16_type = "FLOAT16_VECTOR"
|
||||||
bfloat16_type = "BFLOAT16_VECTOR"
|
bfloat16_type = "BFLOAT16_VECTOR"
|
||||||
all_float_vector_types = [float_type, float16_type, bfloat16_type]
|
sparse_vector = "SPARSE_FLOAT_VECTOR"
|
||||||
|
all_float_vector_types = [float16_type, bfloat16_type, sparse_vector]
|
||||||
default_sparse_vec_field_name = "sparse_vector"
|
default_sparse_vec_field_name = "sparse_vector"
|
||||||
default_partition_name = "_default"
|
default_partition_name = "_default"
|
||||||
default_resource_group_name = '__default_resource_group'
|
default_resource_group_name = '__default_resource_group'
|
||||||
|
|||||||
@ -50,6 +50,7 @@ vectors = [[random.random() for _ in range(default_dim)] for _ in range(default_
|
|||||||
default_search_field = ct.default_float_vec_field_name
|
default_search_field = ct.default_float_vec_field_name
|
||||||
default_search_params = ct.default_search_params
|
default_search_params = ct.default_search_params
|
||||||
max_vector_field_num = ct.max_vector_field_num
|
max_vector_field_num = ct.max_vector_field_num
|
||||||
|
SPARSE_FLOAT_VECTOR_data_type = "SPARSE_FLOAT_VECTOR"
|
||||||
|
|
||||||
|
|
||||||
class TestCollectionParams(TestcaseBase):
|
class TestCollectionParams(TestcaseBase):
|
||||||
@ -1047,6 +1048,24 @@ class TestCollectionParams(TestcaseBase):
|
|||||||
error = {ct.err_code: 65535, ct.err_msg: "maximum field's number should be limited to 64"}
|
error = {ct.err_code: 65535, ct.err_msg: "maximum field's number should be limited to 64"}
|
||||||
self.collection_wrap.init_collection(c_name, schema=schema, check_task=CheckTasks.err_res, check_items=error)
|
self.collection_wrap.init_collection(c_name, schema=schema, check_task=CheckTasks.err_res, check_items=error)
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
|
def test_collection_multi_sparse_vectors(self):
|
||||||
|
"""
|
||||||
|
target: Test multiple sparse vectors in a collection
|
||||||
|
method: create 2 sparse vectors in a collection
|
||||||
|
expected: successful creation of a collection
|
||||||
|
"""
|
||||||
|
# 1. connect
|
||||||
|
self._connect()
|
||||||
|
# 2. create collection with multiple vectors
|
||||||
|
c_name = cf.gen_unique_str(prefix)
|
||||||
|
fields = [cf.gen_int64_field(is_primary=True), cf.gen_float_field(),
|
||||||
|
cf.gen_float_vec_field(vector_data_type=ct.sparse_vector_data_type), cf.gen_float_vec_field(name="tmp", vector_data_type=sparse_vector_data_type)]
|
||||||
|
schema = cf.gen_collection_schema(fields=fields)
|
||||||
|
self.collection_wrap.init_collection(c_name, schema=schema,
|
||||||
|
check_task=CheckTasks.check_collection_property,
|
||||||
|
check_items={exp_name: c_name, exp_schema: schema})
|
||||||
|
|
||||||
|
|
||||||
class TestCollectionOperation(TestcaseBase):
|
class TestCollectionOperation(TestcaseBase):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
import random
|
import random
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
@ -1442,6 +1444,47 @@ class TestIndexInvalid(TestcaseBase):
|
|||||||
check_items={ct.err_code: 1,
|
check_items={ct.err_code: 1,
|
||||||
ct.err_msg: f"<'int' object has no attribute 'items'"})
|
ct.err_msg: f"<'int' object has no attribute 'items'"})
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
|
@pytest.mark.parametrize("metric_type", ["L2", "COSINE", " ", "invalid"])
|
||||||
|
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||||
|
def test_invalid_sparse_metric_type(self, metric_type, index):
|
||||||
|
"""
|
||||||
|
target: unsupported metric_type create index
|
||||||
|
method: unsupported metric_type creates an index
|
||||||
|
expected: raise exception
|
||||||
|
"""
|
||||||
|
c_name = cf.gen_unique_str(prefix)
|
||||||
|
schema = cf.gen_default_sparse_schema()
|
||||||
|
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
|
||||||
|
data = cf.gen_default_list_sparse_data()
|
||||||
|
collection_w.insert(data=data)
|
||||||
|
param = cf.get_index_params_params(index)
|
||||||
|
params = {"index_type": index, "metric_type": metric_type, "params": param}
|
||||||
|
error = {ct.err_code: 65535, ct.err_msg: "only IP is the supported metric type for sparse index"}
|
||||||
|
index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params,
|
||||||
|
check_task=CheckTasks.err_res,
|
||||||
|
check_items=error)
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
|
@pytest.mark.parametrize("ratio", [-0.5, 1, 3])
|
||||||
|
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
|
||||||
|
def test_invalid_sparse_ratio(self, ratio, index):
|
||||||
|
"""
|
||||||
|
target: index creation for unsupported ratio parameter
|
||||||
|
method: indexing of unsupported ratio parameters
|
||||||
|
expected: raise exception
|
||||||
|
"""
|
||||||
|
c_name = cf.gen_unique_str(prefix)
|
||||||
|
schema = cf.gen_default_sparse_schema()
|
||||||
|
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
|
||||||
|
data = cf.gen_default_list_sparse_data()
|
||||||
|
collection_w.insert(data=data)
|
||||||
|
params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": ratio}}
|
||||||
|
error = {ct.err_code: 1100, ct.err_msg: f"invalid drop_ratio_build: {ratio}, must be in range [0, 1): invalid parameter[expected=valid index params"}
|
||||||
|
index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params,
|
||||||
|
check_task=CheckTasks.err_res,
|
||||||
|
check_items=error)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.tags(CaseLabel.GPU)
|
@pytest.mark.tags(CaseLabel.GPU)
|
||||||
class TestNewIndexAsync(TestcaseBase):
|
class TestNewIndexAsync(TestcaseBase):
|
||||||
|
|||||||
@ -1348,6 +1348,25 @@ class TestInsertInvalid(TestcaseBase):
|
|||||||
error = {ct.err_code: 65535, ct.err_msg: "value '+Inf' is not a number or infinity"}
|
error = {ct.err_code: 65535, ct.err_msg: "value '+Inf' is not a number or infinity"}
|
||||||
collection_w.insert(data=data, check_task=CheckTasks.err_res, check_items=error)
|
collection_w.insert(data=data, check_task=CheckTasks.err_res, check_items=error)
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
|
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
|
||||||
|
@pytest.mark.parametrize("invalid_vector_type ", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
|
||||||
|
def test_invalid_sparse_vector_data(self, index, invalid_vector_type):
|
||||||
|
"""
|
||||||
|
target: insert illegal data type
|
||||||
|
method: insert illegal data type
|
||||||
|
expected: raise exception
|
||||||
|
"""
|
||||||
|
c_name = cf.gen_unique_str(prefix)
|
||||||
|
schema = cf.gen_default_sparse_schema()
|
||||||
|
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
|
||||||
|
nb = 100
|
||||||
|
data = cf.gen_default_list_sparse_data(nb=nb)[:-1]
|
||||||
|
invalid_vec = cf.gen_vectors(nb, dim=128, vector_data_type=invalid_vector_type)
|
||||||
|
data.append(invalid_vec)
|
||||||
|
error = {ct.err_code: 1, ct.err_msg: 'input must be a sparse matrix in supported format'}
|
||||||
|
collection_w.insert(data=data, check_task=CheckTasks.err_res, check_items=error)
|
||||||
|
|
||||||
|
|
||||||
class TestInsertInvalidBinary(TestcaseBase):
|
class TestInsertInvalidBinary(TestcaseBase):
|
||||||
"""
|
"""
|
||||||
@ -1872,6 +1891,30 @@ class TestUpsertValid(TestcaseBase):
|
|||||||
collection_w.upsert(df)
|
collection_w.upsert(df)
|
||||||
assert collection_w.num_entities == ct.default_nb
|
assert collection_w.num_entities == ct.default_nb
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
|
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
|
||||||
|
def test_upsert_sparse_data(self, index):
|
||||||
|
"""
|
||||||
|
target: multiple upserts and counts(*)
|
||||||
|
method: multiple upserts and counts(*)
|
||||||
|
expected: number of data entries normal
|
||||||
|
"""
|
||||||
|
c_name = cf.gen_unique_str(prefix)
|
||||||
|
schema = cf.gen_default_sparse_schema()
|
||||||
|
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
|
||||||
|
data = cf.gen_default_list_sparse_data(nb=ct.default_nb)
|
||||||
|
collection_w.upsert(data=data)
|
||||||
|
assert collection_w.num_entities == ct.default_nb
|
||||||
|
params = cf.get_index_params_params(index)
|
||||||
|
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||||
|
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||||
|
collection_w.load()
|
||||||
|
for i in range(5):
|
||||||
|
collection_w.upsert(data=data)
|
||||||
|
collection_w.query(expr=f'{ct.default_int64_field_name} >= 0', output_fields=[ct.default_count_output]
|
||||||
|
, check_task=CheckTasks.check_query_results,
|
||||||
|
check_items={"exp_res": [{"count(*)": ct.default_nb}]})
|
||||||
|
|
||||||
|
|
||||||
class TestUpsertInvalid(TestcaseBase):
|
class TestUpsertInvalid(TestcaseBase):
|
||||||
""" Invalid test case of Upsert interface """
|
""" Invalid test case of Upsert interface """
|
||||||
|
|||||||
@ -3691,6 +3691,37 @@ class TestQueryCount(TestcaseBase):
|
|||||||
check_task=CheckTasks.check_query_results,
|
check_task=CheckTasks.check_query_results,
|
||||||
check_items={exp_res: [{count: res}]})
|
check_items={exp_res: [{count: res}]})
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L1)
|
||||||
|
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||||
|
def test_counts_expression_sparse_vectors(self, index):
|
||||||
|
"""
|
||||||
|
target: test count with expr
|
||||||
|
method: count with expr
|
||||||
|
expected: verify count
|
||||||
|
"""
|
||||||
|
self._connect()
|
||||||
|
c_name = cf.gen_unique_str(prefix)
|
||||||
|
schema = cf.gen_default_sparse_schema()
|
||||||
|
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||||
|
data = cf.gen_default_list_sparse_data()
|
||||||
|
collection_w.insert(data)
|
||||||
|
params = cf.get_index_params_params(index)
|
||||||
|
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||||
|
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||||
|
collection_w.load()
|
||||||
|
collection_w.query(expr=default_expr, output_fields=[count],
|
||||||
|
check_task=CheckTasks.check_query_results,
|
||||||
|
check_items={exp_res: [{count: ct.default_nb}]})
|
||||||
|
expr = "int64 > 50 && int64 < 100 && float < 75"
|
||||||
|
collection_w.query(expr=expr, output_fields=[count],
|
||||||
|
check_task=CheckTasks.check_query_results,
|
||||||
|
check_items={exp_res: [{count: 24}]})
|
||||||
|
batch_size = 100
|
||||||
|
collection_w.query_iterator(batch_size=batch_size, expr=default_expr,
|
||||||
|
check_task=CheckTasks.check_query_iterator,
|
||||||
|
check_items={"count": ct.default_nb,
|
||||||
|
"batch_size": batch_size})
|
||||||
|
|
||||||
|
|
||||||
class TestQueryIterator(TestcaseBase):
|
class TestQueryIterator(TestcaseBase):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -6380,6 +6380,35 @@ class TestSearchPagination(TestcaseBase):
|
|||||||
default_limit, offset=offset)[0]
|
default_limit, offset=offset)[0]
|
||||||
assert res1[0].ids == res2[0].ids
|
assert res1[0].ids == res2[0].ids
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
|
@pytest.mark.parametrize("offset", [1, 5, 20])
|
||||||
|
def test_search_sparse_with_pagination(self, offset):
|
||||||
|
"""
|
||||||
|
target: test search sparse with pagination
|
||||||
|
method: 1. connect and create a collection
|
||||||
|
2. search pagination with offset
|
||||||
|
3. search with offset+limit
|
||||||
|
4. compare with the search results whose corresponding ids should be the same
|
||||||
|
expected: search successfully and ids is correct
|
||||||
|
"""
|
||||||
|
# 1. create a collection
|
||||||
|
auto_id = False
|
||||||
|
collection_w, _, _, insert_ids = \
|
||||||
|
self.init_collection_general(
|
||||||
|
prefix, True, auto_id=auto_id, vector_data_type=ct.sparse_vector)[0:4]
|
||||||
|
# 2. search with offset+limit
|
||||||
|
search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}, "offset": offset}
|
||||||
|
search_vectors = cf.gen_default_list_sparse_data()[-1][-2:]
|
||||||
|
search_res = collection_w.search(search_vectors, ct.default_sparse_vec_field_name,
|
||||||
|
search_param, default_limit)[0]
|
||||||
|
# 3. search
|
||||||
|
_search_param = {"metric_type": "IP", "params": {"drop_ratio_search": "0.2"}}
|
||||||
|
res = collection_w.search(search_vectors[:default_nq], ct.default_sparse_vec_field_name, _search_param,
|
||||||
|
default_limit + offset)[0]
|
||||||
|
assert len(search_res[0].ids) == len(res[0].ids[offset:])
|
||||||
|
assert sorted(search_res[0].distances, key=numpy.float32) == sorted(
|
||||||
|
res[0].distances[offset:], key=numpy.float32)
|
||||||
|
|
||||||
|
|
||||||
class TestSearchPaginationInvalid(TestcaseBase):
|
class TestSearchPaginationInvalid(TestcaseBase):
|
||||||
""" Test case of search pagination """
|
""" Test case of search pagination """
|
||||||
@ -6932,7 +6961,7 @@ class TestCollectionRangeSearch(TestcaseBase):
|
|||||||
******************************************************************
|
******************************************************************
|
||||||
"""
|
"""
|
||||||
@pytest.mark.tags(CaseLabel.L0)
|
@pytest.mark.tags(CaseLabel.L0)
|
||||||
@pytest.mark.parametrize("vector_data_type", ct.all_float_vector_types)
|
@pytest.mark.parametrize("vector_data_type", ["FLOAT_VECTOR", "FLOAT16_VECTOR", "BFLOAT16_VECTOR"])
|
||||||
def test_range_search_default(self, index_type, metric, vector_data_type):
|
def test_range_search_default(self, index_type, metric, vector_data_type):
|
||||||
"""
|
"""
|
||||||
target: verify the range search returns correct results
|
target: verify the range search returns correct results
|
||||||
@ -8346,6 +8375,33 @@ class TestCollectionRangeSearch(TestcaseBase):
|
|||||||
"limit": nb_old + nb_new,
|
"limit": nb_old + nb_new,
|
||||||
"_async": _async})
|
"_async": _async})
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
|
def test_range_search_sparse(self):
|
||||||
|
"""
|
||||||
|
target: test sparse index normal range search
|
||||||
|
method: create connection, collection, insert and range search
|
||||||
|
expected: range search successfully
|
||||||
|
"""
|
||||||
|
# 1. initialize with data
|
||||||
|
collection_w = self.init_collection_general(prefix, True, nb=5000,
|
||||||
|
with_json=True,
|
||||||
|
vector_data_type=ct.sparse_vector)[0]
|
||||||
|
range_filter = random.uniform(0.5, 1)
|
||||||
|
radius = random.uniform(0, 0.5)
|
||||||
|
|
||||||
|
# 2. range search
|
||||||
|
range_search_params = {"metric_type": "IP",
|
||||||
|
"params": {"radius": radius, "range_filter": range_filter}}
|
||||||
|
d = cf.gen_default_list_sparse_data(nb=1)
|
||||||
|
search_res = collection_w.search(d[-1][-1:], ct.default_sparse_vec_field_name,
|
||||||
|
range_search_params, default_limit,
|
||||||
|
default_search_exp)[0]
|
||||||
|
|
||||||
|
# 3. check search results
|
||||||
|
for hits in search_res:
|
||||||
|
for distance in hits.distances:
|
||||||
|
assert range_filter >= distance > radius
|
||||||
|
|
||||||
|
|
||||||
class TestCollectionLoadOperation(TestcaseBase):
|
class TestCollectionLoadOperation(TestcaseBase):
|
||||||
""" Test case of search combining load and other functions """
|
""" Test case of search combining load and other functions """
|
||||||
@ -10656,6 +10712,53 @@ class TestSearchGroupBy(TestcaseBase):
|
|||||||
check_task=CheckTasks.check_search_results,
|
check_task=CheckTasks.check_search_results,
|
||||||
check_items={"nq": nq, "limit": limit})
|
check_items={"nq": nq, "limit": limit})
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
|
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||||
|
def test_sparse_vectors_group_by(self, index):
|
||||||
|
"""
|
||||||
|
target: test search group by works on a collection with sparse vector
|
||||||
|
method: 1. create a collection
|
||||||
|
2. create index
|
||||||
|
3. grouping search
|
||||||
|
verify: search successfully
|
||||||
|
"""
|
||||||
|
self._connect()
|
||||||
|
c_name = cf.gen_unique_str(prefix)
|
||||||
|
schema = cf.gen_default_sparse_schema()
|
||||||
|
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||||
|
nb = 5000
|
||||||
|
data = cf.gen_default_list_sparse_data(nb=nb)
|
||||||
|
# update float fields
|
||||||
|
_data = [random.randint(1, 100) for _ in range(nb)]
|
||||||
|
str_data = [str(i) for i in _data]
|
||||||
|
data[2] = str_data
|
||||||
|
collection_w.insert(data)
|
||||||
|
params = cf.get_index_params_params(index)
|
||||||
|
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||||
|
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||||
|
collection_w.load()
|
||||||
|
|
||||||
|
nq = 2
|
||||||
|
limit = 20
|
||||||
|
search_params = ct.default_sparse_search_params
|
||||||
|
|
||||||
|
search_vectors = cf.gen_default_list_sparse_data(nb=nq)[-1][-2:]
|
||||||
|
# verify the results are same if gourp by pk
|
||||||
|
res = collection_w.search(data=search_vectors, anns_field=ct.default_sparse_vec_field_name,
|
||||||
|
param=search_params, limit=limit,
|
||||||
|
group_by_field="varchar",
|
||||||
|
output_fields=["varchar"],
|
||||||
|
check_task=CheckTasks.check_search_results,
|
||||||
|
check_items={"nq": nq, "limit": limit})
|
||||||
|
|
||||||
|
hit = res[0]
|
||||||
|
set_varchar = set()
|
||||||
|
for item in hit:
|
||||||
|
a = list(item.fields.values())
|
||||||
|
set_varchar.add(a[0])
|
||||||
|
# groupy by is in effect, then there are no duplicate varchar values
|
||||||
|
assert len(hit) == len(set_varchar)
|
||||||
|
|
||||||
|
|
||||||
class TestCollectionHybridSearchValid(TestcaseBase):
|
class TestCollectionHybridSearchValid(TestcaseBase):
|
||||||
""" Test case of search interface """
|
""" Test case of search interface """
|
||||||
@ -12534,6 +12637,64 @@ class TestCollectionHybridSearchValid(TestcaseBase):
|
|||||||
for i in range(nq):
|
for i in range(nq):
|
||||||
assert is_sorted_descend(res[i].distances)
|
assert is_sorted_descend(res[i].distances)
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
|
def test_hybrid_search_sparse_normal(self):
|
||||||
|
"""
|
||||||
|
target: test hybrid search after loading sparse vectors
|
||||||
|
method: Test hybrid search after loading sparse vectors
|
||||||
|
expected: hybrid search successfully with limit(topK)
|
||||||
|
"""
|
||||||
|
nb, auto_id, dim, enable_dynamic_field = 20000, False, 768, False
|
||||||
|
# 1. init collection
|
||||||
|
collection_w, insert_vectors, _, insert_ids = self.init_collection_general(prefix, True, nb=nb,
|
||||||
|
multiple_dim_array=[dim, dim*2], with_json=False,
|
||||||
|
vector_data_type="SPARSE_FLOAT_VECTOR")[0:4]
|
||||||
|
# 2. extract vector field name
|
||||||
|
vector_name_list = cf.extract_vector_field_name_list(collection_w)
|
||||||
|
# 3. prepare search params
|
||||||
|
req_list = []
|
||||||
|
search_res_dict_array = []
|
||||||
|
k = 60
|
||||||
|
|
||||||
|
for i in range(len(vector_name_list)):
|
||||||
|
# vector = cf.gen_sparse_vectors(1, dim)
|
||||||
|
vector = insert_vectors[0][i+3][-1:]
|
||||||
|
search_res_dict = {}
|
||||||
|
search_param = {
|
||||||
|
"data": vector,
|
||||||
|
"anns_field": vector_name_list[i],
|
||||||
|
"param": {"metric_type": "IP", "offset": 0},
|
||||||
|
"limit": default_limit,
|
||||||
|
"expr": "int64 > 0"}
|
||||||
|
req = AnnSearchRequest(**search_param)
|
||||||
|
req_list.append(req)
|
||||||
|
# search for get the base line of hybrid_search
|
||||||
|
search_res = collection_w.search(vector, vector_name_list[i],
|
||||||
|
default_search_params, default_limit,
|
||||||
|
default_search_exp,
|
||||||
|
check_task=CheckTasks.check_search_results,
|
||||||
|
check_items={"nq": 1,
|
||||||
|
"ids": insert_ids,
|
||||||
|
# "limit": default_limit
|
||||||
|
}
|
||||||
|
)[0]
|
||||||
|
ids = search_res[0].ids
|
||||||
|
for j in range(len(ids)):
|
||||||
|
search_res_dict[ids[j]] = 1/(j + k +1)
|
||||||
|
search_res_dict_array.append(search_res_dict)
|
||||||
|
# 4. calculate hybrid search base line for RRFRanker
|
||||||
|
ids_answer, score_answer = cf.get_hybrid_search_base_results_rrf(search_res_dict_array)
|
||||||
|
# 5. hybrid search
|
||||||
|
hybrid_res = collection_w.hybrid_search(req_list, RRFRanker(k), default_limit,
|
||||||
|
check_task=CheckTasks.check_search_results,
|
||||||
|
check_items={"nq": 1,
|
||||||
|
"ids": insert_ids,
|
||||||
|
"limit": default_limit})[0]
|
||||||
|
# 6. compare results through the re-calculated distances
|
||||||
|
for i in range(len(score_answer[:default_limit])):
|
||||||
|
delta = math.fabs(score_answer[i] - hybrid_res[0].distances[i])
|
||||||
|
assert delta < hybrid_search_epsilon
|
||||||
|
|
||||||
|
|
||||||
class TestSparseSearch(TestcaseBase):
|
class TestSparseSearch(TestcaseBase):
|
||||||
""" Add some test cases for the sparse vector """
|
""" Add some test cases for the sparse vector """
|
||||||
@ -12550,7 +12711,7 @@ class TestSparseSearch(TestcaseBase):
|
|||||||
c_name = cf.gen_unique_str(prefix)
|
c_name = cf.gen_unique_str(prefix)
|
||||||
schema = cf.gen_default_sparse_schema(auto_id=False)
|
schema = cf.gen_default_sparse_schema(auto_id=False)
|
||||||
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||||
data = cf.gen_default_list_sparse_data()
|
data = cf.gen_default_list_sparse_data(nb=10000)
|
||||||
collection_w.insert(data)
|
collection_w.insert(data)
|
||||||
params = cf.get_index_params_params(index)
|
params = cf.get_index_params_params(index)
|
||||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||||
@ -12562,6 +12723,12 @@ class TestSparseSearch(TestcaseBase):
|
|||||||
check_task=CheckTasks.check_search_results,
|
check_task=CheckTasks.check_search_results,
|
||||||
check_items={"nq": default_nq,
|
check_items={"nq": default_nq,
|
||||||
"limit": default_limit})
|
"limit": default_limit})
|
||||||
|
expr = "int64 < 100 "
|
||||||
|
collection_w.search(data[-1][-1:], ct.default_sparse_vec_field_name,
|
||||||
|
ct.default_sparse_search_params, default_limit,
|
||||||
|
expr,
|
||||||
|
check_task=CheckTasks.check_search_results,
|
||||||
|
check_items={"nq": default_nq})
|
||||||
|
|
||||||
@pytest.mark.tags(CaseLabel.L2)
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||||
@ -12624,3 +12791,83 @@ class TestSparseSearch(TestcaseBase):
|
|||||||
term_expr = f'{ct.default_int64_field_name} in [0, 1, 10, 100]'
|
term_expr = f'{ct.default_int64_field_name} in [0, 1, 10, 100]'
|
||||||
res = collection_w.query(term_expr)
|
res = collection_w.query(term_expr)
|
||||||
assert len(res) == 4
|
assert len(res) == 4
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L1)
|
||||||
|
@pytest.mark.parametrize("ratio", [0.01, 0.1, 0.5, 0.9])
|
||||||
|
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||||
|
def test_search_sparse_ratio(self, ratio, index):
|
||||||
|
"""
|
||||||
|
target: create a sparse index by adjusting the ratio parameter.
|
||||||
|
method: create a sparse index by adjusting the ratio parameter.
|
||||||
|
expected: search successfully
|
||||||
|
"""
|
||||||
|
self._connect()
|
||||||
|
c_name = cf.gen_unique_str(prefix)
|
||||||
|
schema = cf.gen_default_sparse_schema(auto_id=False)
|
||||||
|
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||||
|
data = cf.gen_default_list_sparse_data(nb=10000)
|
||||||
|
collection_w.insert(data)
|
||||||
|
params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": ratio}}
|
||||||
|
collection_w.create_index(ct.default_sparse_vec_field_name, params, index_name=index)
|
||||||
|
collection_w.load()
|
||||||
|
assert collection_w.has_index(index_name=index) == True
|
||||||
|
search_params = {"metric_type": "IP", "params": {"drop_ratio_search": ratio}}
|
||||||
|
collection_w.search(data[-1][-1:], ct.default_sparse_vec_field_name,
|
||||||
|
search_params, default_limit,
|
||||||
|
check_task=CheckTasks.check_search_results,
|
||||||
|
check_items={"nq": default_nq,
|
||||||
|
"limit": default_limit})
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
|
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||||
|
def test_sparse_vector_search_output_field(self, index):
|
||||||
|
"""
|
||||||
|
target: create sparse vectors and search
|
||||||
|
method: create sparse vectors and search
|
||||||
|
expected: normal search
|
||||||
|
"""
|
||||||
|
self._connect()
|
||||||
|
c_name = cf.gen_unique_str(prefix)
|
||||||
|
schema = cf.gen_default_sparse_schema()
|
||||||
|
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||||
|
data = cf.gen_default_list_sparse_data(nb=10000)
|
||||||
|
collection_w.insert(data)
|
||||||
|
params = cf.get_index_params_params(index)
|
||||||
|
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||||
|
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||||
|
|
||||||
|
collection_w.load()
|
||||||
|
d = cf.gen_default_list_sparse_data(nb=1)
|
||||||
|
collection_w.search(d[-1][-1:], ct.default_sparse_vec_field_name,
|
||||||
|
ct.default_sparse_search_params, 5,
|
||||||
|
output_fields=["float", "sparse_vector"],
|
||||||
|
check_task=CheckTasks.check_search_results,
|
||||||
|
check_items={"nq": default_nq,
|
||||||
|
"limit": default_limit,
|
||||||
|
"output_fields": ["float", "sparse_vector"]
|
||||||
|
})
|
||||||
|
|
||||||
|
@pytest.mark.tags(CaseLabel.L2)
|
||||||
|
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||||
|
def test_sparse_vector_search_iterator(self, index):
|
||||||
|
"""
|
||||||
|
target: create sparse vectors and search iterator
|
||||||
|
method: create sparse vectors and search iterator
|
||||||
|
expected: normal search
|
||||||
|
"""
|
||||||
|
self._connect()
|
||||||
|
c_name = cf.gen_unique_str(prefix)
|
||||||
|
schema = cf.gen_default_sparse_schema()
|
||||||
|
collection_w, _ = self.collection_wrap.init_collection(c_name, schema=schema)
|
||||||
|
data = cf.gen_default_list_sparse_data(nb=10000)
|
||||||
|
collection_w.insert(data)
|
||||||
|
params = cf.get_index_params_params(index)
|
||||||
|
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||||
|
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||||
|
|
||||||
|
collection_w.load()
|
||||||
|
batch_size = 10
|
||||||
|
collection_w.search_iterator(data[-1][-1:], ct.default_sparse_vec_field_name,
|
||||||
|
ct.default_sparse_search_params, batch_size,
|
||||||
|
check_task=CheckTasks.check_search_iterator,
|
||||||
|
check_items={"batch_size": batch_size})
|
||||||
Loading…
x
Reference in New Issue
Block a user