mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
test: Cherry pick sparse invert index algo tests (#39816)
related issue: https://github.com/milvus-io/milvus/issues/39332 related pr: #39691 --------- Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>
This commit is contained in:
parent
ee25af4c9b
commit
3a951f2160
@ -251,8 +251,8 @@ class ResponseChecker:
|
||||
assert res["enable_dynamic_field"] == check_items.get("enable_dynamic_field", True)
|
||||
if check_items.get("num_partitions", 1):
|
||||
assert res["num_partitions"] == check_items.get("num_partitions", 1)
|
||||
if check_items.get("id_name", "id"):
|
||||
assert res["fields"][0]["name"] == check_items.get("id_name", "id")
|
||||
if check_items.get("primary_field", None) is not None:
|
||||
assert res["fields"][0]["name"] == check_items.get("primary_field")
|
||||
if check_items.get("vector_name", "vector"):
|
||||
assert res["fields"][1]["name"] == check_items.get("vector_name", "vector")
|
||||
if check_items.get("dim", None) is not None:
|
||||
@ -372,13 +372,14 @@ class ResponseChecker:
|
||||
log.info("search_results_check: Numbers of query searched is correct")
|
||||
enable_milvus_client_api = check_items.get("enable_milvus_client_api", False)
|
||||
# log.debug(search_res)
|
||||
pk_name = check_items.get('primary_field', 'id')
|
||||
for hits in search_res:
|
||||
searched_original_vectors = []
|
||||
ids = []
|
||||
vector_id = 0
|
||||
if enable_milvus_client_api:
|
||||
for hit in hits:
|
||||
ids.append(hit['id'])
|
||||
ids.append(hit[pk_name])
|
||||
else:
|
||||
ids = list(hits.ids)
|
||||
if (len(hits) != check_items["limit"]) \
|
||||
|
||||
@ -220,10 +220,12 @@ all_index_types = ["FLAT", "IVF_FLAT", "IVF_SQ8", "IVF_PQ",
|
||||
"SPARSE_INVERTED_INDEX", "SPARSE_WAND",
|
||||
"GPU_IVF_FLAT", "GPU_IVF_PQ"]
|
||||
|
||||
inverted_index_algo = ['TAAT_NAIVE', 'DAAT_WAND', 'DAAT_MAXSCORE']
|
||||
|
||||
default_all_indexes_params = [{}, {"nlist": 128}, {"nlist": 128}, {"nlist": 128, "m": 16, "nbits": 8},
|
||||
{"M": 32, "efConstruction": 360}, {"nlist": 128}, {},
|
||||
{}, {"nlist": 64},
|
||||
{"drop_ratio_build": 0.2}, {"drop_ratio_build": 0.2},
|
||||
{}, {"drop_ratio_build": 0.2},
|
||||
{"nlist": 64}, {"nlist": 64, "m": 16, "nbits": 8}]
|
||||
|
||||
default_all_search_params_params = [{}, {"nprobe": 32}, {"nprobe": 32}, {"nprobe": 32},
|
||||
|
||||
@ -306,7 +306,7 @@ class TestMilvusClientCollectionValid(TestMilvusClientV2Base):
|
||||
"consistency_level": 0,
|
||||
"enable_dynamic_field": False,
|
||||
"num_partitions": 16,
|
||||
"id_name": "id_string",
|
||||
"primary_field": "id_string",
|
||||
"vector_name": "embeddings"}
|
||||
if nullable:
|
||||
check_items["nullable_fields"] = ["nullable_field", "array_field"]
|
||||
|
||||
@ -247,39 +247,54 @@ class TestMilvusClientSearchValid(TestMilvusClientV2Base):
|
||||
client = self._client()
|
||||
collection_name = cf.gen_unique_str(prefix)
|
||||
# 1. create collection
|
||||
self.create_collection(client, collection_name, default_dim, consistency_level="Bounded")
|
||||
schema = self.create_schema(client, enable_dynamic_field=True)[0]
|
||||
pk_name = 'pk_varchar'
|
||||
schema.add_field(pk_name, DataType.VARCHAR, max_length=64, is_primary=True,
|
||||
auto_id=False)
|
||||
schema.add_field(default_vector_field_name, DataType.FLOAT_VECTOR, dim=default_dim)
|
||||
schema.add_field(default_string_field_name, DataType.VARCHAR, max_length=64)
|
||||
schema.add_field(default_float_field_name, DataType.FLOAT)
|
||||
self.create_collection(client, collection_name, schema=schema, consistency_level="Bounded")
|
||||
collections = self.list_collections(client)[0]
|
||||
assert collection_name in collections
|
||||
self.describe_collection(client, collection_name,
|
||||
check_task=CheckTasks.check_describe_collection_property,
|
||||
check_items={"collection_name": collection_name,
|
||||
"dim": default_dim,
|
||||
"dim": default_dim, "primary_field": pk_name,
|
||||
"consistency_level": 0})
|
||||
old_name = collection_name
|
||||
new_name = collection_name + "new"
|
||||
self.rename_collection(client, old_name, new_name)
|
||||
# 2. insert
|
||||
rng = np.random.default_rng(seed=19530)
|
||||
rows = [{default_primary_key_field_name: i, default_vector_field_name: list(rng.random((1, default_dim))[0]),
|
||||
default_float_field_name: i * 1.0, default_string_field_name: str(i)} for i in range(default_nb)]
|
||||
rows = [{pk_name: str(i),
|
||||
default_vector_field_name: list(rng.random((1, default_dim))[0]),
|
||||
default_string_field_name: str(i),
|
||||
default_float_field_name: i*1.0
|
||||
} for i in range(default_nb)]
|
||||
self.insert(client, new_name, rows)
|
||||
self.flush(client, new_name)
|
||||
index_params = self.prepare_index_params(client)[0]
|
||||
index_params.add_index(default_vector_field_name, metric_type="COSINE")
|
||||
self.create_index(client, new_name, index_params=index_params)
|
||||
self.load_collection(client, new_name)
|
||||
# assert self.num_entities(client, collection_name)[0] == default_nb
|
||||
# 3. search
|
||||
vectors_to_search = rng.random((1, default_dim))
|
||||
insert_ids = [i for i in range(default_nb)]
|
||||
insert_ids = [str(i) for i in range(default_nb)]
|
||||
self.search(client, new_name, vectors_to_search,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"enable_milvus_client_api": True,
|
||||
"nq": len(vectors_to_search),
|
||||
"ids": insert_ids,
|
||||
"ids": insert_ids, "primary_field": pk_name,
|
||||
"limit": default_limit})
|
||||
# 4. query
|
||||
self.query(client, new_name, filter=default_search_exp,
|
||||
filter = f"{default_float_field_name} >= 0"
|
||||
self.query(client, new_name, filter=filter,
|
||||
check_task=CheckTasks.check_query_results,
|
||||
check_items={exp_res: rows,
|
||||
"with_vec": True,
|
||||
"primary_field": default_primary_key_field_name})
|
||||
"primary_field": pk_name})
|
||||
self.release_collection(client, new_name)
|
||||
self.drop_collection(client, new_name)
|
||||
|
||||
|
||||
@ -28,8 +28,8 @@ pytest-parallel
|
||||
pytest-random-order
|
||||
|
||||
# pymilvus
|
||||
pymilvus==2.5.3
|
||||
pymilvus[bulk_writer]==2.5.3
|
||||
pymilvus==2.5.5rc5
|
||||
pymilvus[bulk_writer]==2.5.5rc5
|
||||
|
||||
# for customize config test
|
||||
python-benedict==0.24.3
|
||||
|
||||
@ -281,6 +281,7 @@ class TestAsyncMilvusClient(TestMilvusClientV2Base):
|
||||
"params": {"ef": "96"}},
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"enable_milvus_client_api": True,
|
||||
"primary_field": ct.default_string_field_name,
|
||||
"nq": ct.default_nq,
|
||||
"limit": ct.default_limit})
|
||||
tasks.append(default_search_task)
|
||||
@ -307,6 +308,7 @@ class TestAsyncMilvusClient(TestMilvusClientV2Base):
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={
|
||||
"enable_milvus_client_api": True,
|
||||
"primary_field": ct.default_string_field_name,
|
||||
"nq": ct.default_nq,
|
||||
"limit": 5})
|
||||
tasks.append(filter_params_search_task)
|
||||
|
||||
@ -2315,9 +2315,10 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
@pytest.mark.parametrize("expr", ["text_match"])
|
||||
@pytest.mark.parametrize("offset", [10])
|
||||
@pytest.mark.parametrize("tokenizer", ["jieba"])
|
||||
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
|
||||
def test_full_text_search_with_jieba_tokenizer(
|
||||
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq
|
||||
):
|
||||
self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key,
|
||||
empty_percent, index_type, nq, inverted_index_algo):
|
||||
"""
|
||||
target: test full text search
|
||||
method: 1. enable full text search with jieba tokenizer and insert data with varchar
|
||||
@ -2430,6 +2431,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
"params": {
|
||||
"bm25_k1": 1.5,
|
||||
"bm25_b": 0.75,
|
||||
"inverted_index_algo": inverted_index_algo
|
||||
}
|
||||
}
|
||||
)
|
||||
@ -3302,8 +3304,9 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True])
|
||||
@pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"])
|
||||
@pytest.mark.parametrize("tokenizer", ["standard"])
|
||||
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
|
||||
def test_hybrid_search_with_full_text_search(
|
||||
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type
|
||||
self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, inverted_index_algo
|
||||
):
|
||||
"""
|
||||
target: test full text search
|
||||
@ -3403,6 +3406,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
|
||||
"params": {
|
||||
"bm25_k1": 1.5,
|
||||
"bm25_b": 0.75,
|
||||
"inverted_index_algo": inverted_index_algo
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
@ -1484,6 +1484,28 @@ class TestIndexInvalid(TestcaseBase):
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("inverted_index_algo", ["INVALID_ALGO"])
|
||||
@pytest.mark.parametrize("index ", ct.all_index_types[9:11])
|
||||
def test_invalid_sparse_inverted_index_algo(self, inverted_index_algo, index):
|
||||
"""
|
||||
target: index creation for unsupported ratio parameter
|
||||
method: indexing of unsupported ratio parameters
|
||||
expected: raise exception
|
||||
"""
|
||||
c_name = cf.gen_unique_str(prefix)
|
||||
schema = cf.gen_default_sparse_schema()
|
||||
collection_w = self.init_collection_wrap(name=c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data()
|
||||
collection_w.insert(data=data)
|
||||
params = {"index_type": index, "metric_type": "IP", "params": {"inverted_index_algo": inverted_index_algo}}
|
||||
error = {ct.err_code: 999,
|
||||
ct.err_msg: f"sparse inverted index algo {inverted_index_algo} not found or not supported, "
|
||||
f"supported: [TAAT_NAIVE DAAT_WAND DAAT_MAXSCORE]"}
|
||||
index, _ = self.index_wrap.init_index(collection_w.collection, ct.default_sparse_vec_field_name, params,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
|
||||
@pytest.mark.tags(CaseLabel.GPU)
|
||||
class TestNewIndexAsync(TestcaseBase):
|
||||
|
||||
@ -3104,24 +3104,23 @@ class TestCollectionSearch(TestcaseBase):
|
||||
assert set(ids).issubset(filter_ids_set)
|
||||
|
||||
# 5. search again with expression template and search hints
|
||||
if expr != "": # TODO: remove this when issue #39013 is fixed
|
||||
search_param = default_search_params.copy()
|
||||
search_param.update({"hints": "iterative_filter"})
|
||||
search_res, _ = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, nb,
|
||||
expr=expr, expr_params=expr_params, _async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": min(nb, len(filter_ids)),
|
||||
"_async": _async})
|
||||
if _async:
|
||||
search_res.done()
|
||||
search_res = search_res.result()
|
||||
filter_ids_set = set(filter_ids)
|
||||
for hits in search_res:
|
||||
ids = hits.ids
|
||||
assert set(ids).issubset(filter_ids_set)
|
||||
search_param = default_search_params.copy()
|
||||
search_param.update({"hints": "iterative_filter"})
|
||||
search_res, _ = collection_w.search(vectors[:default_nq], default_search_field,
|
||||
search_param, nb,
|
||||
expr=expr, expr_params=expr_params, _async=_async,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"ids": insert_ids,
|
||||
"limit": min(nb, len(filter_ids)),
|
||||
"_async": _async})
|
||||
if _async:
|
||||
search_res.done()
|
||||
search_res = search_res.result()
|
||||
filter_ids_set = set(filter_ids)
|
||||
for hits in search_res:
|
||||
ids = hits.ids
|
||||
assert set(ids).issubset(filter_ids_set)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("bool_type", [True, False, "true", "false"])
|
||||
@ -12860,7 +12859,8 @@ class TestSparseSearch(TestcaseBase):
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
def test_sparse_index_search(self, index):
|
||||
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
|
||||
def test_sparse_index_search(self, index, inverted_index_algo):
|
||||
"""
|
||||
target: verify that sparse index for sparse vectors can be searched properly
|
||||
method: create connection, collection, insert and search
|
||||
@ -12873,12 +12873,16 @@ class TestSparseSearch(TestcaseBase):
|
||||
data = cf.gen_default_list_sparse_data(nb=3000)
|
||||
collection_w.insert(data)
|
||||
params = cf.get_index_params_params(index)
|
||||
params.update({"inverted_index_algo": inverted_index_algo})
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
collection_w.load()
|
||||
|
||||
_params = cf.get_search_params_params(index)
|
||||
_params.update({"dim_max_score_ratio": 1.05})
|
||||
search_params = {"params": _params}
|
||||
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
ct.default_sparse_search_params, default_limit,
|
||||
search_params, default_limit,
|
||||
output_fields=[ct.default_sparse_vec_field_name],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
@ -12887,7 +12891,7 @@ class TestSparseSearch(TestcaseBase):
|
||||
"output_fields": [ct.default_sparse_vec_field_name]})
|
||||
expr = "int64 < 100 "
|
||||
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
ct.default_sparse_search_params, default_limit,
|
||||
search_params, default_limit,
|
||||
expr=expr, output_fields=[ct.default_sparse_vec_field_name],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
@ -12923,7 +12927,8 @@ class TestSparseSearch(TestcaseBase):
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
def test_sparse_index_enable_mmap_search(self, index):
|
||||
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
|
||||
def test_sparse_index_enable_mmap_search(self, index, inverted_index_algo):
|
||||
"""
|
||||
target: verify that the sparse indexes of sparse vectors can be searched properly after turning on mmap
|
||||
method: create connection, collection, enable mmap, insert and search
|
||||
@ -12939,6 +12944,7 @@ class TestSparseSearch(TestcaseBase):
|
||||
collection_w.insert(data)
|
||||
|
||||
params = cf.get_index_params_params(index)
|
||||
params.update({"inverted_index_algo": inverted_index_algo})
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
|
||||
@ -12968,9 +12974,9 @@ class TestSparseSearch(TestcaseBase):
|
||||
assert len(res) == 4
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("ratio", [0.01, 0.1, 0.5, 0.9])
|
||||
@pytest.mark.parametrize("drop_ratio_build", [0.01])
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
def test_search_sparse_ratio(self, ratio, index):
|
||||
def test_search_sparse_ratio(self, drop_ratio_build, index):
|
||||
"""
|
||||
target: create a sparse index by adjusting the ratio parameter.
|
||||
method: create a sparse index by adjusting the ratio parameter.
|
||||
@ -12982,16 +12988,28 @@ class TestSparseSearch(TestcaseBase):
|
||||
collection_w = self.init_collection_wrap(c_name, schema=schema)
|
||||
data = cf.gen_default_list_sparse_data(nb=4000)
|
||||
collection_w.insert(data)
|
||||
params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": ratio}}
|
||||
params = {"index_type": index, "metric_type": "IP", "params": {"drop_ratio_build": drop_ratio_build}}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, params, index_name=index)
|
||||
collection_w.load()
|
||||
assert collection_w.has_index(index_name=index)[0] is True
|
||||
search_params = {"metric_type": "IP", "params": {"drop_ratio_search": ratio}}
|
||||
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
search_params, default_limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": default_limit})
|
||||
_params = {"drop_ratio_search": 0.2}
|
||||
for dim_max_score_ratio in [0.5, 0.99, 1, 1.3]:
|
||||
_params.update({"dim_max_score_ratio": dim_max_score_ratio})
|
||||
search_params = {"metric_type": "IP", "params": _params}
|
||||
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
search_params, default_limit,
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": default_nq,
|
||||
"limit": default_limit})
|
||||
error = {ct.err_code: 999,
|
||||
ct.err_msg: "should be in range [0.500000, 1.300000]"}
|
||||
for invalid_ratio in [0.49, 1.4]:
|
||||
_params.update({"dim_max_score_ratio": invalid_ratio})
|
||||
search_params = {"metric_type": "IP", "params": _params}
|
||||
collection_w.search(data[-1][0:default_nq], ct.default_sparse_vec_field_name,
|
||||
search_params, default_limit,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items=error)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
@ -13024,8 +13042,8 @@ class TestSparseSearch(TestcaseBase):
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("index", ct.all_index_types[9:11])
|
||||
@pytest.mark.xfail(reason="issue #36174")
|
||||
def test_sparse_vector_search_iterator(self, index):
|
||||
@pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo)
|
||||
def test_sparse_vector_search_iterator(self, index, inverted_index_algo):
|
||||
"""
|
||||
target: create sparse vectors and search iterator
|
||||
method: create sparse vectors and search iterator
|
||||
@ -13038,6 +13056,7 @@ class TestSparseSearch(TestcaseBase):
|
||||
data = cf.gen_default_list_sparse_data(nb=4000)
|
||||
collection_w.insert(data)
|
||||
params = cf.get_index_params_params(index)
|
||||
params.update({"inverted_index_algo": inverted_index_algo})
|
||||
index_params = {"index_type": index, "metric_type": "IP", "params": params}
|
||||
collection_w.create_index(ct.default_sparse_vec_field_name, index_params, index_name=index)
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ import copy
|
||||
import numpy as np
|
||||
import requests
|
||||
from sklearn import preprocessing
|
||||
from pymilvus import Milvus, DataType
|
||||
from pymilvus import MilvusClient, DataType
|
||||
from utils.util_log import test_log as log
|
||||
from utils.util_k8s import init_k8s_client_config
|
||||
|
||||
@ -115,9 +115,9 @@ def get_milvus(host, port, uri=None, handler=None, **kwargs):
|
||||
handler = "GRPC"
|
||||
try_connect = kwargs.get("try_connect", True)
|
||||
if uri is not None:
|
||||
milvus = Milvus(uri=uri, handler=handler, try_connect=try_connect)
|
||||
milvus = MilvusClient(uri=uri, handler=handler, try_connect=try_connect)
|
||||
else:
|
||||
milvus = Milvus(host=host, port=port, handler=handler, try_connect=try_connect)
|
||||
milvus = MilvusClient(uri=f"http://{host}:{port}", handler=handler, try_connect=try_connect)
|
||||
return milvus
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user