mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
test: Add tests for search by ids (#46756)
related issue: #46755 Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>
This commit is contained in:
parent
941c6eaed7
commit
7018151c7d
@ -171,10 +171,10 @@ class TestMilvusClientV2Base(Base):
|
||||
return res, check_result
|
||||
|
||||
@trace()
|
||||
def search(self, client, collection_name, data, limit=10, filter=None, output_fields=None, search_params=None,
|
||||
def search(self, client, collection_name, data=None, limit=10, filter=None, output_fields=None, search_params=None,
|
||||
timeout=None, check_task=None, check_items=None, **kwargs):
|
||||
timeout = TIMEOUT if timeout is None else timeout
|
||||
kwargs.update({"timeout": timeout})
|
||||
# kwargs.update({"timeout": timeout})
|
||||
|
||||
func_name = sys._getframe().f_code.co_name
|
||||
res, check = api_request([client.search, collection_name, data, filter, limit,
|
||||
|
||||
@ -165,7 +165,7 @@ class ApiCollectionWrapper:
|
||||
return res, check_result
|
||||
|
||||
@trace()
|
||||
def search(self, data, anns_field, param, limit, expr=None,
|
||||
def search(self, data=None, anns_field=None, param=None, limit=None, expr=None,
|
||||
partition_names=None, output_fields=None, timeout=None, round_decimal=-1,
|
||||
check_task=None, check_items=None, **kwargs):
|
||||
timeout = TIMEOUT if timeout is None else timeout
|
||||
@ -197,7 +197,7 @@ class ApiCollectionWrapper:
|
||||
return res, check_result
|
||||
|
||||
@trace()
|
||||
def search_iterator(self, data, anns_field, param, batch_size, limit=-1, expr=None,
|
||||
def search_iterator(self, data=None, anns_field=None, param=None, batch_size=None, limit=-1, expr=None,
|
||||
partition_names=None, output_fields=None, timeout=None, round_decimal=-1,
|
||||
check_task=None, check_items=None, **kwargs):
|
||||
timeout = TIMEOUT if timeout is None else timeout
|
||||
|
||||
@ -1824,6 +1824,30 @@ class TestMilvusClientStructArraySearch(TestMilvusClientV2Base):
|
||||
assert check
|
||||
assert len(results[0]) > 0
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_search_struct_array_not_support_search_by_pk(self):
|
||||
"""
|
||||
target: test searching with multiple vectors (EmbeddingList) in struct array does not supprt search by pk
|
||||
method: search using EmbeddingList by pk
|
||||
expected: search failed with error
|
||||
"""
|
||||
collection_name = cf.gen_unique_str(f"{prefix}_search")
|
||||
|
||||
client = self._client()
|
||||
# Create collection with data and index
|
||||
self.create_collection_with_index(client, collection_name)
|
||||
|
||||
# Search using EmbeddingList
|
||||
error = {ct.err_code: 999,
|
||||
ct.err_msg: "array of vector is not supported for search by IDs"}
|
||||
self.search(client,
|
||||
collection_name,
|
||||
ids=[0, 1],
|
||||
anns_field="clips[clip_embedding1]",
|
||||
search_params={"metric_type": "MAX_SIM_COSINE"},
|
||||
limit=10,
|
||||
check_task=CheckTasks.err_res, check_items=error)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("retrieval_ann_ratio", [1.0, 3.0, 5.0, 10.0])
|
||||
def test_search_with_retrieval_ann_ratio(self, retrieval_ann_ratio):
|
||||
|
||||
@ -668,7 +668,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base):
|
||||
vector_name_list = [self.float_vector_field_name1, self.float_vector_field_name2]
|
||||
# 3. prepare search params for each vector field
|
||||
req_list = []
|
||||
nq = 1 # only works for nq=1, as the limitation of get_hybrid_search_base_results_rrf()
|
||||
nq = 1 # only works for nq=1, as the limitation of get_hybrid_search_base_results_rrf()
|
||||
search_res_dict_array = []
|
||||
for field_name in vector_name_list:
|
||||
search_data = cf.gen_vectors(nq, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR)
|
||||
@ -797,7 +797,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base):
|
||||
limit = 200
|
||||
|
||||
field_names = [self.sparse_vector_field_name1, self.sparse_vector_field_name2]
|
||||
nq = len(field_names) # nq should equal to number of filed names, as it would search nq by nq next
|
||||
nq = len(field_names) # nq should equal to number of filed names, as it would search nq by nq next
|
||||
search_data = cf.gen_varchar_data(length=10, nb=nq, text_mode=True)
|
||||
|
||||
# 0. search
|
||||
@ -1321,7 +1321,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base):
|
||||
check_items=check_items)
|
||||
|
||||
|
||||
class TestCollectionHybridSearchValid(TestcaseBase):
|
||||
class TestCollectionHybridSearch(TestcaseBase):
|
||||
""" Test case of search interface """
|
||||
|
||||
@pytest.fixture(scope="function", params=["JACCARD", "HAMMING"])
|
||||
@ -1728,3 +1728,27 @@ class TestCollectionHybridSearchValid(TestcaseBase):
|
||||
for i in range(len(score_answer[:default_limit])):
|
||||
delta = math.fabs(score_answer[i] - hybrid_res[0].distances[i])
|
||||
assert delta < hybrid_search_epsilon
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_hybrid_search_not_support_search_by_pk(self):
|
||||
"""
|
||||
Test case: Hybrid search does not support search by pk
|
||||
Scenario:
|
||||
- Create connection, collection, and insert data.
|
||||
- Perform hybrid search with search requests with different 'limit' parameters.
|
||||
Expected:
|
||||
- Hybrid search failed with error msg
|
||||
"""
|
||||
nq = 2
|
||||
req_limit = 10
|
||||
ids_to_search = [0, 1]
|
||||
# generate hybrid search request list
|
||||
sub_params = {
|
||||
"ids": ids_to_search,
|
||||
"anns_field": ct.default_float_vec_field_name,
|
||||
"param": {},
|
||||
"limit": req_limit
|
||||
}
|
||||
with pytest.raises(TypeError,
|
||||
match="AnnSearchRequest.__init__.*got an unexpected keyword argument 'ids'"):
|
||||
req = AnnSearchRequest(**sub_params)
|
||||
|
||||
@ -264,7 +264,8 @@ class TestCollectionRangeSearch(TestcaseBase):
|
||||
# assert distances_tmp.count(1.0) == 1
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
def test_range_search_cosine(self):
|
||||
@pytest.mark.parametrize("search_by_pk", [True, False])
|
||||
def test_range_search_cosine(self, search_by_pk):
|
||||
"""
|
||||
target: test range search normal case
|
||||
method: create connection, collection, insert and search
|
||||
@ -278,9 +279,18 @@ class TestCollectionRangeSearch(TestcaseBase):
|
||||
# 2. range search
|
||||
range_search_params = {"metric_type": "COSINE",
|
||||
"params": {"radius": radius, "range_filter": range_filter}}
|
||||
search_res = collection_w.search(vectors[:nq], default_search_field,
|
||||
range_search_params, default_limit,
|
||||
default_search_exp)[0]
|
||||
vectors_to_search = vectors[:nq]
|
||||
ids_to_search = None
|
||||
if search_by_pk is True:
|
||||
vectors_to_search = None
|
||||
ids_to_search = [0, 1]
|
||||
search_res = collection_w.search(
|
||||
data=vectors_to_search,
|
||||
ids=ids_to_search,
|
||||
anns_field=default_search_field,
|
||||
param=range_search_params,
|
||||
limit=default_limit,
|
||||
expr=default_search_exp)[0]
|
||||
|
||||
# 3. check search results
|
||||
for hits in search_res:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -228,7 +228,9 @@ class TestGroupSearch(TestMilvusClientV2Base):
|
||||
assert len(set(group_values)) == 1
|
||||
|
||||
# when strict_group_size=false, it shall return results with group counts = limit
|
||||
res1 = self.search(client, self.collection_name, data=search_vectors, anns_field=self.vector_fields[j],
|
||||
res1 = self.search(client, self.collection_name,
|
||||
data=search_vectors,
|
||||
anns_field=self.vector_fields[j],
|
||||
search_params=search_params, limit=limit,
|
||||
group_by_field=group_by_field, filter=f"{output_field} is not null",
|
||||
group_size=group_size, strict_group_size=False,
|
||||
@ -537,7 +539,8 @@ class TestGroupSearch(TestMilvusClientV2Base):
|
||||
assert len(grpby_field_values) == len(set(grpby_field_values))
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_search_pagination_group_size(self):
|
||||
@pytest.mark.parametrize("search_by_pk", [True, False])
|
||||
def test_search_pagination_group_size(self, search_by_pk):
|
||||
"""
|
||||
verify search group by works with pagination and group_size
|
||||
"""
|
||||
@ -550,14 +553,22 @@ class TestGroupSearch(TestMilvusClientV2Base):
|
||||
default_search_exp = f"{self.primary_field} >= 0"
|
||||
grpby_field = self.inverted_string_field
|
||||
default_search_field = self.vector_fields[1]
|
||||
ids_to_search = None
|
||||
search_vectors = cf.gen_vectors(1, dim=self.dims[1],
|
||||
vector_data_type=cf.get_field_dtype_by_field_name(collection_info,
|
||||
self.vector_fields[1]))
|
||||
if search_by_pk is True:
|
||||
query_res = self.query(client, self.collection_name, limit=1, output_fields=[self.primary_field])[0]
|
||||
ids_to_search = [query_res[0].get(self.primary_field)]
|
||||
search_vectors = None
|
||||
all_pages_ids = []
|
||||
all_pages_grpby_field_values = []
|
||||
res_count = limit * group_size
|
||||
for r in range(page_rounds):
|
||||
page_res = self.search(client, self.collection_name, data=search_vectors, anns_field=default_search_field,
|
||||
page_res = self.search(client, self.collection_name,
|
||||
data=search_vectors,
|
||||
ids=ids_to_search,
|
||||
anns_field=default_search_field,
|
||||
search_params=search_param, limit=limit, offset=limit * r,
|
||||
filter=default_search_exp,
|
||||
group_by_field=grpby_field, group_size=group_size,
|
||||
@ -578,7 +589,10 @@ class TestGroupSearch(TestMilvusClientV2Base):
|
||||
assert hit_rate >= expect_hit_rate
|
||||
|
||||
total_count = limit * group_size * page_rounds
|
||||
total_res = self.search(client, self.collection_name, data=search_vectors, anns_field=default_search_field,
|
||||
total_res = self.search(client, self.collection_name,
|
||||
data=search_vectors,
|
||||
ids=ids_to_search,
|
||||
anns_field=default_search_field,
|
||||
search_params=search_param, limit=limit * page_rounds,
|
||||
filter=default_search_exp,
|
||||
group_by_field=grpby_field, group_size=group_size,
|
||||
|
||||
@ -210,3 +210,39 @@ class TestSearchIterator(TestcaseBase):
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": 1,
|
||||
"err_msg": "Not support search iteration over multiple vectors at present"})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
def test_search_iterator_not_support_search_by_pk(self):
|
||||
"""
|
||||
target: test search iterator does not support search by pk
|
||||
method: 1. search iterator by pk
|
||||
expected: search failed with error
|
||||
"""
|
||||
# 1. initialize with data
|
||||
batch_size = 100
|
||||
dim = 128
|
||||
collection_w = self.init_collection_general(
|
||||
prefix, True, dim=dim, is_index=False)[0]
|
||||
collection_w.create_index(field_name, {"metric_type": "L2"})
|
||||
collection_w.load()
|
||||
# 2. search iterator
|
||||
search_params = {"metric_type": "L2"}
|
||||
ids_to_search = [1]
|
||||
collection_w.search_iterator(
|
||||
ids=ids_to_search,
|
||||
anns_field=field_name,
|
||||
param=search_params,
|
||||
batch_size=batch_size,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": 999,
|
||||
"err_msg": "object of type 'NoneType' has no len()"})
|
||||
|
||||
collection_w.search_iterator(
|
||||
data=vectors[:1],
|
||||
ids=ids_to_search,
|
||||
anns_field=field_name,
|
||||
param=search_params,
|
||||
batch_size=batch_size,
|
||||
check_task=CheckTasks.err_res,
|
||||
check_items={"err_code": 999,
|
||||
"err_msg": "Either ids or data must be provided, not both"})
|
||||
|
||||
@ -440,7 +440,8 @@ class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L2)
|
||||
@pytest.mark.parametrize("offset", [0, 100])
|
||||
def test_search_pagination_with_expression(self, offset):
|
||||
@pytest.mark.parametrize("search_by_pk", [True, False])
|
||||
def test_search_pagination_with_expression(self, offset, search_by_pk):
|
||||
"""
|
||||
target: Test search pagination functionality with filtering expressions
|
||||
method: 1. Create collection and insert test data
|
||||
@ -470,12 +471,17 @@ class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
|
||||
elif len(filter_ids) - offset < default_limit:
|
||||
limit = len(filter_ids) - offset
|
||||
# 3. search with a high nprobe for better accuracy
|
||||
search_params = {"metric_type": "COSINE", "params": {"nprobe": 128}, "offset": offset}
|
||||
search_params = {"metric_type": "COSINE", "params": {"nprobe": 128}, "offset": offset}
|
||||
ids_to_search = None
|
||||
vectors_to_search = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
|
||||
if search_by_pk:
|
||||
ids_to_search = [k for k in range(default_nq)]
|
||||
vectors_to_search = None
|
||||
search_res_with_offset, _ = self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
data=vectors_to_search,
|
||||
ids=ids_to_search,
|
||||
anns_field=self.float_vector_field_name,
|
||||
search_params=search_params,
|
||||
limit=default_limit,
|
||||
@ -492,7 +498,8 @@ class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
|
||||
search_res_full, _ = self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
data=vectors_to_search,
|
||||
ids=ids_to_search,
|
||||
anns_field=self.float_vector_field_name,
|
||||
search_params=search_params_full,
|
||||
limit=default_limit + offset,
|
||||
@ -516,7 +523,8 @@ class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
|
||||
search_res_with_offset, _ = self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
data=vectors_to_search,
|
||||
ids=ids_to_search,
|
||||
anns_field=self.float_vector_field_name,
|
||||
search_params=search_params,
|
||||
limit=default_limit,
|
||||
@ -533,7 +541,8 @@ class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
|
||||
search_res_full, _ = self.search(
|
||||
client,
|
||||
collection_name,
|
||||
vectors_to_search[:default_nq],
|
||||
data=vectors_to_search,
|
||||
ids=ids_to_search,
|
||||
anns_field=self.float_vector_field_name,
|
||||
search_params=search_params_full,
|
||||
limit=default_limit + offset,
|
||||
|
||||
@ -233,6 +233,14 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
|
||||
r = r.to_dict()
|
||||
assert any([token in r["entity"][field] for token in top_10_tokens])
|
||||
|
||||
# verify Text Match support search by pk
|
||||
collection_w.search(ids=[1, 2],
|
||||
anns_field=ann_field,
|
||||
param={},limit=100,
|
||||
expr=expr, output_fields=["id", field],
|
||||
check_task=CheckTasks.check_search_results,
|
||||
check_items={"nq": 2, "limit": 100})
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("enable_partition_key", [True, False])
|
||||
@pytest.mark.parametrize("enable_inverted_index", [True, False])
|
||||
|
||||
@ -252,7 +252,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
|
||||
name=cf.gen_unique_str(prefix), schema=schema
|
||||
)
|
||||
res, result = collection_w.describe()
|
||||
log.info(f"collection describe {res}")
|
||||
# log.info(f"collection describe {res}")
|
||||
assert not result, (
|
||||
"create collection with unsupported tokenizer should be failed"
|
||||
)
|
||||
@ -342,7 +342,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
|
||||
name=cf.gen_unique_str(prefix), schema=schema
|
||||
)
|
||||
res, result = collection_w.describe()
|
||||
log.info(f"collection describe {res}")
|
||||
# log.info(f"collection describe {res}")
|
||||
assert result, (
|
||||
"create collection with valid input/output should be successful"
|
||||
)
|
||||
@ -539,7 +539,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
hybrid_data.append(tmp)
|
||||
data = hybrid_data + data
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -853,7 +853,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
hybrid_data.append(tmp)
|
||||
data = hybrid_data + data
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(df[i : i + batch_size])
|
||||
@ -947,7 +947,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
collection_w = self.init_collection_wrap(
|
||||
name=cf.gen_unique_str(prefix), schema=schema
|
||||
)
|
||||
log.info(f"collection describe {collection_w.describe()}")
|
||||
# log.info(f"collection describe {collection_w.describe()}")
|
||||
fake = fake_en
|
||||
language = "en"
|
||||
if tokenizer == "jieba":
|
||||
@ -987,8 +987,8 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
log.info("analyze documents")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
# log.info("analyze documents")
|
||||
texts = df["text"].to_list()
|
||||
word_freq = cf.analyze_documents(texts, language=language)
|
||||
tokens = list(word_freq.keys())
|
||||
@ -1024,7 +1024,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
|
||||
for i in range(nq):
|
||||
assert len(res_list[i]) == limit
|
||||
search_text = search_data[i]
|
||||
log.info(f"res: {res_list[i]}")
|
||||
# log.info(f"res: {res_list[i]}")
|
||||
res = res_list[i]
|
||||
for j in range(len(res)):
|
||||
r = res[j]
|
||||
@ -1128,7 +1128,7 @@ class TestInsertWithFullTextSearchNegative(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -1248,7 +1248,7 @@ class TestUpsertWithFullTextSearch(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -1402,7 +1402,7 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -1540,7 +1540,7 @@ class TestDeleteWithFullTextSearch(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -1703,7 +1703,7 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -1733,7 +1733,7 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase):
|
||||
# describe index info to verify
|
||||
res = collection_w.indexes
|
||||
index_info = [r.to_dict() for r in res]
|
||||
log.info(f"index info: {index_info}")
|
||||
# log.info(f"index info: {index_info}")
|
||||
for info in index_info:
|
||||
if info["index_name"] == "text_sparse_emb":
|
||||
assert info["index_param"]["index_type"] == index_type
|
||||
@ -1834,7 +1834,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -1950,7 +1950,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -2069,7 +2069,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -2176,7 +2176,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -2323,13 +2323,13 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
texts = df["text"].to_list()
|
||||
word_freq = cf.analyze_documents(texts, language=language)
|
||||
most_freq_word = word_freq.most_common(10)
|
||||
tokens = [item[0] for item in most_freq_word]
|
||||
if len(tokens) == 0:
|
||||
log.info("empty tokens, add a dummy token")
|
||||
# log.info("empty tokens, add a dummy token")
|
||||
tokens = ["dummy"]
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
@ -2378,7 +2378,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
limit=limit,
|
||||
)
|
||||
candidates_num = len(res)
|
||||
log.info(f"search data: {search_data}")
|
||||
# log.info(f"search data: {search_data}")
|
||||
# use offset = 0 to get all the results
|
||||
full_res_list, _ = collection_w.search(
|
||||
data=search_data,
|
||||
@ -2411,7 +2411,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
for i in range(nq):
|
||||
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
|
||||
search_text = search_data[i]
|
||||
log.info(f"res: {res_list[i]}")
|
||||
# log.info(f"res: {res_list[i]}")
|
||||
res = res_list[i]
|
||||
for j in range(len(res)):
|
||||
r = res[j]
|
||||
@ -2431,11 +2431,25 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(
|
||||
search_text, result_text, language=language
|
||||
)
|
||||
log.info(f"overlap {overlap}")
|
||||
# log.info(f"overlap {overlap}")
|
||||
assert len(overlap) > 0, (
|
||||
f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
|
||||
)
|
||||
|
||||
# verify full text searching does not support for search by pk
|
||||
error = {ct.err_code: 100,
|
||||
ct.err_msg: f"not allowed to retrieve raw data of field text_sparse_emb"}
|
||||
collection_w.search(
|
||||
ids=[0, 1],
|
||||
anns_field="text_sparse_emb",
|
||||
expr=filter,
|
||||
param={},
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
output_fields=["id", "text"],
|
||||
check_task=CheckTasks.err_res, check_items=error
|
||||
)
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
@pytest.mark.parametrize("nq", [2])
|
||||
@pytest.mark.parametrize("empty_percent", [0.5])
|
||||
@ -2545,7 +2559,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
texts = df["text"].to_list()
|
||||
word_freq = cf.analyze_documents(texts, language=language)
|
||||
tokens = []
|
||||
@ -2553,7 +2567,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
if len(item[0]) == 2:
|
||||
tokens.append(item[0])
|
||||
if len(tokens) == 0:
|
||||
log.info("empty tokens, add a dummy token")
|
||||
# log.info("empty tokens, add a dummy token")
|
||||
tokens = ["dummy"]
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
@ -2603,7 +2617,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
limit=limit,
|
||||
)
|
||||
candidates_num = len(res)
|
||||
log.info(f"search data: {search_data}")
|
||||
# log.info(f"search data: {search_data}")
|
||||
# use offset = 0 to get all the results
|
||||
full_res_list, _ = collection_w.search(
|
||||
data=search_data,
|
||||
@ -2636,7 +2650,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
for i in range(nq):
|
||||
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
|
||||
search_text = search_data[i]
|
||||
log.info(f"res: {res_list[i]}")
|
||||
# log.info(f"res: {res_list[i]}")
|
||||
res = res_list[i]
|
||||
for j in range(len(res)):
|
||||
r = res[j]
|
||||
@ -2656,7 +2670,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(
|
||||
search_text, result_text, language=language
|
||||
)
|
||||
log.info(f"overlap {overlap}")
|
||||
# log.info(f"overlap {overlap}")
|
||||
assert len(overlap) > 0, (
|
||||
f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
|
||||
)
|
||||
@ -2763,13 +2777,13 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
texts = df["text"].to_list()
|
||||
word_freq = cf.analyze_documents(texts, language=language)
|
||||
most_freq_word = word_freq.most_common(10)
|
||||
tokens = [item[0] for item in most_freq_word]
|
||||
if len(tokens) == 0:
|
||||
log.info("empty tokens, add a dummy token")
|
||||
# log.info("empty tokens, add a dummy token")
|
||||
tokens = ["dummy"]
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
@ -2818,7 +2832,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
limit=limit,
|
||||
)
|
||||
candidates_num = len(res)
|
||||
log.info(f"search data: {search_data}")
|
||||
# log.info(f"search data: {search_data}")
|
||||
# use offset = 0 to get all the results
|
||||
full_res_list, _ = collection_w.search(
|
||||
data=search_data,
|
||||
@ -2851,7 +2865,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
for i in range(nq):
|
||||
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
|
||||
search_text = search_data[i]
|
||||
log.info(f"res: {res_list[i]}")
|
||||
# log.info(f"res: {res_list[i]}")
|
||||
res = res_list[i]
|
||||
for j in range(len(res)):
|
||||
r = res[j]
|
||||
@ -2871,7 +2885,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(
|
||||
search_text, result_text, language=language
|
||||
)
|
||||
log.info(f"overlap {overlap}")
|
||||
# log.info(f"overlap {overlap}")
|
||||
assert len(overlap) > 0, (
|
||||
f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
|
||||
)
|
||||
@ -2976,12 +2990,12 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
texts = df["text"].to_list()
|
||||
word_freq = cf.analyze_documents(texts, language=language)
|
||||
tokens = list(word_freq.keys())
|
||||
if len(tokens) == 0:
|
||||
log.info("empty tokens, add a dummy token")
|
||||
# log.info("empty tokens, add a dummy token")
|
||||
tokens = ["dummy"]
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
@ -3014,7 +3028,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
collection_w.load()
|
||||
limit = 1000
|
||||
search_data = [fake.text().lower() + random.choice(tokens) for _ in range(nq)]
|
||||
log.info(f"search data: {search_data}")
|
||||
# log.info(f"search data: {search_data}")
|
||||
# get distance with search data
|
||||
res_list, _ = collection_w.search(
|
||||
data=search_data,
|
||||
@ -3045,7 +3059,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
)
|
||||
# verify correctness
|
||||
for i in range(nq):
|
||||
log.info(f"res: {len(res_list[i])}")
|
||||
# log.info(f"res: {len(res_list[i])}")
|
||||
assert len(res_list[i]) < limit # less than limit, because the range is set
|
||||
res = res_list[i]
|
||||
for j in range(len(res)):
|
||||
@ -3153,12 +3167,12 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
texts = df["text"].to_list()
|
||||
word_freq = cf.analyze_documents(texts, language=language)
|
||||
tokens = list(word_freq.keys())
|
||||
if len(tokens) == 0:
|
||||
log.info("empty tokens, add a dummy token")
|
||||
# log.info("empty tokens, add a dummy token")
|
||||
tokens = ["dummy"]
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
@ -3192,7 +3206,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
|
||||
search_data = [
|
||||
fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)
|
||||
]
|
||||
log.info(f"search data: {search_data}")
|
||||
# log.info(f"search data: {search_data}")
|
||||
# get distance with search data
|
||||
batch_size = 100
|
||||
limit = batch_size * 10
|
||||
@ -3320,7 +3334,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -3353,7 +3367,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
|
||||
nq = 2
|
||||
limit = 100
|
||||
search_data = ["" for _ in range(nq)]
|
||||
log.info(f"search data: {search_data}")
|
||||
# log.info(f"search data: {search_data}")
|
||||
res, _ = collection_w.search(
|
||||
data=search_data,
|
||||
anns_field="text_sparse_emb",
|
||||
@ -3462,12 +3476,12 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
texts = df["text"].to_list()
|
||||
word_freq = cf.analyze_documents(texts, language=language)
|
||||
tokens = list(word_freq.keys())
|
||||
if len(tokens) == 0:
|
||||
log.info("empty tokens, add a dummy token")
|
||||
# log.info("empty tokens, add a dummy token")
|
||||
tokens = ["dummy"]
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
@ -3508,7 +3522,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
|
||||
search_data = cf.gen_vectors(
|
||||
nb=nq, dim=1000, vector_data_type=DataType.FLOAT_VECTOR
|
||||
)
|
||||
log.info(f"search data: {search_data}")
|
||||
# log.info(f"search data: {search_data}")
|
||||
error = {
|
||||
ct.err_code: 65535,
|
||||
ct.err_msg: "please provide varchar/text for BM25 Function based search",
|
||||
@ -3637,7 +3651,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
# log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
@ -3705,7 +3719,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
|
||||
assert len(res_list) == nq
|
||||
# check the result correctness
|
||||
for i in range(nq):
|
||||
log.info(f"res length: {len(res_list[i])}")
|
||||
# log.info(f"res length: {len(res_list[i])}")
|
||||
if enable_group_by_field:
|
||||
assert len(res_list[i]) == len(language_list)
|
||||
else:
|
||||
@ -3868,9 +3882,9 @@ class TestFullTextSearchMultiAnalyzer(TestcaseBase):
|
||||
)
|
||||
assert len(results) == 1
|
||||
assert len(results[0]) > 0
|
||||
log.info(
|
||||
f"Query '{test['query']}' with analyzer '{test['analyzer_name']}' returned {len(results[0])} results"
|
||||
)
|
||||
# log.info(
|
||||
# f"Query '{test['query']}' with analyzer '{test['analyzer_name']}' returned {len(results[0])} results"
|
||||
# )
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_multi_analyzer_fallback(self):
|
||||
@ -4084,8 +4098,8 @@ class TestFullTextSearchMultiAnalyzer(TestcaseBase):
|
||||
output_fields=["doc_id", "language", "article_content"],
|
||||
limit=10,
|
||||
)
|
||||
log.info(test)
|
||||
log.info(results)
|
||||
# log.info(test)
|
||||
# log.info(results)
|
||||
assert len(results) == 1
|
||||
assert len(results[0]) > 0
|
||||
if test["analyzer_name"] == "eng":
|
||||
@ -4113,8 +4127,8 @@ class TestFullTextSearchMultiAnalyzer(TestcaseBase):
|
||||
output_fields=["doc_id", "language", "article_content"],
|
||||
limit=10,
|
||||
)
|
||||
log.info(test)
|
||||
log.info(results)
|
||||
# log.info(test)
|
||||
# log.info(results)
|
||||
assert len(results) == 1
|
||||
assert len(results[0]) > 0
|
||||
for r in results[0]:
|
||||
@ -4162,7 +4176,7 @@ class TestFullTextSearchMultiAnalyzer(TestcaseBase):
|
||||
res_diff = res_set - mock_res_set
|
||||
mock_res_diff = mock_res_set - res_set
|
||||
if res_diff or mock_res_diff:
|
||||
log.error(f"result diff: {res_diff}, {mock_res_diff}")
|
||||
# log.error(f"result diff: {res_diff}, {mock_res_diff}")
|
||||
assert False, (
|
||||
f"result diff: {res_diff} in origin but not in mock, {mock_res_diff} in mock but not in origin"
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user