test: Add tests for search by ids (#46756)

related issue: #46755

Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>
This commit is contained in:
yanliang567 2026-01-05 13:25:23 +08:00 committed by GitHub
parent 941c6eaed7
commit 7018151c7d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 2289 additions and 76 deletions

View File

@ -171,10 +171,10 @@ class TestMilvusClientV2Base(Base):
return res, check_result
@trace()
def search(self, client, collection_name, data, limit=10, filter=None, output_fields=None, search_params=None,
def search(self, client, collection_name, data=None, limit=10, filter=None, output_fields=None, search_params=None,
timeout=None, check_task=None, check_items=None, **kwargs):
timeout = TIMEOUT if timeout is None else timeout
kwargs.update({"timeout": timeout})
# kwargs.update({"timeout": timeout})
func_name = sys._getframe().f_code.co_name
res, check = api_request([client.search, collection_name, data, filter, limit,

View File

@ -165,7 +165,7 @@ class ApiCollectionWrapper:
return res, check_result
@trace()
def search(self, data, anns_field, param, limit, expr=None,
def search(self, data=None, anns_field=None, param=None, limit=None, expr=None,
partition_names=None, output_fields=None, timeout=None, round_decimal=-1,
check_task=None, check_items=None, **kwargs):
timeout = TIMEOUT if timeout is None else timeout
@ -197,7 +197,7 @@ class ApiCollectionWrapper:
return res, check_result
@trace()
def search_iterator(self, data, anns_field, param, batch_size, limit=-1, expr=None,
def search_iterator(self, data=None, anns_field=None, param=None, batch_size=None, limit=-1, expr=None,
partition_names=None, output_fields=None, timeout=None, round_decimal=-1,
check_task=None, check_items=None, **kwargs):
timeout = TIMEOUT if timeout is None else timeout

View File

@ -1824,6 +1824,30 @@ class TestMilvusClientStructArraySearch(TestMilvusClientV2Base):
assert check
assert len(results[0]) > 0
@pytest.mark.tags(CaseLabel.L1)
def test_search_struct_array_not_support_search_by_pk(self):
"""
target: test searching with multiple vectors (EmbeddingList) in struct array does not supprt search by pk
method: search using EmbeddingList by pk
expected: search failed with error
"""
collection_name = cf.gen_unique_str(f"{prefix}_search")
client = self._client()
# Create collection with data and index
self.create_collection_with_index(client, collection_name)
# Search using EmbeddingList
error = {ct.err_code: 999,
ct.err_msg: "array of vector is not supported for search by IDs"}
self.search(client,
collection_name,
ids=[0, 1],
anns_field="clips[clip_embedding1]",
search_params={"metric_type": "MAX_SIM_COSINE"},
limit=10,
check_task=CheckTasks.err_res, check_items=error)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("retrieval_ann_ratio", [1.0, 3.0, 5.0, 10.0])
def test_search_with_retrieval_ann_ratio(self, retrieval_ann_ratio):

View File

@ -668,7 +668,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base):
vector_name_list = [self.float_vector_field_name1, self.float_vector_field_name2]
# 3. prepare search params for each vector field
req_list = []
nq = 1 # only works for nq=1, as the limitation of get_hybrid_search_base_results_rrf()
nq = 1 # only works for nq=1, as the limitation of get_hybrid_search_base_results_rrf()
search_res_dict_array = []
for field_name in vector_name_list:
search_data = cf.gen_vectors(nq, self.float_vector_dim, vector_data_type=DataType.FLOAT_VECTOR)
@ -797,7 +797,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base):
limit = 200
field_names = [self.sparse_vector_field_name1, self.sparse_vector_field_name2]
nq = len(field_names) # nq should equal to number of filed names, as it would search nq by nq next
nq = len(field_names) # nq should equal to number of filed names, as it would search nq by nq next
search_data = cf.gen_varchar_data(length=10, nb=nq, text_mode=True)
# 0. search
@ -1321,7 +1321,7 @@ class TestMilvusClientHybridSearch(TestMilvusClientV2Base):
check_items=check_items)
class TestCollectionHybridSearchValid(TestcaseBase):
class TestCollectionHybridSearch(TestcaseBase):
""" Test case of search interface """
@pytest.fixture(scope="function", params=["JACCARD", "HAMMING"])
@ -1728,3 +1728,27 @@ class TestCollectionHybridSearchValid(TestcaseBase):
for i in range(len(score_answer[:default_limit])):
delta = math.fabs(score_answer[i] - hybrid_res[0].distances[i])
assert delta < hybrid_search_epsilon
@pytest.mark.tags(CaseLabel.L1)
def test_hybrid_search_not_support_search_by_pk(self):
"""
Test case: Hybrid search does not support search by pk
Scenario:
- Create connection, collection, and insert data.
- Perform hybrid search with search requests with different 'limit' parameters.
Expected:
- Hybrid search failed with error msg
"""
nq = 2
req_limit = 10
ids_to_search = [0, 1]
# generate hybrid search request list
sub_params = {
"ids": ids_to_search,
"anns_field": ct.default_float_vec_field_name,
"param": {},
"limit": req_limit
}
with pytest.raises(TypeError,
match="AnnSearchRequest.__init__.*got an unexpected keyword argument 'ids'"):
req = AnnSearchRequest(**sub_params)

View File

@ -264,7 +264,8 @@ class TestCollectionRangeSearch(TestcaseBase):
# assert distances_tmp.count(1.0) == 1
@pytest.mark.tags(CaseLabel.L1)
def test_range_search_cosine(self):
@pytest.mark.parametrize("search_by_pk", [True, False])
def test_range_search_cosine(self, search_by_pk):
"""
target: test range search normal case
method: create connection, collection, insert and search
@ -278,9 +279,18 @@ class TestCollectionRangeSearch(TestcaseBase):
# 2. range search
range_search_params = {"metric_type": "COSINE",
"params": {"radius": radius, "range_filter": range_filter}}
search_res = collection_w.search(vectors[:nq], default_search_field,
range_search_params, default_limit,
default_search_exp)[0]
vectors_to_search = vectors[:nq]
ids_to_search = None
if search_by_pk is True:
vectors_to_search = None
ids_to_search = [0, 1]
search_res = collection_w.search(
data=vectors_to_search,
ids=ids_to_search,
anns_field=default_search_field,
param=range_search_params,
limit=default_limit,
expr=default_search_exp)[0]
# 3. check search results
for hits in search_res:

File diff suppressed because it is too large Load Diff

View File

@ -228,7 +228,9 @@ class TestGroupSearch(TestMilvusClientV2Base):
assert len(set(group_values)) == 1
# when strict_group_size=false, it shall return results with group counts = limit
res1 = self.search(client, self.collection_name, data=search_vectors, anns_field=self.vector_fields[j],
res1 = self.search(client, self.collection_name,
data=search_vectors,
anns_field=self.vector_fields[j],
search_params=search_params, limit=limit,
group_by_field=group_by_field, filter=f"{output_field} is not null",
group_size=group_size, strict_group_size=False,
@ -537,7 +539,8 @@ class TestGroupSearch(TestMilvusClientV2Base):
assert len(grpby_field_values) == len(set(grpby_field_values))
@pytest.mark.tags(CaseLabel.L0)
def test_search_pagination_group_size(self):
@pytest.mark.parametrize("search_by_pk", [True, False])
def test_search_pagination_group_size(self, search_by_pk):
"""
verify search group by works with pagination and group_size
"""
@ -550,14 +553,22 @@ class TestGroupSearch(TestMilvusClientV2Base):
default_search_exp = f"{self.primary_field} >= 0"
grpby_field = self.inverted_string_field
default_search_field = self.vector_fields[1]
ids_to_search = None
search_vectors = cf.gen_vectors(1, dim=self.dims[1],
vector_data_type=cf.get_field_dtype_by_field_name(collection_info,
self.vector_fields[1]))
if search_by_pk is True:
query_res = self.query(client, self.collection_name, limit=1, output_fields=[self.primary_field])[0]
ids_to_search = [query_res[0].get(self.primary_field)]
search_vectors = None
all_pages_ids = []
all_pages_grpby_field_values = []
res_count = limit * group_size
for r in range(page_rounds):
page_res = self.search(client, self.collection_name, data=search_vectors, anns_field=default_search_field,
page_res = self.search(client, self.collection_name,
data=search_vectors,
ids=ids_to_search,
anns_field=default_search_field,
search_params=search_param, limit=limit, offset=limit * r,
filter=default_search_exp,
group_by_field=grpby_field, group_size=group_size,
@ -578,7 +589,10 @@ class TestGroupSearch(TestMilvusClientV2Base):
assert hit_rate >= expect_hit_rate
total_count = limit * group_size * page_rounds
total_res = self.search(client, self.collection_name, data=search_vectors, anns_field=default_search_field,
total_res = self.search(client, self.collection_name,
data=search_vectors,
ids=ids_to_search,
anns_field=default_search_field,
search_params=search_param, limit=limit * page_rounds,
filter=default_search_exp,
group_by_field=grpby_field, group_size=group_size,

View File

@ -210,3 +210,39 @@ class TestSearchIterator(TestcaseBase):
check_task=CheckTasks.err_res,
check_items={"err_code": 1,
"err_msg": "Not support search iteration over multiple vectors at present"})
@pytest.mark.tags(CaseLabel.L2)
def test_search_iterator_not_support_search_by_pk(self):
"""
target: test search iterator does not support search by pk
method: 1. search iterator by pk
expected: search failed with error
"""
# 1. initialize with data
batch_size = 100
dim = 128
collection_w = self.init_collection_general(
prefix, True, dim=dim, is_index=False)[0]
collection_w.create_index(field_name, {"metric_type": "L2"})
collection_w.load()
# 2. search iterator
search_params = {"metric_type": "L2"}
ids_to_search = [1]
collection_w.search_iterator(
ids=ids_to_search,
anns_field=field_name,
param=search_params,
batch_size=batch_size,
check_task=CheckTasks.err_res,
check_items={"err_code": 999,
"err_msg": "object of type 'NoneType' has no len()"})
collection_w.search_iterator(
data=vectors[:1],
ids=ids_to_search,
anns_field=field_name,
param=search_params,
batch_size=batch_size,
check_task=CheckTasks.err_res,
check_items={"err_code": 999,
"err_msg": "Either ids or data must be provided, not both"})

View File

@ -440,7 +440,8 @@ class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("offset", [0, 100])
def test_search_pagination_with_expression(self, offset):
@pytest.mark.parametrize("search_by_pk", [True, False])
def test_search_pagination_with_expression(self, offset, search_by_pk):
"""
target: Test search pagination functionality with filtering expressions
method: 1. Create collection and insert test data
@ -470,12 +471,17 @@ class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
elif len(filter_ids) - offset < default_limit:
limit = len(filter_ids) - offset
# 3. search with a high nprobe for better accuracy
search_params = {"metric_type": "COSINE", "params": {"nprobe": 128}, "offset": offset}
search_params = {"metric_type": "COSINE", "params": {"nprobe": 128}, "offset": offset}
ids_to_search = None
vectors_to_search = [[random.random() for _ in range(default_dim)] for _ in range(default_nq)]
if search_by_pk:
ids_to_search = [k for k in range(default_nq)]
vectors_to_search = None
search_res_with_offset, _ = self.search(
client,
collection_name,
vectors_to_search[:default_nq],
data=vectors_to_search,
ids=ids_to_search,
anns_field=self.float_vector_field_name,
search_params=search_params,
limit=default_limit,
@ -492,7 +498,8 @@ class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
search_res_full, _ = self.search(
client,
collection_name,
vectors_to_search[:default_nq],
data=vectors_to_search,
ids=ids_to_search,
anns_field=self.float_vector_field_name,
search_params=search_params_full,
limit=default_limit + offset,
@ -516,7 +523,8 @@ class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
search_res_with_offset, _ = self.search(
client,
collection_name,
vectors_to_search[:default_nq],
data=vectors_to_search,
ids=ids_to_search,
anns_field=self.float_vector_field_name,
search_params=search_params,
limit=default_limit,
@ -533,7 +541,8 @@ class TestMilvusClientSearchPagination(TestMilvusClientV2Base):
search_res_full, _ = self.search(
client,
collection_name,
vectors_to_search[:default_nq],
data=vectors_to_search,
ids=ids_to_search,
anns_field=self.float_vector_field_name,
search_params=search_params_full,
limit=default_limit + offset,

View File

@ -233,6 +233,14 @@ class TestSearchWithTextMatchFilter(TestcaseBase):
r = r.to_dict()
assert any([token in r["entity"][field] for token in top_10_tokens])
# verify Text Match support search by pk
collection_w.search(ids=[1, 2],
anns_field=ann_field,
param={},limit=100,
expr=expr, output_fields=["id", field],
check_task=CheckTasks.check_search_results,
check_items={"nq": 2, "limit": 100})
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("enable_partition_key", [True, False])
@pytest.mark.parametrize("enable_inverted_index", [True, False])

View File

@ -252,7 +252,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
name=cf.gen_unique_str(prefix), schema=schema
)
res, result = collection_w.describe()
log.info(f"collection describe {res}")
# log.info(f"collection describe {res}")
assert not result, (
"create collection with unsupported tokenizer should be failed"
)
@ -342,7 +342,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase):
name=cf.gen_unique_str(prefix), schema=schema
)
res, result = collection_w.describe()
log.info(f"collection describe {res}")
# log.info(f"collection describe {res}")
assert result, (
"create collection with valid input/output should be successful"
)
@ -539,7 +539,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
hybrid_data.append(tmp)
data = hybrid_data + data
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -853,7 +853,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
hybrid_data.append(tmp)
data = hybrid_data + data
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(df[i : i + batch_size])
@ -947,7 +947,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
collection_w = self.init_collection_wrap(
name=cf.gen_unique_str(prefix), schema=schema
)
log.info(f"collection describe {collection_w.describe()}")
# log.info(f"collection describe {collection_w.describe()}")
fake = fake_en
language = "en"
if tokenizer == "jieba":
@ -987,8 +987,8 @@ class TestInsertWithFullTextSearch(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
log.info("analyze documents")
# log.info(f"dataframe\n{df}")
# log.info("analyze documents")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = list(word_freq.keys())
@ -1024,7 +1024,7 @@ class TestInsertWithFullTextSearch(TestcaseBase):
for i in range(nq):
assert len(res_list[i]) == limit
search_text = search_data[i]
log.info(f"res: {res_list[i]}")
# log.info(f"res: {res_list[i]}")
res = res_list[i]
for j in range(len(res)):
r = res[j]
@ -1128,7 +1128,7 @@ class TestInsertWithFullTextSearchNegative(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -1248,7 +1248,7 @@ class TestUpsertWithFullTextSearch(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -1402,7 +1402,7 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -1540,7 +1540,7 @@ class TestDeleteWithFullTextSearch(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -1703,7 +1703,7 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -1733,7 +1733,7 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase):
# describe index info to verify
res = collection_w.indexes
index_info = [r.to_dict() for r in res]
log.info(f"index info: {index_info}")
# log.info(f"index info: {index_info}")
for info in index_info:
if info["index_name"] == "text_sparse_emb":
assert info["index_param"]["index_type"] == index_type
@ -1834,7 +1834,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -1950,7 +1950,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -2069,7 +2069,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -2176,7 +2176,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -2323,13 +2323,13 @@ class TestSearchWithFullTextSearch(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
most_freq_word = word_freq.most_common(10)
tokens = [item[0] for item in most_freq_word]
if len(tokens) == 0:
log.info("empty tokens, add a dummy token")
# log.info("empty tokens, add a dummy token")
tokens = ["dummy"]
batch_size = 5000
for i in range(0, len(df), batch_size):
@ -2378,7 +2378,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
limit=limit,
)
candidates_num = len(res)
log.info(f"search data: {search_data}")
# log.info(f"search data: {search_data}")
# use offset = 0 to get all the results
full_res_list, _ = collection_w.search(
data=search_data,
@ -2411,7 +2411,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
for i in range(nq):
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
search_text = search_data[i]
log.info(f"res: {res_list[i]}")
# log.info(f"res: {res_list[i]}")
res = res_list[i]
for j in range(len(res)):
r = res[j]
@ -2431,11 +2431,25 @@ class TestSearchWithFullTextSearch(TestcaseBase):
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(
search_text, result_text, language=language
)
log.info(f"overlap {overlap}")
# log.info(f"overlap {overlap}")
assert len(overlap) > 0, (
f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
)
# verify full text searching does not support for search by pk
error = {ct.err_code: 100,
ct.err_msg: f"not allowed to retrieve raw data of field text_sparse_emb"}
collection_w.search(
ids=[0, 1],
anns_field="text_sparse_emb",
expr=filter,
param={},
limit=limit,
offset=offset,
output_fields=["id", "text"],
check_task=CheckTasks.err_res, check_items=error
)
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("nq", [2])
@pytest.mark.parametrize("empty_percent", [0.5])
@ -2545,7 +2559,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = []
@ -2553,7 +2567,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
if len(item[0]) == 2:
tokens.append(item[0])
if len(tokens) == 0:
log.info("empty tokens, add a dummy token")
# log.info("empty tokens, add a dummy token")
tokens = ["dummy"]
batch_size = 5000
for i in range(0, len(df), batch_size):
@ -2603,7 +2617,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
limit=limit,
)
candidates_num = len(res)
log.info(f"search data: {search_data}")
# log.info(f"search data: {search_data}")
# use offset = 0 to get all the results
full_res_list, _ = collection_w.search(
data=search_data,
@ -2636,7 +2650,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
for i in range(nq):
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
search_text = search_data[i]
log.info(f"res: {res_list[i]}")
# log.info(f"res: {res_list[i]}")
res = res_list[i]
for j in range(len(res)):
r = res[j]
@ -2656,7 +2670,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(
search_text, result_text, language=language
)
log.info(f"overlap {overlap}")
# log.info(f"overlap {overlap}")
assert len(overlap) > 0, (
f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
)
@ -2763,13 +2777,13 @@ class TestSearchWithFullTextSearch(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
most_freq_word = word_freq.most_common(10)
tokens = [item[0] for item in most_freq_word]
if len(tokens) == 0:
log.info("empty tokens, add a dummy token")
# log.info("empty tokens, add a dummy token")
tokens = ["dummy"]
collection_w.create_index(
"emb",
@ -2818,7 +2832,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
limit=limit,
)
candidates_num = len(res)
log.info(f"search data: {search_data}")
# log.info(f"search data: {search_data}")
# use offset = 0 to get all the results
full_res_list, _ = collection_w.search(
data=search_data,
@ -2851,7 +2865,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
for i in range(nq):
assert 0 < len(res_list[i]) <= min(limit, candidates_num)
search_text = search_data[i]
log.info(f"res: {res_list[i]}")
# log.info(f"res: {res_list[i]}")
res = res_list[i]
for j in range(len(res)):
r = res[j]
@ -2871,7 +2885,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
overlap, word_freq_a, word_freq_b = cf.check_token_overlap(
search_text, result_text, language=language
)
log.info(f"overlap {overlap}")
# log.info(f"overlap {overlap}")
assert len(overlap) > 0, (
f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}"
)
@ -2976,12 +2990,12 @@ class TestSearchWithFullTextSearch(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = list(word_freq.keys())
if len(tokens) == 0:
log.info("empty tokens, add a dummy token")
# log.info("empty tokens, add a dummy token")
tokens = ["dummy"]
batch_size = 5000
for i in range(0, len(df), batch_size):
@ -3014,7 +3028,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
collection_w.load()
limit = 1000
search_data = [fake.text().lower() + random.choice(tokens) for _ in range(nq)]
log.info(f"search data: {search_data}")
# log.info(f"search data: {search_data}")
# get distance with search data
res_list, _ = collection_w.search(
data=search_data,
@ -3045,7 +3059,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
)
# verify correctness
for i in range(nq):
log.info(f"res: {len(res_list[i])}")
# log.info(f"res: {len(res_list[i])}")
assert len(res_list[i]) < limit # less than limit, because the range is set
res = res_list[i]
for j in range(len(res)):
@ -3153,12 +3167,12 @@ class TestSearchWithFullTextSearch(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = list(word_freq.keys())
if len(tokens) == 0:
log.info("empty tokens, add a dummy token")
# log.info("empty tokens, add a dummy token")
tokens = ["dummy"]
batch_size = 5000
for i in range(0, len(df), batch_size):
@ -3192,7 +3206,7 @@ class TestSearchWithFullTextSearch(TestcaseBase):
search_data = [
fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)
]
log.info(f"search data: {search_data}")
# log.info(f"search data: {search_data}")
# get distance with search data
batch_size = 100
limit = batch_size * 10
@ -3320,7 +3334,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -3353,7 +3367,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
nq = 2
limit = 100
search_data = ["" for _ in range(nq)]
log.info(f"search data: {search_data}")
# log.info(f"search data: {search_data}")
res, _ = collection_w.search(
data=search_data,
anns_field="text_sparse_emb",
@ -3462,12 +3476,12 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
texts = df["text"].to_list()
word_freq = cf.analyze_documents(texts, language=language)
tokens = list(word_freq.keys())
if len(tokens) == 0:
log.info("empty tokens, add a dummy token")
# log.info("empty tokens, add a dummy token")
tokens = ["dummy"]
batch_size = 5000
for i in range(0, len(df), batch_size):
@ -3508,7 +3522,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase):
search_data = cf.gen_vectors(
nb=nq, dim=1000, vector_data_type=DataType.FLOAT_VECTOR
)
log.info(f"search data: {search_data}")
# log.info(f"search data: {search_data}")
error = {
ct.err_code: 65535,
ct.err_msg: "please provide varchar/text for BM25 Function based search",
@ -3637,7 +3651,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
for i in range(data_size)
]
df = pd.DataFrame(data)
log.info(f"dataframe\n{df}")
# log.info(f"dataframe\n{df}")
batch_size = 5000
for i in range(0, len(df), batch_size):
collection_w.insert(
@ -3705,7 +3719,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase):
assert len(res_list) == nq
# check the result correctness
for i in range(nq):
log.info(f"res length: {len(res_list[i])}")
# log.info(f"res length: {len(res_list[i])}")
if enable_group_by_field:
assert len(res_list[i]) == len(language_list)
else:
@ -3868,9 +3882,9 @@ class TestFullTextSearchMultiAnalyzer(TestcaseBase):
)
assert len(results) == 1
assert len(results[0]) > 0
log.info(
f"Query '{test['query']}' with analyzer '{test['analyzer_name']}' returned {len(results[0])} results"
)
# log.info(
# f"Query '{test['query']}' with analyzer '{test['analyzer_name']}' returned {len(results[0])} results"
# )
@pytest.mark.tags(CaseLabel.L0)
def test_multi_analyzer_fallback(self):
@ -4084,8 +4098,8 @@ class TestFullTextSearchMultiAnalyzer(TestcaseBase):
output_fields=["doc_id", "language", "article_content"],
limit=10,
)
log.info(test)
log.info(results)
# log.info(test)
# log.info(results)
assert len(results) == 1
assert len(results[0]) > 0
if test["analyzer_name"] == "eng":
@ -4113,8 +4127,8 @@ class TestFullTextSearchMultiAnalyzer(TestcaseBase):
output_fields=["doc_id", "language", "article_content"],
limit=10,
)
log.info(test)
log.info(results)
# log.info(test)
# log.info(results)
assert len(results) == 1
assert len(results[0]) > 0
for r in results[0]:
@ -4162,7 +4176,7 @@ class TestFullTextSearchMultiAnalyzer(TestcaseBase):
res_diff = res_set - mock_res_set
mock_res_diff = mock_res_set - res_set
if res_diff or mock_res_diff:
log.error(f"result diff: {res_diff}, {mock_res_diff}")
# log.error(f"result diff: {res_diff}, {mock_res_diff}")
assert False, (
f"result diff: {res_diff} in origin but not in mock, {mock_res_diff} in mock but not in origin"
)