From 7b26cef3be44edc82d8d07d2af61d09e27cc7a19 Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Wed, 2 Jul 2025 11:16:50 +0800 Subject: [PATCH] test: add group by for fts hybrid search (#43037) /kind improvement Signed-off-by: zhuwenxing --- tests/python_client/pytest.ini | 4 ++-- .../testcases/test_full_text_search.py | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/tests/python_client/pytest.ini b/tests/python_client/pytest.ini index 8869b6d5de..a4a39b4302 100644 --- a/tests/python_client/pytest.ini +++ b/tests/python_client/pytest.ini @@ -6,10 +6,10 @@ addopts = --host localhost --html=/tmp/ci_logs/report.html --self-contained-html log_format = [%(asctime)s - %(levelname)s - %(name)s]: %(message)s (%(filename)s:%(lineno)s) log_date_format = %Y-%m-%d %H:%M:%S - +markers = + tags: custom tags for test cases filterwarnings = ignore::DeprecationWarning asyncio_default_fixture_loop_scope = function - diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index 31f3ee0858..1e85b0d901 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -3536,6 +3536,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("empty_percent", [0]) @pytest.mark.parametrize("enable_partition_key", [True]) @pytest.mark.parametrize("enable_inverted_index", [True]) + @pytest.mark.parametrize("enable_group_by_field", [True, False]) @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) @pytest.mark.parametrize("tokenizer", ["standard"]) @pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo) @@ -3547,6 +3548,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): empty_percent, index_type, inverted_index_algo, + enable_group_by_field, ): """ target: test full text search @@ -3561,6 +3563,11 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): dim = 128 fields = [ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="language", + dtype=DataType.VARCHAR, + max_length=16, + ), FieldSchema( name="word", dtype=DataType.VARCHAR, @@ -3608,10 +3615,12 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): collection_w = self.init_collection_wrap( name=cf.gen_unique_str(prefix), schema=schema ) + language_list = ["en", "zh", "de", "jp", "unknown"] fake = fake_en data = [ { "id": i, + "language": random.choice(language_list), "word": fake.word().lower() if random.random() >= empty_percent else "", "sentence": fake.sentence().lower() if random.random() >= empty_percent @@ -3690,13 +3699,17 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): reqs=[bm25_search, dense_search, sparse_search], rerank=WeightedRanker(0.5, 0.5, 0.5), limit=limit, - output_fields=["id", "text"], + output_fields=["id", "text", "language"], + group_by_field="language" if enable_group_by_field else None, ) assert len(res_list) == nq # check the result correctness for i in range(nq): log.info(f"res length: {len(res_list[i])}") - assert len(res_list[i]) == limit + if enable_group_by_field: + assert len(res_list[i]) == len(language_list) + else: + assert len(res_list[i]) == limit class TestFullTextSearchMultiAnalyzer(TestcaseBase):