From e4b0f48bc0526babe6f4c9d7185a897b66ccec4e Mon Sep 17 00:00:00 2001 From: Feilong Hou <77430856+FeilongHou@users.noreply.github.com> Date: Wed, 24 Dec 2025 09:49:19 +0800 Subject: [PATCH] test: add e2e test cases for highlighter (#46505) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### **User description** Issue: #46504 test: create e2e test case for highlighter On branch feature/highlighter Changes to be committed: new file: milvus_client/test_milvus_client_highlighter.py ___ ### **PR Type** Tests ___ ### **Description** - Add comprehensive e2e test suite for LexicalHighlighter functionality - Test highlighter initialization with collection setup and data insertion - Validate highlighter with various parameters (tags, fragments, offsets) - Test edge cases including Chinese characters, long text, and invalid inputs - Verify error handling for invalid fragment sizes, offsets, and configurations ___ ### Diagram Walkthrough ```mermaid flowchart LR A["Test Suite Setup"] --> B["Highlighter Init Tests"] B --> C["Valid Test Cases"] C --> D["Fragment Parameters"] C --> E["Search Variations"] C --> F["Language Support"] B --> G["Invalid Test Cases"] G --> H["Parameter Validation"] G --> I["Error Handling"] ```

File Walkthrough

Relevant files
Tests
test_milvus_client_highlighter.py
Add comprehensive LexicalHighlighter e2e test suite           

tests/python_client/milvus_client/test_milvus_client_highlighter.py
  • Create new test file with 1163 lines of comprehensive highlighter test
    cases
  • Implement TestMilvusClientHighlighterInit class to initialize
    collection with pre-defined test data including English, Chinese, and
    long text samples
  • Implement TestMilvusClientHighlighterValid class with 15+ test methods
    covering basic usage, multiple tags, fragment parameters, offsets,
    numbers, sentences, and language support
  • Implement TestMilvusClientHighlighterInvalid class with 8+ test
    methods validating error handling for invalid parameters and
    configurations
  • Test highlighter with BM25 search, text matching, and various analyzer
    configurations
+1163/-0
___ ## Summary by CodeRabbit * **Tests** * Added a comprehensive highlighter test suite covering: - Core highlighting with single and multi-analyzer setups and multi-tag variations - Fragment parameter behaviors and edge cases (size, offset, count) - Text-match and query-based highlighting, including BM25 and vector interactions - Sub-word, long-text/tag, case sensitivity, Chinese/multi-language scenarios - Error handling for invalid parameters, no-match cases, and other edge conditions - Module-scoped fixture preparing multilingual, long-form test data and teardown ✏️ Tip: You can customize this high-level summary in your review settings. --------- Signed-off-by: Eric Hou Co-authored-by: Eric Hou --- .../test_milvus_client_highlighter.py | 1164 +++++++++++++++++ 1 file changed, 1164 insertions(+) create mode 100644 tests/python_client/milvus_client/test_milvus_client_highlighter.py diff --git a/tests/python_client/milvus_client/test_milvus_client_highlighter.py b/tests/python_client/milvus_client/test_milvus_client_highlighter.py new file mode 100644 index 0000000000..53b64b4418 --- /dev/null +++ b/tests/python_client/milvus_client/test_milvus_client_highlighter.py @@ -0,0 +1,1164 @@ +from itertools import cycle +import pytest + +from base.client_v2_base import TestMilvusClientV2Base +from utils.util_log import test_log as log +from common import common_func as cf +from common import common_type as ct +from common.common_type import CaseLabel, CheckTasks +from utils.util_pymilvus import * +from pymilvus import Function, FunctionType, DataType, LexicalHighlighter + +prefix = "client_insert" +epsilon = ct.epsilon +default_nb = ct.default_nb +default_nb_medium = ct.default_nb_medium +default_nq = ct.default_nq +default_dim = ct.default_dim +default_limit = ct.default_limit +default_search_exp = "id >= 0" +exp_res = "exp_res" +default_search_string_exp = "varchar >= \"0\"" +default_search_mix_exp = "int64 >= 0 && varchar >= \"0\"" +default_invaild_string_exp = "varchar >= 0" +default_json_search_exp = "json_field[\"number\"] >= 0" +perfix_expr = 'varchar like "0%"' +default_search_field = ct.default_float_vec_field_name +default_search_params = ct.default_search_params +default_primary_key_field_name = "id" +default_vector_field_name = "vector" +default_text_field_name = "tr" +default_text_field_name_chinese = "tr_chinese" +default_text_field_name_no_BM25 = "tr_no_bm25" +default_text_field_name_multi_analyzer = "tr_multi_analyzer" +default_sparse_vector_field_name = f"{default_text_field_name}_sparse_emb" +default_sparse_vector_field_name_chinese = f"{default_text_field_name_chinese}_sparse_emb" +default_sparse_vector_field_name_multi_analyzer = f"{default_text_field_name_multi_analyzer}_sparse_emb" + +COLLECTION_NAME = "test_hightligher" + cf.gen_unique_str("_") +@pytest.mark.xdist_group("TestMilvusClientHighlighter") +class TestMilvusClientHighlighter(TestMilvusClientV2Base): + """ + ######################################################### + Init collection with highlighter so all the tests can use the same collection + This aims to save time for the tests + Also, highlighter is difficult to compare the results, + so we need to init the collection with pre-defined data + ######################################################### + """ + @pytest.fixture(scope="module", autouse=True) + def prepare_highlighter_collection(self, request): + """ + Ensure the shared highlighter collection exists before any tests in this module, + and drop it after all tests in this module complete. + """ + client = self._client() + collection_name = COLLECTION_NAME + if client.has_collection(collection_name): + client.drop_collection(collection_name) + + analyzer_params = { + "tokenizer": "standard" + } + analyzer_params_2 = { + "tokenizer": { + "type": "jieba", + "dict": ["结巴分词器"], + "mode": "exact", + "hmm": False + } + } + multi_analyzer_params = { + "by_field": "language", + "analyzers": { + "en": {"type": "english"}, + "zh": {"type": "chinese"}, + "default": {"tokenizer": "icu"}, + }, + "alias": {"chinese": "zh", "eng": "en"}, + } + schema = client.create_schema(auto_id=False, enable_dynamic_field=True) + schema.add_field(field_name=default_primary_key_field_name, datatype=DataType.INT64, is_primary=True) + schema.add_field(field_name=default_vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=default_dim) + schema.add_field(field_name="language", datatype=DataType.VARCHAR, max_length=16) + schema.add_field(field_name=default_text_field_name, datatype=DataType.VARCHAR, nullable=True, max_length=2000, enable_analyzer=True, + analyzer_params=analyzer_params) + schema.add_field(field_name=default_text_field_name_chinese, datatype=DataType.VARCHAR, nullable=True, max_length=2000, enable_analyzer=True, + analyzer_params=analyzer_params_2) + schema.add_field(field_name=default_text_field_name_no_BM25, datatype=DataType.VARCHAR, nullable=True, max_length=2000, enable_analyzer=True, + analyzer_params=analyzer_params_2) + schema.add_field(field_name=default_text_field_name_multi_analyzer, datatype=DataType.VARCHAR, nullable=True, max_length=2000, enable_analyzer=True, + multi_analyzer_params=multi_analyzer_params) + schema.add_field(field_name=default_sparse_vector_field_name, datatype=DataType.SPARSE_FLOAT_VECTOR) + schema.add_field(field_name=default_sparse_vector_field_name_chinese, datatype=DataType.SPARSE_FLOAT_VECTOR) + schema.add_field(field_name=default_sparse_vector_field_name_multi_analyzer, datatype=DataType.SPARSE_FLOAT_VECTOR) + bm25_function = Function( + name=f"{default_text_field_name}_bm25_emb", + function_type=FunctionType.BM25, + input_field_names=[default_text_field_name], + output_field_names=[default_sparse_vector_field_name], + params={} + ) + bm25_function_2 = Function( + name=f"{default_text_field_name_chinese}_bm25_emb", + function_type=FunctionType.BM25, + input_field_names=[default_text_field_name_chinese], + output_field_names=[default_sparse_vector_field_name_chinese], + params={} + ) + bm25_function_multi_analyzer = Function( + name=f"{default_text_field_name_multi_analyzer}_bm25_emb", + function_type=FunctionType.BM25, + input_field_names=[default_text_field_name_multi_analyzer], + output_field_names=[default_sparse_vector_field_name_multi_analyzer], + params={} + ) + schema.add_function(bm25_function) + schema.add_function(bm25_function_2) + schema.add_function(bm25_function_multi_analyzer) + index_params = client.prepare_index_params() + index_params.add_index(field_name=default_primary_key_field_name, index_type="AUTOINDEX") + index_params.add_index(field_name=default_vector_field_name, index_type="AUTOINDEX") + index_params.add_index(field_name=default_sparse_vector_field_name, index_type="SPARSE_INVERTED_INDEX", metric_type="BM25") + index_params.add_index(field_name=default_sparse_vector_field_name_chinese, index_type="SPARSE_INVERTED_INDEX", metric_type="BM25") + index_params.add_index(field_name=default_sparse_vector_field_name_multi_analyzer, index_type="SPARSE_INVERTED_INDEX", metric_type="BM25") + index_params.add_index(field_name=default_text_field_name, index_type="AUTOINDEX") + index_params.add_index(field_name=default_text_field_name_chinese, index_type="AUTOINDEX") + client.create_collection(collection_name=collection_name, schema=schema, index_params=index_params, consistency_level="Strong") + + text = ["Is there a leakage?", + "A leakage of what?", + "I have the seat full of water! Like, full of water!", + "Must be water.", + "Let's add that to the words of wisdom", + "7654321 keyword 1234567", + "key key key key key key key key", + "trytrytrytrytrytry", + "word 1 word 12 word 123", + "1 2 3 4 5 6 7 8 9 10", + "A B C D 一二三四 milvus结巴分词器中文测试", + "There is a sub-word in this sentence", + "", + None, + ("Dusk settled gently upon Embermoor, turning streets into ribbons in twilight glow. " + "In this quiet district, young Teren roamed with restless intent. He sought purpose, " + "something bright enough to hush turmoil pressing within his chest.\n" + "Teren’s trouble began when curious murmurs drifted through town concerning shimmering " + "lights rising nightly beyond hills. Elders insisted it stemmed from old ruins, relics " + "left behind by wanderers long gone. Yet no one ventured there; timid hearts kept them " + "grounded.\n" + "One evening, driven by stubborn courage, Teren set out alone. Crisp wind brushed his " + "cheeks, guiding him through slender trees trembling under midnight hush. Crickets chirped " + "rhythmically, echoing his steady footsteps. He pressed forward until dim ruins emerged, " + "stones bent by centuries yet proud in their quiet endurance.\n" + "Upon entering, Teren sensed something stirring—bright pulses drifting through corridors " + "like living embers. One ember hovered close, swirling gently, studying him with earnest " + "curiosity. It emitted tender hums resonating deep within Teren’s chest, soothing worry " + "stitched into his spirit.\n" + "Ember drifted higher, inviting him to follow. Teren stepped through crumbled chambers " + "until they reached an inner court where hundreds curled in silent orbits—tiny spheres " + "burning with soft brilliance. Together they formed swirling constellations, shimmering " + "Instantly, warmth surged through him—not harsh, not wild, but gentle strength reminding " + "him he belonged in this immense world. Hidden burdens loosened. He felt courage blooming, " + "rooted in something deeper than fear.\n" + "When dawn arrived, Ember escorting him outside, Teren turned to ruins glowing faintly " + "beneath morning light. He understood now: these spirits lingered not for warning but for " + "guiding tender souls seeking direction.\n" + "He returned to Embermoor changed. Not every problem dissolved, yet Teren moved through " + "his days with renewed stride, carrying brilliance gifted during his midnight journey."), + ("黄昏降临在静谧城镇,灯影沿着街道缓缓铺展。青年林舟怀着不安在巷道行走,心跳与脚步相互呼应。他渴望找到方向,却被往昔失落缠绕。" + "传言提到远处丘陵夜晚会浮现微光,长者劝人别靠近,担忧未知带来风险。林舟仍被好奇牵引,选择独自前往。冷风掠过树梢,星辰悬挂高空,陪伴他穿越草径。" + "残破遗迹映入眼帘,石壁布满岁月痕迹。踏入其内,柔亮光点缓缓旋转,如同呼吸般起伏。某个光点靠近他,散发温暖振动,仿佛聆听他内心低语。" + "光点引领他走向空旷庭院,成群光辉在空域盘旋,编织出壮丽图景。那瞬间,林舟感到胸口释然,恐惧逐渐消散,勇气悄然生根。" + "黎明到来,他回望遗迹,光辉渐隐却留存于心。归途上,他步伐坚定,明白指引并未消失,只是化作持续前行力量。每当夜色再度降临,他都会抬头微笑,感谢那段静默旅程。"), + "甲,甲乙,甲乙丙,甲乙丙丁,甲乙丙丁戊,甲乙丙丁戊己,甲乙丙丁戊己庚,甲乙丙丁戊己庚辛,甲乙丙丁戊己庚辛壬,甲乙丙丁戊己庚辛壬癸"] + + l = len(text) + + rows = cf.gen_row_data_by_schema(nb=l, schema=schema) + for i, row in enumerate(rows): + row[default_text_field_name] = text[i] + row[default_text_field_name_chinese] = text[i] + row[default_text_field_name_no_BM25] = text[i] + row[default_text_field_name_multi_analyzer] = text[i] + row["language"] = "en" if i % 2 == 0 else "zh" + + client.insert(collection_name=collection_name, data=rows) + + def teardown(): + try: + if self.has_collection(self._client(), COLLECTION_NAME): + self.drop_collection(self._client(), COLLECTION_NAME) + except Exception: + pass + request.addfinalizer(teardown) + + """ + ****************************************************************** + # The following are valid test cases + ****************************************************************** + """ + @pytest.mark.tags(CaseLabel.L0) + def test_milvus_client_highlighter_basic(self): + """ + target: Test highlighter can be successfully used + method: + 1. Search the data + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + pre_tags = ["<<<<<<<"] + post_tags = [">>>>>>"] + highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags, + highlight_search_text = True, + fragment_offset=0, + fragment_size = 10, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + + expected = [[f"{pre_tags[0]}water{post_tags[0]}."], + [f"{pre_tags[0]}water{post_tags[0]}! Lik"]] + + results = client.search( + collection_name, + ["water"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + + # assert to make sure the results are not empty + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name] in expected + + + # test with multi analyzer + # BUG: #46498 + ''' + results = client.search( + collection_name, + ["water"], + search_params=search_params, + anns_field=default_sparse_vector_field_name_multi_analyzer, + output_fields=[default_text_field_name_multi_analyzer], + highlighter = highlight + ) + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name_multi_analyzer] in expected + ''' + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_multiple_tags(self): + """ + target: Test highlighter can be successfully used with multiple tags + method: + 1. Search the data + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + pre_tags = ["{", "<", "="] + post_tags = ["}", ">", "="] + highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags, + highlight_search_text = True, + fragment_offset=0, + fragment_size = 100, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + + expected = [] + for pre, post in cycle(zip(pre_tags, post_tags)): + expected.append(f"{pre}key{post}") + if len(expected) == 8: + break + expected = " ".join(expected) + + results = client.search( + collection_name, + ["key"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + + # assert to make sure the results are not empty + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name] == [expected] + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_fragment_parameters(self): + """ + target: Test highlighter can be successfully used with fragment parameters + method: + 1. Search the data with different fragment parameters + nested list: row corresponds to fragment_size, column corresponds to num_of_fragments + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + fragment_size = [1, 9, 100] + num_of_fragments = [0, 1, 2] + + # row 0 corresponds to fragment_size = 1 column 0 corresponds to num_of_fragments = 0 + # row 1 corresponds to fragment_size = 9 column 1 corresponds to num_of_fragments = 1 + # row 2 corresponds to fragment_size = 100 column 2 corresponds to num_of_fragments = 2 + expected = [[[[]], [["{water}"]], [['{water}'], ["{water}", "{water}"]]], + [[[]], [['{water}.'], ['{water}! Li']], [['{water}.'], ['{water}! Li', '{water}!']]], + [[[]], [['{water}.'], ['{water}! Like, full of {water}!']], [['{water}.'], ['{water}! Like, full of {water}!']]]] + + for i, size in enumerate(fragment_size): + for j, num in enumerate(num_of_fragments): + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=0, + fragment_size = size, + num_of_fragments=num) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + results = client.search( + collection_name, + ["water"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + + # assert to make sure the results are not empty + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name] in expected[i][j] + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_fragment_offset(self): + """ + target: Test highlighter can be successfully used with fragment offset + method: + 1. Search the data with different fragment offset + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + fragment_offset = [0, 5, 100] + + expected = [[["=water="], ['=water=', '=water=']], + [['t be =water='], ['l of =water=', 'l of =water=']], + [['Must be =water='], ['I have the seat full of =water=', 'I have the seat full of water! Like, full of =water=']]] + + for i, offset in enumerate(fragment_offset): + highlight = LexicalHighlighter(pre_tags=["="], post_tags=["="], + highlight_search_text = True, + fragment_offset=offset, + fragment_size = 5, + num_of_fragments=2) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + results = client.search( + collection_name, + ["water"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + + # assert to make sure the results are not empty + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name] in expected[i] + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_number(self): + """ + target: Test highlighter can be successfully used with numbers + method: + 1. Search the data with different fragment size + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + + expected = [[['word {1}'], ['{1} 2 3']], + [['7654321 keyword {1234567}']]] + highlight = LexicalHighlighter(pre_tags=["{", "<"], post_tags=["}", ">"], + highlight_search_text = True, + fragment_offset=100, + fragment_size = 5, + num_of_fragments=2) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + + results = client.search( + collection_name, + ["1", "1234567"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + + for i, result in enumerate(results): + # assert to make sure the results are not empty + assert result != [] + for res in result: + assert res['highlight'][default_text_field_name] in expected[i] + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_offset_greater_than_fragment_size(self): + """ + target: Test highlighter can be successfully used with offset greater than fragment size + method: + 1. Search the data with offset greater than fragment size + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + expected = ["4321 {keyword}"] + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=5, + fragment_size = 1, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + results = client.search( + collection_name, + ["keyword"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + + # assert to make sure the results are not empty + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name] == expected + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_search_sentence(self): + """ + target: Test highlighter can be successfully used with search sentence + method: + 1. Search the data with search sentence + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + expected = [['{A} {leakage} {of}', 'e of {what}?'], + ['re a {leakage}'], + ['{A} B C D 一二'], + ['full {of} wa', 'full {of} wa'], + ['ords {of} wi'] + ] + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=5, + fragment_size = 10, + num_of_fragments=10) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + results = client.search( + collection_name, + ["A leakage of what?"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + + # assert to make sure the results are not empty + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name] in expected + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_nonexistent_keyword(self): + """ + target: Test highlighter can be successfully used with nonexistent keyword + method: + 1. Search the data with nonexistent keyword + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + expected = [] + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=0, + fragment_size = 1, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + results = client.search( + collection_name, + ["nonexistent", "", "NULL", "None", "null", 'NaN'], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + + for result in results: + assert result == expected + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_sub_word(self): + """ + target: Test highlighter can be successfully used with sub word + method: + 1. Search the data with sub word + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + expected = ["{sub}"] + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=0, + fragment_size = 1, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + results = client.search( + collection_name, + ["sub"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + + # assert to make sure the results are not empty + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name] == expected + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_long_tags(self): + """ + target: Test highlighter can be successfully used with long tags + method: + 1. Search the data with long tags + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + pre_tags = ["{" * 999] + post_tags = ["}" * 999] + expected = [f"{pre_tags[0]}water{post_tags[0]}"] + highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags, + highlight_search_text = True, + fragment_offset=0, + fragment_size = 1, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + results = client.search( + collection_name, + ["water"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + + # assert to make sure the results are not empty + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name] == expected + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_long_search_text(self): + """ + target: Test highlighter can be successfully used with long search text + method: + 1. Search the data with long search text + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + expected = ["{Embermoor}", "{Embermoor}"] + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=0, + fragment_size = 1, + num_of_fragments=10) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + results = client.search( + collection_name, + ["Embermoor"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + # assert to make sure the results are not empty + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name] == expected + + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_lower_upper_case(self): + """ + target: Test highlighter can be successfully used with text match and highlight search text + method: + 1. Search the data with text match and highlight search text + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=0, + fragment_size = 100, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + results = client.search( + collection_name, + ["WATER"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + assert results[0] == [] + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_chinese_characters(self): + """ + target: Test highlighter can be successfully used with chinese characters + method: + 1. Search the data with chinese characters + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + expected = ["{二}"] + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=0, + fragment_size = 1, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + + results = client.search( + collection_name, + ["二"], + search_params=search_params, + anns_field=default_sparse_vector_field_name_chinese, + output_fields=[default_text_field_name_chinese], + highlighter = highlight + ) + + # assert to make sure the results are not empty + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name_chinese] == expected + + expected = ["{结巴分词器}"] + results = client.search( + collection_name, + ["结巴分词器"], + search_params=search_params, + anns_field=default_sparse_vector_field_name_chinese, + output_fields=[default_text_field_name_chinese], + highlighter = highlight + ) + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name_chinese] == expected + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_chinese_characters_long_text(self): + """ + target: Test highlighter can be successfully used with chinese characters + method: + 1. Search the data with chinese characters + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + expected = ["{呼}","{如同呼吸般起伏}"] + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=0, + fragment_size = 1, + num_of_fragments=10) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + + results = client.search( + collection_name, + ["如同呼吸般起伏"], + search_params=search_params, + anns_field=default_sparse_vector_field_name_chinese, + output_fields=[default_text_field_name_chinese], + highlighter = highlight + ) + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name_chinese] == expected + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_with_query(self): + """ + target: Test highlighter can be successfully used with query + method: + 1. Search the data with query + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + pre_tags = ["<<<<<<<"] + post_tags = [">>>>>>"] + highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags, + highlight_search_text = True, + fragment_offset=0, + fragment_size = 10, + num_of_fragments=1, + queries=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "seat"}]) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + + expected = [[f"{pre_tags[0]}water{post_tags[0]}."], + [f"{pre_tags[0]}water{post_tags[0]}! Lik"]] + + results = client.search( + collection_name, + ["water"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight + ) + + # assert to make sure the results are not empty + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name] in expected + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_text_match(self): + """ + target: Test highlighter can be successfully used with text match + method: + 1. Search the data with text match + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + expected = ['{water}! Like, full of !'] + + highlight = LexicalHighlighter(pre_tags=["{", "<"], post_tags=["}", ">"], + highlight_search_text = False, + queries=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "water"}]) + + new_search_params = {"metric_type": "COSINE"} + + + vector = client.query(collection_name, filter=f"{default_primary_key_field_name} == 2", output_fields=[default_vector_field_name])[0][default_vector_field_name] + + results = client.search( + collection_name, + [vector], + search_params=new_search_params, + anns_field=default_vector_field_name, + output_fields=[default_text_field_name_no_BM25], + limit=1, + highlighter = highlight + ) + + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name_no_BM25] == expected + + # test with mismatched tags + expected = ['{water}! Like, full of {water>!'] + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}", ">"], + highlight_search_text = False, + queries=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "water"}]) + + results = client.search( + collection_name, + [vector], + search_params=new_search_params, + anns_field=default_vector_field_name, + output_fields=[default_text_field_name_no_BM25], + limit=1, + highlighter = highlight + ) + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name_no_BM25] == expected + + # test with num_of_fragments > 1 + expected = ['{water}', '{water}'] + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = False, + fragment_size = 1, + num_of_fragments=10, + queries=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "water"}]) + + results = client.search( + collection_name, + [vector], + search_params=new_search_params, + anns_field=default_vector_field_name, + output_fields=[default_text_field_name_no_BM25], + limit=1, + highlighter = highlight + ) + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name_no_BM25] == expected + + # test no match + expected = [] + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = False, + queries=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "nonexistent"}]) + vector = client.query(collection_name, filter=f"{default_primary_key_field_name} == 1", output_fields=[default_vector_field_name])[0][default_vector_field_name] + results = client.search( + collection_name, + [vector], + search_params=new_search_params, + anns_field=default_vector_field_name, + output_fields=[default_text_field_name_no_BM25], + limit=1, + highlighter = highlight + ) + assert results[0][0]["highlight"][default_text_field_name_no_BM25] == expected + + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.skip(reason="skip for now wait for future optimization") + def test_milvus_client_highlighter_chinese_characters_repeating_text(self): + """ + target: Test highlighter can be successfully used with repeating text + method: + 1. Search the data with repeating text + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + expected = ["{甲乙丙丁戊己庚辛壬}"] + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=0, + fragment_size = 1, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + results = client.search( + collection_name, + ["甲乙丙丁戊己庚辛壬"], + search_params=search_params, + anns_field=default_sparse_vector_field_name_chinese, + output_fields=[default_text_field_name_chinese], + highlighter = highlight + ) + assert results[0] != [] + for result in results[0]: + assert result['highlight'][default_text_field_name_chinese] == expected + + + + """ + ****************************************************************** + # The following are invalid test cases + ****************************************************************** + """ + @pytest.mark.tags(CaseLabel.L0) + @pytest.mark.parametrize("fragment_size", [0, -1, 0.1]) + def test_milvus_client_highlighter_fragment_size_invalid(self, fragment_size): + """ + target: Test highlighter can be successfully used with fragment size zero + method: + 1. Search the data with fragment size zero + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=0, + fragment_size = fragment_size, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + + error = {ct.err_code: 1100, + ct.err_msg: f"invalid fragment_size: {fragment_size}: invalid parameter"} + self.search( + client, + collection_name, + ["water"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight, + check_task=CheckTasks.err_res, + check_items=error + ) + + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("fragment_offset", [-1, 0.1]) + def test_milvus_client_highlighter_fragment_offset_invalid(self, fragment_offset): + """ + target: Test highlighter can be successfully used with fragment offset negative + method: + 1. Search the data with fragment offset negative + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=fragment_offset, + fragment_size = 10, + num_of_fragments=10) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + error = {ct.err_code: 1100, + ct.err_msg: f"invalid fragment_offset: {fragment_offset}: invalid parameter"} + self.search( + client, + collection_name, + ["water"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight, + check_task=CheckTasks.err_res, + check_items=error + ) + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("num_of_fragments", [-1, 0.1]) + def test_milvus_client_highlighter_number_of_fragments_invalid(self, num_of_fragments): + """ + target: Test highlighter can be successfully used with number of fragments invalid + method: + 1. Search the data with number of fragments invalid + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=0, + fragment_size = 10, + num_of_fragments=num_of_fragments) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + error = {ct.err_code: 1100, + ct.err_msg: f"invalid num_of_fragments: {num_of_fragments}: invalid parameter"} + self.search( + client, + collection_name, + ["water"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight, + check_task=CheckTasks.err_res, + check_items=error + ) + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_text_match_invalid_search_params(self): + """ + target: Test highlighter can be successfully used with text match invalid search params + method: + 1. Search the data with text match invalid search params + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + pre_tags = ["<<<<<<<"] + post_tags = [">>>>>>"] + highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags, + highlight_search_text = True, + fragment_offset=0, + fragment_size = 10, + num_of_fragments=1, + queries=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "seat"}]) + search_params = {"metric_type": "COSINE"} + vector = client.query(collection_name, filter=f"{default_primary_key_field_name} == 2", output_fields=[default_vector_field_name])[0][default_vector_field_name] + + error = {ct.err_code: 1100, + ct.err_msg: f"Search highlight only support with metric type \"BM25\" but was: : invalid parameter"} + self.search( + client, + collection_name, + [vector], + search_params=search_params, + anns_field=default_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight, + check_task=CheckTasks.err_res, + check_items=error + ) + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_text_match_invalid_anns_field(self): + """ + target: Test highlighter can be successfully used with text match invalid anns field + method: + 1. Search the data with text match invalid anns field + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + pre_tags = ["<<<<<<<"] + post_tags = [">>>>>>"] + highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags, + highlight_search_text = True, + fragment_offset=0, + fragment_size = 10, + num_of_fragments=1, + queries=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "seat"}]) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + vector = client.query(collection_name, filter=f"{default_primary_key_field_name} == 2", output_fields=[default_vector_field_name])[0][default_vector_field_name] + + + # textMatch with BM25 anns field + error = {ct.err_code: 1100, + ct.err_msg: f"please provide varchar/text for BM25 Function based search, got FloatVector: invalid parameter"} + self.search( + client, + collection_name, + [vector], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight, + check_task=CheckTasks.err_res, + check_items=error + ) + + # highlight search text with vector anns field + error = {ct.err_code: 5, + ct.err_msg: f"service internal error: Search with highlight failed, input field of BM25 annsField not found"} + self.search( + client, + collection_name, + ["water"], + search_params=search_params, + anns_field=default_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight, + check_task=CheckTasks.err_res, + check_items=error + ) + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_text_match_with_highlight_search_text(self): + """ + target: Test highlighter can be successfully used with text match and highlight search text + method: + 1. Search the data with text match and highlight search text + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + + highlight = LexicalHighlighter(pre_tags=["{", "<"], post_tags=["}", ">"], + highlight_search_text = True, + queries=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "water"}]) + + new_search_params = {"metric_type": "COSINE"} + + + vector = client.query(collection_name, filter=f"{default_primary_key_field_name} == 2", output_fields=[default_vector_field_name])[0][default_vector_field_name] + error = {ct.err_code: 1100, + ct.err_msg: f"Search highlight only support with metric type \"BM25\" but was: : invalid parameter"} + self.search( + client, + collection_name, + [vector], + search_params=new_search_params, + anns_field=default_vector_field_name, + output_fields=[default_text_field_name_no_BM25], + limit=1, + highlighter = highlight, + check_task=CheckTasks.err_res, + check_items=error + ) + + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_with_filter(self): + """ + target: Test highlighter can be successfully used with query invalid search params + method: + 1. Search the data with query invalid search params + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + + highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"], + highlight_search_text = True, + fragment_offset=0, + fragment_size = 10, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + error = {ct.err_code: 1100, + ct.err_msg: f"failed to create query plan: cannot parse expression: TEXT_MATCH({default_text_field_name}, \"seat\"), " + f"error: field \"{default_text_field_name}\" does not enable match: invalid parameter"} + + self.search( + client, + collection_name, + ["water"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight, + filter=f'TEXT_MATCH({default_text_field_name}, "seat")', + check_task=CheckTasks.err_res, + check_items=error + ) + + @pytest.mark.tags(CaseLabel.L1) + def test_milvus_client_highlighter_empty_tags(self): + """ + target: Test highlighter can be successfully used with filter invalid filter + method: + 1. Search the data with filter invalid filter + expected: Step 1 should result success + """ + client = self._client() + collection_name = COLLECTION_NAME + highlight = LexicalHighlighter(pre_tags=[], post_tags=[], + highlight_search_text = True, + fragment_offset=0, + fragment_size = 10, + num_of_fragments=1) + search_params = {"params": {"nlist": 128}, "metric_type": "BM25"} + error = {ct.err_code: 1100, + ct.err_msg: f"pre_tags cannot be empty list: invalid parameter"} + self.search( + client, + collection_name, + ["water"], + search_params=search_params, + anns_field=default_sparse_vector_field_name, + output_fields=[default_text_field_name], + highlighter = highlight, + check_task=CheckTasks.err_res, + check_items=error + ) \ No newline at end of file