milvus/tests/python_client/milvus_client/test_milvus_client_highlighter.py
aoiasd 342ba550bf
enhance: update highlight ci (#46573)
relate: https://github.com/milvus-io/milvus/issues/46571

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
- Core invariant: the LexicalHighlighter API now expects the match
queries under the parameter name highlight_query (not queries); all call
sites must pass highlight_query to supply match data. This PR assumes
the underlying highlighter behavior and processing of those query values
are unchanged.
- Logic simplified/removed: removed the legacy keyword queries in tests
and updated calls to use highlight_query
(tests/python_client/milvus_client/test_milvus_client_highlighter.py).
This eliminates a redundant/incorrect keyword alias and aligns tests
with the consolidated LexicalHighlighter constructor parameter name.
- Why this does NOT introduce data loss or behavior regression: the
change is a parameter-name rename only — no parsing, matching, or
storage logic was modified. Tests now construct LexicalHighlighter with
pre_tags/post_tags/highlight_search_text/fragment_* and pass the query
list under highlight_query; the highlighter execution path
(client.search → highlighter processing → result['highlight']) is
untouched, so existing highlight outputs and stored data remain
unchanged.
- Other changes: bumped pymilvus test dependency to 2.7.0rc93 in
tests/python_client/requirements.txt to match the updated constructor
signature; scope of change is limited to tests and dependency pinning
(no production code changes).
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

Signed-off-by: aoiasd <zhicheng.yue@zilliz.com>
2025-12-24 19:07:18 +08:00

1164 lines
52 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from itertools import cycle
import pytest
from base.client_v2_base import TestMilvusClientV2Base
from utils.util_log import test_log as log
from common import common_func as cf
from common import common_type as ct
from common.common_type import CaseLabel, CheckTasks
from utils.util_pymilvus import *
from pymilvus import Function, FunctionType, DataType, LexicalHighlighter
prefix = "client_insert"
epsilon = ct.epsilon
default_nb = ct.default_nb
default_nb_medium = ct.default_nb_medium
default_nq = ct.default_nq
default_dim = ct.default_dim
default_limit = ct.default_limit
default_search_exp = "id >= 0"
exp_res = "exp_res"
default_search_string_exp = "varchar >= \"0\""
default_search_mix_exp = "int64 >= 0 && varchar >= \"0\""
default_invaild_string_exp = "varchar >= 0"
default_json_search_exp = "json_field[\"number\"] >= 0"
perfix_expr = 'varchar like "0%"'
default_search_field = ct.default_float_vec_field_name
default_search_params = ct.default_search_params
default_primary_key_field_name = "id"
default_vector_field_name = "vector"
default_text_field_name = "tr"
default_text_field_name_chinese = "tr_chinese"
default_text_field_name_no_BM25 = "tr_no_bm25"
default_text_field_name_multi_analyzer = "tr_multi_analyzer"
default_sparse_vector_field_name = f"{default_text_field_name}_sparse_emb"
default_sparse_vector_field_name_chinese = f"{default_text_field_name_chinese}_sparse_emb"
default_sparse_vector_field_name_multi_analyzer = f"{default_text_field_name_multi_analyzer}_sparse_emb"
COLLECTION_NAME = "test_hightligher" + cf.gen_unique_str("_")
@pytest.mark.xdist_group("TestMilvusClientHighlighter")
class TestMilvusClientHighlighter(TestMilvusClientV2Base):
"""
#########################################################
Init collection with highlighter so all the tests can use the same collection
This aims to save time for the tests
Also, highlighter is difficult to compare the results,
so we need to init the collection with pre-defined data
#########################################################
"""
@pytest.fixture(scope="module", autouse=True)
def prepare_highlighter_collection(self, request):
"""
Ensure the shared highlighter collection exists before any tests in this module,
and drop it after all tests in this module complete.
"""
client = self._client()
collection_name = COLLECTION_NAME
if client.has_collection(collection_name):
client.drop_collection(collection_name)
analyzer_params = {
"tokenizer": "standard"
}
analyzer_params_2 = {
"tokenizer": {
"type": "jieba",
"dict": ["结巴分词器"],
"mode": "exact",
"hmm": False
}
}
multi_analyzer_params = {
"by_field": "language",
"analyzers": {
"en": {"type": "english"},
"zh": {"type": "chinese"},
"default": {"tokenizer": "icu"},
},
"alias": {"chinese": "zh", "eng": "en"},
}
schema = client.create_schema(auto_id=False, enable_dynamic_field=True)
schema.add_field(field_name=default_primary_key_field_name, datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name=default_vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=default_dim)
schema.add_field(field_name="language", datatype=DataType.VARCHAR, max_length=16)
schema.add_field(field_name=default_text_field_name, datatype=DataType.VARCHAR, nullable=True, max_length=2000, enable_analyzer=True,
analyzer_params=analyzer_params)
schema.add_field(field_name=default_text_field_name_chinese, datatype=DataType.VARCHAR, nullable=True, max_length=2000, enable_analyzer=True,
analyzer_params=analyzer_params_2)
schema.add_field(field_name=default_text_field_name_no_BM25, datatype=DataType.VARCHAR, nullable=True, max_length=2000, enable_analyzer=True,
analyzer_params=analyzer_params_2)
schema.add_field(field_name=default_text_field_name_multi_analyzer, datatype=DataType.VARCHAR, nullable=True, max_length=2000, enable_analyzer=True,
multi_analyzer_params=multi_analyzer_params)
schema.add_field(field_name=default_sparse_vector_field_name, datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(field_name=default_sparse_vector_field_name_chinese, datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(field_name=default_sparse_vector_field_name_multi_analyzer, datatype=DataType.SPARSE_FLOAT_VECTOR)
bm25_function = Function(
name=f"{default_text_field_name}_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=[default_text_field_name],
output_field_names=[default_sparse_vector_field_name],
params={}
)
bm25_function_2 = Function(
name=f"{default_text_field_name_chinese}_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=[default_text_field_name_chinese],
output_field_names=[default_sparse_vector_field_name_chinese],
params={}
)
bm25_function_multi_analyzer = Function(
name=f"{default_text_field_name_multi_analyzer}_bm25_emb",
function_type=FunctionType.BM25,
input_field_names=[default_text_field_name_multi_analyzer],
output_field_names=[default_sparse_vector_field_name_multi_analyzer],
params={}
)
schema.add_function(bm25_function)
schema.add_function(bm25_function_2)
schema.add_function(bm25_function_multi_analyzer)
index_params = client.prepare_index_params()
index_params.add_index(field_name=default_primary_key_field_name, index_type="AUTOINDEX")
index_params.add_index(field_name=default_vector_field_name, index_type="AUTOINDEX")
index_params.add_index(field_name=default_sparse_vector_field_name, index_type="SPARSE_INVERTED_INDEX", metric_type="BM25")
index_params.add_index(field_name=default_sparse_vector_field_name_chinese, index_type="SPARSE_INVERTED_INDEX", metric_type="BM25")
index_params.add_index(field_name=default_sparse_vector_field_name_multi_analyzer, index_type="SPARSE_INVERTED_INDEX", metric_type="BM25")
index_params.add_index(field_name=default_text_field_name, index_type="AUTOINDEX")
index_params.add_index(field_name=default_text_field_name_chinese, index_type="AUTOINDEX")
client.create_collection(collection_name=collection_name, schema=schema, index_params=index_params, consistency_level="Strong")
text = ["Is there a leakage?",
"A leakage of what?",
"I have the seat full of water! Like, full of water!",
"Must be water.",
"Let's add that to the words of wisdom",
"7654321 keyword 1234567",
"key key key key key key key key",
"trytrytrytrytrytry",
"word 1 word 12 word 123",
"1 2 3 4 5 6 7 8 9 10",
"A B C D 一二三四 milvus结巴分词器中文测试",
"There is a sub-word in this sentence",
"",
None,
("Dusk settled gently upon Embermoor, turning streets into ribbons in twilight glow. "
"In this quiet district, young Teren roamed with restless intent. He sought purpose, "
"something bright enough to hush turmoil pressing within his chest.\n"
"Terens trouble began when curious murmurs drifted through town concerning shimmering "
"lights rising nightly beyond hills. Elders insisted it stemmed from old ruins, relics "
"left behind by wanderers long gone. Yet no one ventured there; timid hearts kept them "
"grounded.\n"
"One evening, driven by stubborn courage, Teren set out alone. Crisp wind brushed his "
"cheeks, guiding him through slender trees trembling under midnight hush. Crickets chirped "
"rhythmically, echoing his steady footsteps. He pressed forward until dim ruins emerged, "
"stones bent by centuries yet proud in their quiet endurance.\n"
"Upon entering, Teren sensed something stirring—bright pulses drifting through corridors "
"like living embers. One ember hovered close, swirling gently, studying him with earnest "
"curiosity. It emitted tender hums resonating deep within Terens chest, soothing worry "
"stitched into his spirit.\n"
"Ember drifted higher, inviting him to follow. Teren stepped through crumbled chambers "
"until they reached an inner court where hundreds curled in silent orbits—tiny spheres "
"burning with soft brilliance. Together they formed swirling constellations, shimmering "
"Instantly, warmth surged through him—not harsh, not wild, but gentle strength reminding "
"him he belonged in this immense world. Hidden burdens loosened. He felt courage blooming, "
"rooted in something deeper than fear.\n"
"When dawn arrived, Ember escorting him outside, Teren turned to ruins glowing faintly "
"beneath morning light. He understood now: these spirits lingered not for warning but for "
"guiding tender souls seeking direction.\n"
"He returned to Embermoor changed. Not every problem dissolved, yet Teren moved through "
"his days with renewed stride, carrying brilliance gifted during his midnight journey."),
("黄昏降临在静谧城镇,灯影沿着街道缓缓铺展。青年林舟怀着不安在巷道行走,心跳与脚步相互呼应。他渴望找到方向,却被往昔失落缠绕。"
"传言提到远处丘陵夜晚会浮现微光,长者劝人别靠近,担忧未知带来风险。林舟仍被好奇牵引,选择独自前往。冷风掠过树梢,星辰悬挂高空,陪伴他穿越草径。"
"残破遗迹映入眼帘,石壁布满岁月痕迹。踏入其内,柔亮光点缓缓旋转,如同呼吸般起伏。某个光点靠近他,散发温暖振动,仿佛聆听他内心低语。"
"光点引领他走向空旷庭院,成群光辉在空域盘旋,编织出壮丽图景。那瞬间,林舟感到胸口释然,恐惧逐渐消散,勇气悄然生根。"
"黎明到来,他回望遗迹,光辉渐隐却留存于心。归途上,他步伐坚定,明白指引并未消失,只是化作持续前行力量。每当夜色再度降临,他都会抬头微笑,感谢那段静默旅程。"),
"甲,甲乙,甲乙丙,甲乙丙丁,甲乙丙丁戊,甲乙丙丁戊己,甲乙丙丁戊己庚,甲乙丙丁戊己庚辛,甲乙丙丁戊己庚辛壬,甲乙丙丁戊己庚辛壬癸"]
l = len(text)
rows = cf.gen_row_data_by_schema(nb=l, schema=schema)
for i, row in enumerate(rows):
row[default_text_field_name] = text[i]
row[default_text_field_name_chinese] = text[i]
row[default_text_field_name_no_BM25] = text[i]
row[default_text_field_name_multi_analyzer] = text[i]
row["language"] = "en" if i % 2 == 0 else "zh"
client.insert(collection_name=collection_name, data=rows)
def teardown():
try:
if self.has_collection(self._client(), COLLECTION_NAME):
self.drop_collection(self._client(), COLLECTION_NAME)
except Exception:
pass
request.addfinalizer(teardown)
"""
******************************************************************
# The following are valid test cases
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L0)
def test_milvus_client_highlighter_basic(self):
"""
target: Test highlighter can be successfully used
method:
1. Search the data
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
pre_tags = ["<<<<<<<"]
post_tags = [">>>>>>"]
highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags,
highlight_search_text = True,
fragment_offset=0,
fragment_size = 10,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
expected = [[f"{pre_tags[0]}water{post_tags[0]}."],
[f"{pre_tags[0]}water{post_tags[0]}! Lik"]]
results = client.search(
collection_name,
["water"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
# assert to make sure the results are not empty
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name] in expected
# test with multi analyzer
# BUG: #46498
'''
results = client.search(
collection_name,
["water"],
search_params=search_params,
anns_field=default_sparse_vector_field_name_multi_analyzer,
output_fields=[default_text_field_name_multi_analyzer],
highlighter = highlight
)
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name_multi_analyzer] in expected
'''
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_multiple_tags(self):
"""
target: Test highlighter can be successfully used with multiple tags
method:
1. Search the data
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
pre_tags = ["{", "<", "="]
post_tags = ["}", ">", "="]
highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags,
highlight_search_text = True,
fragment_offset=0,
fragment_size = 100,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
expected = []
for pre, post in cycle(zip(pre_tags, post_tags)):
expected.append(f"{pre}key{post}")
if len(expected) == 8:
break
expected = " ".join(expected)
results = client.search(
collection_name,
["key"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
# assert to make sure the results are not empty
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name] == [expected]
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_fragment_parameters(self):
"""
target: Test highlighter can be successfully used with fragment parameters
method:
1. Search the data with different fragment parameters
nested list: row corresponds to fragment_size, column corresponds to num_of_fragments
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
fragment_size = [1, 9, 100]
num_of_fragments = [0, 1, 2]
# row 0 corresponds to fragment_size = 1 column 0 corresponds to num_of_fragments = 0
# row 1 corresponds to fragment_size = 9 column 1 corresponds to num_of_fragments = 1
# row 2 corresponds to fragment_size = 100 column 2 corresponds to num_of_fragments = 2
expected = [[[[]], [["{water}"]], [['{water}'], ["{water}", "{water}"]]],
[[[]], [['{water}.'], ['{water}! Li']], [['{water}.'], ['{water}! Li', '{water}!']]],
[[[]], [['{water}.'], ['{water}! Like, full of {water}!']], [['{water}.'], ['{water}! Like, full of {water}!']]]]
for i, size in enumerate(fragment_size):
for j, num in enumerate(num_of_fragments):
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=0,
fragment_size = size,
num_of_fragments=num)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["water"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
# assert to make sure the results are not empty
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name] in expected[i][j]
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_fragment_offset(self):
"""
target: Test highlighter can be successfully used with fragment offset
method:
1. Search the data with different fragment offset
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
fragment_offset = [0, 5, 100]
expected = [[["=water="], ['=water=', '=water=']],
[['t be =water='], ['l of =water=', 'l of =water=']],
[['Must be =water='], ['I have the seat full of =water=', 'I have the seat full of water! Like, full of =water=']]]
for i, offset in enumerate(fragment_offset):
highlight = LexicalHighlighter(pre_tags=["="], post_tags=["="],
highlight_search_text = True,
fragment_offset=offset,
fragment_size = 5,
num_of_fragments=2)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["water"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
# assert to make sure the results are not empty
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name] in expected[i]
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_number(self):
"""
target: Test highlighter can be successfully used with numbers
method:
1. Search the data with different fragment size
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
expected = [[['word {1}'], ['{1} 2 3']],
[['7654321 keyword {1234567}']]]
highlight = LexicalHighlighter(pre_tags=["{", "<"], post_tags=["}", ">"],
highlight_search_text = True,
fragment_offset=100,
fragment_size = 5,
num_of_fragments=2)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["1", "1234567"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
for i, result in enumerate(results):
# assert to make sure the results are not empty
assert result != []
for res in result:
assert res['highlight'][default_text_field_name] in expected[i]
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_offset_greater_than_fragment_size(self):
"""
target: Test highlighter can be successfully used with offset greater than fragment size
method:
1. Search the data with offset greater than fragment size
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
expected = ["4321 {keyword}"]
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=5,
fragment_size = 1,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["keyword"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
# assert to make sure the results are not empty
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name] == expected
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_search_sentence(self):
"""
target: Test highlighter can be successfully used with search sentence
method:
1. Search the data with search sentence
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
expected = [['{A} {leakage} {of}', 'e of {what}?'],
['re a {leakage}'],
['{A} B C D 一二'],
['full {of} wa', 'full {of} wa'],
['ords {of} wi']
]
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=5,
fragment_size = 10,
num_of_fragments=10)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["A leakage of what?"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
# assert to make sure the results are not empty
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name] in expected
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_nonexistent_keyword(self):
"""
target: Test highlighter can be successfully used with nonexistent keyword
method:
1. Search the data with nonexistent keyword
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
expected = []
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=0,
fragment_size = 1,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["nonexistent", "", "NULL", "None", "null", 'NaN'],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
for result in results:
assert result == expected
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_sub_word(self):
"""
target: Test highlighter can be successfully used with sub word
method:
1. Search the data with sub word
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
expected = ["{sub}"]
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=0,
fragment_size = 1,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["sub"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
# assert to make sure the results are not empty
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name] == expected
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_long_tags(self):
"""
target: Test highlighter can be successfully used with long tags
method:
1. Search the data with long tags
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
pre_tags = ["{" * 999]
post_tags = ["}" * 999]
expected = [f"{pre_tags[0]}water{post_tags[0]}"]
highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags,
highlight_search_text = True,
fragment_offset=0,
fragment_size = 1,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["water"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
# assert to make sure the results are not empty
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name] == expected
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_long_search_text(self):
"""
target: Test highlighter can be successfully used with long search text
method:
1. Search the data with long search text
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
expected = ["{Embermoor}", "{Embermoor}"]
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=0,
fragment_size = 1,
num_of_fragments=10)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["Embermoor"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
# assert to make sure the results are not empty
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name] == expected
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_lower_upper_case(self):
"""
target: Test highlighter can be successfully used with text match and highlight search text
method:
1. Search the data with text match and highlight search text
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=0,
fragment_size = 100,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["WATER"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
assert results[0] == []
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_chinese_characters(self):
"""
target: Test highlighter can be successfully used with chinese characters
method:
1. Search the data with chinese characters
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
expected = ["{二}"]
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=0,
fragment_size = 1,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
[""],
search_params=search_params,
anns_field=default_sparse_vector_field_name_chinese,
output_fields=[default_text_field_name_chinese],
highlighter = highlight
)
# assert to make sure the results are not empty
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name_chinese] == expected
expected = ["{结巴分词器}"]
results = client.search(
collection_name,
["结巴分词器"],
search_params=search_params,
anns_field=default_sparse_vector_field_name_chinese,
output_fields=[default_text_field_name_chinese],
highlighter = highlight
)
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name_chinese] == expected
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_chinese_characters_long_text(self):
"""
target: Test highlighter can be successfully used with chinese characters
method:
1. Search the data with chinese characters
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
expected = ["{呼}","{如同呼吸般起伏}"]
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=0,
fragment_size = 1,
num_of_fragments=10)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["如同呼吸般起伏"],
search_params=search_params,
anns_field=default_sparse_vector_field_name_chinese,
output_fields=[default_text_field_name_chinese],
highlighter = highlight
)
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name_chinese] == expected
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_with_query(self):
"""
target: Test highlighter can be successfully used with query
method:
1. Search the data with query
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
pre_tags = ["<<<<<<<"]
post_tags = [">>>>>>"]
highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags,
highlight_search_text = True,
fragment_offset=0,
fragment_size = 10,
num_of_fragments=1,
highlight_query=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "seat"}])
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
expected = [[f"{pre_tags[0]}water{post_tags[0]}."],
[f"{pre_tags[0]}water{post_tags[0]}! Lik"]]
results = client.search(
collection_name,
["water"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight
)
# assert to make sure the results are not empty
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name] in expected
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_text_match(self):
"""
target: Test highlighter can be successfully used with text match
method:
1. Search the data with text match
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
expected = ['{water}! Like, full of <water>!']
highlight = LexicalHighlighter(pre_tags=["{", "<"], post_tags=["}", ">"],
highlight_search_text = False,
highlight_query=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "water"}])
new_search_params = {"metric_type": "COSINE"}
vector = client.query(collection_name, filter=f"{default_primary_key_field_name} == 2", output_fields=[default_vector_field_name])[0][default_vector_field_name]
results = client.search(
collection_name,
[vector],
search_params=new_search_params,
anns_field=default_vector_field_name,
output_fields=[default_text_field_name_no_BM25],
limit=1,
highlighter = highlight
)
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name_no_BM25] == expected
# test with mismatched tags
expected = ['{water}! Like, full of {water>!']
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}", ">"],
highlight_search_text = False,
highlight_query=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "water"}])
results = client.search(
collection_name,
[vector],
search_params=new_search_params,
anns_field=default_vector_field_name,
output_fields=[default_text_field_name_no_BM25],
limit=1,
highlighter = highlight
)
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name_no_BM25] == expected
# test with num_of_fragments > 1
expected = ['{water}', '{water}']
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = False,
fragment_size = 1,
num_of_fragments=10,
highlight_query=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "water"}])
results = client.search(
collection_name,
[vector],
search_params=new_search_params,
anns_field=default_vector_field_name,
output_fields=[default_text_field_name_no_BM25],
limit=1,
highlighter = highlight
)
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name_no_BM25] == expected
# test no match
expected = []
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = False,
highlight_query=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "nonexistent"}])
vector = client.query(collection_name, filter=f"{default_primary_key_field_name} == 1", output_fields=[default_vector_field_name])[0][default_vector_field_name]
results = client.search(
collection_name,
[vector],
search_params=new_search_params,
anns_field=default_vector_field_name,
output_fields=[default_text_field_name_no_BM25],
limit=1,
highlighter = highlight
)
assert results[0][0]["highlight"][default_text_field_name_no_BM25] == expected
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.skip(reason="skip for now wait for future optimization")
def test_milvus_client_highlighter_chinese_characters_repeating_text(self):
"""
target: Test highlighter can be successfully used with repeating text
method:
1. Search the data with repeating text
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
expected = ["{甲乙丙丁戊己庚辛壬}"]
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=0,
fragment_size = 1,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
results = client.search(
collection_name,
["甲乙丙丁戊己庚辛壬"],
search_params=search_params,
anns_field=default_sparse_vector_field_name_chinese,
output_fields=[default_text_field_name_chinese],
highlighter = highlight
)
assert results[0] != []
for result in results[0]:
assert result['highlight'][default_text_field_name_chinese] == expected
"""
******************************************************************
# The following are invalid test cases
******************************************************************
"""
@pytest.mark.tags(CaseLabel.L0)
@pytest.mark.parametrize("fragment_size", [0, -1, 0.1])
def test_milvus_client_highlighter_fragment_size_invalid(self, fragment_size):
"""
target: Test highlighter can be successfully used with fragment size zero
method:
1. Search the data with fragment size zero
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=0,
fragment_size = fragment_size,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
error = {ct.err_code: 1100,
ct.err_msg: f"invalid fragment_size: {fragment_size}: invalid parameter"}
self.search(
client,
collection_name,
["water"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight,
check_task=CheckTasks.err_res,
check_items=error
)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("fragment_offset", [-1, 0.1])
def test_milvus_client_highlighter_fragment_offset_invalid(self, fragment_offset):
"""
target: Test highlighter can be successfully used with fragment offset negative
method:
1. Search the data with fragment offset negative
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=fragment_offset,
fragment_size = 10,
num_of_fragments=10)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
error = {ct.err_code: 1100,
ct.err_msg: f"invalid fragment_offset: {fragment_offset}: invalid parameter"}
self.search(
client,
collection_name,
["water"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight,
check_task=CheckTasks.err_res,
check_items=error
)
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("num_of_fragments", [-1, 0.1])
def test_milvus_client_highlighter_number_of_fragments_invalid(self, num_of_fragments):
"""
target: Test highlighter can be successfully used with number of fragments invalid
method:
1. Search the data with number of fragments invalid
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=0,
fragment_size = 10,
num_of_fragments=num_of_fragments)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
error = {ct.err_code: 1100,
ct.err_msg: f"invalid num_of_fragments: {num_of_fragments}: invalid parameter"}
self.search(
client,
collection_name,
["water"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight,
check_task=CheckTasks.err_res,
check_items=error
)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_text_match_invalid_search_params(self):
"""
target: Test highlighter can be successfully used with text match invalid search params
method:
1. Search the data with text match invalid search params
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
pre_tags = ["<<<<<<<"]
post_tags = [">>>>>>"]
highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags,
highlight_search_text = True,
fragment_offset=0,
fragment_size = 10,
num_of_fragments=1,
highlight_query=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "seat"}])
search_params = {"metric_type": "COSINE"}
vector = client.query(collection_name, filter=f"{default_primary_key_field_name} == 2", output_fields=[default_vector_field_name])[0][default_vector_field_name]
error = {ct.err_code: 1100,
ct.err_msg: f"Search highlight only support with metric type \"BM25\" but was: : invalid parameter"}
self.search(
client,
collection_name,
[vector],
search_params=search_params,
anns_field=default_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight,
check_task=CheckTasks.err_res,
check_items=error
)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_text_match_invalid_anns_field(self):
"""
target: Test highlighter can be successfully used with text match invalid anns field
method:
1. Search the data with text match invalid anns field
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
pre_tags = ["<<<<<<<"]
post_tags = [">>>>>>"]
highlight = LexicalHighlighter(pre_tags=pre_tags, post_tags=post_tags,
highlight_search_text = True,
fragment_offset=0,
fragment_size = 10,
num_of_fragments=1,
highlight_query=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "seat"}])
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
vector = client.query(collection_name, filter=f"{default_primary_key_field_name} == 2", output_fields=[default_vector_field_name])[0][default_vector_field_name]
# textMatch with BM25 anns field
error = {ct.err_code: 1100,
ct.err_msg: f"please provide varchar/text for BM25 Function based search, got FloatVector: invalid parameter"}
self.search(
client,
collection_name,
[vector],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight,
check_task=CheckTasks.err_res,
check_items=error
)
# highlight search text with vector anns field
error = {ct.err_code: 5,
ct.err_msg: f"service internal error: Search with highlight failed, input field of BM25 annsField not found"}
self.search(
client,
collection_name,
["water"],
search_params=search_params,
anns_field=default_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight,
check_task=CheckTasks.err_res,
check_items=error
)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_text_match_with_highlight_search_text(self):
"""
target: Test highlighter can be successfully used with text match and highlight search text
method:
1. Search the data with text match and highlight search text
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
highlight = LexicalHighlighter(pre_tags=["{", "<"], post_tags=["}", ">"],
highlight_search_text = True,
highlight_query=[{"type": "TextMatch", "field": default_text_field_name_no_BM25, "text": "water"}])
new_search_params = {"metric_type": "COSINE"}
vector = client.query(collection_name, filter=f"{default_primary_key_field_name} == 2", output_fields=[default_vector_field_name])[0][default_vector_field_name]
error = {ct.err_code: 1100,
ct.err_msg: f"Search highlight only support with metric type \"BM25\" but was: : invalid parameter"}
self.search(
client,
collection_name,
[vector],
search_params=new_search_params,
anns_field=default_vector_field_name,
output_fields=[default_text_field_name_no_BM25],
limit=1,
highlighter = highlight,
check_task=CheckTasks.err_res,
check_items=error
)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_with_filter(self):
"""
target: Test highlighter can be successfully used with query invalid search params
method:
1. Search the data with query invalid search params
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
highlight = LexicalHighlighter(pre_tags=["{"], post_tags=["}"],
highlight_search_text = True,
fragment_offset=0,
fragment_size = 10,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
error = {ct.err_code: 1100,
ct.err_msg: f"failed to create query plan: cannot parse expression: TEXT_MATCH({default_text_field_name}, \"seat\"), "
f"error: field \"{default_text_field_name}\" does not enable match: invalid parameter"}
self.search(
client,
collection_name,
["water"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight,
filter=f'TEXT_MATCH({default_text_field_name}, "seat")',
check_task=CheckTasks.err_res,
check_items=error
)
@pytest.mark.tags(CaseLabel.L1)
def test_milvus_client_highlighter_empty_tags(self):
"""
target: Test highlighter can be successfully used with filter invalid filter
method:
1. Search the data with filter invalid filter
expected: Step 1 should result success
"""
client = self._client()
collection_name = COLLECTION_NAME
highlight = LexicalHighlighter(pre_tags=[], post_tags=[],
highlight_search_text = True,
fragment_offset=0,
fragment_size = 10,
num_of_fragments=1)
search_params = {"params": {"nlist": 128}, "metric_type": "BM25"}
error = {ct.err_code: 1100,
ct.err_msg: f"pre_tags cannot be empty list: invalid parameter"}
self.search(
client,
collection_name,
["water"],
search_params=search_params,
anns_field=default_sparse_vector_field_name,
output_fields=[default_text_field_name],
highlighter = highlight,
check_task=CheckTasks.err_res,
check_items=error
)