diff --git a/tests/python_client/base/client_v2_base.py b/tests/python_client/base/client_v2_base.py index e67e38294f..20b67c26b5 100644 --- a/tests/python_client/base/client_v2_base.py +++ b/tests/python_client/base/client_v2_base.py @@ -171,9 +171,9 @@ class TestMilvusClientV2Base(Base): limit=limit, output_fields=output_fields, search_params=search_params, **kwargs).run() return res, check_result - + @trace() - def hybrid_search(self, client, collection_name, reqs, rerank, limit=10, + def hybrid_search(self, client, collection_name, reqs, rerank, limit=10, output_fields=None, timeout=None, partition_names=None, check_task=None, check_items=None, **kwargs): timeout = TIMEOUT if timeout is None else timeout @@ -919,4 +919,14 @@ class TestMilvusClientV2Base(Base): check_result = ResponseChecker(res, func_name, check_task, check_items, check, **kwargs).run() return res, check_result + @trace() + def run_analyzer(self, client, text, analyzer_params, timeout=None, check_task=None, check_items=None, **kwargs): + timeout = TIMEOUT if timeout is None else timeout + kwargs.update({"timeout": timeout}) + func_name = sys._getframe().f_code.co_name + res, check = api_request([client.run_analyzer, text, analyzer_params], **kwargs) + check_result = ResponseChecker(res, func_name, check_task, check_items, check, text=text, + analyzer_params=analyzer_params, **kwargs).run() + return res, check_result + diff --git a/tests/python_client/common/phrase_match_generator.py b/tests/python_client/common/phrase_match_generator.py index e77b045434..726d857378 100644 --- a/tests/python_client/common/phrase_match_generator.py +++ b/tests/python_client/common/phrase_match_generator.py @@ -368,3 +368,112 @@ class PhraseMatchTestGenerator: matched_docs.extend(doc_id) return matched_docs + + + + +class KoreanTextGenerator: + def __init__(self): + # Sports/Activities (Nouns) + self.activities = [ + "수영", "축구", "농구", "테니스", + "배구", "야구", "골프", "럭비", + "달리기", "자전거", "스케이트", "스키", + "서핑", "다이빙", "등산", "요가", + "춤", "하이킹", "독서", "요리" + ] + + # Verbs (Base Form) + self.verbs = [ + "좋아하다", "즐기다", "하다", "배우다", + "가르치다", "보다", "시작하다", "계속하다", + "연습하다", "선호하다", "마스터하다", "도전하다" + ] + + # Connectors + self.connectors = [ + "그리고", "또는", "하지만", "그런데", + "그래서", "또한", "게다가", "그러면서", + "동시에", "함께" + ] + + # Modifiers (Frequency/Degree) + self.modifiers = [ + "매우", "자주", "가끔", "열심히", + "전문적으로", "규칙적으로", "매일", "일주일에 한 번", + "취미로", "진지하게" + ] + + def conjugate_verb(self, verb): + # Simple Korean verb conjugation (using informal style "-아/어요") + if verb.endswith("하다"): + return verb.replace("하다", "해요") + elif verb.endswith("다"): + return verb[:-1] + "아요" + return verb + + def sentence(self): + # Build basic sentence structure + activity = random.choice(self.activities) + verb = random.choice(self.verbs) + modifier = random.choice(self.modifiers) + + # Conjugate verb + conjugated_verb = self.conjugate_verb(verb) + + # Build sentence (Korean word order: Subject + Object + Modifier + Verb) + sentence = f"저는 {activity}를/을 {modifier} {conjugated_verb}" + + # Randomly add connector and another activity + if random.choice([True, False]): + connector = random.choice(self.connectors) + second_activity = random.choice(self.activities) + second_verb = self.conjugate_verb(random.choice(self.verbs)) + sentence += f" {connector} {second_activity}도 {second_verb}" + + return sentence + "." + + def paragraph(self, num_sentences=3): + return '\n'.join([self.sentence() for _ in range(num_sentences)]) + + def text(self, num_sentences=5): + return '\n'.join([self.sentence() for _ in range(num_sentences)]) + + +def generate_text_by_analyzer(analyzer_params): + """ + Generate text data based on the given analyzer parameters + + Args: + analyzer_params: Dictionary containing the analyzer parameters + + Returns: + str: Generated text data + """ + if analyzer_params["tokenizer"] == "standard": + fake = Faker("en_US") + elif analyzer_params["tokenizer"] == "jieba": + fake = Faker("zh_CN") + elif analyzer_params["tokenizer"]["type"] == "lindera": + # Generate random Japanese text + if analyzer_params["tokenizer"]["dict_kind"] == "ipadic": + fake = Faker("ja_JP") + elif analyzer_params["tokenizer"]["dict_kind"] == "ko-dic": + fake = KoreanTextGenerator() + elif analyzer_params["tokenizer"]["dict_kind"] == "cc-cedict": + fake = Faker("zh_CN") + else: + raise ValueError("Invalid dict_kind") + else: + raise ValueError("Invalid analyzer parameters") + + text = fake.text() + stop_words = [] + if "filter" in analyzer_params: + for filter in analyzer_params["filter"]: + if filter["type"] == "stop": + stop_words.extend(filter["stop_words"]) + + # add stop words to the text + text += " " + " ".join(stop_words) + return text diff --git a/tests/python_client/milvus_client/test_milvus_client_analyzer.py b/tests/python_client/milvus_client/test_milvus_client_analyzer.py new file mode 100644 index 0000000000..a4ba5a9fa5 --- /dev/null +++ b/tests/python_client/milvus_client/test_milvus_client_analyzer.py @@ -0,0 +1,61 @@ +import pytest + +from base.client_v2_base import TestMilvusClientV2Base +from common.common_type import CaseLabel +from common.phrase_match_generator import generate_text_by_analyzer + + +class TestMilvusClientAnalyzer(TestMilvusClientV2Base): + analyzer_params_list = [ + { + "tokenizer": "standard", + "filter": [ + { + "type": "stop", + "stop_words": ["is", "the", "this", "a", "an", "and", "or"], + } + ], + }, + { + "tokenizer": "jieba", + }, + { + "tokenizer": {"type": "lindera", "dict_kind": "ipadic"}, + "filter": [ + { + "type": "stop", + "stop_words": ["は", "が", "の", "に", "を", "で", "と", "た"], + } + ], + }, + {"tokenizer": {"type": "lindera", "dict_kind": "ko-dic"}}, + {"tokenizer": {"type": "lindera", "dict_kind": "cc-cedict"}}, + ] + + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("analyzer_params", analyzer_params_list) + def test_analyzer(self, analyzer_params): + """ + target: test analyzer + method: use different analyzer params, then run analyzer to get the tokens + expected: verify the tokens + """ + client = self._client() + text = generate_text_by_analyzer(analyzer_params) + res, result = self.run_analyzer(client, text, analyzer_params) + tokens = res.tokens + # Check tokens are not empty + assert len(tokens) > 0, "No tokens were generated" + + # Check tokens are related to input text (all token should be a substring of the text) + assert all( + token.lower() in text.lower() for token in tokens + ), "some of the tokens do not appear in the original text" + + if "filter" in analyzer_params: + for filter in analyzer_params["filter"]: + if filter["type"] == "stop": + stop_words = filter["stop_words"] + assert not any( + token in stop_words for token in tokens + ), "some of the tokens are stop words" diff --git a/tests/python_client/requirements.txt b/tests/python_client/requirements.txt index 8d9355207f..b329fc050f 100644 --- a/tests/python_client/requirements.txt +++ b/tests/python_client/requirements.txt @@ -28,8 +28,8 @@ pytest-parallel pytest-random-order # pymilvus -pymilvus==2.6.0rc79 -pymilvus[bulk_writer]==2.6.0rc79 +pymilvus==2.6.0rc81 +pymilvus[bulk_writer]==2.6.0rc81 # for customize config test diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index 19a25d698d..32b07ea236 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -5,6 +5,7 @@ from utils.util_log import test_log as log from common.common_type import CaseLabel, CheckTasks from common import common_type as ct from common import common_func as cf +from common.phrase_match_generator import KoreanTextGenerator from common.code_mapping import ConnectionErrorMessage as cem from base.client_base import TestcaseBase from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_EVENTUALLY @@ -29,6 +30,10 @@ Faker.seed(19530) fake_en = Faker("en_US") fake_zh = Faker("zh_CN") fake_de = Faker("de_DE") +fake_jp = Faker("ja_JP") +fake_ko = Faker("ko_KR") + + # patch faker to generate text with specific distribution cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution) @@ -5734,6 +5739,79 @@ class TestQueryTextMatch(TestcaseBase): res, _ = collection_w.query(expr=expr, output_fields=["id", field]) pytest.assume(len(res) == 0, f"res len {len(res)}, data size {data_size}") + @pytest.mark.parametrize("dict_kind", ["ipadic", "ko-dic", "cc-cedict"]) + def test_query_text_match_with_Lindera_tokenizer(self, dict_kind): + """ + target: test text match with lindera tokenizer + method: 1. enable text match, use lindera tokenizer and insert data with varchar in different lang + 2. get the most common words and query with text match + 3. verify the result + expected: get the correct token, text match successfully and result is correct + """ + analyzer_params = { + "tokenizer": { + "type": "lindera", + "dict_kind": dict_kind + } + } + if dict_kind == "ipadic": + fake = fake_jp + elif dict_kind == "ko-dic": + fake = KoreanTextGenerator() + elif dict_kind == "cc-cedict": + fake = fake_zh + else: + fake = fake_en + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + ] + schema = CollectionSchema(fields=fields, description="test collection") + data_size = 5000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + data = [ + { + "id": i, + "sentence": fake.sentence(), + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + collection_w.flush() + collection_w.create_index( + "emb", + {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, + ) + collection_w.load() + # analyze the croup + text_fields = ["sentence"] + # query sentence field with word list + for field in text_fields: + match_text = df["sentence"].iloc[0] + expr = f"text_match({field}, '{match_text}')" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + assert len(res) > 0 @pytest.mark.tags(CaseLabel.L0) def test_query_text_match_with_combined_expression_for_single_field(self):