test: add run_analyzer api test and lindera tokenizer test (#40160)

/kind improvement /hold --------- Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
2026-01-07 19:31:51 +08:00 · 2025-02-26 10:29:55 +08:00 · 2025-02-26 10:29:55 +08:00 · 01966280da
commit 01966280da
parent 828ecacadc
5 changed files with 262 additions and 4 deletions
--- a/tests/python_client/base/client_v2_base.py
+++ b/tests/python_client/base/client_v2_base.py
@ -171,9 +171,9 @@ class TestMilvusClientV2Base(Base):
                                       limit=limit, output_fields=output_fields, search_params=search_params,
                                       **kwargs).run()
        return res, check_result
-    
+
    @trace()
-    def hybrid_search(self, client, collection_name, reqs, rerank, limit=10, 
+    def hybrid_search(self, client, collection_name, reqs, rerank, limit=10,
                      output_fields=None, timeout=None, partition_names=None,
                      check_task=None, check_items=None, **kwargs):
        timeout = TIMEOUT if timeout is None else timeout
@ -919,4 +919,14 @@ class TestMilvusClientV2Base(Base):
        check_result = ResponseChecker(res, func_name, check_task, check_items, check, **kwargs).run()
        return res, check_result

+    @trace()
+    def run_analyzer(self, client, text, analyzer_params, timeout=None, check_task=None, check_items=None, **kwargs):
+        timeout = TIMEOUT if timeout is None else timeout
+        kwargs.update({"timeout": timeout})
+        func_name = sys._getframe().f_code.co_name
+        res, check = api_request([client.run_analyzer, text, analyzer_params], **kwargs)
+        check_result = ResponseChecker(res, func_name, check_task, check_items, check, text=text,
+                                       analyzer_params=analyzer_params, **kwargs).run()
+        return res, check_result
+

--- a/tests/python_client/common/phrase_match_generator.py
+++ b/tests/python_client/common/phrase_match_generator.py
@ -368,3 +368,112 @@ class PhraseMatchTestGenerator:
            matched_docs.extend(doc_id)

        return matched_docs
+
+
+
+
+class KoreanTextGenerator:
+    def __init__(self):
+        # Sports/Activities (Nouns)
+        self.activities = [
+            "수영", "축구", "농구", "테니스",
+            "배구", "야구", "골프", "럭비",
+            "달리기", "자전거", "스케이트", "스키",
+            "서핑", "다이빙", "등산", "요가",
+            "춤", "하이킹", "독서", "요리"
+        ]
+
+        # Verbs (Base Form)
+        self.verbs = [
+            "좋아하다", "즐기다", "하다", "배우다",
+            "가르치다", "보다", "시작하다", "계속하다",
+            "연습하다", "선호하다", "마스터하다", "도전하다"
+        ]
+
+        # Connectors
+        self.connectors = [
+            "그리고", "또는", "하지만", "그런데",
+            "그래서", "또한", "게다가", "그러면서",
+            "동시에", "함께"
+        ]
+
+        # Modifiers (Frequency/Degree)
+        self.modifiers = [
+            "매우", "자주", "가끔", "열심히",
+            "전문적으로", "규칙적으로", "매일", "일주일에 한 번",
+            "취미로", "진지하게"
+        ]
+
+    def conjugate_verb(self, verb):
+        # Simple Korean verb conjugation (using informal style "-아/어요")
+        if verb.endswith("하다"):
+            return verb.replace("하다", "해요")
+        elif verb.endswith("다"):
+            return verb[:-1] + "아요"
+        return verb
+
+    def sentence(self):
+        # Build basic sentence structure
+        activity = random.choice(self.activities)
+        verb = random.choice(self.verbs)
+        modifier = random.choice(self.modifiers)
+
+        # Conjugate verb
+        conjugated_verb = self.conjugate_verb(verb)
+
+        # Build sentence (Korean word order: Subject + Object + Modifier + Verb)
+        sentence = f"저는 {activity}를/을 {modifier} {conjugated_verb}"
+
+        # Randomly add connector and another activity
+        if random.choice([True, False]):
+            connector = random.choice(self.connectors)
+            second_activity = random.choice(self.activities)
+            second_verb = self.conjugate_verb(random.choice(self.verbs))
+            sentence += f" {connector} {second_activity}도 {second_verb}"
+
+        return sentence + "."
+
+    def paragraph(self, num_sentences=3):
+        return '\n'.join([self.sentence() for _ in range(num_sentences)])
+
+    def text(self, num_sentences=5):
+        return '\n'.join([self.sentence() for _ in range(num_sentences)])
+
+
+def generate_text_by_analyzer(analyzer_params):
+    """
+    Generate text data based on the given analyzer parameters
+
+    Args:
+        analyzer_params: Dictionary containing the analyzer parameters
+
+    Returns:
+        str: Generated text data
+    """
+    if analyzer_params["tokenizer"] == "standard":
+        fake = Faker("en_US")
+    elif analyzer_params["tokenizer"] == "jieba":
+        fake = Faker("zh_CN")
+    elif analyzer_params["tokenizer"]["type"] == "lindera":
+        # Generate random Japanese text
+        if analyzer_params["tokenizer"]["dict_kind"] == "ipadic":
+            fake = Faker("ja_JP")
+        elif analyzer_params["tokenizer"]["dict_kind"] == "ko-dic":
+            fake = KoreanTextGenerator()
+        elif analyzer_params["tokenizer"]["dict_kind"] == "cc-cedict":
+            fake = Faker("zh_CN")
+        else:
+            raise ValueError("Invalid dict_kind")
+    else:
+        raise ValueError("Invalid analyzer parameters")
+
+    text = fake.text()
+    stop_words = []
+    if "filter" in analyzer_params:
+        for filter in analyzer_params["filter"]:
+            if filter["type"] == "stop":
+                stop_words.extend(filter["stop_words"])
+
+    # add stop words to the text
+    text += " " + " ".join(stop_words)
+    return text
--- a/tests/python_client/milvus_client/test_milvus_client_analyzer.py
+++ b/tests/python_client/milvus_client/test_milvus_client_analyzer.py
@ -0,0 +1,61 @@
+import pytest
+
+from base.client_v2_base import TestMilvusClientV2Base
+from common.common_type import CaseLabel
+from common.phrase_match_generator import generate_text_by_analyzer
+
+
+class TestMilvusClientAnalyzer(TestMilvusClientV2Base):
+    analyzer_params_list = [
+        {
+            "tokenizer": "standard",
+            "filter": [
+                {
+                    "type": "stop",
+                    "stop_words": ["is", "the", "this", "a", "an", "and", "or"],
+                }
+            ],
+        },
+        {
+            "tokenizer": "jieba",
+        },
+        {
+            "tokenizer": {"type": "lindera", "dict_kind": "ipadic"},
+            "filter": [
+                {
+                    "type": "stop",
+                    "stop_words": ["は", "が", "の", "に", "を", "で", "と", "た"],
+                }
+            ],
+        },
+        {"tokenizer": {"type": "lindera", "dict_kind": "ko-dic"}},
+        {"tokenizer": {"type": "lindera", "dict_kind": "cc-cedict"}},
+    ]
+
+    @pytest.mark.tags(CaseLabel.L1)
+    @pytest.mark.parametrize("analyzer_params", analyzer_params_list)
+    def test_analyzer(self, analyzer_params):
+        """
+        target: test analyzer
+        method: use different analyzer params, then run analyzer to get the tokens
+        expected: verify the tokens
+        """
+        client = self._client()
+        text = generate_text_by_analyzer(analyzer_params)
+        res, result = self.run_analyzer(client, text, analyzer_params)
+        tokens = res.tokens
+        # Check tokens are not empty
+        assert len(tokens) > 0, "No tokens were generated"
+
+        # Check tokens are related to input text (all token should be a substring of the text)
+        assert all(
+            token.lower() in text.lower() for token in tokens
+        ), "some of the tokens do not appear in the original text"
+
+        if "filter" in analyzer_params:
+            for filter in analyzer_params["filter"]:
+                if filter["type"] == "stop":
+                    stop_words = filter["stop_words"]
+                    assert not any(
+                        token in stop_words for token in tokens
+                    ), "some of the tokens are stop words"
--- a/tests/python_client/requirements.txt
+++ b/tests/python_client/requirements.txt
@ -28,8 +28,8 @@ pytest-parallel
 pytest-random-order

 # pymilvus
-pymilvus==2.6.0rc79
-pymilvus[bulk_writer]==2.6.0rc79
+pymilvus==2.6.0rc81
+pymilvus[bulk_writer]==2.6.0rc81


 # for customize config test
--- a/tests/python_client/testcases/test_query.py
+++ b/tests/python_client/testcases/test_query.py
@ -5,6 +5,7 @@ from utils.util_log import test_log as log
 from common.common_type import CaseLabel, CheckTasks
 from common import common_type as ct
 from common import common_func as cf
+from common.phrase_match_generator import KoreanTextGenerator
 from common.code_mapping import ConnectionErrorMessage as cem
 from base.client_base import TestcaseBase
 from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_EVENTUALLY
@ -29,6 +30,10 @@ Faker.seed(19530)
 fake_en = Faker("en_US")
 fake_zh = Faker("zh_CN")
 fake_de = Faker("de_DE")
+fake_jp = Faker("ja_JP")
+fake_ko = Faker("ko_KR")
+
+

 # patch faker to generate text with specific distribution
 cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
@ -5734,6 +5739,79 @@ class TestQueryTextMatch(TestcaseBase):
            res, _ = collection_w.query(expr=expr, output_fields=["id", field])
            pytest.assume(len(res) == 0, f"res len {len(res)}, data size {data_size}")

+    @pytest.mark.parametrize("dict_kind", ["ipadic", "ko-dic", "cc-cedict"])
+    def test_query_text_match_with_Lindera_tokenizer(self, dict_kind):
+        """
+        target: test text match with lindera tokenizer
+        method: 1. enable text match, use lindera tokenizer and insert data with varchar in different lang
+                2. get the most common words and query with text match
+                3. verify the result
+        expected: get the correct token, text match successfully and result is correct
+        """
+        analyzer_params = {
+            "tokenizer": {
+            "type": "lindera",
+            "dict_kind": dict_kind
+            }
+        }
+        if dict_kind == "ipadic":
+            fake = fake_jp
+        elif dict_kind == "ko-dic":
+            fake = KoreanTextGenerator()
+        elif dict_kind == "cc-cedict":
+            fake = fake_zh
+        else:
+            fake = fake_en
+        dim = 128
+        fields = [
+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
+            FieldSchema(
+                name="sentence",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_analyzer=True,
+                enable_match=True,
+                analyzer_params=analyzer_params,
+            ),
+            FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
+        ]
+        schema = CollectionSchema(fields=fields, description="test collection")
+        data_size = 5000
+        collection_w = self.init_collection_wrap(
+            name=cf.gen_unique_str(prefix), schema=schema
+        )
+        data = [
+            {
+                "id": i,
+                "sentence": fake.sentence(),
+                "emb": [random.random() for _ in range(dim)],
+            }
+            for i in range(data_size)
+        ]
+        df = pd.DataFrame(data)
+        log.info(f"dataframe\n{df}")
+        batch_size = 5000
+        for i in range(0, len(df), batch_size):
+            collection_w.insert(
+                data[i: i + batch_size]
+                if i + batch_size < len(df)
+                else data[i: len(df)]
+            )
+            collection_w.flush()
+        collection_w.create_index(
+            "emb",
+            {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
+        )
+        collection_w.load()
+        # analyze the croup
+        text_fields = ["sentence"]
+        # query sentence field with word list
+        for field in text_fields:
+            match_text = df["sentence"].iloc[0]
+            expr = f"text_match({field}, '{match_text}')"
+            log.info(f"expr: {expr}")
+            res, _ = collection_w.query(expr=expr, output_fields=["id", field])
+            assert len(res) > 0

    @pytest.mark.tags(CaseLabel.L0)
    def test_query_text_match_with_combined_expression_for_single_field(self):