From 3df2156ee23b598b3c17ddc9f700f089f758d9ca Mon Sep 17 00:00:00 2001
From: zhuwenxing <wenxing.zhu@zilliz.com>
Date: Wed, 30 Apr 2025 14:18:54 +0800
Subject: [PATCH] test: [cherry-pick]add icu tokenizer testcases (#41630)

pr: https://github.com/milvus-io/milvus/pull/41501

/kind improvement

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
---
 tests/python_client/base/client_base.py       |  19 +-
 tests/python_client/common/common_func.py     |  22 ++-
 .../common/phrase_match_generator.py          | 108 -----------
 tests/python_client/common/text_generator.py  | 179 ++++++++++++++++++
 .../test_milvus_client_analyzer.py            |   9 +-
 tests/python_client/testcases/test_query.py   | 164 +++++++++++++++-
 6 files changed, 377 insertions(+), 124 deletions(-)
 create mode 100644 tests/python_client/common/text_generator.py

diff --git a/tests/python_client/base/client_base.py b/tests/python_client/base/client_base.py
index 9024fbe5c3..e3d2407f88 100644
--- a/tests/python_client/base/client_base.py
+++ b/tests/python_client/base/client_base.py
@@ -1,4 +1,3 @@
-import pytest
 import sys
 from typing import Dict, List
 from pymilvus import DefaultConfig
@@ -18,7 +17,7 @@ from common import common_func as cf
 from common import common_type as ct
 from common.common_params import IndexPrams
 
-from pymilvus import ResourceGroupInfo, DataType, utility
+from pymilvus import ResourceGroupInfo, DataType, utility, MilvusClient
 import pymilvus
 
 
@@ -170,6 +169,22 @@ class TestcaseBase(Base):
         log.info(f"server version: {server_version}")
         return res
 
+
+    def get_tokens_by_analyzer(self, text, analyzer_params):
+        if cf.param_info.param_uri:
+            uri = cf.param_info.param_uri
+        else:
+            uri = "http://" + cf.param_info.param_host + ":" + str(cf.param_info.param_port)
+
+        client = MilvusClient(
+            uri = uri,
+            token = cf.param_info.param_token
+        )
+        res = client.run_analyzer(text, analyzer_params, with_detail=True, with_hash=True)
+        tokens = [r['token'] for r in res.tokens]
+        return tokens
+        
+
     # def init_async_milvus_client(self):
     #     uri = cf.param_info.param_uri or f"http://{cf.param_info.param_host}:{cf.param_info.param_port}"
     #     kwargs = {
diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py
index 4c04f9adf6..c1addc7e0f 100644
--- a/tests/python_client/common/common_func.py
+++ b/tests/python_client/common/common_func.py
@@ -26,7 +26,7 @@ import jieba
 import re
 import inspect
 
-from pymilvus import CollectionSchema, DataType, FunctionType, Function, MilvusException
+from pymilvus import CollectionSchema, DataType, FunctionType, Function, MilvusException, MilvusClient
 
 from bm25s.tokenization import Tokenizer
 
@@ -266,6 +266,24 @@ def analyze_documents(texts, language="en"):
     return word_freq
 
 
+def analyze_documents_with_analyzer_params(texts, analyzer_params):
+    if param_info.param_uri:
+        uri = param_info.param_uri
+    else:
+        uri = "http://" + param_info.param_host + ":" + str(param_info.param_port)
+
+    client = MilvusClient(
+        uri = uri,
+        token = param_info.param_token
+    )
+    freq = Counter()
+    res = client.run_analyzer(texts, analyzer_params, with_detail=True, with_hash=True)
+    for r in res:
+        freq.update(t['token'] for t in r.tokens)
+    log.info(f"word freq {freq.most_common(10)}")
+    return freq
+
+
 def check_token_overlap(text_a, text_b, language="en"):
     word_freq_a = analyze_documents([text_a], language)
     word_freq_b = analyze_documents([text_b], language)
@@ -2446,7 +2464,7 @@ def gen_json_field_expressions_all_single_operator():
                    "json_field is null", "json_field IS NULL", "json_field is not null", "json_field IS NOT NULL",
                    "json_field['a'] is null", "json_field['a'] IS NULL", "json_field['a'] is not null", "json_field['a'] IS NOT NULL"
                    ]
-    
+
     return expressions
 
 
diff --git a/tests/python_client/common/phrase_match_generator.py b/tests/python_client/common/phrase_match_generator.py
index 726d857378..97b174924e 100644
--- a/tests/python_client/common/phrase_match_generator.py
+++ b/tests/python_client/common/phrase_match_generator.py
@@ -369,111 +369,3 @@ class PhraseMatchTestGenerator:
 
         return matched_docs
 
-
-
-
-class KoreanTextGenerator:
-    def __init__(self):
-        # Sports/Activities (Nouns)
-        self.activities = [
-            "수영", "축구", "농구", "테니스",
-            "배구", "야구", "골프", "럭비",
-            "달리기", "자전거", "스케이트", "스키",
-            "서핑", "다이빙", "등산", "요가",
-            "춤", "하이킹", "독서", "요리"
-        ]
-
-        # Verbs (Base Form)
-        self.verbs = [
-            "좋아하다", "즐기다", "하다", "배우다",
-            "가르치다", "보다", "시작하다", "계속하다",
-            "연습하다", "선호하다", "마스터하다", "도전하다"
-        ]
-
-        # Connectors
-        self.connectors = [
-            "그리고", "또는", "하지만", "그런데",
-            "그래서", "또한", "게다가", "그러면서",
-            "동시에", "함께"
-        ]
-
-        # Modifiers (Frequency/Degree)
-        self.modifiers = [
-            "매우", "자주", "가끔", "열심히",
-            "전문적으로", "규칙적으로", "매일", "일주일에 한 번",
-            "취미로", "진지하게"
-        ]
-
-    def conjugate_verb(self, verb):
-        # Simple Korean verb conjugation (using informal style "-아/어요")
-        if verb.endswith("하다"):
-            return verb.replace("하다", "해요")
-        elif verb.endswith("다"):
-            return verb[:-1] + "아요"
-        return verb
-
-    def sentence(self):
-        # Build basic sentence structure
-        activity = random.choice(self.activities)
-        verb = random.choice(self.verbs)
-        modifier = random.choice(self.modifiers)
-
-        # Conjugate verb
-        conjugated_verb = self.conjugate_verb(verb)
-
-        # Build sentence (Korean word order: Subject + Object + Modifier + Verb)
-        sentence = f"저는 {activity}를/을 {modifier} {conjugated_verb}"
-
-        # Randomly add connector and another activity
-        if random.choice([True, False]):
-            connector = random.choice(self.connectors)
-            second_activity = random.choice(self.activities)
-            second_verb = self.conjugate_verb(random.choice(self.verbs))
-            sentence += f" {connector} {second_activity}도 {second_verb}"
-
-        return sentence + "."
-
-    def paragraph(self, num_sentences=3):
-        return '\n'.join([self.sentence() for _ in range(num_sentences)])
-
-    def text(self, num_sentences=5):
-        return '\n'.join([self.sentence() for _ in range(num_sentences)])
-
-
-def generate_text_by_analyzer(analyzer_params):
-    """
-    Generate text data based on the given analyzer parameters
-
-    Args:
-        analyzer_params: Dictionary containing the analyzer parameters
-
-    Returns:
-        str: Generated text data
-    """
-    if analyzer_params["tokenizer"] == "standard":
-        fake = Faker("en_US")
-    elif analyzer_params["tokenizer"] == "jieba":
-        fake = Faker("zh_CN")
-    elif analyzer_params["tokenizer"]["type"] == "lindera":
-        # Generate random Japanese text
-        if analyzer_params["tokenizer"]["dict_kind"] == "ipadic":
-            fake = Faker("ja_JP")
-        elif analyzer_params["tokenizer"]["dict_kind"] == "ko-dic":
-            fake = KoreanTextGenerator()
-        elif analyzer_params["tokenizer"]["dict_kind"] == "cc-cedict":
-            fake = Faker("zh_CN")
-        else:
-            raise ValueError("Invalid dict_kind")
-    else:
-        raise ValueError("Invalid analyzer parameters")
-
-    text = fake.text()
-    stop_words = []
-    if "filter" in analyzer_params:
-        for filter in analyzer_params["filter"]:
-            if filter["type"] == "stop":
-                stop_words.extend(filter["stop_words"])
-
-    # add stop words to the text
-    text += " " + " ".join(stop_words)
-    return text
diff --git a/tests/python_client/common/text_generator.py b/tests/python_client/common/text_generator.py
new file mode 100644
index 0000000000..fe857cb68c
--- /dev/null
+++ b/tests/python_client/common/text_generator.py
@@ -0,0 +1,179 @@
+from faker import Faker
+import random
+
+class ICUTextGenerator:
+    """
+    ICU(International Components for Unicode)TextGenerator: 
+    Generate test sentences containing multiple languages (Chinese, English, Japanese, Korean), emojis, and special symbols.
+    """
+    def __init__(self):
+        self.fake_en = Faker("en_US")
+        self.fake_zh = Faker("zh_CN")
+        self.fake_ja = Faker("ja_JP")
+        self.fake_de = Faker("de_DE")
+        self.korean_samples = [
+            "안녕하세요 세계", "파이썬 프로그래밍", "데이터 분석", "인공지능",
+            "밀버스 테스트", "한국어 샘플", "자연어 처리"
+        ]
+        self.emojis = ["😊", "🐍", "🚀", "🌏", "💡", "🔥", "✨", "👍"]
+        self.specials = ["#", "@", "$"]
+
+    def word(self):
+        """
+        Generate a list of words containing multiple languages, emojis, and special symbols.
+        """
+        parts = [
+            self.fake_en.word(),
+            self.fake_zh.word(),
+            self.fake_ja.word(),
+            self.fake_de.word(),
+            random.choice(self.korean_samples),
+            random.choice(self.emojis),
+            random.choice(self.specials),
+        ]
+        return  random.choice(parts)
+
+    def sentence(self):
+        """
+        Generate a sentence containing multiple languages, emojis, and special symbols.
+        """
+        parts = [
+            self.fake_en.sentence(),
+            self.fake_zh.sentence(),
+            self.fake_ja.sentence(),
+            self.fake_de.sentence(),
+            random.choice(self.korean_samples),
+            " ".join(random.sample(self.emojis, 2)),
+            " ".join(random.sample(self.specials, 2)),
+        ]
+        random.shuffle(parts)
+        return " ".join(parts)
+
+    def paragraph(self, num_sentences=3):
+        """
+        Generate a paragraph containing multiple sentences, each with multiple languages, emojis, and special symbols.
+        """
+        return ' '.join([self.sentence() for _ in range(num_sentences)])
+
+    def text(self, num_sentences=5):
+        """
+        Generate multiple sentences containing multiple languages, emojis, and special symbols.
+        """
+        return ' '.join([self.sentence() for _ in range(num_sentences)])
+
+
+class KoreanTextGenerator:
+    """
+    KoreanTextGenerator: Generate test sentences containing Korean activities, verbs, connectors, and modifiers.
+    """
+    def __init__(self):
+        # Sports/Activities (Nouns)
+        self.activities = [
+            "수영", "축구", "농구", "테니스",
+            "배구", "야구", "골프", "럭비",
+            "달리기", "자전거", "스케이트", "스키",
+            "서핑", "다이빙", "등산", "요가",
+            "춤", "하이킹", "독서", "요리"
+        ]
+
+        # Verbs (Base Form)
+        self.verbs = [
+            "좋아하다", "즐기다", "하다", "배우다",
+            "가르치다", "보다", "시작하다", "계속하다",
+            "연습하다", "선호하다", "마스터하다", "도전하다"
+        ]
+
+        # Connectors
+        self.connectors = [
+            "그리고", "또는", "하지만", "그런데",
+            "그래서", "또한", "게다가", "그러면서",
+            "동시에", "함께"
+        ]
+
+        # Modifiers (Frequency/Degree)
+        self.modifiers = [
+            "매우", "자주", "가끔", "열심히",
+            "전문적으로", "규칙적으로", "매일", "일주일에 한 번",
+            "취미로", "진지하게"
+        ]
+
+    def conjugate_verb(self, verb):
+        # Simple Korean verb conjugation (using informal style "-아/어요")
+        if verb.endswith("하다"):
+            return verb.replace("하다", "해요")
+        elif verb.endswith("다"):
+            return verb[:-1] + "아요"
+        return verb
+
+
+    def word(self):
+        return random.choice(self.activities + self.verbs + self.modifiers + self.connectors)
+
+    def sentence(self):
+        # Build basic sentence structure
+        activity = random.choice(self.activities)
+        verb = random.choice(self.verbs)
+        modifier = random.choice(self.modifiers)
+
+        # Conjugate verb
+        conjugated_verb = self.conjugate_verb(verb)
+
+        # Build sentence (Korean word order: Subject + Object + Modifier + Verb)
+        sentence = f"저는 {activity}를/을 {modifier} {conjugated_verb}"
+
+        # Randomly add connector and another activity
+        if random.choice([True, False]):
+            connector = random.choice(self.connectors)
+            second_activity = random.choice(self.activities)
+            second_verb = self.conjugate_verb(random.choice(self.verbs))
+            sentence += f" {connector} {second_activity}도 {second_verb}"
+
+        return sentence + "."
+
+    def paragraph(self, num_sentences=3):
+        return '\n'.join([self.sentence() for _ in range(num_sentences)])
+
+    def text(self, num_sentences=5):
+        return '\n'.join([self.sentence() for _ in range(num_sentences)])
+
+
+def generate_text_by_analyzer(analyzer_params):
+    """
+    Generate text data based on the given analyzer parameters
+
+    Args:
+        analyzer_params: Dictionary containing the analyzer parameters
+
+    Returns:
+        str: Generated text data
+    """
+    if analyzer_params["tokenizer"] == "standard":
+        fake = Faker("en_US")
+    elif analyzer_params["tokenizer"] == "jieba":
+        fake = Faker("zh_CN")
+    elif analyzer_params["tokenizer"] == "icu":
+        fake = ICUTextGenerator()
+
+    elif analyzer_params["tokenizer"]["type"] == "lindera":
+        # Generate random Japanese text
+        if analyzer_params["tokenizer"]["dict_kind"] == "ipadic":
+            fake = Faker("ja_JP")
+        elif analyzer_params["tokenizer"]["dict_kind"] == "ko-dic":
+            fake = KoreanTextGenerator()
+        elif analyzer_params["tokenizer"]["dict_kind"] == "cc-cedict":
+            fake = Faker("zh_CN")
+        else:
+            raise ValueError("Invalid dict_kind")
+    else:
+        raise ValueError("Invalid analyzer parameters")
+
+    text = fake.text()
+    stop_words = []
+    if "filter" in analyzer_params:
+        for filter in analyzer_params["filter"]:
+            if filter["type"] == "stop":
+                stop_words.extend(filter["stop_words"])
+
+    # add stop words to the text
+    text += " " + " ".join(stop_words)
+    return text
diff --git a/tests/python_client/milvus_client/test_milvus_client_analyzer.py b/tests/python_client/milvus_client/test_milvus_client_analyzer.py
index 2fce8edb21..3b0faa7805 100644
--- a/tests/python_client/milvus_client/test_milvus_client_analyzer.py
+++ b/tests/python_client/milvus_client/test_milvus_client_analyzer.py
@@ -2,7 +2,7 @@ import pytest
 
 from base.client_v2_base import TestMilvusClientV2Base
 from common.common_type import CaseLabel
-from common.phrase_match_generator import generate_text_by_analyzer
+from common.text_generator import generate_text_by_analyzer
 
 
 class TestMilvusClientAnalyzer(TestMilvusClientV2Base):
@@ -19,6 +19,9 @@ class TestMilvusClientAnalyzer(TestMilvusClientV2Base):
         {
             "tokenizer": "jieba",
         },
+        {
+            "tokenizer": "icu"
+        }
         # {
         #     "tokenizer": {"type": "lindera", "dict_kind": "ipadic"},
         #     "filter": [
@@ -41,8 +44,8 @@ class TestMilvusClientAnalyzer(TestMilvusClientV2Base):
         """
         client = self._client()
         text = generate_text_by_analyzer(analyzer_params)
-        res, result = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True)
-        res_2, result_2 = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True)
+        res, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True)
+        res_2, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True)
         # verify the result are the same when run analyzer twice
         for i in range(len(res.tokens)):
             assert res.tokens[i]["token"] == res_2.tokens[i]["token"]
diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py
index 84598df431..18dffeba1a 100644
--- a/tests/python_client/testcases/test_query.py
+++ b/tests/python_client/testcases/test_query.py
@@ -1,11 +1,9 @@
-import jieba
-
 import utils.util_pymilvus as ut
 from utils.util_log import test_log as log
 from common.common_type import CaseLabel, CheckTasks
 from common import common_type as ct
 from common import common_func as cf
-from common.phrase_match_generator import KoreanTextGenerator
+from common.text_generator import KoreanTextGenerator, ICUTextGenerator
 from common.code_mapping import ConnectionErrorMessage as cem
 from base.client_base import TestcaseBase
 from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_EVENTUALLY
@@ -17,7 +15,6 @@ from pymilvus import (
 import threading
 from pymilvus import DefaultConfig
 import time
-
 import pytest
 import random
 import numpy as np
@@ -4734,7 +4731,10 @@ class TestQueryTextMatch(TestcaseBase):
             wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
         # query single field for one token
         for field in text_fields:
-            token = wf_map[field].most_common()[0][0]
+            most_common_tokens = wf_map[field].most_common(10)
+            mid = len(most_common_tokens) // 2
+            idx = random.randint(0, max(0, mid - 1))
+            token = most_common_tokens[idx][0]
             expr = f"text_match({field}, '{token}')"
             log.info(f"expr: {expr}")
             res, _ = collection_w.query(expr=expr, output_fields=["id", field])
@@ -4879,7 +4879,10 @@ class TestQueryTextMatch(TestcaseBase):
 
         # query single field for one token
         for field in text_fields:
-            token = wf_map[field].most_common()[0][0]
+            most_common_tokens = wf_map[field].most_common(10)
+            mid = len(most_common_tokens) // 2
+            idx = random.randint(0, max(0, mid - 1))
+            token = most_common_tokens[idx][0]
             expr = f"text_match({field}, '{token}')"
             log.info(f"expr: {expr}")
             res, _ = collection_w.query(expr=expr, output_fields=["id", field])
@@ -4912,6 +4915,140 @@ class TestQueryTextMatch(TestcaseBase):
                 assert any(
                     [token in r[field] for token in top_10_tokens]), f"top 10 tokens {top_10_tokens} not in {r[field]}"
 
+    @pytest.mark.tags(CaseLabel.L0)
+    @pytest.mark.parametrize("enable_partition_key", [True, False])
+    @pytest.mark.parametrize("enable_inverted_index", [True, False])
+    @pytest.mark.parametrize("tokenizer", ["icu"])
+    def test_query_text_match_with_icu_tokenizer(
+            self, tokenizer, enable_inverted_index, enable_partition_key
+    ):
+        """
+        target: test text match with icu tokenizer
+        method: 1. enable text match and insert data with varchar
+                2. get the most common words and query with text match
+                3. verify the result
+        expected: text match successfully and result is correct
+        """
+        analyzer_params = {
+            "tokenizer": tokenizer,
+        }
+        dim = 128
+        fields = [
+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
+            FieldSchema(
+                name="word",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_analyzer=True,
+                enable_match=True,
+                is_partition_key=enable_partition_key,
+                analyzer_params=analyzer_params,
+            ),
+            FieldSchema(
+                name="sentence",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_analyzer=True,
+                enable_match=True,
+                analyzer_params=analyzer_params,
+            ),
+            FieldSchema(
+                name="paragraph",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_analyzer=True,
+                enable_match=True,
+                analyzer_params=analyzer_params,
+            ),
+            FieldSchema(
+                name="text",
+                dtype=DataType.VARCHAR,
+                max_length=65535,
+                enable_analyzer=True,
+                enable_match=True,
+                analyzer_params=analyzer_params,
+            ),
+            FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
+        ]
+        schema = CollectionSchema(fields=fields, description="test collection")
+        data_size = 3000
+        collection_w = self.init_collection_wrap(
+            name=cf.gen_unique_str(prefix), schema=schema
+        )
+        fake = ICUTextGenerator()
+        data = [
+            {
+                "id": i,
+                "word": fake.word().lower(),
+                "sentence": fake.sentence().lower(),
+                "paragraph": fake.paragraph().lower(),
+                "text": fake.text().lower(),
+                "emb": [random.random() for _ in range(dim)],
+            }
+            for i in range(data_size)
+        ]
+        df = pd.DataFrame(data)
+        log.info(f"dataframe\n{df}")
+        batch_size = 5000
+        for i in range(0, len(df), batch_size):
+            collection_w.insert(
+                data[i: i + batch_size]
+                if i + batch_size < len(df)
+                else data[i: len(df)]
+            )
+        # only if the collection is flushed, the inverted index ca be applied.
+        # growing segment may be not applied, although in strong consistency.
+        collection_w.flush()
+        collection_w.create_index(
+            "emb",
+            {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
+        )
+        if enable_inverted_index:
+            collection_w.create_index("word", {"index_type": "INVERTED"})
+        collection_w.load()
+        # analyze the croup
+        text_fields = ["word", "sentence", "paragraph", "text"]
+        wf_map = {}
+        for field in text_fields:
+            wf_map[field] = cf.analyze_documents_with_analyzer_params(df[field].tolist(), analyzer_params)
+        # query single field for one token
+        for field in text_fields:
+            most_common_tokens = wf_map[field].most_common(10)
+            mid = len(most_common_tokens) // 2
+            idx = random.randint(0, max(0, mid - 1))
+            token = most_common_tokens[idx][0]
+            expr = f"text_match({field}, '{token}')"
+            log.info(f"expr: {expr}")
+            res, _ = collection_w.query(expr=expr, output_fields=["id", field])
+            assert len(res) > 0
+            log.info(f"res len {len(res)}")
+            for r in res:
+                assert token in r[field]
+
+            # verify inverted index
+            if enable_inverted_index:
+                if field == "word":
+                    expr = f"{field} == '{token}'"
+                    log.info(f"expr: {expr}")
+                    res, _ = collection_w.query(expr=expr, output_fields=["id", field])
+                    log.info(f"res len {len(res)}")
+                    for r in res:
+                        assert r[field] == token
+        # query single field for multi-word
+        for field in text_fields:
+            # match top 10 most common words
+            top_10_tokens = []
+            for word, count in wf_map[field].most_common(10):
+                top_10_tokens.append(word)
+            string_of_top_10_words = " ".join(top_10_tokens)
+            expr = f"text_match({field}, '{string_of_top_10_words}')"
+            log.info(f"expr {expr}")
+            res, _ = collection_w.query(expr=expr, output_fields=["id", field])
+            log.info(f"res len {len(res)}")
+            for r in res:
+                assert any([token in r[field] for token in top_10_tokens])
+
+
     @pytest.mark.tags(CaseLabel.L0)
     @pytest.mark.parametrize("enable_partition_key", [True])
     @pytest.mark.parametrize("enable_inverted_index", [True])
@@ -5014,7 +5151,10 @@ class TestQueryTextMatch(TestcaseBase):
             wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
         # query single field for one token
         for field in text_fields:
-            token = wf_map[field].most_common()[0][0]
+            most_common_tokens = wf_map[field].most_common(10)
+            mid = len(most_common_tokens) // 2
+            idx = random.randint(0, max(0, mid - 1))
+            token = most_common_tokens[idx][0]
             expr = f"text_match({field}, '{token}')"
             log.info(f"expr: {expr}")
             res, _ = collection_w.query(expr=expr, output_fields=["id", field])
@@ -5153,7 +5293,10 @@ class TestQueryTextMatch(TestcaseBase):
             wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language)
         # query single field for one token
         for field in text_fields:
-            token = wf_map[field].most_common()[0][0]
+            most_common_tokens = wf_map[field].most_common(10)
+            mid = len(most_common_tokens) // 2
+            idx = random.randint(0, max(0, mid - 1))
+            token = most_common_tokens[idx][0]
             expr = f"text_match({field}, '{token}')"
             log.info(f"expr: {expr}")
             res, _ = collection_w.query(expr=expr, output_fields=["id", field])
@@ -6532,7 +6675,10 @@ class TestQueryTextMatch(TestcaseBase):
             wf_map[field] = cf.analyze_documents(df[field].tolist(), language="en")
         # query single field for one word
         for field in text_fields:
-            token = wf_map[field].most_common()[0][0]
+            most_common_tokens = wf_map[field].most_common(10)
+            mid = len(most_common_tokens) // 2
+            idx = random.randint(0, max(0, mid - 1))
+            token = most_common_tokens[idx][0]
             tm_expr = f"text_match({field}, '{token}')"
             int_expr = "age > 10"
             combined_expr = f"{tm_expr} {combine_op} {int_expr}"