From 3df2156ee23b598b3c17ddc9f700f089f758d9ca Mon Sep 17 00:00:00 2001 From: zhuwenxing Date: Wed, 30 Apr 2025 14:18:54 +0800 Subject: [PATCH] test: [cherry-pick]add icu tokenizer testcases (#41630) pr: https://github.com/milvus-io/milvus/pull/41501 /kind improvement Signed-off-by: zhuwenxing --- tests/python_client/base/client_base.py | 19 +- tests/python_client/common/common_func.py | 22 ++- .../common/phrase_match_generator.py | 108 ----------- tests/python_client/common/text_generator.py | 179 ++++++++++++++++++ .../test_milvus_client_analyzer.py | 9 +- tests/python_client/testcases/test_query.py | 164 +++++++++++++++- 6 files changed, 377 insertions(+), 124 deletions(-) create mode 100644 tests/python_client/common/text_generator.py diff --git a/tests/python_client/base/client_base.py b/tests/python_client/base/client_base.py index 9024fbe5c3..e3d2407f88 100644 --- a/tests/python_client/base/client_base.py +++ b/tests/python_client/base/client_base.py @@ -1,4 +1,3 @@ -import pytest import sys from typing import Dict, List from pymilvus import DefaultConfig @@ -18,7 +17,7 @@ from common import common_func as cf from common import common_type as ct from common.common_params import IndexPrams -from pymilvus import ResourceGroupInfo, DataType, utility +from pymilvus import ResourceGroupInfo, DataType, utility, MilvusClient import pymilvus @@ -170,6 +169,22 @@ class TestcaseBase(Base): log.info(f"server version: {server_version}") return res + + def get_tokens_by_analyzer(self, text, analyzer_params): + if cf.param_info.param_uri: + uri = cf.param_info.param_uri + else: + uri = "http://" + cf.param_info.param_host + ":" + str(cf.param_info.param_port) + + client = MilvusClient( + uri = uri, + token = cf.param_info.param_token + ) + res = client.run_analyzer(text, analyzer_params, with_detail=True, with_hash=True) + tokens = [r['token'] for r in res.tokens] + return tokens + + # def init_async_milvus_client(self): # uri = cf.param_info.param_uri or f"http://{cf.param_info.param_host}:{cf.param_info.param_port}" # kwargs = { diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index 4c04f9adf6..c1addc7e0f 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -26,7 +26,7 @@ import jieba import re import inspect -from pymilvus import CollectionSchema, DataType, FunctionType, Function, MilvusException +from pymilvus import CollectionSchema, DataType, FunctionType, Function, MilvusException, MilvusClient from bm25s.tokenization import Tokenizer @@ -266,6 +266,24 @@ def analyze_documents(texts, language="en"): return word_freq +def analyze_documents_with_analyzer_params(texts, analyzer_params): + if param_info.param_uri: + uri = param_info.param_uri + else: + uri = "http://" + param_info.param_host + ":" + str(param_info.param_port) + + client = MilvusClient( + uri = uri, + token = param_info.param_token + ) + freq = Counter() + res = client.run_analyzer(texts, analyzer_params, with_detail=True, with_hash=True) + for r in res: + freq.update(t['token'] for t in r.tokens) + log.info(f"word freq {freq.most_common(10)}") + return freq + + def check_token_overlap(text_a, text_b, language="en"): word_freq_a = analyze_documents([text_a], language) word_freq_b = analyze_documents([text_b], language) @@ -2446,7 +2464,7 @@ def gen_json_field_expressions_all_single_operator(): "json_field is null", "json_field IS NULL", "json_field is not null", "json_field IS NOT NULL", "json_field['a'] is null", "json_field['a'] IS NULL", "json_field['a'] is not null", "json_field['a'] IS NOT NULL" ] - + return expressions diff --git a/tests/python_client/common/phrase_match_generator.py b/tests/python_client/common/phrase_match_generator.py index 726d857378..97b174924e 100644 --- a/tests/python_client/common/phrase_match_generator.py +++ b/tests/python_client/common/phrase_match_generator.py @@ -369,111 +369,3 @@ class PhraseMatchTestGenerator: return matched_docs - - - -class KoreanTextGenerator: - def __init__(self): - # Sports/Activities (Nouns) - self.activities = [ - "수영", "축구", "농구", "테니스", - "배구", "야구", "골프", "럭비", - "달리기", "자전거", "스케이트", "스키", - "서핑", "다이빙", "등산", "요가", - "춤", "하이킹", "독서", "요리" - ] - - # Verbs (Base Form) - self.verbs = [ - "좋아하다", "즐기다", "하다", "배우다", - "가르치다", "보다", "시작하다", "계속하다", - "연습하다", "선호하다", "마스터하다", "도전하다" - ] - - # Connectors - self.connectors = [ - "그리고", "또는", "하지만", "그런데", - "그래서", "또한", "게다가", "그러면서", - "동시에", "함께" - ] - - # Modifiers (Frequency/Degree) - self.modifiers = [ - "매우", "자주", "가끔", "열심히", - "전문적으로", "규칙적으로", "매일", "일주일에 한 번", - "취미로", "진지하게" - ] - - def conjugate_verb(self, verb): - # Simple Korean verb conjugation (using informal style "-아/어요") - if verb.endswith("하다"): - return verb.replace("하다", "해요") - elif verb.endswith("다"): - return verb[:-1] + "아요" - return verb - - def sentence(self): - # Build basic sentence structure - activity = random.choice(self.activities) - verb = random.choice(self.verbs) - modifier = random.choice(self.modifiers) - - # Conjugate verb - conjugated_verb = self.conjugate_verb(verb) - - # Build sentence (Korean word order: Subject + Object + Modifier + Verb) - sentence = f"저는 {activity}를/을 {modifier} {conjugated_verb}" - - # Randomly add connector and another activity - if random.choice([True, False]): - connector = random.choice(self.connectors) - second_activity = random.choice(self.activities) - second_verb = self.conjugate_verb(random.choice(self.verbs)) - sentence += f" {connector} {second_activity}도 {second_verb}" - - return sentence + "." - - def paragraph(self, num_sentences=3): - return '\n'.join([self.sentence() for _ in range(num_sentences)]) - - def text(self, num_sentences=5): - return '\n'.join([self.sentence() for _ in range(num_sentences)]) - - -def generate_text_by_analyzer(analyzer_params): - """ - Generate text data based on the given analyzer parameters - - Args: - analyzer_params: Dictionary containing the analyzer parameters - - Returns: - str: Generated text data - """ - if analyzer_params["tokenizer"] == "standard": - fake = Faker("en_US") - elif analyzer_params["tokenizer"] == "jieba": - fake = Faker("zh_CN") - elif analyzer_params["tokenizer"]["type"] == "lindera": - # Generate random Japanese text - if analyzer_params["tokenizer"]["dict_kind"] == "ipadic": - fake = Faker("ja_JP") - elif analyzer_params["tokenizer"]["dict_kind"] == "ko-dic": - fake = KoreanTextGenerator() - elif analyzer_params["tokenizer"]["dict_kind"] == "cc-cedict": - fake = Faker("zh_CN") - else: - raise ValueError("Invalid dict_kind") - else: - raise ValueError("Invalid analyzer parameters") - - text = fake.text() - stop_words = [] - if "filter" in analyzer_params: - for filter in analyzer_params["filter"]: - if filter["type"] == "stop": - stop_words.extend(filter["stop_words"]) - - # add stop words to the text - text += " " + " ".join(stop_words) - return text diff --git a/tests/python_client/common/text_generator.py b/tests/python_client/common/text_generator.py new file mode 100644 index 0000000000..fe857cb68c --- /dev/null +++ b/tests/python_client/common/text_generator.py @@ -0,0 +1,179 @@ +from faker import Faker +import random + +class ICUTextGenerator: + """ + ICU(International Components for Unicode)TextGenerator: + Generate test sentences containing multiple languages (Chinese, English, Japanese, Korean), emojis, and special symbols. + """ + def __init__(self): + self.fake_en = Faker("en_US") + self.fake_zh = Faker("zh_CN") + self.fake_ja = Faker("ja_JP") + self.fake_de = Faker("de_DE") + self.korean_samples = [ + "안녕하세요 세계", "파이썬 프로그래밍", "데이터 분석", "인공지능", + "밀버스 테스트", "한국어 샘플", "자연어 처리" + ] + self.emojis = ["😊", "🐍", "🚀", "🌏", "💡", "🔥", "✨", "👍"] + self.specials = ["#", "@", "$"] + + def word(self): + """ + Generate a list of words containing multiple languages, emojis, and special symbols. + """ + parts = [ + self.fake_en.word(), + self.fake_zh.word(), + self.fake_ja.word(), + self.fake_de.word(), + random.choice(self.korean_samples), + random.choice(self.emojis), + random.choice(self.specials), + ] + return random.choice(parts) + + def sentence(self): + """ + Generate a sentence containing multiple languages, emojis, and special symbols. + """ + parts = [ + self.fake_en.sentence(), + self.fake_zh.sentence(), + self.fake_ja.sentence(), + self.fake_de.sentence(), + random.choice(self.korean_samples), + " ".join(random.sample(self.emojis, 2)), + " ".join(random.sample(self.specials, 2)), + ] + random.shuffle(parts) + return " ".join(parts) + + def paragraph(self, num_sentences=3): + """ + Generate a paragraph containing multiple sentences, each with multiple languages, emojis, and special symbols. + """ + return ' '.join([self.sentence() for _ in range(num_sentences)]) + + def text(self, num_sentences=5): + """ + Generate multiple sentences containing multiple languages, emojis, and special symbols. + """ + return ' '.join([self.sentence() for _ in range(num_sentences)]) + + +class KoreanTextGenerator: + """ + KoreanTextGenerator: Generate test sentences containing Korean activities, verbs, connectors, and modifiers. + """ + def __init__(self): + # Sports/Activities (Nouns) + self.activities = [ + "수영", "축구", "농구", "테니스", + "배구", "야구", "골프", "럭비", + "달리기", "자전거", "스케이트", "스키", + "서핑", "다이빙", "등산", "요가", + "춤", "하이킹", "독서", "요리" + ] + + # Verbs (Base Form) + self.verbs = [ + "좋아하다", "즐기다", "하다", "배우다", + "가르치다", "보다", "시작하다", "계속하다", + "연습하다", "선호하다", "마스터하다", "도전하다" + ] + + # Connectors + self.connectors = [ + "그리고", "또는", "하지만", "그런데", + "그래서", "또한", "게다가", "그러면서", + "동시에", "함께" + ] + + # Modifiers (Frequency/Degree) + self.modifiers = [ + "매우", "자주", "가끔", "열심히", + "전문적으로", "규칙적으로", "매일", "일주일에 한 번", + "취미로", "진지하게" + ] + + def conjugate_verb(self, verb): + # Simple Korean verb conjugation (using informal style "-아/어요") + if verb.endswith("하다"): + return verb.replace("하다", "해요") + elif verb.endswith("다"): + return verb[:-1] + "아요" + return verb + + + def word(self): + return random.choice(self.activities + self.verbs + self.modifiers + self.connectors) + + def sentence(self): + # Build basic sentence structure + activity = random.choice(self.activities) + verb = random.choice(self.verbs) + modifier = random.choice(self.modifiers) + + # Conjugate verb + conjugated_verb = self.conjugate_verb(verb) + + # Build sentence (Korean word order: Subject + Object + Modifier + Verb) + sentence = f"저는 {activity}를/을 {modifier} {conjugated_verb}" + + # Randomly add connector and another activity + if random.choice([True, False]): + connector = random.choice(self.connectors) + second_activity = random.choice(self.activities) + second_verb = self.conjugate_verb(random.choice(self.verbs)) + sentence += f" {connector} {second_activity}도 {second_verb}" + + return sentence + "." + + def paragraph(self, num_sentences=3): + return '\n'.join([self.sentence() for _ in range(num_sentences)]) + + def text(self, num_sentences=5): + return '\n'.join([self.sentence() for _ in range(num_sentences)]) + + +def generate_text_by_analyzer(analyzer_params): + """ + Generate text data based on the given analyzer parameters + + Args: + analyzer_params: Dictionary containing the analyzer parameters + + Returns: + str: Generated text data + """ + if analyzer_params["tokenizer"] == "standard": + fake = Faker("en_US") + elif analyzer_params["tokenizer"] == "jieba": + fake = Faker("zh_CN") + elif analyzer_params["tokenizer"] == "icu": + fake = ICUTextGenerator() + + elif analyzer_params["tokenizer"]["type"] == "lindera": + # Generate random Japanese text + if analyzer_params["tokenizer"]["dict_kind"] == "ipadic": + fake = Faker("ja_JP") + elif analyzer_params["tokenizer"]["dict_kind"] == "ko-dic": + fake = KoreanTextGenerator() + elif analyzer_params["tokenizer"]["dict_kind"] == "cc-cedict": + fake = Faker("zh_CN") + else: + raise ValueError("Invalid dict_kind") + else: + raise ValueError("Invalid analyzer parameters") + + text = fake.text() + stop_words = [] + if "filter" in analyzer_params: + for filter in analyzer_params["filter"]: + if filter["type"] == "stop": + stop_words.extend(filter["stop_words"]) + + # add stop words to the text + text += " " + " ".join(stop_words) + return text diff --git a/tests/python_client/milvus_client/test_milvus_client_analyzer.py b/tests/python_client/milvus_client/test_milvus_client_analyzer.py index 2fce8edb21..3b0faa7805 100644 --- a/tests/python_client/milvus_client/test_milvus_client_analyzer.py +++ b/tests/python_client/milvus_client/test_milvus_client_analyzer.py @@ -2,7 +2,7 @@ import pytest from base.client_v2_base import TestMilvusClientV2Base from common.common_type import CaseLabel -from common.phrase_match_generator import generate_text_by_analyzer +from common.text_generator import generate_text_by_analyzer class TestMilvusClientAnalyzer(TestMilvusClientV2Base): @@ -19,6 +19,9 @@ class TestMilvusClientAnalyzer(TestMilvusClientV2Base): { "tokenizer": "jieba", }, + { + "tokenizer": "icu" + } # { # "tokenizer": {"type": "lindera", "dict_kind": "ipadic"}, # "filter": [ @@ -41,8 +44,8 @@ class TestMilvusClientAnalyzer(TestMilvusClientV2Base): """ client = self._client() text = generate_text_by_analyzer(analyzer_params) - res, result = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True) - res_2, result_2 = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True) + res, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True) + res_2, _ = self.run_analyzer(client, text, analyzer_params, with_detail=True, with_hash=True) # verify the result are the same when run analyzer twice for i in range(len(res.tokens)): assert res.tokens[i]["token"] == res_2.tokens[i]["token"] diff --git a/tests/python_client/testcases/test_query.py b/tests/python_client/testcases/test_query.py index 84598df431..18dffeba1a 100644 --- a/tests/python_client/testcases/test_query.py +++ b/tests/python_client/testcases/test_query.py @@ -1,11 +1,9 @@ -import jieba - import utils.util_pymilvus as ut from utils.util_log import test_log as log from common.common_type import CaseLabel, CheckTasks from common import common_type as ct from common import common_func as cf -from common.phrase_match_generator import KoreanTextGenerator +from common.text_generator import KoreanTextGenerator, ICUTextGenerator from common.code_mapping import ConnectionErrorMessage as cem from base.client_base import TestcaseBase from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_EVENTUALLY @@ -17,7 +15,6 @@ from pymilvus import ( import threading from pymilvus import DefaultConfig import time - import pytest import random import numpy as np @@ -4734,7 +4731,10 @@ class TestQueryTextMatch(TestcaseBase): wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) # query single field for one token for field in text_fields: - token = wf_map[field].most_common()[0][0] + most_common_tokens = wf_map[field].most_common(10) + mid = len(most_common_tokens) // 2 + idx = random.randint(0, max(0, mid - 1)) + token = most_common_tokens[idx][0] expr = f"text_match({field}, '{token}')" log.info(f"expr: {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) @@ -4879,7 +4879,10 @@ class TestQueryTextMatch(TestcaseBase): # query single field for one token for field in text_fields: - token = wf_map[field].most_common()[0][0] + most_common_tokens = wf_map[field].most_common(10) + mid = len(most_common_tokens) // 2 + idx = random.randint(0, max(0, mid - 1)) + token = most_common_tokens[idx][0] expr = f"text_match({field}, '{token}')" log.info(f"expr: {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) @@ -4912,6 +4915,140 @@ class TestQueryTextMatch(TestcaseBase): assert any( [token in r[field] for token in top_10_tokens]), f"top 10 tokens {top_10_tokens} not in {r[field]}" + @pytest.mark.tags(CaseLabel.L0) + @pytest.mark.parametrize("enable_partition_key", [True, False]) + @pytest.mark.parametrize("enable_inverted_index", [True, False]) + @pytest.mark.parametrize("tokenizer", ["icu"]) + def test_query_text_match_with_icu_tokenizer( + self, tokenizer, enable_inverted_index, enable_partition_key + ): + """ + target: test text match with icu tokenizer + method: 1. enable text match and insert data with varchar + 2. get the most common words and query with text match + 3. verify the result + expected: text match successfully and result is correct + """ + analyzer_params = { + "tokenizer": tokenizer, + } + dim = 128 + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name="word", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + is_partition_key=enable_partition_key, + analyzer_params=analyzer_params, + ), + FieldSchema( + name="sentence", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema( + name="paragraph", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema( + name="text", + dtype=DataType.VARCHAR, + max_length=65535, + enable_analyzer=True, + enable_match=True, + analyzer_params=analyzer_params, + ), + FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), + ] + schema = CollectionSchema(fields=fields, description="test collection") + data_size = 3000 + collection_w = self.init_collection_wrap( + name=cf.gen_unique_str(prefix), schema=schema + ) + fake = ICUTextGenerator() + data = [ + { + "id": i, + "word": fake.word().lower(), + "sentence": fake.sentence().lower(), + "paragraph": fake.paragraph().lower(), + "text": fake.text().lower(), + "emb": [random.random() for _ in range(dim)], + } + for i in range(data_size) + ] + df = pd.DataFrame(data) + log.info(f"dataframe\n{df}") + batch_size = 5000 + for i in range(0, len(df), batch_size): + collection_w.insert( + data[i: i + batch_size] + if i + batch_size < len(df) + else data[i: len(df)] + ) + # only if the collection is flushed, the inverted index ca be applied. + # growing segment may be not applied, although in strong consistency. + collection_w.flush() + collection_w.create_index( + "emb", + {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}}, + ) + if enable_inverted_index: + collection_w.create_index("word", {"index_type": "INVERTED"}) + collection_w.load() + # analyze the croup + text_fields = ["word", "sentence", "paragraph", "text"] + wf_map = {} + for field in text_fields: + wf_map[field] = cf.analyze_documents_with_analyzer_params(df[field].tolist(), analyzer_params) + # query single field for one token + for field in text_fields: + most_common_tokens = wf_map[field].most_common(10) + mid = len(most_common_tokens) // 2 + idx = random.randint(0, max(0, mid - 1)) + token = most_common_tokens[idx][0] + expr = f"text_match({field}, '{token}')" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + assert len(res) > 0 + log.info(f"res len {len(res)}") + for r in res: + assert token in r[field] + + # verify inverted index + if enable_inverted_index: + if field == "word": + expr = f"{field} == '{token}'" + log.info(f"expr: {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + log.info(f"res len {len(res)}") + for r in res: + assert r[field] == token + # query single field for multi-word + for field in text_fields: + # match top 10 most common words + top_10_tokens = [] + for word, count in wf_map[field].most_common(10): + top_10_tokens.append(word) + string_of_top_10_words = " ".join(top_10_tokens) + expr = f"text_match({field}, '{string_of_top_10_words}')" + log.info(f"expr {expr}") + res, _ = collection_w.query(expr=expr, output_fields=["id", field]) + log.info(f"res len {len(res)}") + for r in res: + assert any([token in r[field] for token in top_10_tokens]) + + @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("enable_partition_key", [True]) @pytest.mark.parametrize("enable_inverted_index", [True]) @@ -5014,7 +5151,10 @@ class TestQueryTextMatch(TestcaseBase): wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) # query single field for one token for field in text_fields: - token = wf_map[field].most_common()[0][0] + most_common_tokens = wf_map[field].most_common(10) + mid = len(most_common_tokens) // 2 + idx = random.randint(0, max(0, mid - 1)) + token = most_common_tokens[idx][0] expr = f"text_match({field}, '{token}')" log.info(f"expr: {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) @@ -5153,7 +5293,10 @@ class TestQueryTextMatch(TestcaseBase): wf_map[field] = cf.analyze_documents(df[field].tolist(), language=language) # query single field for one token for field in text_fields: - token = wf_map[field].most_common()[0][0] + most_common_tokens = wf_map[field].most_common(10) + mid = len(most_common_tokens) // 2 + idx = random.randint(0, max(0, mid - 1)) + token = most_common_tokens[idx][0] expr = f"text_match({field}, '{token}')" log.info(f"expr: {expr}") res, _ = collection_w.query(expr=expr, output_fields=["id", field]) @@ -6532,7 +6675,10 @@ class TestQueryTextMatch(TestcaseBase): wf_map[field] = cf.analyze_documents(df[field].tolist(), language="en") # query single field for one word for field in text_fields: - token = wf_map[field].most_common()[0][0] + most_common_tokens = wf_map[field].most_common(10) + mid = len(most_common_tokens) // 2 + idx = random.randint(0, max(0, mid - 1)) + token = most_common_tokens[idx][0] tm_expr = f"text_match({field}, '{token}')" int_expr = "age > 10" combined_expr = f"{tm_expr} {combine_op} {int_expr}"