milvus/tests/python_client/common/text_generator.py
zhuwenxing b5fe6a5243
test: add icu tokenizer testcases (#41501)
/kind improvement

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
2025-04-24 18:54:38 +08:00

180 lines
6.2 KiB
Python

from faker import Faker
import random
class ICUTextGenerator:
"""
ICU(International Components for Unicode)TextGenerator:
Generate test sentences containing multiple languages (Chinese, English, Japanese, Korean), emojis, and special symbols.
"""
def __init__(self):
self.fake_en = Faker("en_US")
self.fake_zh = Faker("zh_CN")
self.fake_ja = Faker("ja_JP")
self.fake_de = Faker("de_DE")
self.korean_samples = [
"안녕하세요 세계", "파이썬 프로그래밍", "데이터 분석", "인공지능",
"밀버스 테스트", "한국어 샘플", "자연어 처리"
]
self.emojis = ["😊", "🐍", "🚀", "🌏", "💡", "🔥", "", "👍"]
self.specials = ["#", "@", "$"]
def word(self):
"""
Generate a list of words containing multiple languages, emojis, and special symbols.
"""
parts = [
self.fake_en.word(),
self.fake_zh.word(),
self.fake_ja.word(),
self.fake_de.word(),
random.choice(self.korean_samples),
random.choice(self.emojis),
random.choice(self.specials),
]
return random.choice(parts)
def sentence(self):
"""
Generate a sentence containing multiple languages, emojis, and special symbols.
"""
parts = [
self.fake_en.sentence(),
self.fake_zh.sentence(),
self.fake_ja.sentence(),
self.fake_de.sentence(),
random.choice(self.korean_samples),
" ".join(random.sample(self.emojis, 2)),
" ".join(random.sample(self.specials, 2)),
]
random.shuffle(parts)
return " ".join(parts)
def paragraph(self, num_sentences=3):
"""
Generate a paragraph containing multiple sentences, each with multiple languages, emojis, and special symbols.
"""
return ' '.join([self.sentence() for _ in range(num_sentences)])
def text(self, num_sentences=5):
"""
Generate multiple sentences containing multiple languages, emojis, and special symbols.
"""
return ' '.join([self.sentence() for _ in range(num_sentences)])
class KoreanTextGenerator:
"""
KoreanTextGenerator: Generate test sentences containing Korean activities, verbs, connectors, and modifiers.
"""
def __init__(self):
# Sports/Activities (Nouns)
self.activities = [
"수영", "축구", "농구", "테니스",
"배구", "야구", "골프", "럭비",
"달리기", "자전거", "스케이트", "스키",
"서핑", "다이빙", "등산", "요가",
"", "하이킹", "독서", "요리"
]
# Verbs (Base Form)
self.verbs = [
"좋아하다", "즐기다", "하다", "배우다",
"가르치다", "보다", "시작하다", "계속하다",
"연습하다", "선호하다", "마스터하다", "도전하다"
]
# Connectors
self.connectors = [
"그리고", "또는", "하지만", "그런데",
"그래서", "또한", "게다가", "그러면서",
"동시에", "함께"
]
# Modifiers (Frequency/Degree)
self.modifiers = [
"매우", "자주", "가끔", "열심히",
"전문적으로", "규칙적으로", "매일", "일주일에 한 번",
"취미로", "진지하게"
]
def conjugate_verb(self, verb):
# Simple Korean verb conjugation (using informal style "-아/어요")
if verb.endswith("하다"):
return verb.replace("하다", "해요")
elif verb.endswith(""):
return verb[:-1] + "아요"
return verb
def word(self):
return random.choice(self.activities + self.verbs + self.modifiers + self.connectors)
def sentence(self):
# Build basic sentence structure
activity = random.choice(self.activities)
verb = random.choice(self.verbs)
modifier = random.choice(self.modifiers)
# Conjugate verb
conjugated_verb = self.conjugate_verb(verb)
# Build sentence (Korean word order: Subject + Object + Modifier + Verb)
sentence = f"저는 {activity}를/을 {modifier} {conjugated_verb}"
# Randomly add connector and another activity
if random.choice([True, False]):
connector = random.choice(self.connectors)
second_activity = random.choice(self.activities)
second_verb = self.conjugate_verb(random.choice(self.verbs))
sentence += f" {connector} {second_activity}{second_verb}"
return sentence + "."
def paragraph(self, num_sentences=3):
return '\n'.join([self.sentence() for _ in range(num_sentences)])
def text(self, num_sentences=5):
return '\n'.join([self.sentence() for _ in range(num_sentences)])
def generate_text_by_analyzer(analyzer_params):
"""
Generate text data based on the given analyzer parameters
Args:
analyzer_params: Dictionary containing the analyzer parameters
Returns:
str: Generated text data
"""
if analyzer_params["tokenizer"] == "standard":
fake = Faker("en_US")
elif analyzer_params["tokenizer"] == "jieba":
fake = Faker("zh_CN")
elif analyzer_params["tokenizer"] == "icu":
fake = ICUTextGenerator()
elif analyzer_params["tokenizer"]["type"] == "lindera":
# Generate random Japanese text
if analyzer_params["tokenizer"]["dict_kind"] == "ipadic":
fake = Faker("ja_JP")
elif analyzer_params["tokenizer"]["dict_kind"] == "ko-dic":
fake = KoreanTextGenerator()
elif analyzer_params["tokenizer"]["dict_kind"] == "cc-cedict":
fake = Faker("zh_CN")
else:
raise ValueError("Invalid dict_kind")
else:
raise ValueError("Invalid analyzer parameters")
text = fake.text()
stop_words = []
if "filter" in analyzer_params:
for filter in analyzer_params["filter"]:
if filter["type"] == "stop":
stop_words.extend(filter["stop_words"])
# add stop words to the text
text += " " + " ".join(stop_words)
return text