mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-07 19:31:51 +08:00
test: add run_analyzer api test and lindera tokenizer test (#40160)
/kind improvement /hold --------- Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
This commit is contained in:
parent
828ecacadc
commit
01966280da
@ -171,9 +171,9 @@ class TestMilvusClientV2Base(Base):
|
||||
limit=limit, output_fields=output_fields, search_params=search_params,
|
||||
**kwargs).run()
|
||||
return res, check_result
|
||||
|
||||
|
||||
@trace()
|
||||
def hybrid_search(self, client, collection_name, reqs, rerank, limit=10,
|
||||
def hybrid_search(self, client, collection_name, reqs, rerank, limit=10,
|
||||
output_fields=None, timeout=None, partition_names=None,
|
||||
check_task=None, check_items=None, **kwargs):
|
||||
timeout = TIMEOUT if timeout is None else timeout
|
||||
@ -919,4 +919,14 @@ class TestMilvusClientV2Base(Base):
|
||||
check_result = ResponseChecker(res, func_name, check_task, check_items, check, **kwargs).run()
|
||||
return res, check_result
|
||||
|
||||
@trace()
|
||||
def run_analyzer(self, client, text, analyzer_params, timeout=None, check_task=None, check_items=None, **kwargs):
|
||||
timeout = TIMEOUT if timeout is None else timeout
|
||||
kwargs.update({"timeout": timeout})
|
||||
func_name = sys._getframe().f_code.co_name
|
||||
res, check = api_request([client.run_analyzer, text, analyzer_params], **kwargs)
|
||||
check_result = ResponseChecker(res, func_name, check_task, check_items, check, text=text,
|
||||
analyzer_params=analyzer_params, **kwargs).run()
|
||||
return res, check_result
|
||||
|
||||
|
||||
|
||||
@ -368,3 +368,112 @@ class PhraseMatchTestGenerator:
|
||||
matched_docs.extend(doc_id)
|
||||
|
||||
return matched_docs
|
||||
|
||||
|
||||
|
||||
|
||||
class KoreanTextGenerator:
|
||||
def __init__(self):
|
||||
# Sports/Activities (Nouns)
|
||||
self.activities = [
|
||||
"수영", "축구", "농구", "테니스",
|
||||
"배구", "야구", "골프", "럭비",
|
||||
"달리기", "자전거", "스케이트", "스키",
|
||||
"서핑", "다이빙", "등산", "요가",
|
||||
"춤", "하이킹", "독서", "요리"
|
||||
]
|
||||
|
||||
# Verbs (Base Form)
|
||||
self.verbs = [
|
||||
"좋아하다", "즐기다", "하다", "배우다",
|
||||
"가르치다", "보다", "시작하다", "계속하다",
|
||||
"연습하다", "선호하다", "마스터하다", "도전하다"
|
||||
]
|
||||
|
||||
# Connectors
|
||||
self.connectors = [
|
||||
"그리고", "또는", "하지만", "그런데",
|
||||
"그래서", "또한", "게다가", "그러면서",
|
||||
"동시에", "함께"
|
||||
]
|
||||
|
||||
# Modifiers (Frequency/Degree)
|
||||
self.modifiers = [
|
||||
"매우", "자주", "가끔", "열심히",
|
||||
"전문적으로", "규칙적으로", "매일", "일주일에 한 번",
|
||||
"취미로", "진지하게"
|
||||
]
|
||||
|
||||
def conjugate_verb(self, verb):
|
||||
# Simple Korean verb conjugation (using informal style "-아/어요")
|
||||
if verb.endswith("하다"):
|
||||
return verb.replace("하다", "해요")
|
||||
elif verb.endswith("다"):
|
||||
return verb[:-1] + "아요"
|
||||
return verb
|
||||
|
||||
def sentence(self):
|
||||
# Build basic sentence structure
|
||||
activity = random.choice(self.activities)
|
||||
verb = random.choice(self.verbs)
|
||||
modifier = random.choice(self.modifiers)
|
||||
|
||||
# Conjugate verb
|
||||
conjugated_verb = self.conjugate_verb(verb)
|
||||
|
||||
# Build sentence (Korean word order: Subject + Object + Modifier + Verb)
|
||||
sentence = f"저는 {activity}를/을 {modifier} {conjugated_verb}"
|
||||
|
||||
# Randomly add connector and another activity
|
||||
if random.choice([True, False]):
|
||||
connector = random.choice(self.connectors)
|
||||
second_activity = random.choice(self.activities)
|
||||
second_verb = self.conjugate_verb(random.choice(self.verbs))
|
||||
sentence += f" {connector} {second_activity}도 {second_verb}"
|
||||
|
||||
return sentence + "."
|
||||
|
||||
def paragraph(self, num_sentences=3):
|
||||
return '\n'.join([self.sentence() for _ in range(num_sentences)])
|
||||
|
||||
def text(self, num_sentences=5):
|
||||
return '\n'.join([self.sentence() for _ in range(num_sentences)])
|
||||
|
||||
|
||||
def generate_text_by_analyzer(analyzer_params):
|
||||
"""
|
||||
Generate text data based on the given analyzer parameters
|
||||
|
||||
Args:
|
||||
analyzer_params: Dictionary containing the analyzer parameters
|
||||
|
||||
Returns:
|
||||
str: Generated text data
|
||||
"""
|
||||
if analyzer_params["tokenizer"] == "standard":
|
||||
fake = Faker("en_US")
|
||||
elif analyzer_params["tokenizer"] == "jieba":
|
||||
fake = Faker("zh_CN")
|
||||
elif analyzer_params["tokenizer"]["type"] == "lindera":
|
||||
# Generate random Japanese text
|
||||
if analyzer_params["tokenizer"]["dict_kind"] == "ipadic":
|
||||
fake = Faker("ja_JP")
|
||||
elif analyzer_params["tokenizer"]["dict_kind"] == "ko-dic":
|
||||
fake = KoreanTextGenerator()
|
||||
elif analyzer_params["tokenizer"]["dict_kind"] == "cc-cedict":
|
||||
fake = Faker("zh_CN")
|
||||
else:
|
||||
raise ValueError("Invalid dict_kind")
|
||||
else:
|
||||
raise ValueError("Invalid analyzer parameters")
|
||||
|
||||
text = fake.text()
|
||||
stop_words = []
|
||||
if "filter" in analyzer_params:
|
||||
for filter in analyzer_params["filter"]:
|
||||
if filter["type"] == "stop":
|
||||
stop_words.extend(filter["stop_words"])
|
||||
|
||||
# add stop words to the text
|
||||
text += " " + " ".join(stop_words)
|
||||
return text
|
||||
|
||||
@ -0,0 +1,61 @@
|
||||
import pytest
|
||||
|
||||
from base.client_v2_base import TestMilvusClientV2Base
|
||||
from common.common_type import CaseLabel
|
||||
from common.phrase_match_generator import generate_text_by_analyzer
|
||||
|
||||
|
||||
class TestMilvusClientAnalyzer(TestMilvusClientV2Base):
|
||||
analyzer_params_list = [
|
||||
{
|
||||
"tokenizer": "standard",
|
||||
"filter": [
|
||||
{
|
||||
"type": "stop",
|
||||
"stop_words": ["is", "the", "this", "a", "an", "and", "or"],
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"tokenizer": "jieba",
|
||||
},
|
||||
{
|
||||
"tokenizer": {"type": "lindera", "dict_kind": "ipadic"},
|
||||
"filter": [
|
||||
{
|
||||
"type": "stop",
|
||||
"stop_words": ["は", "が", "の", "に", "を", "で", "と", "た"],
|
||||
}
|
||||
],
|
||||
},
|
||||
{"tokenizer": {"type": "lindera", "dict_kind": "ko-dic"}},
|
||||
{"tokenizer": {"type": "lindera", "dict_kind": "cc-cedict"}},
|
||||
]
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L1)
|
||||
@pytest.mark.parametrize("analyzer_params", analyzer_params_list)
|
||||
def test_analyzer(self, analyzer_params):
|
||||
"""
|
||||
target: test analyzer
|
||||
method: use different analyzer params, then run analyzer to get the tokens
|
||||
expected: verify the tokens
|
||||
"""
|
||||
client = self._client()
|
||||
text = generate_text_by_analyzer(analyzer_params)
|
||||
res, result = self.run_analyzer(client, text, analyzer_params)
|
||||
tokens = res.tokens
|
||||
# Check tokens are not empty
|
||||
assert len(tokens) > 0, "No tokens were generated"
|
||||
|
||||
# Check tokens are related to input text (all token should be a substring of the text)
|
||||
assert all(
|
||||
token.lower() in text.lower() for token in tokens
|
||||
), "some of the tokens do not appear in the original text"
|
||||
|
||||
if "filter" in analyzer_params:
|
||||
for filter in analyzer_params["filter"]:
|
||||
if filter["type"] == "stop":
|
||||
stop_words = filter["stop_words"]
|
||||
assert not any(
|
||||
token in stop_words for token in tokens
|
||||
), "some of the tokens are stop words"
|
||||
@ -28,8 +28,8 @@ pytest-parallel
|
||||
pytest-random-order
|
||||
|
||||
# pymilvus
|
||||
pymilvus==2.6.0rc79
|
||||
pymilvus[bulk_writer]==2.6.0rc79
|
||||
pymilvus==2.6.0rc81
|
||||
pymilvus[bulk_writer]==2.6.0rc81
|
||||
|
||||
|
||||
# for customize config test
|
||||
|
||||
@ -5,6 +5,7 @@ from utils.util_log import test_log as log
|
||||
from common.common_type import CaseLabel, CheckTasks
|
||||
from common import common_type as ct
|
||||
from common import common_func as cf
|
||||
from common.phrase_match_generator import KoreanTextGenerator
|
||||
from common.code_mapping import ConnectionErrorMessage as cem
|
||||
from base.client_base import TestcaseBase
|
||||
from pymilvus.orm.types import CONSISTENCY_STRONG, CONSISTENCY_BOUNDED, CONSISTENCY_EVENTUALLY
|
||||
@ -29,6 +30,10 @@ Faker.seed(19530)
|
||||
fake_en = Faker("en_US")
|
||||
fake_zh = Faker("zh_CN")
|
||||
fake_de = Faker("de_DE")
|
||||
fake_jp = Faker("ja_JP")
|
||||
fake_ko = Faker("ko_KR")
|
||||
|
||||
|
||||
|
||||
# patch faker to generate text with specific distribution
|
||||
cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution)
|
||||
@ -5734,6 +5739,79 @@ class TestQueryTextMatch(TestcaseBase):
|
||||
res, _ = collection_w.query(expr=expr, output_fields=["id", field])
|
||||
pytest.assume(len(res) == 0, f"res len {len(res)}, data size {data_size}")
|
||||
|
||||
@pytest.mark.parametrize("dict_kind", ["ipadic", "ko-dic", "cc-cedict"])
|
||||
def test_query_text_match_with_Lindera_tokenizer(self, dict_kind):
|
||||
"""
|
||||
target: test text match with lindera tokenizer
|
||||
method: 1. enable text match, use lindera tokenizer and insert data with varchar in different lang
|
||||
2. get the most common words and query with text match
|
||||
3. verify the result
|
||||
expected: get the correct token, text match successfully and result is correct
|
||||
"""
|
||||
analyzer_params = {
|
||||
"tokenizer": {
|
||||
"type": "lindera",
|
||||
"dict_kind": dict_kind
|
||||
}
|
||||
}
|
||||
if dict_kind == "ipadic":
|
||||
fake = fake_jp
|
||||
elif dict_kind == "ko-dic":
|
||||
fake = KoreanTextGenerator()
|
||||
elif dict_kind == "cc-cedict":
|
||||
fake = fake_zh
|
||||
else:
|
||||
fake = fake_en
|
||||
dim = 128
|
||||
fields = [
|
||||
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
|
||||
FieldSchema(
|
||||
name="sentence",
|
||||
dtype=DataType.VARCHAR,
|
||||
max_length=65535,
|
||||
enable_analyzer=True,
|
||||
enable_match=True,
|
||||
analyzer_params=analyzer_params,
|
||||
),
|
||||
FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim),
|
||||
]
|
||||
schema = CollectionSchema(fields=fields, description="test collection")
|
||||
data_size = 5000
|
||||
collection_w = self.init_collection_wrap(
|
||||
name=cf.gen_unique_str(prefix), schema=schema
|
||||
)
|
||||
data = [
|
||||
{
|
||||
"id": i,
|
||||
"sentence": fake.sentence(),
|
||||
"emb": [random.random() for _ in range(dim)],
|
||||
}
|
||||
for i in range(data_size)
|
||||
]
|
||||
df = pd.DataFrame(data)
|
||||
log.info(f"dataframe\n{df}")
|
||||
batch_size = 5000
|
||||
for i in range(0, len(df), batch_size):
|
||||
collection_w.insert(
|
||||
data[i: i + batch_size]
|
||||
if i + batch_size < len(df)
|
||||
else data[i: len(df)]
|
||||
)
|
||||
collection_w.flush()
|
||||
collection_w.create_index(
|
||||
"emb",
|
||||
{"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 64}},
|
||||
)
|
||||
collection_w.load()
|
||||
# analyze the croup
|
||||
text_fields = ["sentence"]
|
||||
# query sentence field with word list
|
||||
for field in text_fields:
|
||||
match_text = df["sentence"].iloc[0]
|
||||
expr = f"text_match({field}, '{match_text}')"
|
||||
log.info(f"expr: {expr}")
|
||||
res, _ = collection_w.query(expr=expr, output_fields=["id", field])
|
||||
assert len(res) > 0
|
||||
|
||||
@pytest.mark.tags(CaseLabel.L0)
|
||||
def test_query_text_match_with_combined_expression_for_single_field(self):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user