mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-08 01:58:34 +08:00
694 lines
38 KiB
Python
694 lines
38 KiB
Python
import logging
|
||
import time
|
||
from utils.util_pymilvus import *
|
||
from common.common_type import CaseLabel, CheckTasks
|
||
from common import common_type as ct
|
||
from common import common_func as cf
|
||
from base.client_v2_base import TestMilvusClientV2Base
|
||
import pytest
|
||
from idx_ngram import NGRAM
|
||
|
||
index_type = "NGRAM"
|
||
success = "success"
|
||
pk_field_name = 'id'
|
||
vector_field_name = 'vector'
|
||
content_field_name = 'content_ngram'
|
||
json_field_name = 'json_field'
|
||
dim = 32
|
||
default_nb = 2000
|
||
default_build_params = {"min_gram": 2, "max_gram": 3}
|
||
|
||
|
||
class TestNgramBuildParams(TestMilvusClientV2Base):
|
||
@pytest.mark.tags(CaseLabel.L1)
|
||
@pytest.mark.parametrize("params", NGRAM.build_params)
|
||
def test_ngram_build_params(self, params):
|
||
"""
|
||
Test the build params of NGRAM index
|
||
"""
|
||
client = self._client()
|
||
collection_name = cf.gen_collection_name_by_testcase_name()
|
||
schema, _ = self.create_schema(client)
|
||
schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False)
|
||
schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim)
|
||
schema.add_field(content_field_name, datatype=DataType.VARCHAR, max_length=100)
|
||
|
||
# Check if this test case requires JSON field
|
||
build_params = params.get("params", None)
|
||
has_json_params = (build_params is not None and
|
||
("json_path" in build_params or "json_cast_type" in build_params))
|
||
|
||
target_field_name = content_field_name # Default to VARCHAR field
|
||
|
||
if has_json_params:
|
||
# Add JSON field for JSON-related parameter tests
|
||
schema.add_field(json_field_name, datatype=DataType.JSON)
|
||
target_field_name = json_field_name
|
||
|
||
self.create_collection(client, collection_name, schema=schema)
|
||
|
||
# Insert test data
|
||
nb = default_nb
|
||
rows = cf.gen_row_data_by_schema(nb=nb, schema=schema, start=0)
|
||
|
||
if has_json_params:
|
||
# Generate JSON test data with varied content
|
||
json_keywords = ["stadium", "park", "school", "library", "hospital", "restaurant", "office", "store"]
|
||
for i, row in enumerate(rows):
|
||
keyword_idx = i % len(json_keywords)
|
||
keyword = json_keywords[keyword_idx]
|
||
row[content_field_name] = f"text content {i}" # Still provide VARCHAR data
|
||
row[json_field_name] = {
|
||
"body": f"This is a {keyword} building",
|
||
"title": f"Location {i}",
|
||
"description": f"Description for {keyword} number {i}"
|
||
}
|
||
else:
|
||
# Generate VARCHAR test data with varied content
|
||
varchar_keywords = ["stadium", "park", "school", "library", "hospital", "restaurant", "office", "store"]
|
||
for i, row in enumerate(rows):
|
||
keyword_idx = i % len(varchar_keywords)
|
||
keyword = varchar_keywords[keyword_idx]
|
||
row[content_field_name] = f"The {keyword} is large and beautiful number {i}"
|
||
|
||
# Insert data in batches for better performance
|
||
batch_size = 1000
|
||
for i in range(0, nb, batch_size):
|
||
batch_rows = rows[i:i + batch_size]
|
||
self.insert(client, collection_name, batch_rows)
|
||
self.flush(client, collection_name)
|
||
|
||
# Create index
|
||
index_params = self.prepare_index_params(client)[0]
|
||
index_name = cf.gen_str_by_length(10, letters_only=True)
|
||
index_params.add_index(field_name=target_field_name,
|
||
index_name=index_name,
|
||
index_type=index_type,
|
||
params=build_params)
|
||
|
||
# Build index
|
||
if params.get("expected", None) != success:
|
||
self.create_index(client, collection_name, index_params,
|
||
check_task=CheckTasks.err_res,
|
||
check_items=params.get("expected"))
|
||
else:
|
||
self.create_index(client, collection_name, index_params)
|
||
self.wait_for_index_ready(client, collection_name, index_name=index_name)
|
||
|
||
# Create vector index before loading collection
|
||
vector_index_params = self.prepare_index_params(client)[0]
|
||
vector_index_params.add_index(field_name=vector_field_name,
|
||
metric_type=cf.get_default_metric_for_vector_type(
|
||
vector_type=DataType.FLOAT_VECTOR),
|
||
index_type="IVF_FLAT",
|
||
params={"nlist": 128})
|
||
self.create_index(client, collection_name, vector_index_params)
|
||
self.wait_for_index_ready(client, collection_name, index_name=vector_field_name)
|
||
|
||
# Load collection
|
||
self.load_collection(client, collection_name)
|
||
|
||
# Test query based on field type
|
||
if has_json_params:
|
||
filter_expr = f"{json_field_name}['body'] LIKE \"%stadium%\""
|
||
else:
|
||
filter_expr = f'{content_field_name} LIKE "%stadium%"'
|
||
|
||
# Calculate expected count: 2000 data points with 8 keywords cycling
|
||
# Each keyword appears 2000/8 = 250 times
|
||
expected_count = default_nb // 8 # 250 matches for "stadium"
|
||
|
||
self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["count(*)"],
|
||
check_task=CheckTasks.check_query_results,
|
||
check_items={"enable_milvus_client_api": True,
|
||
"count(*)": expected_count})
|
||
|
||
# Verify the index params are persisted
|
||
idx_info = client.describe_index(collection_name, index_name)
|
||
if build_params is not None:
|
||
for key, value in build_params.items():
|
||
if value is not None and key not in ["json_path", "json_cast_type"]:
|
||
assert key in idx_info.keys()
|
||
assert str(value) in idx_info.values()
|
||
|
||
@pytest.mark.tags(CaseLabel.L2)
|
||
@pytest.mark.parametrize("scalar_field_type", ct.all_scalar_data_types)
|
||
def test_ngram_on_all_scalar_fields(self, scalar_field_type):
|
||
"""
|
||
Test NGRAM index on all scalar field types and verify proper error handling
|
||
"""
|
||
client = self._client()
|
||
collection_name = cf.gen_collection_name_by_testcase_name()
|
||
schema, _ = self.create_schema(client)
|
||
schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False)
|
||
schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim)
|
||
|
||
# Add the scalar field with appropriate parameters
|
||
if scalar_field_type == DataType.VARCHAR:
|
||
schema.add_field("scalar_field", datatype=scalar_field_type, max_length=1000)
|
||
elif scalar_field_type == DataType.ARRAY:
|
||
schema.add_field("scalar_field", datatype=scalar_field_type,
|
||
element_type=DataType.VARCHAR, max_capacity=10, max_length=100)
|
||
else:
|
||
schema.add_field("scalar_field", datatype=scalar_field_type)
|
||
|
||
self.create_collection(client, collection_name, schema=schema)
|
||
|
||
# Generate appropriate test data for each field type
|
||
nb = default_nb
|
||
rows = cf.gen_row_data_by_schema(nb=nb, schema=schema, start=0)
|
||
|
||
# Update scalar field with appropriate test data
|
||
if scalar_field_type == DataType.VARCHAR:
|
||
# Generate varied VARCHAR data for better testing
|
||
keywords = ["stadium", "park", "school", "library", "hospital", "restaurant", "office", "store"]
|
||
for i, row in enumerate(rows):
|
||
keyword_idx = i % len(keywords)
|
||
keyword = keywords[keyword_idx]
|
||
row["scalar_field"] = f"The {keyword} is a large building number {i}"
|
||
elif scalar_field_type == DataType.JSON:
|
||
# Generate varied JSON data for better testing
|
||
keywords = ["school", "park", "mall", "library", "hospital", "restaurant", "office", "store"]
|
||
for i, row in enumerate(rows):
|
||
keyword_idx = i % len(keywords)
|
||
keyword = keywords[keyword_idx]
|
||
row["scalar_field"] = {
|
||
"body": f"This is a {keyword}",
|
||
"title": f"Location {i}",
|
||
"category": f"Category {keyword_idx}"
|
||
}
|
||
elif scalar_field_type == DataType.ARRAY:
|
||
# Generate varied ARRAY data for better testing
|
||
base_words = ["word", "text", "data", "item", "element"]
|
||
keywords = ["stadium", "park", "school", "library", "hospital"]
|
||
for i, row in enumerate(rows):
|
||
base_idx = i % len(base_words)
|
||
keyword_idx = i % len(keywords)
|
||
row["scalar_field"] = [f"{base_words[base_idx]}1", f"{base_words[base_idx]}2", keywords[keyword_idx]]
|
||
# For other scalar types, keep the auto-generated data
|
||
|
||
# Insert data in batches for better performance
|
||
batch_size = 1000
|
||
for i in range(0, nb, batch_size):
|
||
batch_rows = rows[i:i + batch_size]
|
||
self.insert(client, collection_name, batch_rows)
|
||
self.flush(client, collection_name)
|
||
|
||
# Create index
|
||
index_name = cf.gen_str_by_length(10, letters_only=True)
|
||
index_params = self.prepare_index_params(client)[0]
|
||
if scalar_field_type == DataType.JSON:
|
||
# JSON field requires json_path and json_cast_type
|
||
index_params.add_index(field_name="scalar_field",
|
||
index_name=index_name,
|
||
index_type=index_type,
|
||
params={
|
||
"min_gram": 2,
|
||
"max_gram": 3,
|
||
"json_path": "scalar_field['body']",
|
||
"json_cast_type": "varchar"
|
||
})
|
||
else:
|
||
index_params.add_index(field_name="scalar_field",
|
||
index_name=index_name,
|
||
index_type=index_type,
|
||
params=default_build_params)
|
||
|
||
# Check if the field type is supported for NGRAM index
|
||
if scalar_field_type not in NGRAM.supported_field_types:
|
||
self.create_index(client, collection_name, index_params,
|
||
check_task=CheckTasks.err_res,
|
||
check_items={"err_code": 999,
|
||
"err_msg": "ngram index can only be created on VARCHAR or JSON field"})
|
||
else:
|
||
self.create_index(client, collection_name, index_params)
|
||
self.wait_for_index_ready(client, collection_name, index_name=index_name)
|
||
|
||
# Create vector index before loading collection
|
||
vector_index_params = self.prepare_index_params(client)[0]
|
||
vector_index_params.add_index(field_name=vector_field_name,
|
||
metric_type=cf.get_default_metric_for_vector_type(
|
||
vector_type=DataType.FLOAT_VECTOR),
|
||
index_type="IVF_FLAT",
|
||
params={"nlist": 128})
|
||
self.create_index(client, collection_name, vector_index_params)
|
||
self.wait_for_index_ready(client, collection_name, index_name=vector_field_name)
|
||
|
||
self.load_collection(client, collection_name)
|
||
|
||
# Test query for supported types
|
||
if scalar_field_type == DataType.VARCHAR:
|
||
# Calculate expected count: 2000 data points with 8 keywords cycling
|
||
# Each keyword appears 2000/8 = 250 times
|
||
expected_count = default_nb // 8 # 250 matches for "stadium"
|
||
filter_expr = 'scalar_field LIKE "%stadium%"'
|
||
self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["count(*)"],
|
||
check_task=CheckTasks.check_query_results,
|
||
check_items={"enable_milvus_client_api": True,
|
||
"count(*)": expected_count})
|
||
elif scalar_field_type == DataType.JSON:
|
||
# Calculate expected count: 2000 data points with 8 keywords cycling
|
||
# Each keyword appears 2000/8 = 250 times
|
||
expected_count = default_nb // 8 # 250 matches for "school"
|
||
filter_expr = "scalar_field['body'] LIKE \"%school%\""
|
||
self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["count(*)"],
|
||
check_task=CheckTasks.check_query_results,
|
||
check_items={"enable_milvus_client_api": True,
|
||
"count(*)": expected_count})
|
||
|
||
@pytest.mark.tags(CaseLabel.L2)
|
||
@pytest.mark.skip(reason="skip for issue #44164")
|
||
def test_ngram_alter_index_mmap_and_gram_values(self):
|
||
"""
|
||
Test the alter index with mmap and gram values
|
||
"""
|
||
client = self._client()
|
||
collection_name = cf.gen_collection_name_by_testcase_name()
|
||
schema, _ = self.create_schema(client)
|
||
schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False)
|
||
schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim)
|
||
schema.add_field("content_ngram", datatype=DataType.VARCHAR, max_length=20)
|
||
self.create_collection(client, collection_name, schema=schema)
|
||
|
||
# Insert data
|
||
content_keywords = ["stadium", "park", "school", "library", "hospital", "restaurant", "office", "store"]
|
||
rows = cf.gen_row_data_by_schema(nb=default_nb, schema=schema, start=0)
|
||
for i, row in enumerate(rows):
|
||
row["content_ngram"] = content_keywords[i % len(content_keywords)]
|
||
self.insert(client, collection_name, rows)
|
||
self.flush(client, collection_name)
|
||
|
||
# Create index
|
||
index_params = self.prepare_index_params(client)[0]
|
||
index_params.add_index(field_name="content_ngram",
|
||
index_name="content_ngram",
|
||
index_type=index_type,
|
||
params={"min_gram": 2, "max_gram": 3})
|
||
index_params.add_index(field_name=vector_field_name,
|
||
index_type="IVF_FLAT",
|
||
metric_type="COSINE",
|
||
params={"nlist": 128})
|
||
self.create_index(client, collection_name, index_params)
|
||
self.wait_for_index_ready(client, collection_name, index_name="content_ngram")
|
||
self.wait_for_index_ready(client, collection_name, index_name=vector_field_name)
|
||
self.load_collection(client, collection_name)
|
||
# Query to check if the index is created
|
||
res = self.query(client, collection_name, filter="content_ngram LIKE 'stad_%'",
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res) == default_nb // len(content_keywords)
|
||
|
||
# Release collection before alter ngram index
|
||
self.release_collection(client, collection_name)
|
||
# Alter index mmap properties
|
||
self.alter_index_properties(client, collection_name, index_name="content_ngram", properties={"mmap.enabled": True})
|
||
res = self.describe_index(client, collection_name, index_name="content_ngram")[0]
|
||
assert res.get('mmap.enabled', None) == 'True'
|
||
# Load the collection and query again
|
||
self.load_collection(client, collection_name)
|
||
res = self.query(client, collection_name, filter="content_ngram LIKE 'stad_%'",
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res) == default_nb // len(content_keywords)
|
||
|
||
# Alter index gram value properties is not supported
|
||
self.release_collection(client, collection_name)
|
||
error = {ct.err_code: 1, ct.err_msg: "invalid mmap.enabled value: True, expected: true, false"}
|
||
self.alter_index_properties(client, collection_name, index_name="content_ngram", properties={"min_gram": 3, "max_gram": 4},
|
||
check_task=CheckTasks.err_res, check_items=error)
|
||
|
||
@pytest.mark.tags(CaseLabel.L2)
|
||
def test_ngram_search_with_diff_length_of_filter_value(self):
|
||
"""
|
||
Test the search params of NGRAM index
|
||
"""
|
||
client = self._client()
|
||
collection_name = cf.gen_collection_name_by_testcase_name()
|
||
schema, _ = self.create_schema(client)
|
||
schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False)
|
||
schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim)
|
||
schema.add_field("content_no_index", datatype=DataType.VARCHAR, max_length=10)
|
||
schema.add_field("content_ngram", datatype=DataType.VARCHAR, max_length=10)
|
||
|
||
self.create_collection(client, collection_name, schema=schema)
|
||
|
||
# Insert test data
|
||
insert_times = 2
|
||
content_keywords = ["stadium", "park", "school", "library", "hospital", "restaurant", "office", "store"]
|
||
for i in range(insert_times):
|
||
rows = cf.gen_row_data_by_schema(nb=default_nb, schema=schema, start=i * default_nb)
|
||
for j, row in enumerate(rows):
|
||
row["content_no_index"] = content_keywords[j % len(content_keywords)]
|
||
row["content_ngram"] = content_keywords[j % len(content_keywords)]
|
||
self.insert(client, collection_name, rows)
|
||
self.flush(client, collection_name)
|
||
|
||
# Create vector index before loading collection
|
||
index_params = self.prepare_index_params(client)[0]
|
||
index_params.add_index(field_name=vector_field_name,
|
||
metric_type="COSINE",
|
||
index_type="IVF_FLAT",
|
||
params={"nlist": 128})
|
||
min_gram = 2
|
||
max_gram = 4
|
||
index_params.add_index(field_name="content_ngram",
|
||
index_type=index_type,
|
||
params={"min_gram": min_gram, "max_gram": max_gram})
|
||
self.create_index(client, collection_name, index_params)
|
||
self.wait_for_index_ready(client, collection_name, index_name=vector_field_name)
|
||
self.wait_for_index_ready(client, collection_name, index_name="content_ngram")
|
||
self.load_collection(client, collection_name)
|
||
|
||
# Test query 0: filter value length is less than min_gram
|
||
filter_expr = f'content_ngram LIKE "{content_keywords[0][:min_gram - 1]}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= insert_times * default_nb // len(content_keywords)
|
||
filter_expr = f'content_no_index LIKE "{content_keywords[0][:min_gram - 1]}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_no_index) >= insert_times * default_nb // len(content_keywords)
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test query 1: filter value length is equal to min_gram
|
||
filter_expr = f'content_ngram LIKE "{content_keywords[0][:min_gram]}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= insert_times * default_nb // len(content_keywords)
|
||
filter_expr = f'content_no_index LIKE "{content_keywords[0][:min_gram]}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_no_index) >= insert_times * default_nb // len(content_keywords)
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test query 2: filter value length is less than max_gram
|
||
filter_expr = f'content_ngram LIKE "{content_keywords[0][:max_gram - 1]}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= insert_times * default_nb // len(content_keywords)
|
||
filter_expr = f'content_no_index LIKE "{content_keywords[0][:max_gram - 1]}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_no_index) >= insert_times * default_nb // len(content_keywords)
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test query 3: filter value length is equal to max_gram
|
||
filter_expr = f'content_ngram LIKE "{content_keywords[0][:max_gram]}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= insert_times * default_nb // len(content_keywords)
|
||
filter_expr = f'content_no_index LIKE "{content_keywords[0][:max_gram]}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_no_index) >= insert_times * default_nb // len(content_keywords)
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test query 4: filter value length is greater than max_gram
|
||
filter_expr = f'content_ngram LIKE "{content_keywords[0][:max_gram + 1]}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= insert_times * default_nb // len(content_keywords)
|
||
filter_expr = f'content_no_index LIKE "{content_keywords[0][:max_gram + 1]}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_no_index) >= insert_times * default_nb // len(content_keywords)
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test query with suffix match
|
||
filter_expr = f'content_ngram LIKE "%{content_keywords[0][4:]}"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= insert_times * default_nb // len(content_keywords)
|
||
filter_expr = f'content_no_index LIKE "%{content_keywords[0][4:]}"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_no_index) >= insert_times * default_nb // len(content_keywords)
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test query with infix match
|
||
filter_expr = f'content_ngram LIKE "%{content_keywords[0][2:4]}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= insert_times * default_nb // len(content_keywords)
|
||
filter_expr = f'content_no_index LIKE "%{content_keywords[0][2:4]}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_no_index) >= insert_times * default_nb // len(content_keywords)
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test query with Mixed Wildcard Match
|
||
filter_expr = f'content_ngram LIKE "%st_d_um%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= insert_times * default_nb // len(content_keywords)
|
||
filter_expr = f'content_no_index LIKE "%st_d_um%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_no_index) >= insert_times * default_nb // len(content_keywords)
|
||
assert res_ngram == res_no_index
|
||
|
||
@pytest.mark.tags(CaseLabel.L2)
|
||
def test_ngram_search_with_multilingual_utf8_strings(self):
|
||
"""
|
||
Test NGRAM index with multilingual and UTF-8 strings for LIKE filtering
|
||
"""
|
||
client = self._client()
|
||
collection_name = cf.gen_collection_name_by_testcase_name()
|
||
schema, _ = self.create_schema(client)
|
||
schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False)
|
||
schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim)
|
||
schema.add_field("content_no_index", datatype=DataType.JSON)
|
||
schema.add_field("content_ngram", datatype=DataType.JSON)
|
||
|
||
self.create_collection(client, collection_name, schema=schema)
|
||
|
||
# Multilingual test data with various UTF-8 characters
|
||
multilingual_keywords = [
|
||
"北京大学", # Chinese
|
||
"東京大学", # Japanese
|
||
"Московский", # Russian
|
||
"café", # French with accent
|
||
"naïve", # French with diaeresis
|
||
"München", # German with umlaut
|
||
"🏫学校🎓", # Chinese with emojis
|
||
"🌟star⭐", # English with emojis
|
||
"مدرسة", # Arabic
|
||
"Γειά", # Greek
|
||
"प्रविष्टि", # Hindi/Devanagari
|
||
"한국어", # Korean
|
||
"español", # Spanish
|
||
"português", # Portuguese
|
||
"中英mix英文", # Mixed Chinese-English
|
||
"café☕北京🏙️" # Mixed with emojis and multiple languages
|
||
]
|
||
|
||
# Insert test data
|
||
insert_times = 2
|
||
total_records = insert_times * default_nb
|
||
for i in range(insert_times):
|
||
rows = cf.gen_row_data_by_schema(nb=default_nb, schema=schema, start=i * default_nb)
|
||
for j, row in enumerate(rows):
|
||
keyword_idx = j % len(multilingual_keywords)
|
||
keyword = multilingual_keywords[keyword_idx]
|
||
row["content_no_index"] = {
|
||
"body": f"This is a {keyword} building",
|
||
"title": f"Location {i}",
|
||
"description": f"Description for {keyword} number {i}"
|
||
}
|
||
row["content_ngram"] = {
|
||
"body": f"This is a {keyword} building",
|
||
"title": f"Location {i}",
|
||
"description": f"Description for {keyword} number {i}"
|
||
}
|
||
self.insert(client, collection_name, rows)
|
||
self.flush(client, collection_name)
|
||
|
||
# Create vector index before loading collection
|
||
index_params = self.prepare_index_params(client)[0]
|
||
index_params.add_index(field_name=vector_field_name,
|
||
metric_type="COSINE",
|
||
index_type="IVF_FLAT",
|
||
params={"nlist": 128})
|
||
|
||
# Create NGRAM index with appropriate parameters for multilingual content
|
||
min_gram = 1 # Use 1 for better multilingual support
|
||
max_gram = 3
|
||
index_params.add_index(field_name="content_ngram",
|
||
index_name="content_ngram",
|
||
index_type=index_type,
|
||
params={"min_gram": min_gram, "max_gram": max_gram,
|
||
"json_path": "content_ngram['body']", "json_cast_type": "varchar"})
|
||
self.create_index(client, collection_name, index_params)
|
||
self.wait_for_index_ready(client, collection_name, index_name=vector_field_name)
|
||
self.wait_for_index_ready(client, collection_name, index_name="content_ngram")
|
||
self.load_collection(client, collection_name)
|
||
|
||
expected_count_per_keyword = total_records // len(multilingual_keywords)
|
||
|
||
# Test 1: Chinese character search
|
||
chinese_keyword = "北京"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{chinese_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{chinese_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 2: Japanese character search
|
||
japanese_keyword = "東京"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{japanese_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{japanese_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 3: Russian Cyrillic character search
|
||
russian_keyword = "Моск"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{russian_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{russian_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 4: French accent character search
|
||
french_keyword = "café"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{french_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{french_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 5: Emoji character search
|
||
emoji_keyword = "🏫"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{emoji_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{emoji_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 6: Star emoji search
|
||
star_keyword = "⭐"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{star_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{star_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 7: Arabic character search
|
||
arabic_keyword = "مدرسة"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{arabic_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{arabic_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 8: Korean character search
|
||
korean_keyword = "한국"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{korean_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{korean_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 9: German umlaut character search
|
||
german_keyword = "München"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{german_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{german_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 10: Mixed language search
|
||
mixed_keyword = "mix"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{mixed_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{mixed_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 11: Complex multilingual with emojis prefix search
|
||
complex_keyword = "café☕"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{complex_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{complex_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 12: Hindi/Devanagari character search
|
||
hindi_keyword = "प्रविष्टि"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{hindi_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{hindi_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 13: Greek character search
|
||
greek_keyword = "Γειά"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{greek_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{greek_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 14: Portuguese with tilde character search
|
||
portuguese_keyword = "português"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{portuguese_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{portuguese_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
assert len(res_ngram) >= expected_count_per_keyword
|
||
assert res_ngram == res_no_index
|
||
|
||
# Test 15: Test single character search (especially important for CJK)
|
||
single_char_keyword = "学"
|
||
filter_expr = f'content_ngram["body"] LIKE "%{single_char_keyword}%"'
|
||
res_ngram = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
filter_expr = f'content_no_index["body"] LIKE "%{single_char_keyword}%"'
|
||
res_no_index = self.query(client, collection_name, filter=filter_expr,
|
||
output_fields=["id", "content_ngram"])[0]
|
||
# Should match both "北京大学" and "🏫学校🎓"
|
||
assert len(res_ngram) >= expected_count_per_keyword * 2
|
||
assert res_ngram == res_no_index
|