import logging import time from utils.util_pymilvus import * from common.common_type import CaseLabel, CheckTasks from common import common_type as ct from common import common_func as cf from base.client_v2_base import TestMilvusClientV2Base import pytest from idx_ngram import NGRAM index_type = "NGRAM" success = "success" pk_field_name = 'id' vector_field_name = 'vector' content_field_name = 'content_ngram' json_field_name = 'json_field' dim = 32 default_nb = 2000 default_build_params = {"min_gram": 2, "max_gram": 3} class TestNgramBuildParams(TestMilvusClientV2Base): @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("params", NGRAM.build_params) def test_ngram_build_params(self, params): """ Test the build params of NGRAM index """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() schema, _ = self.create_schema(client) schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False) schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim) schema.add_field(content_field_name, datatype=DataType.VARCHAR, max_length=100) # Check if this test case requires JSON field build_params = params.get("params", None) has_json_params = (build_params is not None and ("json_path" in build_params or "json_cast_type" in build_params)) target_field_name = content_field_name # Default to VARCHAR field if has_json_params: # Add JSON field for JSON-related parameter tests schema.add_field(json_field_name, datatype=DataType.JSON) target_field_name = json_field_name self.create_collection(client, collection_name, schema=schema) # Insert test data nb = default_nb rows = cf.gen_row_data_by_schema(nb=nb, schema=schema, start=0) if has_json_params: # Generate JSON test data with varied content json_keywords = ["stadium", "park", "school", "library", "hospital", "restaurant", "office", "store"] for i, row in enumerate(rows): keyword_idx = i % len(json_keywords) keyword = json_keywords[keyword_idx] row[content_field_name] = f"text content {i}" # Still provide VARCHAR data row[json_field_name] = { "body": f"This is a {keyword} building", "title": f"Location {i}", "description": f"Description for {keyword} number {i}" } else: # Generate VARCHAR test data with varied content varchar_keywords = ["stadium", "park", "school", "library", "hospital", "restaurant", "office", "store"] for i, row in enumerate(rows): keyword_idx = i % len(varchar_keywords) keyword = varchar_keywords[keyword_idx] row[content_field_name] = f"The {keyword} is large and beautiful number {i}" # Insert data in batches for better performance batch_size = 1000 for i in range(0, nb, batch_size): batch_rows = rows[i:i + batch_size] self.insert(client, collection_name, batch_rows) self.flush(client, collection_name) # Create index index_params = self.prepare_index_params(client)[0] index_name = cf.gen_str_by_length(10, letters_only=True) index_params.add_index(field_name=target_field_name, index_name=index_name, index_type=index_type, params=build_params) # Build index if params.get("expected", None) != success: self.create_index(client, collection_name, index_params, check_task=CheckTasks.err_res, check_items=params.get("expected")) else: self.create_index(client, collection_name, index_params) self.wait_for_index_ready(client, collection_name, index_name=index_name) # Create vector index before loading collection vector_index_params = self.prepare_index_params(client)[0] vector_index_params.add_index(field_name=vector_field_name, metric_type=cf.get_default_metric_for_vector_type( vector_type=DataType.FLOAT_VECTOR), index_type="IVF_FLAT", params={"nlist": 128}) self.create_index(client, collection_name, vector_index_params) self.wait_for_index_ready(client, collection_name, index_name=vector_field_name) # Load collection self.load_collection(client, collection_name) # Test query based on field type if has_json_params: filter_expr = f"{json_field_name}['body'] LIKE \"%stadium%\"" else: filter_expr = f'{content_field_name} LIKE "%stadium%"' # Calculate expected count: 2000 data points with 8 keywords cycling # Each keyword appears 2000/8 = 250 times expected_count = default_nb // 8 # 250 matches for "stadium" self.query(client, collection_name, filter=filter_expr, output_fields=["count(*)"], check_task=CheckTasks.check_query_results, check_items={"enable_milvus_client_api": True, "count(*)": expected_count}) # Verify the index params are persisted idx_info = client.describe_index(collection_name, index_name) if build_params is not None: for key, value in build_params.items(): if value is not None and key not in ["json_path", "json_cast_type"]: assert key in idx_info.keys() assert str(value) in idx_info.values() @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("scalar_field_type", ct.all_scalar_data_types) def test_ngram_on_all_scalar_fields(self, scalar_field_type): """ Test NGRAM index on all scalar field types and verify proper error handling """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() schema, _ = self.create_schema(client) schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False) schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim) # Add the scalar field with appropriate parameters if scalar_field_type == DataType.VARCHAR: schema.add_field("scalar_field", datatype=scalar_field_type, max_length=1000) elif scalar_field_type == DataType.ARRAY: schema.add_field("scalar_field", datatype=scalar_field_type, element_type=DataType.VARCHAR, max_capacity=10, max_length=100) else: schema.add_field("scalar_field", datatype=scalar_field_type) self.create_collection(client, collection_name, schema=schema) # Generate appropriate test data for each field type nb = default_nb rows = cf.gen_row_data_by_schema(nb=nb, schema=schema, start=0) # Update scalar field with appropriate test data if scalar_field_type == DataType.VARCHAR: # Generate varied VARCHAR data for better testing keywords = ["stadium", "park", "school", "library", "hospital", "restaurant", "office", "store"] for i, row in enumerate(rows): keyword_idx = i % len(keywords) keyword = keywords[keyword_idx] row["scalar_field"] = f"The {keyword} is a large building number {i}" elif scalar_field_type == DataType.JSON: # Generate varied JSON data for better testing keywords = ["school", "park", "mall", "library", "hospital", "restaurant", "office", "store"] for i, row in enumerate(rows): keyword_idx = i % len(keywords) keyword = keywords[keyword_idx] row["scalar_field"] = { "body": f"This is a {keyword}", "title": f"Location {i}", "category": f"Category {keyword_idx}" } elif scalar_field_type == DataType.ARRAY: # Generate varied ARRAY data for better testing base_words = ["word", "text", "data", "item", "element"] keywords = ["stadium", "park", "school", "library", "hospital"] for i, row in enumerate(rows): base_idx = i % len(base_words) keyword_idx = i % len(keywords) row["scalar_field"] = [f"{base_words[base_idx]}1", f"{base_words[base_idx]}2", keywords[keyword_idx]] # For other scalar types, keep the auto-generated data # Insert data in batches for better performance batch_size = 1000 for i in range(0, nb, batch_size): batch_rows = rows[i:i + batch_size] self.insert(client, collection_name, batch_rows) self.flush(client, collection_name) # Create index index_name = cf.gen_str_by_length(10, letters_only=True) index_params = self.prepare_index_params(client)[0] if scalar_field_type == DataType.JSON: # JSON field requires json_path and json_cast_type index_params.add_index(field_name="scalar_field", index_name=index_name, index_type=index_type, params={ "min_gram": 2, "max_gram": 3, "json_path": "scalar_field['body']", "json_cast_type": "varchar" }) else: index_params.add_index(field_name="scalar_field", index_name=index_name, index_type=index_type, params=default_build_params) # Check if the field type is supported for NGRAM index if scalar_field_type not in NGRAM.supported_field_types: self.create_index(client, collection_name, index_params, check_task=CheckTasks.err_res, check_items={"err_code": 999, "err_msg": "ngram index can only be created on VARCHAR or JSON field"}) else: self.create_index(client, collection_name, index_params) self.wait_for_index_ready(client, collection_name, index_name=index_name) # Create vector index before loading collection vector_index_params = self.prepare_index_params(client)[0] vector_index_params.add_index(field_name=vector_field_name, metric_type=cf.get_default_metric_for_vector_type( vector_type=DataType.FLOAT_VECTOR), index_type="IVF_FLAT", params={"nlist": 128}) self.create_index(client, collection_name, vector_index_params) self.wait_for_index_ready(client, collection_name, index_name=vector_field_name) self.load_collection(client, collection_name) # Test query for supported types if scalar_field_type == DataType.VARCHAR: # Calculate expected count: 2000 data points with 8 keywords cycling # Each keyword appears 2000/8 = 250 times expected_count = default_nb // 8 # 250 matches for "stadium" filter_expr = 'scalar_field LIKE "%stadium%"' self.query(client, collection_name, filter=filter_expr, output_fields=["count(*)"], check_task=CheckTasks.check_query_results, check_items={"enable_milvus_client_api": True, "count(*)": expected_count}) elif scalar_field_type == DataType.JSON: # Calculate expected count: 2000 data points with 8 keywords cycling # Each keyword appears 2000/8 = 250 times expected_count = default_nb // 8 # 250 matches for "school" filter_expr = "scalar_field['body'] LIKE \"%school%\"" self.query(client, collection_name, filter=filter_expr, output_fields=["count(*)"], check_task=CheckTasks.check_query_results, check_items={"enable_milvus_client_api": True, "count(*)": expected_count}) @pytest.mark.tags(CaseLabel.L2) @pytest.mark.skip(reason="skip for issue #44164") def test_ngram_alter_index_mmap_and_gram_values(self): """ Test the alter index with mmap and gram values """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() schema, _ = self.create_schema(client) schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False) schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim) schema.add_field("content_ngram", datatype=DataType.VARCHAR, max_length=20) self.create_collection(client, collection_name, schema=schema) # Insert data content_keywords = ["stadium", "park", "school", "library", "hospital", "restaurant", "office", "store"] rows = cf.gen_row_data_by_schema(nb=default_nb, schema=schema, start=0) for i, row in enumerate(rows): row["content_ngram"] = content_keywords[i % len(content_keywords)] self.insert(client, collection_name, rows) self.flush(client, collection_name) # Create index index_params = self.prepare_index_params(client)[0] index_params.add_index(field_name="content_ngram", index_name="content_ngram", index_type=index_type, params={"min_gram": 2, "max_gram": 3}) index_params.add_index(field_name=vector_field_name, index_type="IVF_FLAT", metric_type="COSINE", params={"nlist": 128}) self.create_index(client, collection_name, index_params) self.wait_for_index_ready(client, collection_name, index_name="content_ngram") self.wait_for_index_ready(client, collection_name, index_name=vector_field_name) self.load_collection(client, collection_name) # Query to check if the index is created res = self.query(client, collection_name, filter="content_ngram LIKE 'stad_%'", output_fields=["id", "content_ngram"])[0] assert len(res) == default_nb // len(content_keywords) # Release collection before alter ngram index self.release_collection(client, collection_name) # Alter index mmap properties self.alter_index_properties(client, collection_name, index_name="content_ngram", properties={"mmap.enabled": True}) res = self.describe_index(client, collection_name, index_name="content_ngram")[0] assert res.get('mmap.enabled', None) == 'True' # Load the collection and query again self.load_collection(client, collection_name) res = self.query(client, collection_name, filter="content_ngram LIKE 'stad_%'", output_fields=["id", "content_ngram"])[0] assert len(res) == default_nb // len(content_keywords) # Alter index gram value properties is not supported self.release_collection(client, collection_name) error = {ct.err_code: 1, ct.err_msg: "invalid mmap.enabled value: True, expected: true, false"} self.alter_index_properties(client, collection_name, index_name="content_ngram", properties={"min_gram": 3, "max_gram": 4}, check_task=CheckTasks.err_res, check_items=error) @pytest.mark.tags(CaseLabel.L2) def test_ngram_search_with_diff_length_of_filter_value(self): """ Test the search params of NGRAM index """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() schema, _ = self.create_schema(client) schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False) schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim) schema.add_field("content_no_index", datatype=DataType.VARCHAR, max_length=10) schema.add_field("content_ngram", datatype=DataType.VARCHAR, max_length=10) self.create_collection(client, collection_name, schema=schema) # Insert test data insert_times = 2 content_keywords = ["stadium", "park", "school", "library", "hospital", "restaurant", "office", "store"] for i in range(insert_times): rows = cf.gen_row_data_by_schema(nb=default_nb, schema=schema, start=i * default_nb) for j, row in enumerate(rows): row["content_no_index"] = content_keywords[j % len(content_keywords)] row["content_ngram"] = content_keywords[j % len(content_keywords)] self.insert(client, collection_name, rows) self.flush(client, collection_name) # Create vector index before loading collection index_params = self.prepare_index_params(client)[0] index_params.add_index(field_name=vector_field_name, metric_type="COSINE", index_type="IVF_FLAT", params={"nlist": 128}) min_gram = 2 max_gram = 4 index_params.add_index(field_name="content_ngram", index_type=index_type, params={"min_gram": min_gram, "max_gram": max_gram}) self.create_index(client, collection_name, index_params) self.wait_for_index_ready(client, collection_name, index_name=vector_field_name) self.wait_for_index_ready(client, collection_name, index_name="content_ngram") self.load_collection(client, collection_name) # Test query 0: filter value length is less than min_gram filter_expr = f'content_ngram LIKE "{content_keywords[0][:min_gram - 1]}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= insert_times * default_nb // len(content_keywords) filter_expr = f'content_no_index LIKE "{content_keywords[0][:min_gram - 1]}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_no_index) >= insert_times * default_nb // len(content_keywords) assert res_ngram == res_no_index # Test query 1: filter value length is equal to min_gram filter_expr = f'content_ngram LIKE "{content_keywords[0][:min_gram]}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= insert_times * default_nb // len(content_keywords) filter_expr = f'content_no_index LIKE "{content_keywords[0][:min_gram]}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_no_index) >= insert_times * default_nb // len(content_keywords) assert res_ngram == res_no_index # Test query 2: filter value length is less than max_gram filter_expr = f'content_ngram LIKE "{content_keywords[0][:max_gram - 1]}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= insert_times * default_nb // len(content_keywords) filter_expr = f'content_no_index LIKE "{content_keywords[0][:max_gram - 1]}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_no_index) >= insert_times * default_nb // len(content_keywords) assert res_ngram == res_no_index # Test query 3: filter value length is equal to max_gram filter_expr = f'content_ngram LIKE "{content_keywords[0][:max_gram]}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= insert_times * default_nb // len(content_keywords) filter_expr = f'content_no_index LIKE "{content_keywords[0][:max_gram]}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_no_index) >= insert_times * default_nb // len(content_keywords) assert res_ngram == res_no_index # Test query 4: filter value length is greater than max_gram filter_expr = f'content_ngram LIKE "{content_keywords[0][:max_gram + 1]}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= insert_times * default_nb // len(content_keywords) filter_expr = f'content_no_index LIKE "{content_keywords[0][:max_gram + 1]}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_no_index) >= insert_times * default_nb // len(content_keywords) assert res_ngram == res_no_index # Test query with suffix match filter_expr = f'content_ngram LIKE "%{content_keywords[0][4:]}"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= insert_times * default_nb // len(content_keywords) filter_expr = f'content_no_index LIKE "%{content_keywords[0][4:]}"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_no_index) >= insert_times * default_nb // len(content_keywords) assert res_ngram == res_no_index # Test query with infix match filter_expr = f'content_ngram LIKE "%{content_keywords[0][2:4]}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= insert_times * default_nb // len(content_keywords) filter_expr = f'content_no_index LIKE "%{content_keywords[0][2:4]}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_no_index) >= insert_times * default_nb // len(content_keywords) assert res_ngram == res_no_index # Test query with Mixed Wildcard Match filter_expr = f'content_ngram LIKE "%st_d_um%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= insert_times * default_nb // len(content_keywords) filter_expr = f'content_no_index LIKE "%st_d_um%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_no_index) >= insert_times * default_nb // len(content_keywords) assert res_ngram == res_no_index @pytest.mark.tags(CaseLabel.L2) def test_ngram_search_with_multilingual_utf8_strings(self): """ Test NGRAM index with multilingual and UTF-8 strings for LIKE filtering """ client = self._client() collection_name = cf.gen_collection_name_by_testcase_name() schema, _ = self.create_schema(client) schema.add_field(pk_field_name, datatype=DataType.INT64, is_primary=True, auto_id=False) schema.add_field(vector_field_name, datatype=DataType.FLOAT_VECTOR, dim=dim) schema.add_field("content_no_index", datatype=DataType.JSON) schema.add_field("content_ngram", datatype=DataType.JSON) self.create_collection(client, collection_name, schema=schema) # Multilingual test data with various UTF-8 characters multilingual_keywords = [ "北京大学", # Chinese "東京大学", # Japanese "Московский", # Russian "café", # French with accent "naïve", # French with diaeresis "München", # German with umlaut "🏫学校🎓", # Chinese with emojis "🌟star⭐", # English with emojis "مدرسة", # Arabic "Γειά", # Greek "प्रविष्टि", # Hindi/Devanagari "한국어", # Korean "español", # Spanish "português", # Portuguese "中英mix英文", # Mixed Chinese-English "café☕北京🏙️" # Mixed with emojis and multiple languages ] # Insert test data insert_times = 2 total_records = insert_times * default_nb for i in range(insert_times): rows = cf.gen_row_data_by_schema(nb=default_nb, schema=schema, start=i * default_nb) for j, row in enumerate(rows): keyword_idx = j % len(multilingual_keywords) keyword = multilingual_keywords[keyword_idx] row["content_no_index"] = { "body": f"This is a {keyword} building", "title": f"Location {i}", "description": f"Description for {keyword} number {i}" } row["content_ngram"] = { "body": f"This is a {keyword} building", "title": f"Location {i}", "description": f"Description for {keyword} number {i}" } self.insert(client, collection_name, rows) self.flush(client, collection_name) # Create vector index before loading collection index_params = self.prepare_index_params(client)[0] index_params.add_index(field_name=vector_field_name, metric_type="COSINE", index_type="IVF_FLAT", params={"nlist": 128}) # Create NGRAM index with appropriate parameters for multilingual content min_gram = 1 # Use 1 for better multilingual support max_gram = 3 index_params.add_index(field_name="content_ngram", index_name="content_ngram", index_type=index_type, params={"min_gram": min_gram, "max_gram": max_gram, "json_path": "content_ngram['body']", "json_cast_type": "varchar"}) self.create_index(client, collection_name, index_params) self.wait_for_index_ready(client, collection_name, index_name=vector_field_name) self.wait_for_index_ready(client, collection_name, index_name="content_ngram") self.load_collection(client, collection_name) expected_count_per_keyword = total_records // len(multilingual_keywords) # Test 1: Chinese character search chinese_keyword = "北京" filter_expr = f'content_ngram["body"] LIKE "%{chinese_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{chinese_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 2: Japanese character search japanese_keyword = "東京" filter_expr = f'content_ngram["body"] LIKE "%{japanese_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{japanese_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 3: Russian Cyrillic character search russian_keyword = "Моск" filter_expr = f'content_ngram["body"] LIKE "%{russian_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{russian_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 4: French accent character search french_keyword = "café" filter_expr = f'content_ngram["body"] LIKE "%{french_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{french_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 5: Emoji character search emoji_keyword = "🏫" filter_expr = f'content_ngram["body"] LIKE "%{emoji_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{emoji_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 6: Star emoji search star_keyword = "⭐" filter_expr = f'content_ngram["body"] LIKE "%{star_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{star_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 7: Arabic character search arabic_keyword = "مدرسة" filter_expr = f'content_ngram["body"] LIKE "%{arabic_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{arabic_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 8: Korean character search korean_keyword = "한국" filter_expr = f'content_ngram["body"] LIKE "%{korean_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{korean_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 9: German umlaut character search german_keyword = "München" filter_expr = f'content_ngram["body"] LIKE "%{german_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{german_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 10: Mixed language search mixed_keyword = "mix" filter_expr = f'content_ngram["body"] LIKE "%{mixed_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{mixed_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 11: Complex multilingual with emojis prefix search complex_keyword = "café☕" filter_expr = f'content_ngram["body"] LIKE "%{complex_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{complex_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 12: Hindi/Devanagari character search hindi_keyword = "प्रविष्टि" filter_expr = f'content_ngram["body"] LIKE "%{hindi_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{hindi_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 13: Greek character search greek_keyword = "Γειά" filter_expr = f'content_ngram["body"] LIKE "%{greek_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{greek_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 14: Portuguese with tilde character search portuguese_keyword = "português" filter_expr = f'content_ngram["body"] LIKE "%{portuguese_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{portuguese_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] assert len(res_ngram) >= expected_count_per_keyword assert res_ngram == res_no_index # Test 15: Test single character search (especially important for CJK) single_char_keyword = "学" filter_expr = f'content_ngram["body"] LIKE "%{single_char_keyword}%"' res_ngram = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] filter_expr = f'content_no_index["body"] LIKE "%{single_char_keyword}%"' res_no_index = self.query(client, collection_name, filter=filter_expr, output_fields=["id", "content_ngram"])[0] # Should match both "北京大学" and "🏫学校🎓" assert len(res_ngram) >= expected_count_per_keyword * 2 assert res_ngram == res_no_index