diff --git a/tests/python_client/base/client_base.py b/tests/python_client/base/client_base.py index e0f79ebc10..b1f5952cbc 100644 --- a/tests/python_client/base/client_base.py +++ b/tests/python_client/base/client_base.py @@ -138,6 +138,7 @@ class TestcaseBase(Base): Additional methods; Public methods that can be used for test cases. """ + client = None def _connect(self, enable_milvus_client_api=False): """ Add a connection and create the connect """ @@ -152,6 +153,7 @@ class TestcaseBase(Base): self.connection_wrap.connect(alias=DefaultConfig.DEFAULT_USING,uri=uri,token=cf.param_info.param_token) res, is_succ = self.connection_wrap.MilvusClient(uri=uri, token=cf.param_info.param_token) + self.client = MilvusClient(uri=uri, token=cf.param_info.param_token) else: if cf.param_info.param_user and cf.param_info.param_password: res, is_succ = self.connection_wrap.connect(alias=DefaultConfig.DEFAULT_USING, @@ -165,6 +167,8 @@ class TestcaseBase(Base): host=cf.param_info.param_host, port=cf.param_info.param_port) + uri = "http://" + cf.param_info.param_host + ":" + str(cf.param_info.param_port) + self.client = MilvusClient(uri=uri, token=cf.param_info.param_token) server_version = utility.get_server_version() log.info(f"server version: {server_version}") return res @@ -183,7 +187,7 @@ class TestcaseBase(Base): res = client.run_analyzer(text, analyzer_params, with_detail=True, with_hash=True) tokens = [r['token'] for r in res.tokens] return tokens - + # def init_async_milvus_client(self): # uri = cf.param_info.param_uri or f"http://{cf.param_info.param_host}:{cf.param_info.param_port}" diff --git a/tests/python_client/milvus_client/test_milvus_client_analyzer.py b/tests/python_client/milvus_client/test_milvus_client_analyzer.py index 3b0faa7805..60885f96e8 100644 --- a/tests/python_client/milvus_client/test_milvus_client_analyzer.py +++ b/tests/python_client/milvus_client/test_milvus_client_analyzer.py @@ -18,6 +18,12 @@ class TestMilvusClientAnalyzer(TestMilvusClientV2Base): }, { "tokenizer": "jieba", + "filter": [ + { + "type": "stop", + "stop_words": ["is", "the", "this", "a", "an", "and", "or", "是", "的", "这", "一个", "和", "或"], + } + ], }, { "tokenizer": "icu" diff --git a/tests/python_client/testcases/test_full_text_search.py b/tests/python_client/testcases/test_full_text_search.py index f966d6b55d..31f3ee0858 100644 --- a/tests/python_client/testcases/test_full_text_search.py +++ b/tests/python_client/testcases/test_full_text_search.py @@ -1,5 +1,13 @@ +import json + from pymilvus import ( - FieldSchema, CollectionSchema, DataType, Function, FunctionType, AnnSearchRequest, WeightedRanker + FieldSchema, + CollectionSchema, + DataType, + Function, + FunctionType, + AnnSearchRequest, + WeightedRanker, ) from common.common_type import CaseLabel, CheckTasks from common import common_func as cf @@ -15,6 +23,8 @@ from faker import Faker Faker.seed(19530) fake_en = Faker("en_US") fake_zh = Faker("zh_CN") +fake_jp = Faker("ja_JP") +fake_de = Faker("de_DE") # patch faker to generate text with specific distribution cf.patch_faker_text(fake_en, cf.en_vocabularies_distribution) @@ -77,7 +87,9 @@ class TestCreateCollectionWIthFullTextSearch(TestcaseBase): ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), - FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), + FieldSchema( + name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR + ), ] schema = CollectionSchema(fields=fields, description="test collection") text_fields = ["text", "paragraph"] @@ -98,7 +110,9 @@ class TestCreateCollectionWIthFullTextSearch(TestcaseBase): @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("tokenizer", ["standard"]) - def test_create_collection_for_full_text_search_twice_with_same_schema(self, tokenizer): + def test_create_collection_for_full_text_search_twice_with_same_schema( + self, tokenizer + ): """ target: test create collection with full text search twice with same schema method: create collection with full text search, use bm25 function, then create again @@ -141,7 +155,9 @@ class TestCreateCollectionWIthFullTextSearch(TestcaseBase): ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), - FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), + FieldSchema( + name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR + ), ] schema = CollectionSchema(fields=fields, description="test collection") text_fields = ["text", "paragraph"] @@ -155,12 +171,8 @@ class TestCreateCollectionWIthFullTextSearch(TestcaseBase): ) schema.add_function(bm25_function) c_name = cf.gen_unique_str(prefix) - self.init_collection_wrap( - name=c_name, schema=schema - ) - collection_w = self.init_collection_wrap( - name=c_name, schema=schema - ) + self.init_collection_wrap(name=c_name, schema=schema) + collection_w = self.init_collection_wrap(name=c_name, schema=schema) res, _ = collection_w.describe() assert len(res["functions"]) == len(text_fields) @@ -176,7 +188,9 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase): @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("tokenizer", ["unsupported"]) @pytest.mark.skip(reason="check not implement may cause panic") - def test_create_collection_for_full_text_search_with_unsupported_tokenizer(self, tokenizer): + def test_create_collection_for_full_text_search_with_unsupported_tokenizer( + self, tokenizer + ): """ target: test create collection with full text search with unsupported tokenizer method: create collection with full text search, use bm25 function and unsupported tokenizer @@ -219,7 +233,9 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase): ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), - FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), + FieldSchema( + name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR + ), ] schema = CollectionSchema(fields=fields, description="test collection") text_fields = ["text", "paragraph"] @@ -237,12 +253,16 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase): ) res, result = collection_w.describe() log.info(f"collection describe {res}") - assert not result, "create collection with unsupported tokenizer should be failed" + assert not result, ( + "create collection with unsupported tokenizer should be failed" + ) @pytest.mark.tags(CaseLabel.L2) @pytest.mark.parametrize("valid_output", [True, False]) @pytest.mark.parametrize("valid_input", [True, False]) - def test_create_collection_for_full_text_search_with_invalid_input_output(self, valid_output, valid_input): + def test_create_collection_for_full_text_search_with_invalid_input_output( + self, valid_output, valid_input + ): """ target: test create collection with full text search with invalid input/output in bm25 function method: create collection with full text search, use bm25 function and invalid input/output @@ -285,7 +305,9 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase): ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), - FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), + FieldSchema( + name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR + ), ] schema = CollectionSchema(fields=fields, description="test collection") if valid_input: @@ -298,7 +320,7 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase): output_field_names = ["invalid_output"] bm25_function = Function( - name=f"text_bm25_emb", + name="text_bm25_emb", function_type=FunctionType.BM25, input_field_names=input_field_names, output_field_names=output_field_names, @@ -307,9 +329,13 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase): schema.add_function(bm25_function) if (not valid_output) or (not valid_input): self.init_collection_wrap( - name=cf.gen_unique_str(prefix), schema=schema, + name=cf.gen_unique_str(prefix), + schema=schema, check_task=CheckTasks.err_res, - check_items={ct.err_code: 1, ct.err_msg: "field not found in collection"} + check_items={ + ct.err_code: 1, + ct.err_msg: "field not found in collection", + }, ) else: collection_w = self.init_collection_wrap( @@ -317,7 +343,9 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase): ) res, result = collection_w.describe() log.info(f"collection describe {res}") - assert result, "create collection with valid input/output should be successful" + assert result, ( + "create collection with valid input/output should be successful" + ) @pytest.mark.tags(CaseLabel.L1) def test_create_collection_for_full_text_search_with_field_not_tokenized(self): @@ -363,25 +391,30 @@ class TestCreateCollectionWithFullTextSearchNegative(TestcaseBase): ), FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), - FieldSchema(name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), + FieldSchema( + name="paragraph_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR + ), ] schema = CollectionSchema(fields=fields, description="test collection") bm25_function = Function( - name=f"text_bm25_emb", + name="text_bm25_emb", function_type=FunctionType.BM25, input_field_names=["text"], output_field_names=["text_sparse_emb"], - params={ - }, + params={}, ) schema.add_function(bm25_function) check_task = CheckTasks.err_res - check_items = {ct.err_code: 65535, ct.err_msg: "BM25 function input field must set enable_analyzer to true"} + check_items = { + ct.err_code: 65535, + ct.err_msg: "BM25 function input field must set enable_analyzer to true", + } self.init_collection_wrap( - name=cf.gen_unique_str(prefix), schema=schema, + name=cf.gen_unique_str(prefix), + schema=schema, check_task=check_task, - check_items=check_items + check_items=check_items, ) @@ -393,7 +426,6 @@ class TestInsertWithFullTextSearch(TestcaseBase): ****************************************************************** """ - @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("nullable", [False, True]) @pytest.mark.parametrize("text_lang", ["en", "zh", "hybrid"]) @@ -469,8 +501,12 @@ class TestInsertWithFullTextSearch(TestcaseBase): { "id": i, "word": fake.word().lower(), - "sentence": fake.sentence().lower() if random.random() < 0.5 else None, - "paragraph": fake.paragraph().lower() if random.random() < 0.5 else None, + "sentence": fake.sentence().lower() + if random.random() < 0.5 + else None, + "paragraph": fake.paragraph().lower() + if random.random() < 0.5 + else None, "text": fake.text().lower(), # function input should not be None "emb": [random.random() for _ in range(dim)], } @@ -507,13 +543,17 @@ class TestInsertWithFullTextSearch(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -524,16 +564,13 @@ class TestInsertWithFullTextSearch(TestcaseBase): "drop_ratio_build": 0.3, "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) collection_w.create_index("text", {"index_type": "INVERTED"}) collection_w.load() num_entities = collection_w.num_entities - res, _ = collection_w.query( - expr="", - output_fields=["count(*)"] - ) + res, _ = collection_w.query(expr="", output_fields=["count(*)"]) count = res[0]["count(*)"] assert len(data) == num_entities assert len(data) == count @@ -543,7 +580,9 @@ class TestInsertWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("nullable", [False]) @pytest.mark.parametrize("text_lang", ["en"]) @pytest.mark.parametrize("tokenizer", ["standard"]) - def test_insert_for_full_text_search_enable_dynamic_field(self, tokenizer, text_lang, nullable, enable_dynamic_field): + def test_insert_for_full_text_search_enable_dynamic_field( + self, tokenizer, text_lang, nullable, enable_dynamic_field + ): """ target: test insert data with full text search and enable dynamic field method: 1. create collection with full text search and enable dynamic field @@ -591,7 +630,11 @@ class TestInsertWithFullTextSearch(TestcaseBase): FieldSchema(name="emb", dtype=DataType.FLOAT_VECTOR, dim=dim), FieldSchema(name="text_sparse_emb", dtype=DataType.SPARSE_FLOAT_VECTOR), ] - schema = CollectionSchema(fields=fields, description="test collection", enable_dynamic_field=enable_dynamic_field) + schema = CollectionSchema( + fields=fields, + description="test collection", + enable_dynamic_field=enable_dynamic_field, + ) bm25_function = Function( name="text_bm25_emb", function_type=FunctionType.BM25, @@ -617,11 +660,15 @@ class TestInsertWithFullTextSearch(TestcaseBase): { "id": i, "word": fake.word().lower(), - "sentence": fake.sentence().lower() if random.random() < 0.5 else None, - "paragraph": fake.paragraph().lower() if random.random() < 0.5 else None, + "sentence": fake.sentence().lower() + if random.random() < 0.5 + else None, + "paragraph": fake.paragraph().lower() + if random.random() < 0.5 + else None, "text": fake.text().lower(), # function input should not be None "emb": [random.random() for _ in range(dim)], - f"dynamic_field_{i}": f"dynamic_value_{i}" + f"dynamic_field_{i}": f"dynamic_value_{i}", } for i in range(data_size) ] @@ -634,7 +681,7 @@ class TestInsertWithFullTextSearch(TestcaseBase): "paragraph": fake.paragraph().lower(), "text": fake.text().lower(), "emb": [random.random() for _ in range(dim)], - f"dynamic_field_{i}": f"dynamic_value_{i}" + f"dynamic_field_{i}": f"dynamic_value_{i}", } for i in range(data_size) ] @@ -649,7 +696,7 @@ class TestInsertWithFullTextSearch(TestcaseBase): "paragraph": fake.paragraph().lower(), "text": fake.text().lower(), "emb": [random.random() for _ in range(dim)], - f"dynamic_field_{i}": f"dynamic_value_{i}" + f"dynamic_field_{i}": f"dynamic_value_{i}", } hybrid_data.append(tmp) data = hybrid_data + data @@ -658,13 +705,17 @@ class TestInsertWithFullTextSearch(TestcaseBase): batch_size = 5000 for i in range(0, len(data), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(data) - else data[i: len(data)] + else data[i : len(data)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -675,16 +726,13 @@ class TestInsertWithFullTextSearch(TestcaseBase): "drop_ratio_build": 0.3, "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) collection_w.create_index("text", {"index_type": "INVERTED"}) collection_w.load() num_entities = collection_w.num_entities - res, _ = collection_w.query( - expr="", - output_fields=["count(*)"] - ) + res, _ = collection_w.query(expr="", output_fields=["count(*)"]) count = res[0]["count(*)"] assert len(data) == num_entities assert len(data) == count @@ -693,7 +741,9 @@ class TestInsertWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("nullable", [True]) @pytest.mark.parametrize("text_lang", ["en"]) @pytest.mark.parametrize("tokenizer", ["standard"]) - def test_insert_for_full_text_search_with_dataframe(self, tokenizer, text_lang, nullable): + def test_insert_for_full_text_search_with_dataframe( + self, tokenizer, text_lang, nullable + ): """ target: test insert data for full text search with dataframe method: 1. insert data with varchar in dataframe format @@ -765,8 +815,12 @@ class TestInsertWithFullTextSearch(TestcaseBase): { "id": i, "word": fake.word().lower(), - "sentence": fake.sentence().lower() if random.random() < 0.5 else None, - "paragraph": fake.paragraph().lower() if random.random() < 0.5 else None, + "sentence": fake.sentence().lower() + if random.random() < 0.5 + else None, + "paragraph": fake.paragraph().lower() + if random.random() < 0.5 + else None, "text": fake.text().lower(), # function input should not be None "emb": [random.random() for _ in range(dim)], } @@ -802,10 +856,14 @@ class TestInsertWithFullTextSearch(TestcaseBase): log.info(f"dataframe\n{df}") batch_size = 5000 for i in range(0, len(df), batch_size): - collection_w.insert(df[i: i + batch_size]) + collection_w.insert(df[i : i + batch_size]) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -816,16 +874,13 @@ class TestInsertWithFullTextSearch(TestcaseBase): "drop_ratio_build": 0.3, "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) collection_w.create_index("text", {"index_type": "INVERTED"}) collection_w.load() num_entities = collection_w.num_entities - res, _ = collection_w.query( - expr="", - output_fields=["count(*)"] - ) + res, _ = collection_w.query(expr="", output_fields=["count(*)"]) count = res[0]["count(*)"] assert len(data) == num_entities assert len(data) == count @@ -900,7 +955,11 @@ class TestInsertWithFullTextSearch(TestcaseBase): language = "zh" collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -911,8 +970,8 @@ class TestInsertWithFullTextSearch(TestcaseBase): "drop_ratio_build": 0.3, "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) collection_w.create_index("text", {"index_type": "INVERTED"}) collection_w.load() @@ -929,31 +988,25 @@ class TestInsertWithFullTextSearch(TestcaseBase): ] df = pd.DataFrame(data) log.info(f"dataframe\n{df}") - log.info(f"analyze documents") + log.info("analyze documents") texts = df["text"].to_list() word_freq = cf.analyze_documents(texts, language=language) tokens = list(word_freq.keys()) batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) num_entities = collection_w.num_entities # query with count(*) - res, _ = collection_w.query( - expr="", - output_fields=["count(*)"] - ) + res, _ = collection_w.query(expr="", output_fields=["count(*)"]) count = res[0]["count(*)"] assert len(data) == num_entities assert len(data) == count # query with expr - res, _ = collection_w.query( - expr="id >= 0", - output_fields=["text"] - ) + res, _ = collection_w.query(expr="id >= 0", output_fields=["text"]) assert len(res) == len(data) # search with text @@ -965,7 +1018,8 @@ class TestInsertWithFullTextSearch(TestcaseBase): anns_field="text_sparse_emb", param={}, limit=limit, - output_fields=["id", "text"]) + output_fields=["id", "text"], + ) assert len(res_list) == nq for i in range(nq): assert len(res_list[i]) == limit @@ -975,9 +1029,12 @@ class TestInsertWithFullTextSearch(TestcaseBase): for j in range(len(res)): r = res[j] result_text = r.text - overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language) - assert len( - overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" + overlap, word_freq_a, word_freq_b = cf.check_token_overlap( + search_text, result_text, language=language + ) + assert len(overlap) > 0, ( + f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" + ) # @pytest.mark.skip("skip") @@ -991,7 +1048,9 @@ class TestInsertWithFullTextSearchNegative(TestcaseBase): @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("nullable", [True]) @pytest.mark.parametrize("tokenizer", ["standard"]) - def test_insert_with_full_text_search_with_non_varchar_data(self, tokenizer, nullable): + def test_insert_with_full_text_search_with_non_varchar_data( + self, tokenizer, nullable + ): """ target: test insert data with full text search with non varchar data method: 1. insert data with non varchar data @@ -1061,7 +1120,9 @@ class TestInsertWithFullTextSearchNegative(TestcaseBase): "word": fake.word().lower(), "sentence": fake.sentence().lower(), "paragraph": fake.paragraph().lower(), - "text": fake.text().lower() if random.random() < 0.5 else 1, # mix some int data + "text": fake.text().lower() + if random.random() < 0.5 + else 1, # mix some int data "emb": [random.random() for _ in range(dim)], } for i in range(data_size) @@ -1071,13 +1132,17 @@ class TestInsertWithFullTextSearchNegative(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)], + else data[i : len(df)], check_task=CheckTasks.err_res, - check_items={ct.err_code: 1, ct.err_msg: "inconsistent with defined schema"}, + check_items={ + ct.err_code: 1, + ct.err_msg: "inconsistent with defined schema", + }, ) + # @pytest.mark.skip("skip") class TestUpsertWithFullTextSearch(TestcaseBase): """ @@ -1086,7 +1151,6 @@ class TestUpsertWithFullTextSearch(TestcaseBase): ****************************************************************** """ - @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("nullable", [False, True]) @pytest.mark.parametrize("tokenizer", ["standard"]) @@ -1152,18 +1216,20 @@ class TestUpsertWithFullTextSearch(TestcaseBase): name=cf.gen_unique_str(prefix), schema=schema ) fake = fake_en - language = "en" if tokenizer == "jieba": fake = fake_zh - language = "zh" if nullable: data = [ { "id": i, "word": fake.word().lower(), - "sentence": fake.sentence().lower() if random.random() < 0.5 else None, - "paragraph": fake.paragraph().lower() if random.random() < 0.5 else None, + "sentence": fake.sentence().lower() + if random.random() < 0.5 + else None, + "paragraph": fake.paragraph().lower() + if random.random() < 0.5 + else None, "text": fake.text().lower(), # function input should not be None "emb": [random.random() for _ in range(dim)], } @@ -1186,13 +1252,17 @@ class TestUpsertWithFullTextSearch(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -1203,16 +1273,13 @@ class TestUpsertWithFullTextSearch(TestcaseBase): "drop_ratio_build": 0.3, "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) collection_w.create_index("text", {"index_type": "INVERTED"}) collection_w.load() num_entities = collection_w.num_entities - res, _ = collection_w.query( - expr="", - output_fields=["count(*)"] - ) + res, _ = collection_w.query(expr="", output_fields=["count(*)"]) count = res[0]["count(*)"] assert len(data) == num_entities assert len(data) == count @@ -1229,17 +1296,14 @@ class TestUpsertWithFullTextSearch(TestcaseBase): } for i in range(data_size // 2) ] - upsert_data += data[data_size // 2:] + upsert_data += data[data_size // 2 :] for i in range(0, len(upsert_data), batch_size): collection_w.upsert( - upsert_data[i: i + batch_size] + upsert_data[i : i + batch_size] if i + batch_size < len(upsert_data) - else upsert_data[i: len(upsert_data)] + else upsert_data[i : len(upsert_data)] ) - res, _ = collection_w.query( - expr="id >= 0", - output_fields=["*"] - ) + res, _ = collection_w.query(expr="id >= 0", output_fields=["*"]) upsert_data_map = {} for d in upsert_data: upsert_data_map[d["id"]] = d @@ -1260,7 +1324,9 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase): @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("nullable", [False]) @pytest.mark.parametrize("tokenizer", ["standard"]) - def test_upsert_for_full_text_search_with_no_varchar_data(self, tokenizer, nullable): + def test_upsert_for_full_text_search_with_no_varchar_data( + self, tokenizer, nullable + ): """ target: test upsert data for full text search with no varchar data method: 1. insert data with varchar data @@ -1321,10 +1387,8 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase): name=cf.gen_unique_str(prefix), schema=schema ) fake = fake_en - language = "en" if tokenizer == "jieba": fake = fake_zh - language = "zh" data = [ { @@ -1342,13 +1406,17 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -1359,16 +1427,13 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase): "drop_ratio_build": 0.3, "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) collection_w.create_index("text", {"index_type": "INVERTED"}) collection_w.load() num_entities = collection_w.num_entities - res, _ = collection_w.query( - expr="", - output_fields=["count(*)"] - ) + res, _ = collection_w.query(expr="", output_fields=["count(*)"]) count = res[0]["count(*)"] assert len(data) == num_entities assert len(data) == count @@ -1380,16 +1445,16 @@ class TestUpsertWithFullTextSearchNegative(TestcaseBase): "word": fake.word().lower(), "sentence": fake.sentence().lower(), "paragraph": fake.paragraph().lower(), - "text": fake.text().lower() if random.random() < 0.5 else 1, # mix some int data + "text": fake.text().lower() + if random.random() < 0.5 + else 1, # mix some int data "emb": [random.random() for _ in range(dim)], } for i in range(data_size) ] check_items = {ct.err_code: 1, ct.err_msg: "inconsistent with defined schema"} check_task = CheckTasks.err_res - collection_w.upsert(upsert_data, - check_task=check_task, - check_items=check_items) + collection_w.upsert(upsert_data, check_task=check_task, check_items=check_items) class TestDeleteWithFullTextSearch(TestcaseBase): @@ -1479,13 +1544,17 @@ class TestDeleteWithFullTextSearch(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -1496,47 +1565,37 @@ class TestDeleteWithFullTextSearch(TestcaseBase): "drop_ratio_build": 0.3, "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) collection_w.create_index("text", {"index_type": "INVERTED"}) collection_w.load() num_entities = collection_w.num_entities - res, _ = collection_w.query( - expr="", - output_fields=["count(*)"] - ) + res, _ = collection_w.query(expr="", output_fields=["count(*)"]) count = res[0]["count(*)"] assert len(data) == num_entities assert len(data) == count # delete half of the data delete_ids = [i for i in range(data_size // 2)] - collection_w.delete( - expr=f"id in {delete_ids}" - ) - res, _ = collection_w.query( - expr="", - output_fields=["count(*)"] - ) + collection_w.delete(expr=f"id in {delete_ids}") + res, _ = collection_w.query(expr="", output_fields=["count(*)"]) count = res[0]["count(*)"] assert count == data_size // 2 # query with delete expr and get empty result - res, _ = collection_w.query( - expr=f"id in {delete_ids}", - output_fields=["*"] - ) + res, _ = collection_w.query(expr=f"id in {delete_ids}", output_fields=["*"]) assert len(res) == 0 # search with text has been deleted, not in the result - search_data = df["text"].to_list()[:data_size // 2] + search_data = df["text"].to_list()[: data_size // 2] res_list, _ = collection_w.search( data=search_data, anns_field="text_sparse_emb", param={}, limit=100, - output_fields=["id", "text"]) + output_fields=["id", "text"], + ) for i in range(len(res_list)): query_text = search_data[i] result_texts = [r.text for r in res_list[i]] @@ -1547,6 +1606,7 @@ class TestDeleteWithFullTextSearchNegative(TestcaseBase): """ todo: add some negative cases """ + pass @@ -1564,7 +1624,7 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX", "SPARSE_WAND"]) @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_index_for_full_text_search_default( - self, tokenizer, index_type, k, b + self, tokenizer, index_type, k, b ): """ target: test create index for full text search @@ -1631,8 +1691,12 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } @@ -1643,13 +1707,17 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -1659,8 +1727,8 @@ class TestCreateIndexWithFullTextSearch(TestcaseBase): "params": { "bm25_k1": k, "bm25_b": b, - } - } + }, + }, ) # describe index info to verify res = collection_w.indexes @@ -1688,7 +1756,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): @pytest.mark.parametrize("index_type", ["HNSW", "INVALID_INDEX_TYPE"]) @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_full_text_search_with_invalid_index_type( - self, tokenizer, index_type, k, b + self, tokenizer, index_type, k, b ): """ target: test create index for full text search with invalid index type @@ -1754,8 +1822,12 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } @@ -1766,13 +1838,17 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) error = {"err_code": 1100, "err_msg": "invalid"} collection_w.create_index( @@ -1783,10 +1859,10 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): "params": { "bm25_k1": k, "bm25_b": b, - } + }, }, check_task=CheckTasks.err_res, - check_items=error + check_items=error, ) @pytest.mark.tags(CaseLabel.L2) @@ -1796,7 +1872,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): @pytest.mark.parametrize("metric_type", ["COSINE", "L2", "IP"]) @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_full_text_search_index_with_invalid_metric_type( - self, tokenizer, index_type, metric_type, k, b + self, tokenizer, index_type, metric_type, k, b ): """ target: test create index for full text search with invalid metric type @@ -1862,8 +1938,12 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } @@ -1874,15 +1954,22 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) - error = {ct.err_code: 65535, ct.err_msg: "index metric type of BM25 function output field must be BM25"} + error = { + ct.err_code: 65535, + ct.err_msg: "index metric type of BM25 function output field must be BM25", + } collection_w.create_index( "text_sparse_emb", { @@ -1891,10 +1978,10 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): "params": { "bm25_k1": k, "bm25_b": b, - } + }, }, check_task=CheckTasks.err_res, - check_items=error + check_items=error, ) @pytest.mark.tags(CaseLabel.L2) @@ -1903,7 +1990,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_index_using_bm25_metric_type_for_non_bm25_output_field( - self, tokenizer, index_type, k, b + self, tokenizer, index_type, k, b ): """ target: test create index using bm25 metric type for non bm25 output field (dense float vector or @@ -1970,8 +2057,12 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } @@ -1982,16 +2073,23 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) - error = {ct.err_code: 1100, ct.err_msg: "float vector index does not support metric type: BM25"} + error = { + ct.err_code: 1100, + ct.err_msg: "float vector index does not support metric type: BM25", + } collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "BM25", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "BM25", + "params": {"M": 16, "efConstruction": 500}, + }, check_task=CheckTasks.err_res, - check_items=error + check_items=error, ) @pytest.mark.tags(CaseLabel.L0) @@ -2000,7 +2098,7 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): @pytest.mark.parametrize("index_type", ["SPARSE_INVERTED_INDEX"]) @pytest.mark.parametrize("tokenizer", ["standard"]) def test_create_full_text_search_with_invalid_bm25_params( - self, tokenizer, index_type, k, b + self, tokenizer, index_type, k, b ): """ target: test create index for full text search with invalid bm25 params @@ -2066,8 +2164,12 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } @@ -2078,17 +2180,24 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) check_task = CheckTasks.err_res - error = {"err_code": 1100, "err_msg": "invalid"} # todo, update error code and message + error = { + "err_code": 1100, + "err_msg": "invalid", + } # todo, update error code and message collection_w.create_index( "text_sparse_emb", { @@ -2097,10 +2206,10 @@ class TestCreateIndexWithFullTextSearchNegative(TestcaseBase): "params": { "bm25_k1": k, "bm25_b": b, - } + }, }, check_task=check_task, - check_items=error + check_items=error, ) @@ -2122,7 +2231,15 @@ class TestSearchWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("tokenizer", ["standard"]) @pytest.mark.parametrize("offset", [10, 0]) def test_full_text_search_default( - self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq + self, + offset, + tokenizer, + expr, + enable_inverted_index, + enable_partition_key, + empty_percent, + index_type, + nq, ): """ target: test full text search @@ -2194,8 +2311,12 @@ class TestSearchWithFullTextSearch(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } @@ -2208,19 +2329,23 @@ class TestSearchWithFullTextSearch(TestcaseBase): most_freq_word = word_freq.most_common(10) tokens = [item[0] for item in most_freq_word] if len(tokens) == 0: - log.info(f"empty tokens, add a dummy token") + log.info("empty tokens, add a dummy token") tokens = ["dummy"] batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.flush() collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -2230,15 +2355,15 @@ class TestSearchWithFullTextSearch(TestcaseBase): "params": { "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) if enable_inverted_index: collection_w.create_index("text", {"index_type": "INVERTED"}) collection_w.load() limit = 100 token = random.choice(tokens) - search_data = [fake.text().lower() + f" {token} " for _ in range(nq)] + search_data = [fake.text().lower() + f" {token} " for _ in range(nq)] if expr == "text_match": filter = f"TEXT_MATCH(text, '{token}')" res, _ = collection_w.query( @@ -2262,7 +2387,8 @@ class TestSearchWithFullTextSearch(TestcaseBase): param={}, limit=limit + offset, offset=0, - output_fields=["id", "text"]) + output_fields=["id", "text"], + ) full_res_id_list = [] for i in range(nq): res = full_res_list[i] @@ -2278,7 +2404,8 @@ class TestSearchWithFullTextSearch(TestcaseBase): param={}, limit=limit, offset=offset, - output_fields=["id", "text"]) + output_fields=["id", "text"], + ) # verify correctness for i in range(nq): @@ -2301,10 +2428,13 @@ class TestSearchWithFullTextSearch(TestcaseBase): if expr == "id_range": assert _id < data_size // 2 # verify search result has overlap with search text - overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language) + overlap, word_freq_a, word_freq_b = cf.check_token_overlap( + search_text, result_text, language=language + ) log.info(f"overlap {overlap}") - assert len( - overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" + assert len(overlap) > 0, ( + f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" + ) @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("nq", [2]) @@ -2317,8 +2447,17 @@ class TestSearchWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("tokenizer", ["jieba"]) @pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo) def test_full_text_search_with_jieba_tokenizer( - self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, - empty_percent, index_type, nq, inverted_index_algo): + self, + offset, + tokenizer, + expr, + enable_inverted_index, + enable_partition_key, + empty_percent, + index_type, + nq, + inverted_index_algo, + ): """ target: test full text search method: 1. enable full text search with jieba tokenizer and insert data with varchar @@ -2332,7 +2471,7 @@ class TestSearchWithFullTextSearch(TestcaseBase): lang_type = "english" analyzer_params = { - "type": lang_type, + "type": lang_type, } dim = 128 fields = [ @@ -2394,8 +2533,12 @@ class TestSearchWithFullTextSearch(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } @@ -2410,19 +2553,23 @@ class TestSearchWithFullTextSearch(TestcaseBase): if len(item[0]) == 2: tokens.append(item[0]) if len(tokens) == 0: - log.info(f"empty tokens, add a dummy token") + log.info("empty tokens, add a dummy token") tokens = ["dummy"] batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.flush() collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -2432,9 +2579,9 @@ class TestSearchWithFullTextSearch(TestcaseBase): "params": { "bm25_k1": 1.5, "bm25_b": 0.75, - "inverted_index_algo": inverted_index_algo - } - } + "inverted_index_algo": inverted_index_algo, + }, + }, ) if enable_inverted_index: collection_w.create_index("text", {"index_type": "INVERTED"}) @@ -2465,7 +2612,8 @@ class TestSearchWithFullTextSearch(TestcaseBase): param={}, limit=limit + offset, offset=0, - output_fields=["id", "text"]) + output_fields=["id", "text"], + ) full_res_id_list = [] for i in range(nq): res = full_res_list[i] @@ -2481,7 +2629,8 @@ class TestSearchWithFullTextSearch(TestcaseBase): param={}, limit=limit, offset=offset, - output_fields=["id", "text"]) + output_fields=["id", "text"], + ) # verify correctness for i in range(nq): @@ -2504,11 +2653,13 @@ class TestSearchWithFullTextSearch(TestcaseBase): if expr == "id_range": assert _id < data_size // 2 # verify search result has overlap with search text - overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language) + overlap, word_freq_a, word_freq_b = cf.check_token_overlap( + search_text, result_text, language=language + ) log.info(f"overlap {overlap}") - assert len( - overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" - + assert len(overlap) > 0, ( + f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" + ) @pytest.mark.tags(CaseLabel.L0) @pytest.mark.parametrize("nq", [2]) @@ -2520,7 +2671,15 @@ class TestSearchWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("tokenizer", ["standard"]) @pytest.mark.parametrize("offset", [0]) def test_full_text_search_for_growing_segment( - self, offset, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq + self, + offset, + tokenizer, + expr, + enable_inverted_index, + enable_partition_key, + empty_percent, + index_type, + nq, ): """ target: test full text search @@ -2592,8 +2751,12 @@ class TestSearchWithFullTextSearch(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } @@ -2606,11 +2769,15 @@ class TestSearchWithFullTextSearch(TestcaseBase): most_freq_word = word_freq.most_common(10) tokens = [item[0] for item in most_freq_word] if len(tokens) == 0: - log.info(f"empty tokens, add a dummy token") + log.info("empty tokens, add a dummy token") tokens = ["dummy"] collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -2620,8 +2787,8 @@ class TestSearchWithFullTextSearch(TestcaseBase): "params": { "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) if enable_inverted_index: collection_w.create_index("text", {"index_type": "INVERTED"}) @@ -2629,12 +2796,14 @@ class TestSearchWithFullTextSearch(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) limit = 100 - search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)] + search_data = [ + fake.text().lower() + " " + random.choice(tokens) for _ in range(nq) + ] if expr == "text_match": filter = f"TextMatch(text, '{tokens[0]}')" res, _ = collection_w.query( @@ -2658,7 +2827,8 @@ class TestSearchWithFullTextSearch(TestcaseBase): param={}, limit=limit + offset, offset=0, - output_fields=["id", "text"]) + output_fields=["id", "text"], + ) full_res_id_list = [] for i in range(nq): res = full_res_list[i] @@ -2674,7 +2844,8 @@ class TestSearchWithFullTextSearch(TestcaseBase): param={}, limit=limit, offset=offset, - output_fields=["id", "text"]) + output_fields=["id", "text"], + ) # verify correctness for i in range(nq): @@ -2697,10 +2868,13 @@ class TestSearchWithFullTextSearch(TestcaseBase): if expr == "id_range": assert _id < data_size // 2 # verify search result has overlap with search text - overlap, word_freq_a, word_freq_b = cf.check_token_overlap(search_text, result_text, language=language) + overlap, word_freq_a, word_freq_b = cf.check_token_overlap( + search_text, result_text, language=language + ) log.info(f"overlap {overlap}") - assert len( - overlap) > 0, f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" + assert len(overlap) > 0, ( + f"query text: {search_text}, \ntext: {result_text} \n overlap: {overlap} \n word freq a: {word_freq_a} \n word freq b: {word_freq_b}\n result: {r}" + ) @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("nq", [2]) @@ -2711,7 +2885,14 @@ class TestSearchWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("expr", [None]) @pytest.mark.parametrize("tokenizer", ["standard"]) def test_full_text_search_with_range_search( - self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq + self, + tokenizer, + expr, + enable_inverted_index, + enable_partition_key, + empty_percent, + index_type, + nq, ): """ target: test full text search @@ -2783,8 +2964,12 @@ class TestSearchWithFullTextSearch(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } @@ -2796,18 +2981,22 @@ class TestSearchWithFullTextSearch(TestcaseBase): word_freq = cf.analyze_documents(texts, language=language) tokens = list(word_freq.keys()) if len(tokens) == 0: - log.info(f"empty tokens, add a dummy token") + log.info("empty tokens, add a dummy token") tokens = ["dummy"] batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -2817,8 +3006,8 @@ class TestSearchWithFullTextSearch(TestcaseBase): "params": { "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) if enable_inverted_index: collection_w.create_index("text", {"index_type": "INVERTED"}) @@ -2830,10 +3019,10 @@ class TestSearchWithFullTextSearch(TestcaseBase): res_list, _ = collection_w.search( data=search_data, anns_field="text_sparse_emb", - param={ - }, + param={}, limit=limit, # get a wider range of search result - output_fields=["id", "text"]) + output_fields=["id", "text"], + ) distance_list = [] for i in range(nq): @@ -2850,13 +3039,10 @@ class TestSearchWithFullTextSearch(TestcaseBase): res_list, _ = collection_w.search( data=search_data, anns_field="text_sparse_emb", - param={ - "params": { - "radius": low, "range_filter": high - } - }, + param={"params": {"radius": low, "range_filter": high}}, limit=limit, - output_fields=["id", "text"]) + output_fields=["id", "text"], + ) # verify correctness for i in range(nq): log.info(f"res: {len(res_list[i])}") @@ -2876,7 +3062,14 @@ class TestSearchWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("expr", [None]) @pytest.mark.parametrize("tokenizer", ["standard"]) def test_full_text_search_with_search_iterator( - self, tokenizer, expr, enable_inverted_index, enable_partition_key, empty_percent, index_type, nq + self, + tokenizer, + expr, + enable_inverted_index, + enable_partition_key, + empty_percent, + index_type, + nq, ): """ target: test full text search @@ -2948,8 +3141,12 @@ class TestSearchWithFullTextSearch(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } @@ -2961,18 +3158,22 @@ class TestSearchWithFullTextSearch(TestcaseBase): word_freq = cf.analyze_documents(texts, language=language) tokens = list(word_freq.keys()) if len(tokens) == 0: - log.info(f"empty tokens, add a dummy token") + log.info("empty tokens, add a dummy token") tokens = ["dummy"] batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -2982,13 +3183,15 @@ class TestSearchWithFullTextSearch(TestcaseBase): "params": { "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) if enable_inverted_index: collection_w.create_index("text", {"index_type": "INVERTED"}) collection_w.load() - search_data = [fake.text().lower() + " " + random.choice(tokens) for _ in range(nq)] + search_data = [ + fake.text().lower() + " " + random.choice(tokens) for _ in range(nq) + ] log.info(f"search data: {search_data}") # get distance with search data batch_size = 100 @@ -3001,7 +3204,7 @@ class TestSearchWithFullTextSearch(TestcaseBase): "metric_type": "BM25", }, output_fields=["id", "text"], - limit=limit + limit=limit, ) iter_result = [] while True: @@ -3014,6 +3217,7 @@ class TestSearchWithFullTextSearch(TestcaseBase): for r in iter_result[:-1]: assert r == batch_size + class TestSearchWithFullTextSearchNegative(TestcaseBase): """ ****************************************************************** @@ -3030,7 +3234,13 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase): @pytest.mark.parametrize("tokenizer", ["standard"]) @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37022") def test_search_for_full_text_search_with_empty_string_search_data( - self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data + self, + tokenizer, + enable_inverted_index, + enable_partition_key, + empty_percent, + index_type, + invalid_search_data, ): """ target: test full text search @@ -3092,17 +3302,18 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase): ) fake = fake_en if tokenizer == "jieba": - language = "zh" fake = fake_zh - else: - language = "en" data = [ { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } @@ -3113,13 +3324,17 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -3129,8 +3344,8 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase): "params": { "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) if enable_inverted_index: collection_w.create_index("text", {"index_type": "INVERTED"}) @@ -3150,7 +3365,6 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase): for r in res: assert len(r) == 0 - @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("empty_percent", [0]) @pytest.mark.parametrize("enable_partition_key", [True]) @@ -3159,7 +3373,13 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase): @pytest.mark.parametrize("invalid_search_data", ["sparse_vector", "dense_vector"]) @pytest.mark.parametrize("tokenizer", ["standard"]) def test_search_for_full_text_search_with_invalid_search_data( - self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, invalid_search_data + self, + tokenizer, + enable_inverted_index, + enable_partition_key, + empty_percent, + index_type, + invalid_search_data, ): """ target: test full text search @@ -3230,32 +3450,39 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "emb": [random.random() for _ in range(dim)], } for i in range(data_size) ] df = pd.DataFrame(data) - corpus = df["text"].to_list() log.info(f"dataframe\n{df}") texts = df["text"].to_list() word_freq = cf.analyze_documents(texts, language=language) tokens = list(word_freq.keys()) if len(tokens) == 0: - log.info(f"empty tokens, add a dummy token") + log.info("empty tokens, add a dummy token") tokens = ["dummy"] batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "text_sparse_emb", @@ -3265,8 +3492,8 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase): "params": { "bm25_k1": 1.5, "bm25_b": 0.75, - } - } + }, + }, ) if enable_inverted_index: collection_w.create_index("text", {"index_type": "INVERTED"}) @@ -3274,12 +3501,18 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase): nq = 2 limit = 100 if invalid_search_data == "sparse_vector": - search_data = cf.gen_vectors(nb=nq, dim=1000, vector_data_type=DataType.SPARSE_FLOAT_VECTOR) + search_data = cf.gen_vectors( + nb=nq, dim=1000, vector_data_type=DataType.SPARSE_FLOAT_VECTOR + ) else: - search_data = cf.gen_vectors(nb=nq, dim=1000, vector_data_type=DataType.FLOAT_VECTOR) + search_data = cf.gen_vectors( + nb=nq, dim=1000, vector_data_type=DataType.FLOAT_VECTOR + ) log.info(f"search data: {search_data}") - error = {ct.err_code: 65535, - ct.err_msg: "please provide varchar/text for BM25 Function based search"} + error = { + ct.err_code: 65535, + ct.err_msg: "please provide varchar/text for BM25 Function based search", + } collection_w.search( data=search_data, anns_field="text_sparse_emb", @@ -3287,7 +3520,7 @@ class TestSearchWithFullTextSearchNegative(TestcaseBase): limit=limit, output_fields=["id", "text"], check_task=CheckTasks.err_res, - check_items=error + check_items=error, ) @@ -3307,7 +3540,13 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): @pytest.mark.parametrize("tokenizer", ["standard"]) @pytest.mark.parametrize("inverted_index_algo", ct.inverted_index_algo) def test_hybrid_search_with_full_text_search( - self, tokenizer, enable_inverted_index, enable_partition_key, empty_percent, index_type, inverted_index_algo + self, + tokenizer, + enable_inverted_index, + enable_partition_key, + empty_percent, + index_type, + inverted_index_algo, ): """ target: test full text search @@ -3374,11 +3613,17 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): { "id": i, "word": fake.word().lower() if random.random() >= empty_percent else "", - "sentence": fake.sentence().lower() if random.random() >= empty_percent else "", - "paragraph": fake.paragraph().lower() if random.random() >= empty_percent else "", + "sentence": fake.sentence().lower() + if random.random() >= empty_percent + else "", + "paragraph": fake.paragraph().lower() + if random.random() >= empty_percent + else "", "text": fake.text().lower() if random.random() >= empty_percent else "", "dense_emb": [random.random() for _ in range(dim)], - "neural_sparse_emb": cf.gen_vectors(nb=1, dim=1000, vector_data_type=DataType.SPARSE_FLOAT_VECTOR)[0], + "neural_sparse_emb": cf.gen_vectors( + nb=1, dim=1000, vector_data_type=DataType.SPARSE_FLOAT_VECTOR + )[0], } for i in range(data_size) ] @@ -3387,13 +3632,17 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): batch_size = 5000 for i in range(0, len(df), batch_size): collection_w.insert( - data[i: i + batch_size] + data[i : i + batch_size] if i + batch_size < len(df) - else data[i: len(df)] + else data[i : len(df)] ) collection_w.create_index( "dense_emb", - {"index_type": "HNSW", "metric_type": "L2", "params": {"M": 16, "efConstruction": 500}}, + { + "index_type": "HNSW", + "metric_type": "L2", + "params": {"M": 16, "efConstruction": 500}, + }, ) collection_w.create_index( "neural_sparse_emb", @@ -3407,9 +3656,9 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): "params": { "bm25_k1": 1.5, "bm25_b": 0.75, - "inverted_index_algo": inverted_index_algo - } - } + "inverted_index_algo": inverted_index_algo, + }, + }, ) if enable_inverted_index: collection_w.create_index("text", {"index_type": "INVERTED"}) @@ -3429,7 +3678,9 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): limit=limit, ) sparse_search = AnnSearchRequest( - data=cf.gen_vectors(nb=nq, dim=dim, vector_data_type=DataType.SPARSE_FLOAT_VECTOR), + data=cf.gen_vectors( + nb=nq, dim=dim, vector_data_type=DataType.SPARSE_FLOAT_VECTOR + ), anns_field="neural_sparse_emb", param={}, limit=limit, @@ -3439,7 +3690,7 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): reqs=[bm25_search, dense_search, sparse_search], rerank=WeightedRanker(0.5, 0.5, 0.5), limit=limit, - output_fields=["id", "text"] + output_fields=["id", "text"], ) assert len(res_list) == nq # check the result correctness @@ -3447,3 +3698,758 @@ class TestHybridSearchWithFullTextSearch(TestcaseBase): log.info(f"res length: {len(res_list[i])}") assert len(res_list[i]) == limit + +class TestFullTextSearchMultiAnalyzer(TestcaseBase): + """ + Comprehensive tests for multi_analyzer_params (multi-analyzer BM25) functionality in Milvus. + Covers schema creation, data insertion, indexing, searching, alias/default/fallback, edge cases, and more. + """ + + @pytest.mark.tags(CaseLabel.L0) + def test_create_collection_with_multi_analyzer(self): + """ + target: test create collection with multi_analyzer_params + method: create collection with multi_analyzer_params + expected: create collection successfully + """ + # Define multi_analyzer_params + multi_analyzer_params = { + "by_field": "language", + "analyzers": { + "en": {"type": "english"}, + "zh": {"type": "chinese"}, + "default": {"tokenizer": "icu"}, + }, + "alias": {"chinese": "zh", "eng": "en"}, + } + # Define fields + fields = [ + FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True), + FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=16), + FieldSchema( + name="article_content", + dtype=DataType.VARCHAR, + max_length=1024, + enable_analyzer=True, + multi_analyzer_params=multi_analyzer_params, + ), + FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema( + fields=fields, description="Multi-analyzer BM25 test collection" + ) + bm25_func = Function( + name="bm25", + function_type=FunctionType.BM25, + input_field_names=["article_content"], + output_field_names=["bm25_sparse_vector"], + ) + schema.add_function(bm25_func) + c_name = cf.gen_unique_str(prefix) + collection_w = self.init_collection_wrap(name=c_name, schema=schema) + res, _ = collection_w.describe() + assert len(res["functions"]) == 1 + assert res["fields"][2]["name"] == "article_content" + assert "multi_analyzer_params" in res["fields"][2]["params"] + assert ( + json.loads(res["fields"][2]["params"]["multi_analyzer_params"]) + == multi_analyzer_params + ) + + @pytest.mark.tags(CaseLabel.L0) + def test_insert_and_search_with_multi_analyzer(self): + """ + target: test insert and search with multi_analyzer + method: create collection, insert multilingual data, create index, search with analyzers + expected: insert and search works, correct analyzer is used + """ + multi_analyzer_params = { + "by_field": "language", + "analyzers": { + "en": {"type": "english"}, + "zh": {"type": "chinese"}, + "default": {"tokenizer": "standard"}, + }, + "alias": {"chinese": "zh", "eng": "en"}, + } + fields = [ + FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True), + FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=16), + FieldSchema( + name="article_content", + dtype=DataType.VARCHAR, + max_length=1024, + enable_analyzer=True, + multi_analyzer_params=multi_analyzer_params, + ), + FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema( + fields=fields, description="Multi-analyzer BM25 test collection" + ) + bm25_func = Function( + name="bm25", + function_type=FunctionType.BM25, + input_field_names=["article_content"], + output_field_names=["bm25_sparse_vector"], + ) + schema.add_function(bm25_func) + c_name = cf.gen_unique_str(prefix) + collection_w = self.init_collection_wrap(name=c_name, schema=schema) + # Prepare multilingual data + language_samples = { + "en": ["The quick brown fox.", "Machine learning is fun."], + "zh": ["自然语言处理很重要。", "人工智能改变世界。"], + "fr": ["L'intelligence artificielle.", "Traitement du langage naturel."], + "unknown": ["Some random text for default analyzer."], + } + data = [] + idx = 0 + for lang, samples in language_samples.items(): + for s in samples: + data.append({"doc_id": idx, "language": lang, "article_content": s}) + idx += 1 + collection_w.insert(data) + fake_map = { + "en": fake_en, + "zh": fake_zh, + "de": fake_de, + "jp": fake_jp, + "unknown": fake_en, + } + add_data = [] + for doc_id in range(idx, 3000): + lang = random.choice(["en", "zh", "de", "jp", "unknown"]) + content = fake_map[lang].sentence() + add_data.append( + {"doc_id": doc_id, "language": lang, "article_content": content} + ) + collection_w.insert(add_data) + collection_w.create_index( + "bm25_sparse_vector", + {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "BM25"}, + ) + collection_w.load() + # Search with different analyzers + analyzer_tests = [ + {"language": "en", "query": "machine learning", "analyzer_name": "en"}, + {"language": "zh", "query": "自然语言处理", "analyzer_name": "zh"}, + { + "language": "fr", + "query": "intelligence artificielle", + "analyzer_name": "default", + }, + {"language": "unknown", "query": "random text", "analyzer_name": "default"}, + ] + for test in analyzer_tests: + search_params = { + "metric_type": "BM25", + "analyzer_name": test["analyzer_name"], + } + results, _ = collection_w.search( + data=[test["query"]], + anns_field="bm25_sparse_vector", + param=search_params, + output_fields=["doc_id", "language", "article_content"], + limit=5, + ) + assert len(results) == 1 + assert len(results[0]) > 0 + log.info( + f"Query '{test['query']}' with analyzer '{test['analyzer_name']}' returned {len(results[0])} results" + ) + + @pytest.mark.tags(CaseLabel.L0) + def test_multi_analyzer_fallback(self): + """ + target: test fallback to default analyzer + method: insert data with languages not in analyzers, search without analyzer_name + expected: fallback to default analyzer + """ + multi_analyzer_params = { + "by_field": "language", + "analyzers": { + "en": {"type": "english"}, + "zh": {"type": "chinese"}, + "default": {"tokenizer": "standard"}, + }, + } + fields = [ + FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True), + FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=16), + FieldSchema( + name="article_content", + dtype=DataType.VARCHAR, + max_length=1024, + enable_analyzer=True, + multi_analyzer_params=multi_analyzer_params, + ), + FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema( + fields=fields, description="Multi-analyzer fallback test" + ) + bm25_func = Function( + name="bm25", + function_type=FunctionType.BM25, + input_field_names=["article_content"], + output_field_names=["bm25_sparse_vector"], + ) + schema.add_function(bm25_func) + c_name = cf.gen_unique_str(prefix) + collection_w = self.init_collection_wrap(name=c_name, schema=schema) + data = [ + { + "doc_id": 1, + "language": "en", + "article_content": "English text for testing.", + }, + {"doc_id": 2, "language": "zh", "article_content": "中文测试文本。"}, + { + "doc_id": 3, + "language": "fr", + "article_content": "Texte français pour les tests.", + }, + { + "doc_id": 4, + "language": "de", + "article_content": "Deutscher Text zum Testen.", + }, + { + "doc_id": 5, + "language": "unknown", + "article_content": "Text in unknown language.", + }, + ] + collection_w.insert(data) + collection_w.create_index( + "bm25_sparse_vector", + {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "BM25"}, + ) + collection_w.load() + fallback_tests = [ + {"language": "fr", "query": "texte français"}, + {"language": "de", "query": "deutscher text"}, + {"language": "unknown", "query": "unknown language"}, + ] + for test in fallback_tests: + search_params = {"metric_type": "BM25"} + results, _ = collection_w.search( + data=[test["query"]], + anns_field="bm25_sparse_vector", + param=search_params, + output_fields=["doc_id", "language", "article_content"], + limit=5, + ) + assert len(results) == 1 + assert len(results[0]) > 0 + + @pytest.mark.tags(CaseLabel.L0) + def test_multi_analyzer_alias(self): + """ + target: test alias for multi analyzer + method: insert data with languages in alias + expected: analyzer should be resolved correctly + """ + stop_words = ["a", "an", "the", "of", "to", " "] + multi_analyzer_params = { + "by_field": "language", + "analyzers": { + "en": { + "tokenizer": "standard", + "filter": [ + { + "type": "stop", # Specifies 'stop' as the filter type + "stop_words": stop_words, # Customizes stop words for this filter type + } + ], + }, + "zh": { + "tokenizer": "jieba", + "filter": [ + { + "type": "stop", # Specifies 'stop' as the filter type + "stop_words": stop_words, # Customizes stop words for this filter type + } + ], + }, + "default": {"tokenizer": "icu"}, + }, + "alias": {"chinese": "zh", "eng": "en"}, + } + fields = [ + FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True), + FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=16), + FieldSchema( + name="article_content", + dtype=DataType.VARCHAR, + max_length=8192, + enable_analyzer=True, + multi_analyzer_params=multi_analyzer_params, + ), + FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema( + fields=fields, description="Multi-analyzer fallback test" + ) + bm25_func = Function( + name="bm25", + function_type=FunctionType.BM25, + input_field_names=["article_content"], + output_field_names=["bm25_sparse_vector"], + ) + schema.add_function(bm25_func) + c_name = cf.gen_unique_str(prefix) + collection_w = self.init_collection_wrap(name=c_name, schema=schema) + data = [ + { + "doc_id": 1, + "language": "en", + "article_content": "English text for testing", + }, + { + "doc_id": 2, + "language": "eng", + "article_content": "English text for testing" + + " ".join(stop_words * 5), + }, + {"doc_id": 3, "language": "zh", "article_content": "中文测试文本 "}, + { + "doc_id": 4, + "language": "chinese", + "article_content": "中文测试文本 " + " ".join(stop_words * 5), + }, + { + "doc_id": 5, + "language": "fr", + "article_content": "Texte français pour les tests.", + }, + { + "doc_id": 6, + "language": "de", + "article_content": "Deutscher Text zum Testen.", + }, + { + "doc_id": 7, + "language": "unknown", + "article_content": "Text in unknown language.", + }, + { + "doc_id": 8, + "language": "default", + "article_content": " ".join(stop_words * 5), + }, + ] + # " ." * 1000 will be removed in en and zh analyzer, but will be kept in icu analyzer + # if chinese and eng are not go to the alias as expected, then doc is 8 will be returned + collection_w.insert(data) + collection_w.create_index( + "bm25_sparse_vector", + {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "BM25"}, + ) + collection_w.load() + alias_tests = [ + { + "analyzer_name": "eng", + "query": "English text for testing." + " ".join(stop_words * 10), + }, + { + "analyzer_name": "chinese", + "query": "中文测试文本。" + " ".join(stop_words * 10), + }, + ] + + for test in alias_tests: + search_params = { + "metric_type": "BM25", + "analyzer_name": test["analyzer_name"], + } + results, _ = collection_w.search( + data=[test["query"]], + anns_field="bm25_sparse_vector", + param=search_params, + output_fields=["doc_id", "language", "article_content"], + limit=10, + ) + log.info(test) + log.info(results) + assert len(results) == 1 + assert len(results[0]) > 0 + if test["analyzer_name"] == "eng": + # return id is 1,2 + assert results[0][0]["doc_id"] in [1, 2] + assert results[0][1]["doc_id"] in [1, 2] + elif test["analyzer_name"] == "chinese": + # return id is 3,4 + assert results[0][0]["doc_id"] in [3, 4] + assert results[0][1]["doc_id"] in [3, 4] + + alias_tests = [ + {"analyzer_name": "icu", "query": " ".join(stop_words * 10)}, + {"analyzer_name": "default", "query": " ".join(stop_words * 10)}, + ] + for test in alias_tests: + search_params = { + "metric_type": "BM25", + "analyzer_name": test["analyzer_name"], + } + results, _ = collection_w.search( + data=[test["query"]], + anns_field="bm25_sparse_vector", + param=search_params, + output_fields=["doc_id", "language", "article_content"], + limit=10, + ) + log.info(test) + log.info(results) + assert len(results) == 1 + assert len(results[0]) > 0 + for r in results[0]: + assert r["doc_id"] not in [1, 2, 3, 4] + + @pytest.mark.tags(CaseLabel.L0) + def test_multi_analyzer_correctness(self): + """ + target: test multi_analyzer correctness + method: create collection, insert and search using utility + expected: utility workflow works as expected + """ + from utils.util_fts import FTSMultiAnalyzerChecker + + self._connect() + client = self.client + c_name = cf.gen_unique_str(prefix) + language_field = "language" + text_field = "article_content" + ft_checker = FTSMultiAnalyzerChecker( + collection_name=c_name, + language_field_name=language_field, + text_field_name=text_field, + client=client, + ) + ft_checker.init_collection() + language_list = ["en", "zh", "fr", "jp"] + data = ft_checker.generate_test_data(num_rows=100, lang_list=language_list) + original_data, tokenized_data = ft_checker.insert_data(data) + original_data = pd.DataFrame(original_data) + ft_checker.create_index() + sample_data = random.sample(tokenized_data, 10) + for item in sample_data: + doc_id = item["doc_id"] + tokenized_query = item[text_field] + original_query = original_data.loc[ + original_data["doc_id"] == doc_id, text_field + ].iloc[0] + language = item[language_field] + res, mock_res = ft_checker.search( + original_query, tokenized_query, language, limit=5 + ) + res_set = set([r["doc_id"] for r in res[0]]) + mock_res_set = set([r["doc_id"] for r in mock_res[0]]) + res_diff = res_set - mock_res_set + mock_res_diff = mock_res_set - res_set + if res_diff or mock_res_diff: + log.error(f"result diff: {res_diff}, {mock_res_diff}") + assert False, ( + f"result diff: {res_diff} in origin but not in mock, {mock_res_diff} in mock but not in origin" + ) + + +class TestFullTextSearchMultiAnalyzerInvalid(TestcaseBase): + """ + Cases for invalid multi_analyzer_params, should raise exceptions. + """ + + @pytest.mark.tags(CaseLabel.L0) + def test_missing_by_field(self): + """ + target: test missing by_field in multi_analyzer_params + method: create collection without by_field + expected: collection creation should fail because of missing by_field + """ + missing_by_field = { + "analyzers": { + "en": {"type": "english"}, + "default": {"tokenizer": "standard"}, + } + } + fields = [ + FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True), + FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=16), + FieldSchema( + name="article_content", + dtype=DataType.VARCHAR, + max_length=1024, + enable_analyzer=True, + multi_analyzer_params=missing_by_field, + ), + FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema( + fields=fields, description="Invalid multi-analyzer test" + ) + bm25_func = Function( + name="bm25", + function_type=FunctionType.BM25, + input_field_names=["article_content"], + output_field_names=["bm25_sparse_vector"], + ) + schema.add_function(bm25_func) + collection_name = cf.gen_unique_str(prefix) + with pytest.raises(Exception): + self.init_collection_wrap(name=collection_name, schema=schema) + + @pytest.mark.tags(CaseLabel.L0) + def test_by_field_not_exist(self): + """ + target: test by_field not exist in multi_analyzer_params + method: create collection with by_field not exist + expected: collection creation should fail because of by_field not exist + """ + missing_by_field = { + "by_field": "not_exist", + "analyzers": { + "en": {"type": "english"}, + "default": {"tokenizer": "standard"}, + }, + } + fields = [ + FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True), + FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=16), + FieldSchema( + name="article_content", + dtype=DataType.VARCHAR, + max_length=1024, + enable_analyzer=True, + multi_analyzer_params=missing_by_field, + ), + FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema( + fields=fields, description="Invalid multi-analyzer test" + ) + bm25_func = Function( + name="bm25", + function_type=FunctionType.BM25, + input_field_names=["article_content"], + output_field_names=["bm25_sparse_vector"], + ) + schema.add_function(bm25_func) + c_name = cf.gen_unique_str(prefix) + with pytest.raises(Exception): + self.init_collection_wrap(name=c_name, schema=schema) + + @pytest.mark.tags(CaseLabel.L0) + def test_by_field_is_nullable(self): + """ + target: test by_field is nullable in multi_analyzer_params + method: create collection with by_field enable nullable + expected: collection creation should success because by_field can be nullable + """ + multi_analyzer_params = { + "by_field": "language", + "analyzers": { + "en": {"type": "english"}, + "default": {"tokenizer": "standard"}, + }, + } + fields = [ + FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True), + FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=16, nullable=True), + FieldSchema( + name="article_content", + dtype=DataType.VARCHAR, + max_length=1024, + enable_analyzer=True, + multi_analyzer_params=multi_analyzer_params, + ), + FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema( + fields=fields, description="Invalid multi-analyzer test" + ) + bm25_func = Function( + name="bm25", + function_type=FunctionType.BM25, + input_field_names=["article_content"], + output_field_names=["bm25_sparse_vector"], + ) + schema.add_function(bm25_func) + c_name = cf.gen_unique_str(prefix) + collection_w = self.init_collection_wrap(name=c_name, schema=schema) + + data = [ + { + "doc_id": 1, + "language": "en", + "article_content": "English text for testing.", + }, + {"doc_id": 2, "language": "zh", "article_content": "中文测试文本。"}, + { + "doc_id": 3, + "language": "fr", + "article_content": "Texte français pour les tests.", + }, + { + "doc_id": 4, + "language": "de", + "article_content": "Deutscher Text zum Testen.", + }, + { + "doc_id": 5, + "language": "unknown", + "article_content": "Text in unknown language.", + }, + {"doc_id": 6, "language": None, "article_content": "nullable test"}, + {"doc_id": 7, "language": None, "article_content": "nullable test"}, + ] + collection_w.insert(data) + collection_w.create_index( + "bm25_sparse_vector", + {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "BM25"}, + ) + collection_w.load() + query_tests = [ + {"analyzer_name": "", "query": "texte français"}, + {"analyzer_name": "de", "query": "deutscher text"}, + {"analyzer_name": "unknown", "query": "unknown language"}, + {"analyzer_name": None, "query": "nullable language"}, + ] + for test in query_tests: + search_params = { + "metric_type": "BM25", + "analyzer_name": test["analyzer_name"], + } + results, _ = collection_w.search( + data=[test["query"]], + anns_field="bm25_sparse_vector", + param=search_params, + output_fields=["doc_id", "language", "article_content"], + limit=5, + ) + assert len(results) == 1 + assert len(results[0]) > 0 + + @pytest.mark.tags(CaseLabel.L0) + def test_text_field_is_nullable(self): + """ + target: test text not exist in multi_analyzer_params + method: create collection with by_field not exist + expected: collection creation should fail because text field is nullable + """ + multi_analyzer_params = { + "by_field": "language", + "analyzers": { + "en": {"type": "english"}, + "default": {"tokenizer": "standard"}, + }, + } + fields = [ + FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True), + FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=16), + FieldSchema( + name="article_content", + dtype=DataType.VARCHAR, + max_length=1024, + enable_analyzer=True, + multi_analyzer_params=multi_analyzer_params, + nullable=True, + ), + FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema( + fields=fields, description="Invalid multi-analyzer test" + ) + bm25_func = Function( + name="bm25", + function_type=FunctionType.BM25, + input_field_names=["article_content"], + output_field_names=["bm25_sparse_vector"], + ) + schema.add_function(bm25_func) + c_name = cf.gen_unique_str(prefix) + error = { + ct.err_code: 65535, + ct.err_msg: "function input field cannot be nullable", + } + self.init_collection_wrap( + name=c_name, schema=schema, check_task=CheckTasks.err_res, check_items=error + ) + + @pytest.mark.tags(CaseLabel.L0) + def test_missing_default_analyzer(self): + """ + target: test missing default analyzer in multi_analyzer_params + method: create collection without default analyzer + expected: collection creation should fail because of no default analyzer + """ + missing_default = { + "by_field": "language", + "analyzers": {"en": {"type": "english"}, "zh": {"type": "chinese"}}, + } + fields = [ + FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True), + FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=16), + FieldSchema( + name="article_content", + dtype=DataType.VARCHAR, + max_length=1024, + enable_analyzer=True, + multi_analyzer_params=missing_default, + ), + FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema( + fields=fields, description="Invalid multi-analyzer test" + ) + bm25_func = Function( + name="bm25", + function_type=FunctionType.BM25, + input_field_names=["article_content"], + output_field_names=["bm25_sparse_vector"], + ) + schema.add_function(bm25_func) + collection_name = cf.gen_unique_str(prefix) + with pytest.raises(Exception): + self.init_collection_wrap(name=collection_name, schema=schema) + + @pytest.mark.tags(CaseLabel.L0) + def test_alias_point_not_exist_analyzer(self): + """ + target: test alias point to nonexist analyzer + method: create collection with alias pointing to not exist analyzer + expected: collection creation should success because fallback to default analyzer + """ + missing_default = { + "by_field": "language", + "analyzers": { + "en": {"type": "english"}, + "zh": {"type": "chinese"}, + "default": {"type": "english"}, + }, + "alias": {"chinese": "zh", "eng": "en", "fr": "fr"}, + } + fields = [ + FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True), + FieldSchema(name="language", dtype=DataType.VARCHAR, max_length=16), + FieldSchema( + name="article_content", + dtype=DataType.VARCHAR, + max_length=1024, + enable_analyzer=True, + multi_analyzer_params=missing_default, + ), + FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema( + fields=fields, description="Invalid multi-analyzer test" + ) + bm25_func = Function( + name="bm25", + function_type=FunctionType.BM25, + input_field_names=["article_content"], + output_field_names=["bm25_sparse_vector"], + ) + schema.add_function(bm25_func) + c_name = cf.gen_unique_str(prefix) + self.init_collection_wrap(name=c_name, schema=schema) diff --git a/tests/python_client/utils/util_fts.py b/tests/python_client/utils/util_fts.py new file mode 100644 index 0000000000..c206bbe594 --- /dev/null +++ b/tests/python_client/utils/util_fts.py @@ -0,0 +1,356 @@ +import random +import time +import logging +from typing import List, Dict, Optional, Tuple +import pandas as pd +from faker import Faker +from pymilvus import ( + FieldSchema, + CollectionSchema, + DataType, + Function, + FunctionType, + Collection, + connections, +) +from pymilvus import MilvusClient + +logger = logging.getLogger(__name__) + + +class FTSMultiAnalyzerChecker: + """ + Full-text search utility class providing various utility methods for full-text search testing. + Includes schema construction, test data generation, index creation, and more. + """ + + # Constant definitions + DEFAULT_TEXT_MAX_LENGTH = 8192 + DEFAULT_LANG_MAX_LENGTH = 16 + DEFAULT_DOC_ID_START = 100 + + # Faker multilingual instances as class attributes to avoid repeated creation + fake_en = Faker("en_US") + fake_zh = Faker("zh_CN") + fake_fr = Faker("fr_FR") + fake_jp = Faker("ja_JP") + + def __init__( + self, + collection_name: str, + language_field_name: str, + text_field_name: str, + multi_analyzer_params: Optional[Dict] = None, + client: Optional[MilvusClient] = None, + ): + self.collection_name = collection_name + self.mock_collection_name = collection_name + "_mock" + self.language_field_name = language_field_name + self.text_field_name = text_field_name + self.multi_analyzer_params = ( + multi_analyzer_params + if multi_analyzer_params is not None + else { + "by_field": self.language_field_name, + "analyzers": { + "en": {"type": "english"}, + "zh": {"type": "chinese"}, + "icu": { + "tokenizer": "icu", + "filter": [{"type": "stop", "stop_words": [" "]}], + }, + "default": {"tokenizer": "whitespace"}, + }, + "alias": {"chinese": "zh", "eng": "en", "fr": "icu", "jp": "icu"}, + } + ) + self.mock_multi_analyzer_params = { + "by_field": self.language_field_name, + "analyzers": {"default": {"tokenizer": "whitespace"}}, + } + self.client = client + self.collection = None + self.mock_collection = None + + def resolve_analyzer(self, lang: str) -> str: + """ + Return the analyzer name according to the language. + Args: + lang (str): Language identifier + Returns: + str: Analyzer name + """ + if lang in self.multi_analyzer_params["analyzers"]: + return lang + if lang in self.multi_analyzer_params.get("alias", {}): + return self.multi_analyzer_params["alias"][lang] + return "default" + + def build_schema(self, multi_analyzer_params: dict) -> CollectionSchema: + """ + Build a collection schema with multi-analyzer parameters. + Args: + multi_analyzer_params (dict): Analyzer parameters + Returns: + CollectionSchema: Constructed collection schema + """ + fields = [ + FieldSchema(name="doc_id", dtype=DataType.INT64, is_primary=True), + FieldSchema( + name=self.language_field_name, + dtype=DataType.VARCHAR, + max_length=self.DEFAULT_LANG_MAX_LENGTH, + ), + FieldSchema( + name=self.text_field_name, + dtype=DataType.VARCHAR, + max_length=self.DEFAULT_TEXT_MAX_LENGTH, + enable_analyzer=True, + multi_analyzer_params=multi_analyzer_params, + ), + FieldSchema(name="bm25_sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR), + ] + schema = CollectionSchema( + fields=fields, description="Multi-analyzer BM25 schema test" + ) + bm25_func = Function( + name="bm25", + function_type=FunctionType.BM25, + input_field_names=[self.text_field_name], + output_field_names=["bm25_sparse_vector"], + ) + schema.add_function(bm25_func) + return schema + + def init_collection(self) -> None: + """ + Initialize Milvus collections, delete if exists first. + """ + try: + if self.client.has_collection(self.collection_name): + self.client.drop_collection(self.collection_name) + if self.client.has_collection(self.mock_collection_name): + self.client.drop_collection(self.mock_collection_name) + self.collection = Collection( + name=self.collection_name, + schema=self.build_schema(self.multi_analyzer_params), + ) + self.mock_collection = Collection( + name=self.mock_collection_name, + schema=self.build_schema(self.mock_multi_analyzer_params), + ) + except Exception as e: + logger.error(f"collection init failed: {e}") + raise + + def get_tokens_by_analyzer(self, text: str, analyzer_params: dict) -> List[str]: + """ + Tokenize text according to analyzer parameters. + Args: + text (str): Text to be tokenized + analyzer_params (dict): Analyzer parameters + Returns: + List[str]: List of tokenized text + """ + try: + res = self.client.run_analyzer(text, analyzer_params) + # Filter out tokens that are just whitespace + return [token for token in res.tokens if token.strip()] + except Exception as e: + logger.error(f"Tokenization failed: {e}") + return [] + + def generate_test_data( + self, num_rows: int = 3000, lang_list: Optional[List[str]] = None + ) -> List[Dict]: + """ + Generate test data according to the schema, row count and language list. + Each row will contain language, article content and other fields. + Args: + num_rows (int): Number of data rows to generate + lang_list (Optional[List[str]]): List of languages + Returns: + List[Dict]: Generated test data list + """ + if lang_list is None: + lang_list = ["en", "eng", "zh", "fr", "chinese", "jp", ""] + data = [] + for i in range(num_rows): + lang = random.choice(lang_list) + # Generate article content according to language + if lang in ("en", "eng"): + content = self.fake_en.sentence() + elif lang in ("zh", "chinese"): + content = self.fake_zh.sentence() + elif lang == "fr": + content = self.fake_fr.sentence() + elif lang == "jp": + content = self.fake_jp.sentence() + else: + content = "" + row = { + "doc_id": i + self.DEFAULT_DOC_ID_START, + self.language_field_name: lang, + self.text_field_name: content, + } + data.append(row) + return data + + def tokenize_data_by_multi_analyzer( + self, data_list: List[Dict], verbose: bool = False + ) -> List[Dict]: + """ + Tokenize data according to multi-analyzer parameters. + Args: + data_list (List[Dict]): Data list + verbose (bool): Whether to print detailed information + Returns: + List[Dict]: Tokenized data list + """ + data_list_tokenized = [] + for row in data_list: + lang = row.get(self.language_field_name, None) + content = row.get(self.text_field_name, "") + doc_analyzer = self.resolve_analyzer(lang) + doc_analyzer_params = self.multi_analyzer_params["analyzers"][doc_analyzer] + content_tokens = self.get_tokens_by_analyzer(content, doc_analyzer_params) + tokenized_content = " ".join(content_tokens) + data_list_tokenized.append( + { + "doc_id": row.get("doc_id"), + self.language_field_name: lang, + self.text_field_name: tokenized_content, + } + ) + if verbose: + original_data = pd.DataFrame(data_list) + tokenized_data = pd.DataFrame(data_list_tokenized) + logger.info(f"Original data:\n{original_data}") + logger.info(f"Tokenized data:\n{tokenized_data}") + return data_list_tokenized + + def insert_data( + self, data: List[Dict], verbose: bool = False + ) -> Tuple[List[Dict], List[Dict]]: + """ + Insert test data and return original and tokenized data. + Args: + data (List[Dict]): Original data list + verbose (bool): Whether to print detailed information + Returns: + Tuple[List[Dict], List[Dict]]: (original data, tokenized data) + """ + try: + self.collection.insert(data) + self.collection.flush() + except Exception as e: + logger.error(f"Failed to insert original data: {e}") + raise + t0 = time.time() + tokenized_data = self.tokenize_data_by_multi_analyzer(data, verbose=verbose) + t1 = time.time() + logger.info(f"Tokenization time: {t1 - t0}") + try: + self.mock_collection.insert(tokenized_data) + self.mock_collection.flush() + except Exception as e: + logger.error(f"Failed to insert tokenized data: {e}") + raise + return data, tokenized_data + + def create_index(self) -> None: + """ + Create BM25 index for sparse vector field. + """ + for c in [self.collection, self.mock_collection]: + try: + c.create_index( + "bm25_sparse_vector", + {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "BM25"}, + ) + c.load() + except Exception as e: + logger.error(f"Failed to create index: {e}") + raise + + def search( + self, origin_query: str, tokenized_query: str, language: str, limit: int = 10 + ) -> Tuple[list, list]: + """ + Search interface, perform BM25 search on main and mock collections respectively. + Args: + origin_query (str): Original query text + tokenized_query (str): Tokenized query text + language (str): Query language + limit (int): Number of results to return + Returns: + Tuple[list, list]: (main collection results, mock collection results) + """ + analyzer_name = self.resolve_analyzer(language) + search_params = {"metric_type": "BM25", "analyzer_name": analyzer_name} + logger.info(f"search_params: {search_params}") + try: + res = self.collection.search( + data=[origin_query], + anns_field="bm25_sparse_vector", + param=search_params, + output_fields=["doc_id"], + limit=limit, + ) + mock_res = self.mock_collection.search( + data=[tokenized_query], + anns_field="bm25_sparse_vector", + param=search_params, + output_fields=["doc_id"], + limit=limit, + ) + return res, mock_res + except Exception as e: + logger.error(f"Search failed: {e}") + return [], [] + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + connections.connect("default", host="10.104.25.52", port="19530") + client = MilvusClient(uri="http://10.104.25.52:19530") + ft = FTSMultiAnalyzerChecker( + "test_collection", "language", "article_content", client=client + ) + ft.init_collection() + ft.create_index() + language_list = ["jp", "en", "fr", "zh"] + data = ft.generate_test_data(1000, language_list) + _, tokenized_data = ft.insert_data(data) + search_sample_data = random.sample(tokenized_data, 10) + for row in search_sample_data: + tokenized_query = row[ft.text_field_name] + # Find the same doc_id in the original data and get the original query + # Use pandas to find the item with matching doc_id + # Convert data to DataFrame if it's not already + if not isinstance(data, pd.DataFrame): + data_df = pd.DataFrame(data) + else: + data_df = data + # Filter by doc_id and get the text field value + origin_query = data_df.loc[ + data_df["doc_id"] == row["doc_id"], ft.text_field_name + ].iloc[0] + logger.info(f"Query: {tokenized_query}") + logger.info(f"Origin Query: {origin_query}") + language = row[ft.language_field_name] + logger.info(f"language: {language}") + res, mock_res = ft.search(origin_query, tokenized_query, language) + logger.info(f"Main collection search result: {res}") + logger.info(f"Mock collection search result: {mock_res}") + if res and mock_res: + res_set = set([r["doc_id"] for r in res[0]]) + mock_res_set = set([r["doc_id"] for r in mock_res[0]]) + res_diff = res_set - mock_res_set + mock_res_diff = mock_res_set - res_set + logger.info(f"Diff: {res_diff}, {mock_res_diff}") + if res_diff or mock_res_diff: + logger.error( + f"Search results inconsistent: {res_diff}, {mock_res_diff}" + ) + assert False