test: add restful cases for full text search and some minor fix (#37148)

/kind improvement --------- Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
2026-01-07 19:31:51 +08:00 · 2024-10-31 21:18:23 +08:00 · 2024-10-31 21:18:23 +08:00 · 247f75180f
commit 247f75180f
parent d24970c090
7 changed files with 528 additions and 53 deletions
--- a/tests/restful_client_v2/api/milvus.py
+++ b/tests/restful_client_v2/api/milvus.py
@ -334,6 +334,7 @@ class CollectionClient(Requests):
        self.endpoint = endpoint
        self.api_key = token
        self.db_name = None
+        self.name_list = []
        self.headers = self.update_headers()

    @classmethod
@ -435,6 +436,10 @@ class CollectionClient(Requests):

    def collection_create(self, payload, db_name="default"):
        time.sleep(1)  # wait for collection created and in case of rate limit
+        c_name = payload.get("collectionName", None)
+        db_name = payload.get("dbName", db_name)
+        self.name_list.append((db_name, c_name))
+
        url = f'{self.endpoint}/v2/vectordb/collections/create'
        if self.db_name is not None:
            payload["dbName"] = self.db_name
--- a/tests/restful_client_v2/base/testbase.py
+++ b/tests/restful_client_v2/base/testbase.py
@ -50,6 +50,17 @@ class TestBase(Base):
                rsp = self.collection_client.collection_drop(payload)
            except Exception as e:
                logger.error(e)
+        for item in self.collection_client.name_list:
+            db_name = item[0]
+            c_name = item[1]
+            payload = {
+                "collectionName": c_name,
+                "dbName": db_name
+            }
+            try:
+                self.collection_client.collection_drop(payload)
+            except Exception as e:
+                logger.error(e)

    @pytest.fixture(scope="function", autouse=True)
    def init_client(self, endpoint, token, minio_host, bucket_name, root_path):
--- a/tests/restful_client_v2/testcases/test_collection_operations.py
+++ b/tests/restful_client_v2/testcases/test_collection_operations.py
@ -103,7 +103,7 @@ class TestCreateCollection(TestBase):
            "collectionName": name,
            "dimension": dim,
            "metricType": metric_type,
-            "params":{
+            "params": {
                "enableDynamicField": enable_dynamic_field,
                "shardsNum": request_shards_num,
                "consistencyLevel": f"{consistency_level}",
@ -147,7 +147,7 @@ class TestCreateCollection(TestBase):
    @pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
    @pytest.mark.parametrize("consistency_level", ["Strong", "Bounded"])
    @pytest.mark.parametrize("enable_dynamic_field", [True, False])
-    @pytest.mark.parametrize("index_type", ["AUTOINDEX","IVF_SQ8", "HNSW"])
+    @pytest.mark.parametrize("index_type", ["AUTOINDEX", "IVF_SQ8", "HNSW"])
    @pytest.mark.parametrize("dim", [128])
    def test_create_collections_with_all_params(self,
                                                dim,
@ -179,6 +179,7 @@ class TestCreateCollection(TestBase):
            "FLAT": {},
            "IVF_SQ8": {"nlist": 16384},
            "HNSW": {"M": 16, "efConstruction": 500},
+            "BM25_SPARSE_INVERTED_INDEX": {"bm25_k1": 0.5, "bm25_b": 0.5},
            "AUTOINDEX": {}
        }

@ -197,15 +198,32 @@ class TestCreateCollection(TestBase):
                    {"fieldName": "book_id", "dataType": "Int64",
                     "isPrimary": primary_key_field == "book_id", "elementTypeParams": {}},
                    {"fieldName": "word_count", "dataType": "Int64",
-                     "isPartitionKey": partition_key_field == "word_count", "isClusteringKey": clustering_key_field == "word_count", "elementTypeParams": {}},
+                     "isPartitionKey": partition_key_field == "word_count",
+                     "isClusteringKey": clustering_key_field == "word_count", "elementTypeParams": {}},
                    {"fieldName": "book_category", "dataType": "Int64",
                     "isPartitionKey": partition_key_field == "book_category",
                     "isClusteringKey": clustering_key_field == "book_category", "elementTypeParams": {}},
                    {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
+                    {"fieldName": "document_content", "dataType": "VarChar",
+                     "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
+                                           "analyzer_params": {
+                                               "tokenizer": "default"
+                                           },
+                                           "enable_match": True}},
                    {"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
                    {"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
                     "elementTypeParams": {"max_capacity": "1024"}},
-                    {"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
+                    {"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
+                    {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"}
+                ],
+                "functions": [
+                    {
+                        "name": "bm25_fn",
+                        "type": "BM25",
+                        "inputFieldNames": ["document_content"],
+                        "outputFieldNames": ["sparse_vector"],
+                        "params": {}
+                    }
                ]
            },
            "indexParams": [
@ -214,7 +232,14 @@ class TestCreateCollection(TestBase):
                 "metricType": f"{metric_type}",
                 "indexType": index_type,
                 "params": index_param_map[index_type]
-                 }]
+                 },
+                {"fieldName": "sparse_vector",
+                 "indexName": "sparse_vector_index",
+                 "metricType": "BM25",
+                 "indexType": "SPARSE_INVERTED_INDEX",
+                 "params": index_param_map["BM25_SPARSE_INVERTED_INDEX"]
+                }
+            ]
        }

        logging.info(f"create collection {name} with payload: {payload}")
@ -244,6 +269,7 @@ class TestCreateCollection(TestBase):
        assert rsp['data']['partitionsNum'] == num_partitions
        assert rsp['data']['consistencyLevel'] == consistency_level
        assert ttl_seconds_actual == ttl_seconds
+        assert len(rsp['data']["functions"]) == len(payload["schema"]["functions"])
        #
        # # check fields properties
        fields = rsp['data']['fields']
@ -259,11 +285,16 @@ class TestCreateCollection(TestBase):
        # check index
        index_info = [index.to_dict() for index in c.indexes]
        logger.info(f"index_info: {index_info}")
-        assert len(index_info) == 1
-        assert index_info[0]["index_param"]['metric_type'] == metric_type
-        assert index_info[0]["index_param"]['index_type'] == index_type
-        assert index_info[0]["index_param"].get("params", {}) == index_param_map[index_type]
-
+        assert len(index_info) == 2
+        for index in index_info:
+            index_param = index["index_param"]
+            if index_param["index_type"] == "SPARSE_INVERTED_INDEX":
+                assert index_param["metric_type"] == "BM25"
+                assert index_param.get("params", {}) == index_param_map["BM25_SPARSE_INVERTED_INDEX"]
+            else:
+                assert index_param["metric_type"] == metric_type
+                assert index_param["index_type"] == index_type
+                assert index_param.get("params", {}) == index_param_map[index_type]

    @pytest.mark.parametrize("auto_id", [True, False])
    @pytest.mark.parametrize("enable_dynamic_field", [True, False])
@ -686,9 +717,6 @@ class TestCreateCollectionNegative(TestBase):
        rsp = client.collection_create(payload)
        assert rsp['code'] == 1801

-
-
-
    @pytest.mark.parametrize("name",
                             [" ", "test_collection_" * 100, "test collection", "test/collection", "test\collection"])
    def test_create_collections_with_invalid_collection_name(self, name):
@ -797,6 +825,7 @@ class TestGetCollectionStats(TestBase):
        rsp = client.collection_stats(collection_name=name)
        assert rsp['data']['rowCount'] == nb

+
@pytest.mark.L0
 class TestLoadReleaseCollection(TestBase):

@ -845,6 +874,7 @@ class TestLoadReleaseCollection(TestBase):
        rsp = client.collection_load_state(collection_name=name)
        assert rsp['data']['loadState'] == "LoadStateNotLoad"

+
@pytest.mark.L0
 class TestGetCollectionLoadState(TestBase):

@ -1126,6 +1156,7 @@ class TestRenameCollection(TestBase):
        assert new_name in all_collections
        assert name not in all_collections

+
@pytest.mark.L1
 class TestCollectionWithAuth(TestBase):
    def test_drop_collections_with_invalid_api_key(self):
--- a/tests/restful_client_v2/testcases/test_index_operation.py
+++ b/tests/restful_client_v2/testcases/test_index_operation.py
@ -1,28 +1,42 @@
 import random
 from sklearn import preprocessing
 import numpy as np
-import sys
-import json
 import time
-from utils import constant
-from utils.utils import gen_collection_name
+from utils.utils import gen_collection_name, patch_faker_text, en_vocabularies_distribution, \
+    zh_vocabularies_distribution
 from utils.util_log import test_log as logger
 import pytest
 from base.testbase import TestBase
 from utils.utils import gen_vector
 from pymilvus import (
-    FieldSchema, CollectionSchema, DataType,
    Collection
 )
+from faker import Faker
+
+Faker.seed(19530)
+fake_en = Faker("en_US")
+fake_zh = Faker("zh_CN")
+
+patch_faker_text(fake_en, en_vocabularies_distribution)
+patch_faker_text(fake_zh, zh_vocabularies_distribution)
+
+index_param_map = {
+    "FLAT": {},
+    "IVF_SQ8": {"nlist": 128},
+    "HNSW": {"M": 16, "efConstruction": 200},
+    "BM25_SPARSE_INVERTED_INDEX": {"bm25_k1": 0.5, "bm25_b": 0.5},
+    "AUTOINDEX": {}
+}


@pytest.mark.L0
 class TestCreateIndex(TestBase):

-    @pytest.mark.parametrize("metric_type", ["L2"])
-    @pytest.mark.parametrize("index_type", ["AUTOINDEX", "HNSW"])
+    @pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
+    @pytest.mark.parametrize("index_type", ["AUTOINDEX", "IVF_SQ8", "HNSW"])
    @pytest.mark.parametrize("dim", [128])
-    def test_index_e2e(self, dim, metric_type, index_type):
+    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365")
+    def test_index_default(self, dim, metric_type, index_type):
        """
        target: test create collection
        method: create a collection with a simple schema
@ -43,38 +57,21 @@ class TestCreateIndex(TestBase):
        }
        logger.info(f"create collection {name} with payload: {payload}")
        rsp = client.collection_create(payload)
-        # insert data
-        for i in range(1):
-            data = []
-            for j in range(3000):
-                tmp = {
-                    "book_id": j,
-                    "word_count": j,
-                    "book_describe": f"book_{j}",
-                    "book_intro": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
-                        0].tolist(),
-                }
-                data.append(tmp)
-            payload = {
-                "collectionName": name,
-                "data": data
-            }
-            rsp = self.vector_client.vector_insert(payload)
        c = Collection(name)
        c.flush()
        # list index, expect empty
        rsp = self.index_client.index_list(name)
-
        # create index
        payload = {
            "collectionName": name,
-            "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector",
-                             "metricType": f"{metric_type}"}]
+            "indexParams": [
+                {"fieldName": "book_intro", "indexName": "book_intro_vector",
+                 "metricType": f"{metric_type}",
+                 "indexType": f"{index_type}",
+                 "params": index_param_map[index_type]
+                 }
+            ]
        }
-        if index_type == "HNSW":
-            payload["indexParams"][0]["params"] = {"index_type": "HNSW", "M": "16", "efConstruction": "200"}
-        if index_type == "AUTOINDEX":
-            payload["indexParams"][0]["params"] = {"index_type": "AUTOINDEX"}
        rsp = self.index_client.index_create(payload)
        assert rsp['code'] == 0
        time.sleep(10)
@ -90,8 +87,19 @@ class TestCreateIndex(TestBase):
            assert expected_index[i]['fieldName'] == actual_index[i]['fieldName']
            assert expected_index[i]['indexName'] == actual_index[i]['indexName']
            assert expected_index[i]['metricType'] == actual_index[i]['metricType']
-            assert expected_index[i]["params"]['index_type'] == actual_index[i]['indexType']
-
+            assert expected_index[i]["indexType"] == actual_index[i]['indexType']
+        # check index by pymilvus
+        index_info = [index.to_dict() for index in c.indexes]
+        logger.info(f"index_info: {index_info}")
+        for index in index_info:
+            index_param = index["index_param"]
+            if index_param["index_type"] == "SPARSE_INVERTED_INDEX":
+                assert index_param["metric_type"] == "BM25"
+                assert index_param.get("params", {}) == index_param_map["BM25_SPARSE_INVERTED_INDEX"]
+            else:
+                assert index_param["metric_type"] == metric_type
+                assert index_param["index_type"] == index_type
+                assert index_param.get("params", {}) == index_param_map[index_type]
        # drop index
        for i in range(len(actual_index)):
            payload = {
@ -241,6 +249,119 @@ class TestCreateIndex(TestBase):
            assert expected_index[i]['indexName'] == actual_index[i]['indexName']
            assert expected_index[i]['params']['index_type'] == actual_index[i]['indexType']

+    @pytest.mark.parametrize("insert_round", [1])
+    @pytest.mark.parametrize("auto_id", [True])
+    @pytest.mark.parametrize("is_partition_key", [True])
+    @pytest.mark.parametrize("enable_dynamic_schema", [True])
+    @pytest.mark.parametrize("nb", [3000])
+    @pytest.mark.parametrize("dim", [128])
+    @pytest.mark.parametrize("tokenizer", ['default', 'jieba'])
+    @pytest.mark.parametrize("index_type", ['SPARSE_INVERTED_INDEX', 'SPARSE_WAND'])
+    @pytest.mark.parametrize("bm25_k1", [1.2, 1.5])
+    @pytest.mark.parametrize("bm25_b", [0.7, 0.5])
+    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365")
+    def test_create_index_for_full_text_search(self, nb, dim, insert_round, auto_id, is_partition_key,
+                                               enable_dynamic_schema, tokenizer, index_type, bm25_k1, bm25_b):
+        """
+        Insert a vector with a simple payload
+        """
+        # create a collection
+        name = gen_collection_name()
+        payload = {
+            "collectionName": name,
+            "schema": {
+                "autoId": auto_id,
+                "enableDynamicField": enable_dynamic_schema,
+                "fields": [
+                    {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
+                    {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
+                     "elementTypeParams": {}},
+                    {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
+                    {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
+                    {"fieldName": "document_content", "dataType": "VarChar",
+                     "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
+                                           "analyzer_params": {
+                                               "tokenizer": tokenizer,
+                                           },
+                                           "enable_match": True}},
+                    {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
+                ],
+                "functions": [
+                    {
+                        "name": "bm25_fn",
+                        "type": "BM25",
+                        "inputFieldNames": ["document_content"],
+                        "outputFieldNames": ["sparse_vector"],
+                        "params": {}
+                    }
+                ]
+            },
+        }
+        rsp = self.collection_client.collection_create(payload)
+        assert rsp['code'] == 0
+        rsp = self.collection_client.collection_describe(name)
+        logger.info(f"rsp: {rsp}")
+        assert rsp['code'] == 0
+        if tokenizer == 'default':
+            fake = fake_en
+        elif tokenizer == 'jieba':
+            fake = fake_zh
+        else:
+            raise Exception("Invalid tokenizer")
+
+        # insert data
+        for i in range(insert_round):
+            data = []
+            for j in range(nb):
+                idx = i * nb + j
+                if auto_id:
+                    tmp = {
+                        "user_id": idx % 100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                else:
+                    tmp = {
+                        "book_id": idx,
+                        "user_id": idx % 100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                if enable_dynamic_schema:
+                    tmp.update({f"dynamic_field_{i}": i})
+                data.append(tmp)
+            payload = {
+                "collectionName": name,
+                "data": data,
+            }
+            rsp = self.vector_client.vector_insert(payload)
+            assert rsp['code'] == 0
+            assert rsp['data']['insertCount'] == nb
+        assert rsp['code'] == 0
+
+        # create index
+        payload = {
+            "collectionName": name,
+            "indexParams": [
+                {"fieldName": "sparse_vector", "indexName": "sparse_vector",
+                 "metricType": "BM25",
+                 "indexType": index_type,
+                 "params": {"bm25_k1": bm25_k1, "bm25_b": bm25_b}
+                 }
+            ]
+        }
+        rsp = self.index_client.index_create(payload)
+        c = Collection(name)
+        index_info = [index.to_dict() for index in c.indexes]
+        logger.info(f"index_info: {index_info}")
+        for info in index_info:
+            assert info['index_param']['metric_type'] == 'BM25'
+            assert info['index_param']["params"]['bm25_k1'] == bm25_k1
+            assert info['index_param']["params"]['bm25_b'] == bm25_b
+            assert info['index_param']['index_type'] == index_type
+

@pytest.mark.L1
 class TestCreateIndexNegative(TestBase):
--- a/tests/restful_client_v2/testcases/test_jobs_operation.py
+++ b/tests/restful_client_v2/testcases/test_jobs_operation.py
@ -101,7 +101,6 @@ class TestCreateImportJob(TestBase):
                    assert False, "import job timeout"
        c = Collection(name)
        c.load(_refresh=True)
-        time.sleep(10)
        res = c.query(
            expr="",
            output_fields=["count(*)"],
@ -192,7 +191,6 @@ class TestCreateImportJob(TestBase):
                    assert False, "import job timeout"
        c = Collection(name)
        c.load(_refresh=True)
-        time.sleep(10)
        res = c.query(
            expr="",
            output_fields=["count(*)"],
@ -285,7 +283,6 @@ class TestCreateImportJob(TestBase):
                    assert False, "import job timeout"
        c = Collection(name)
        c.load(_refresh=True)
-        time.sleep(10)
        res = c.query(
            expr="",
            output_fields=["count(*)"],
@ -376,6 +373,7 @@ class TestCreateImportJob(TestBase):
        time.sleep(10)
        # assert data count
        c = Collection(name)
+        c.load(_refresh=True)
        assert c.num_entities == 2000
        # assert import data can be queried
        payload = {
@ -456,6 +454,7 @@ class TestCreateImportJob(TestBase):
        time.sleep(10)
        # assert data count
        c = Collection(name)
+        c.load(_refresh=True)
        assert c.num_entities == 2000
        # assert import data can be queried
        payload = {
@ -541,6 +540,7 @@ class TestCreateImportJob(TestBase):
        time.sleep(10)
        # assert data count
        c = Collection(name)
+        c.load(_refresh=True)
        assert c.num_entities == 2000
        # assert import data can be queried
        payload = {
@ -665,6 +665,7 @@ class TestCreateImportJob(TestBase):
        time.sleep(10)
        # assert data count
        c = Collection(name)
+        c.load(_refresh=True)
        assert c.num_entities == 6000
        # assert import data can be queried
        payload = {
@ -915,6 +916,7 @@ class TestImportJobAdvance(TestBase):
        rsp = self.import_job_client.list_import_jobs(payload)
        # assert data count
        c = Collection(name)
+        c.load(_refresh=True)
        assert c.num_entities == file_nums * batch_size
        # assert import data can be queried
        payload = {
@ -1007,6 +1009,7 @@ class TestCreateImportJobAdvance(TestBase):
        rsp = self.import_job_client.list_import_jobs(payload)
        # assert data count
        c = Collection(name)
+        c.load(_refresh=True)
        assert c.num_entities == file_nums * batch_size * task_num
        # assert import data can be queried
        payload = {
@ -1096,6 +1099,7 @@ class TestCreateImportJobAdvance(TestBase):
        rsp = self.import_job_client.list_import_jobs(payload)
        # assert data count
        c = Collection(name)
+        c.load(_refresh=True)
        assert c.num_entities == file_nums * batch_size * task_num
        # assert import data can be queried
        payload = {
--- a/tests/restful_client_v2/testcases/test_vector_operations.py
+++ b/tests/restful_client_v2/testcases/test_vector_operations.py
@ -6,7 +6,7 @@ import sys
 import json
 import time
 from utils import constant
-from utils.utils import gen_collection_name, get_sorted_distance
+from utils.utils import gen_collection_name, get_sorted_distance, patch_faker_text, en_vocabularies_distribution, zh_vocabularies_distribution
 from utils.util_log import test_log as logger
 import pytest
 from base.testbase import TestBase
@ -20,6 +20,9 @@ Faker.seed(19530)
 fake_en = Faker("en_US")
 fake_zh = Faker("zh_CN")

+patch_faker_text(fake_en, en_vocabularies_distribution)
+patch_faker_text(fake_zh, zh_vocabularies_distribution)
+

@pytest.mark.L0
 class TestInsertVector(TestBase):
@ -1193,14 +1196,108 @@ class TestSearchVector(TestBase):
        rsp = self.vector_client.vector_search(payload)
        assert rsp['code'] == 0

+
+    @pytest.mark.parametrize("insert_round", [1])
+    @pytest.mark.parametrize("auto_id", [True, False])
+    @pytest.mark.parametrize("is_partition_key", [True, False])
+    @pytest.mark.parametrize("enable_dynamic_schema", [True])
+    @pytest.mark.parametrize("nb", [3000])
+    @pytest.mark.parametrize("dim", [128])
+    @pytest.mark.parametrize("groupingField", ['user_id', None])
+    @pytest.mark.parametrize("tokenizer", ['default'])
+    def test_search_vector_for_en_full_text_search(self, nb, dim, insert_round, auto_id,
+                                                      is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
+        """
+        Insert a vector with a simple payload
+        """
+        # create a collection
+        name = gen_collection_name()
+        payload = {
+            "collectionName": name,
+            "schema": {
+                "autoId": auto_id,
+                "enableDynamicField": enable_dynamic_schema,
+                "fields": [
+                    {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
+                    {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
+                     "elementTypeParams": {}},
+                    {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
+                    {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
+                    {"fieldName": "document_content", "dataType": "VarChar",
+                     "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
+                                           "analyzer_params": {
+                                               "tokenizer": tokenizer,
+                                           },
+                                           "enable_match": True}},
+                    {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
+                ],
+                "functions": [
+                    {
+                        "name": "bm25_fn",
+                        "type": "BM25",
+                        "inputFieldNames": ["document_content"],
+                        "outputFieldNames": ["sparse_vector"],
+                        "params": {}
+                    }
+                ]
+            },
+
+            "indexParams": [
+                {"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
+                 "params": {"index_type": "SPARSE_INVERTED_INDEX"}}
+            ]
+        }
+        rsp = self.collection_client.collection_create(payload)
+        assert rsp['code'] == 0
+        rsp = self.collection_client.collection_describe(name)
+        logger.info(f"rsp: {rsp}")
+        assert rsp['code'] == 0
+        if tokenizer == 'default':
+            fake = fake_en
+        elif tokenizer == 'jieba':
+            fake = fake_zh
+        else:
+            raise Exception("Invalid tokenizer")
+
+        # insert data
+        for i in range(insert_round):
+            data = []
+            for j in range(nb):
+                idx = i * nb + j
+                if auto_id:
+                    tmp = {
+                        "user_id": idx%100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                else:
+                    tmp = {
+                        "book_id": idx,
+                        "user_id": idx%100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                if enable_dynamic_schema:
+                    tmp.update({f"dynamic_field_{i}": i})
+                data.append(tmp)
+            payload = {
+                "collectionName": name,
+                "data": data,
+            }
+            rsp = self.vector_client.vector_insert(payload)
+            assert rsp['code'] == 0
+            assert rsp['data']['insertCount'] == nb
+        assert rsp['code'] == 0
+
        # search data
        payload = {
            "collectionName": name,
-            "data": [gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="coo")],
+            "data": [fake.text().lower() for _ in range(1)],
            "filter": "word_count > 100",
            "outputFields": ["*"],
            "searchParams": {
-                "metricType": "IP",
                "params": {
                    "drop_ratio_search": "0.2",
                }
@ -1211,6 +1308,125 @@ class TestSearchVector(TestBase):
            payload["groupingField"] = groupingField
        rsp = self.vector_client.vector_search(payload)
        assert rsp['code'] == 0
+        assert len(rsp['data']) > 0
+
+
+    @pytest.mark.parametrize("insert_round", [1])
+    @pytest.mark.parametrize("auto_id", [True, False])
+    @pytest.mark.parametrize("is_partition_key", [True, False])
+    @pytest.mark.parametrize("enable_dynamic_schema", [True])
+    @pytest.mark.parametrize("nb", [3000])
+    @pytest.mark.parametrize("dim", [128])
+    @pytest.mark.parametrize("groupingField", ['user_id', None])
+    @pytest.mark.parametrize("tokenizer", ['jieba'])
+    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751")
+    def test_search_vector_for_zh_full_text_search(self, nb, dim, insert_round, auto_id,
+                                                      is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
+        """
+        Insert a vector with a simple payload
+        """
+        # create a collection
+        name = gen_collection_name()
+        payload = {
+            "collectionName": name,
+            "schema": {
+                "autoId": auto_id,
+                "enableDynamicField": enable_dynamic_schema,
+                "fields": [
+                    {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
+                    {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
+                     "elementTypeParams": {}},
+                    {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
+                    {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
+                    {"fieldName": "document_content", "dataType": "VarChar",
+                     "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
+                                           "analyzer_params": {
+                                               "tokenizer": tokenizer,
+                                           },
+                                           "enable_match": True}},
+                    {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
+                ],
+                "functions": [
+                    {
+                        "name": "bm25_fn",
+                        "type": "BM25",
+                        "inputFieldNames": ["document_content"],
+                        "outputFieldNames": ["sparse_vector"],
+                        "params": {}
+                    }
+                ]
+            },
+
+            "indexParams": [
+                {"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
+                 "params": {"index_type": "SPARSE_INVERTED_INDEX"}}
+            ]
+        }
+        rsp = self.collection_client.collection_create(payload)
+        assert rsp['code'] == 0
+        rsp = self.collection_client.collection_describe(name)
+        logger.info(f"rsp: {rsp}")
+        assert rsp['code'] == 0
+        if tokenizer == 'default':
+            fake = fake_en
+        elif tokenizer == 'jieba':
+            fake = fake_zh
+        else:
+            raise Exception("Invalid tokenizer")
+
+        # insert data
+        for i in range(insert_round):
+            data = []
+            for j in range(nb):
+                idx = i * nb + j
+                if auto_id:
+                    tmp = {
+                        "user_id": idx%100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                else:
+                    tmp = {
+                        "book_id": idx,
+                        "user_id": idx%100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                if enable_dynamic_schema:
+                    tmp.update({f"dynamic_field_{i}": i})
+                data.append(tmp)
+            payload = {
+                "collectionName": name,
+                "data": data,
+            }
+            rsp = self.vector_client.vector_insert(payload)
+            assert rsp['code'] == 0
+            assert rsp['data']['insertCount'] == nb
+        assert rsp['code'] == 0
+
+        # search data
+        payload = {
+            "collectionName": name,
+            "data": [fake.text().lower() for _ in range(2)],
+            "filter": "word_count > 100",
+            "outputFields": ["*"],
+            "searchParams": {
+                "params": {
+                    "drop_ratio_search": "0.2",
+                }
+            },
+            "limit": 500,
+        }
+        if groupingField:
+            payload["groupingField"] = groupingField
+        rsp = self.vector_client.vector_search(payload)
+        assert rsp['code'] == 0
+        assert len(rsp['data']) > 0
+
+
+

    @pytest.mark.parametrize("insert_round", [2])
    @pytest.mark.parametrize("auto_id", [True])
@ -1790,6 +2006,29 @@ class TestSearchVectorNegative(TestBase):
        rsp = self.vector_client.vector_search(payload)
        assert rsp['code'] == 1802

+    @pytest.mark.parametrize("invalid_metric_type", ["L2", "IP", "UNSUPPORTED"])
+    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37138")
+    def test_search_vector_with_invalid_metric_type(self, invalid_metric_type):
+        """
+        Search a vector with a simple payload
+        """
+        name = gen_collection_name()
+        self.name = name
+        self.init_collection(name, metric_type="COSINE")
+
+        # search data
+        dim = 128
+        payload = {
+            "collectionName": name,
+            "data": [preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()],
+            "searchParams": {
+                "metricType": invalid_metric_type
+            }
+        }
+        rsp = self.vector_client.vector_search(payload)
+        assert rsp['code'] != 0
+
+
    @pytest.mark.parametrize("limit", [0, 16385])
    def test_search_vector_with_invalid_limit(self, limit):
        """
--- a/tests/restful_client_v2/utils/utils.py
+++ b/tests/restful_client_v2/utils/utils.py
@ -14,10 +14,74 @@ from sklearn.metrics import pairwise_distances
 from collections import Counter
 import bm25s
 import jieba
+
+
 fake = Faker()
+fake.seed_instance(19530)
 rng = np.random.default_rng()


+en_vocabularies_distribution = {
+    "hello": 0.01,
+    "milvus": 0.01,
+    "vector": 0.01,
+    "database": 0.01
+}
+
+zh_vocabularies_distribution = {
+    "你好": 0.01,
+    "向量": 0.01,
+    "数据": 0.01,
+    "库": 0.01
+}
+
+
+def patch_faker_text(fake_instance, vocabularies_distribution):
+    """
+    Monkey patch the text() method of a Faker instance to include custom vocabulary.
+    Each word in vocabularies_distribution has an independent chance to be inserted.
+    Args:
+        fake_instance: Faker instance to patch
+        vocabularies_distribution: Dictionary where:
+            - key: word to insert
+            - value: probability (0-1) of inserting this word into each sentence
+    Example:
+        vocabularies_distribution = {
+            "hello": 0.1,    # 10% chance to insert "hello" in each sentence
+            "milvus": 0.1,   # 10% chance to insert "milvus" in each sentence
+        }
+    """
+    original_text = fake_instance.text
+
+    def new_text(*args, **kwargs):
+        sentences = []
+        # Split original text into sentences
+        original_sentences = original_text(*args,**kwargs).split('.')
+        original_sentences = [s.strip() for s in original_sentences if s.strip()]
+
+        for base_sentence in original_sentences:
+            words = base_sentence.split()
+
+            # Independently decide whether to insert each word
+            for word, probability in vocabularies_distribution.items():
+                if random.random() < probability:
+                    # Choose random position to insert the word
+                    insert_pos = random.randint(0, len(words))
+                    words.insert(insert_pos, word)
+
+            # Reconstruct the sentence
+            base_sentence = ' '.join(words)
+
+            # Ensure proper capitalization
+            base_sentence = base_sentence[0].upper() + base_sentence[1:]
+            sentences.append(base_sentence)
+
+        return '. '.join(sentences) + '.'
+
+    # Replace the original text method with our custom one
+    fake_instance.text = new_text
+
+
 def analyze_documents(texts, language="en"):
    stopwords = "en"
    if language in ["en", "english"]: