From 247f75180f8f9f35404b01969feffded39ee2b6a Mon Sep 17 00:00:00 2001
From: zhuwenxing <wenxing.zhu@zilliz.com>
Date: Thu, 31 Oct 2024 21:18:23 +0800
Subject: [PATCH] test: add restful cases for full text search and some minor
 fix (#37148)

/kind improvement

---------

Signed-off-by: zhuwenxing <wenxing.zhu@zilliz.com>
---
 tests/restful_client_v2/api/milvus.py         |   5 +
 tests/restful_client_v2/base/testbase.py      |  11 +
 .../testcases/test_collection_operations.py   |  57 +++-
 .../testcases/test_index_operation.py         | 189 +++++++++++---
 .../testcases/test_jobs_operation.py          |  10 +-
 .../testcases/test_vector_operations.py       | 245 +++++++++++++++++-
 tests/restful_client_v2/utils/utils.py        |  64 +++++
 7 files changed, 528 insertions(+), 53 deletions(-)

diff --git a/tests/restful_client_v2/api/milvus.py b/tests/restful_client_v2/api/milvus.py
index 9c1dabbdbb..ae94cd959c 100644
--- a/tests/restful_client_v2/api/milvus.py
+++ b/tests/restful_client_v2/api/milvus.py
@@ -334,6 +334,7 @@ class CollectionClient(Requests):
         self.endpoint = endpoint
         self.api_key = token
         self.db_name = None
+        self.name_list = []
         self.headers = self.update_headers()
 
     @classmethod
@@ -435,6 +436,10 @@ class CollectionClient(Requests):
 
     def collection_create(self, payload, db_name="default"):
         time.sleep(1)  # wait for collection created and in case of rate limit
+        c_name = payload.get("collectionName", None)
+        db_name = payload.get("dbName", db_name)
+        self.name_list.append((db_name, c_name))
+
         url = f'{self.endpoint}/v2/vectordb/collections/create'
         if self.db_name is not None:
             payload["dbName"] = self.db_name
diff --git a/tests/restful_client_v2/base/testbase.py b/tests/restful_client_v2/base/testbase.py
index 3c08fea27e..d4556aefeb 100644
--- a/tests/restful_client_v2/base/testbase.py
+++ b/tests/restful_client_v2/base/testbase.py
@@ -50,6 +50,17 @@ class TestBase(Base):
                 rsp = self.collection_client.collection_drop(payload)
             except Exception as e:
                 logger.error(e)
+        for item in self.collection_client.name_list:
+            db_name = item[0]
+            c_name = item[1]
+            payload = {
+                "collectionName": c_name,
+                "dbName": db_name
+            }
+            try:
+                self.collection_client.collection_drop(payload)
+            except Exception as e:
+                logger.error(e)
 
     @pytest.fixture(scope="function", autouse=True)
     def init_client(self, endpoint, token, minio_host, bucket_name, root_path):
diff --git a/tests/restful_client_v2/testcases/test_collection_operations.py b/tests/restful_client_v2/testcases/test_collection_operations.py
index b08da39925..5f6be5807a 100644
--- a/tests/restful_client_v2/testcases/test_collection_operations.py
+++ b/tests/restful_client_v2/testcases/test_collection_operations.py
@@ -103,7 +103,7 @@ class TestCreateCollection(TestBase):
             "collectionName": name,
             "dimension": dim,
             "metricType": metric_type,
-            "params":{
+            "params": {
                 "enableDynamicField": enable_dynamic_field,
                 "shardsNum": request_shards_num,
                 "consistencyLevel": f"{consistency_level}",
@@ -147,7 +147,7 @@ class TestCreateCollection(TestBase):
     @pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
     @pytest.mark.parametrize("consistency_level", ["Strong", "Bounded"])
     @pytest.mark.parametrize("enable_dynamic_field", [True, False])
-    @pytest.mark.parametrize("index_type", ["AUTOINDEX","IVF_SQ8", "HNSW"])
+    @pytest.mark.parametrize("index_type", ["AUTOINDEX", "IVF_SQ8", "HNSW"])
     @pytest.mark.parametrize("dim", [128])
     def test_create_collections_with_all_params(self,
                                                 dim,
@@ -179,6 +179,7 @@ class TestCreateCollection(TestBase):
             "FLAT": {},
             "IVF_SQ8": {"nlist": 16384},
             "HNSW": {"M": 16, "efConstruction": 500},
+            "BM25_SPARSE_INVERTED_INDEX": {"bm25_k1": 0.5, "bm25_b": 0.5},
             "AUTOINDEX": {}
         }
 
@@ -197,15 +198,32 @@ class TestCreateCollection(TestBase):
                     {"fieldName": "book_id", "dataType": "Int64",
                      "isPrimary": primary_key_field == "book_id", "elementTypeParams": {}},
                     {"fieldName": "word_count", "dataType": "Int64",
-                     "isPartitionKey": partition_key_field == "word_count", "isClusteringKey": clustering_key_field == "word_count", "elementTypeParams": {}},
+                     "isPartitionKey": partition_key_field == "word_count",
+                     "isClusteringKey": clustering_key_field == "word_count", "elementTypeParams": {}},
                     {"fieldName": "book_category", "dataType": "Int64",
                      "isPartitionKey": partition_key_field == "book_category",
                      "isClusteringKey": clustering_key_field == "book_category", "elementTypeParams": {}},
                     {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
+                    {"fieldName": "document_content", "dataType": "VarChar",
+                     "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
+                                           "analyzer_params": {
+                                               "tokenizer": "default"
+                                           },
+                                           "enable_match": True}},
                     {"fieldName": "json", "dataType": "JSON", "elementTypeParams": {}},
                     {"fieldName": "int_array", "dataType": "Array", "elementDataType": "Int64",
                      "elementTypeParams": {"max_capacity": "1024"}},
-                    {"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}}
+                    {"fieldName": "book_intro", "dataType": "FloatVector", "elementTypeParams": {"dim": f"{dim}"}},
+                    {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"}
+                ],
+                "functions": [
+                    {
+                        "name": "bm25_fn",
+                        "type": "BM25",
+                        "inputFieldNames": ["document_content"],
+                        "outputFieldNames": ["sparse_vector"],
+                        "params": {}
+                    }
                 ]
             },
             "indexParams": [
@@ -214,7 +232,14 @@ class TestCreateCollection(TestBase):
                  "metricType": f"{metric_type}",
                  "indexType": index_type,
                  "params": index_param_map[index_type]
-                 }]
+                 },
+                {"fieldName": "sparse_vector",
+                 "indexName": "sparse_vector_index",
+                 "metricType": "BM25",
+                 "indexType": "SPARSE_INVERTED_INDEX",
+                 "params": index_param_map["BM25_SPARSE_INVERTED_INDEX"]
+                }
+            ]
         }
 
         logging.info(f"create collection {name} with payload: {payload}")
@@ -244,6 +269,7 @@ class TestCreateCollection(TestBase):
         assert rsp['data']['partitionsNum'] == num_partitions
         assert rsp['data']['consistencyLevel'] == consistency_level
         assert ttl_seconds_actual == ttl_seconds
+        assert len(rsp['data']["functions"]) == len(payload["schema"]["functions"])
         #
         # # check fields properties
         fields = rsp['data']['fields']
@@ -259,11 +285,16 @@ class TestCreateCollection(TestBase):
         # check index
         index_info = [index.to_dict() for index in c.indexes]
         logger.info(f"index_info: {index_info}")
-        assert len(index_info) == 1
-        assert index_info[0]["index_param"]['metric_type'] == metric_type
-        assert index_info[0]["index_param"]['index_type'] == index_type
-        assert index_info[0]["index_param"].get("params", {}) == index_param_map[index_type]
-
+        assert len(index_info) == 2
+        for index in index_info:
+            index_param = index["index_param"]
+            if index_param["index_type"] == "SPARSE_INVERTED_INDEX":
+                assert index_param["metric_type"] == "BM25"
+                assert index_param.get("params", {}) == index_param_map["BM25_SPARSE_INVERTED_INDEX"]
+            else:
+                assert index_param["metric_type"] == metric_type
+                assert index_param["index_type"] == index_type
+                assert index_param.get("params", {}) == index_param_map[index_type]
 
     @pytest.mark.parametrize("auto_id", [True, False])
     @pytest.mark.parametrize("enable_dynamic_field", [True, False])
@@ -686,9 +717,6 @@ class TestCreateCollectionNegative(TestBase):
         rsp = client.collection_create(payload)
         assert rsp['code'] == 1801
 
-
-
-
     @pytest.mark.parametrize("name",
                              [" ", "test_collection_" * 100, "test collection", "test/collection", "test\collection"])
     def test_create_collections_with_invalid_collection_name(self, name):
@@ -797,6 +825,7 @@ class TestGetCollectionStats(TestBase):
         rsp = client.collection_stats(collection_name=name)
         assert rsp['data']['rowCount'] == nb
 
+
 @pytest.mark.L0
 class TestLoadReleaseCollection(TestBase):
 
@@ -845,6 +874,7 @@ class TestLoadReleaseCollection(TestBase):
         rsp = client.collection_load_state(collection_name=name)
         assert rsp['data']['loadState'] == "LoadStateNotLoad"
 
+
 @pytest.mark.L0
 class TestGetCollectionLoadState(TestBase):
 
@@ -1126,6 +1156,7 @@ class TestRenameCollection(TestBase):
         assert new_name in all_collections
         assert name not in all_collections
 
+
 @pytest.mark.L1
 class TestCollectionWithAuth(TestBase):
     def test_drop_collections_with_invalid_api_key(self):
diff --git a/tests/restful_client_v2/testcases/test_index_operation.py b/tests/restful_client_v2/testcases/test_index_operation.py
index 534684c9bf..c399e62e3c 100644
--- a/tests/restful_client_v2/testcases/test_index_operation.py
+++ b/tests/restful_client_v2/testcases/test_index_operation.py
@@ -1,28 +1,42 @@
 import random
 from sklearn import preprocessing
 import numpy as np
-import sys
-import json
 import time
-from utils import constant
-from utils.utils import gen_collection_name
+from utils.utils import gen_collection_name, patch_faker_text, en_vocabularies_distribution, \
+    zh_vocabularies_distribution
 from utils.util_log import test_log as logger
 import pytest
 from base.testbase import TestBase
 from utils.utils import gen_vector
 from pymilvus import (
-    FieldSchema, CollectionSchema, DataType,
     Collection
 )
+from faker import Faker
+
+Faker.seed(19530)
+fake_en = Faker("en_US")
+fake_zh = Faker("zh_CN")
+
+patch_faker_text(fake_en, en_vocabularies_distribution)
+patch_faker_text(fake_zh, zh_vocabularies_distribution)
+
+index_param_map = {
+    "FLAT": {},
+    "IVF_SQ8": {"nlist": 128},
+    "HNSW": {"M": 16, "efConstruction": 200},
+    "BM25_SPARSE_INVERTED_INDEX": {"bm25_k1": 0.5, "bm25_b": 0.5},
+    "AUTOINDEX": {}
+}
 
 
 @pytest.mark.L0
 class TestCreateIndex(TestBase):
 
-    @pytest.mark.parametrize("metric_type", ["L2"])
-    @pytest.mark.parametrize("index_type", ["AUTOINDEX", "HNSW"])
+    @pytest.mark.parametrize("metric_type", ["L2", "COSINE", "IP"])
+    @pytest.mark.parametrize("index_type", ["AUTOINDEX", "IVF_SQ8", "HNSW"])
     @pytest.mark.parametrize("dim", [128])
-    def test_index_e2e(self, dim, metric_type, index_type):
+    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365")
+    def test_index_default(self, dim, metric_type, index_type):
         """
         target: test create collection
         method: create a collection with a simple schema
@@ -43,38 +57,21 @@ class TestCreateIndex(TestBase):
         }
         logger.info(f"create collection {name} with payload: {payload}")
         rsp = client.collection_create(payload)
-        # insert data
-        for i in range(1):
-            data = []
-            for j in range(3000):
-                tmp = {
-                    "book_id": j,
-                    "word_count": j,
-                    "book_describe": f"book_{j}",
-                    "book_intro": preprocessing.normalize([np.array([random.random() for _ in range(dim)])])[
-                        0].tolist(),
-                }
-                data.append(tmp)
-            payload = {
-                "collectionName": name,
-                "data": data
-            }
-            rsp = self.vector_client.vector_insert(payload)
         c = Collection(name)
         c.flush()
         # list index, expect empty
         rsp = self.index_client.index_list(name)
-
         # create index
         payload = {
             "collectionName": name,
-            "indexParams": [{"fieldName": "book_intro", "indexName": "book_intro_vector",
-                             "metricType": f"{metric_type}"}]
+            "indexParams": [
+                {"fieldName": "book_intro", "indexName": "book_intro_vector",
+                 "metricType": f"{metric_type}",
+                 "indexType": f"{index_type}",
+                 "params": index_param_map[index_type]
+                 }
+            ]
         }
-        if index_type == "HNSW":
-            payload["indexParams"][0]["params"] = {"index_type": "HNSW", "M": "16", "efConstruction": "200"}
-        if index_type == "AUTOINDEX":
-            payload["indexParams"][0]["params"] = {"index_type": "AUTOINDEX"}
         rsp = self.index_client.index_create(payload)
         assert rsp['code'] == 0
         time.sleep(10)
@@ -90,8 +87,19 @@ class TestCreateIndex(TestBase):
             assert expected_index[i]['fieldName'] == actual_index[i]['fieldName']
             assert expected_index[i]['indexName'] == actual_index[i]['indexName']
             assert expected_index[i]['metricType'] == actual_index[i]['metricType']
-            assert expected_index[i]["params"]['index_type'] == actual_index[i]['indexType']
-
+            assert expected_index[i]["indexType"] == actual_index[i]['indexType']
+        # check index by pymilvus
+        index_info = [index.to_dict() for index in c.indexes]
+        logger.info(f"index_info: {index_info}")
+        for index in index_info:
+            index_param = index["index_param"]
+            if index_param["index_type"] == "SPARSE_INVERTED_INDEX":
+                assert index_param["metric_type"] == "BM25"
+                assert index_param.get("params", {}) == index_param_map["BM25_SPARSE_INVERTED_INDEX"]
+            else:
+                assert index_param["metric_type"] == metric_type
+                assert index_param["index_type"] == index_type
+                assert index_param.get("params", {}) == index_param_map[index_type]
         # drop index
         for i in range(len(actual_index)):
             payload = {
@@ -241,6 +249,119 @@ class TestCreateIndex(TestBase):
             assert expected_index[i]['indexName'] == actual_index[i]['indexName']
             assert expected_index[i]['params']['index_type'] == actual_index[i]['indexType']
 
+    @pytest.mark.parametrize("insert_round", [1])
+    @pytest.mark.parametrize("auto_id", [True])
+    @pytest.mark.parametrize("is_partition_key", [True])
+    @pytest.mark.parametrize("enable_dynamic_schema", [True])
+    @pytest.mark.parametrize("nb", [3000])
+    @pytest.mark.parametrize("dim", [128])
+    @pytest.mark.parametrize("tokenizer", ['default', 'jieba'])
+    @pytest.mark.parametrize("index_type", ['SPARSE_INVERTED_INDEX', 'SPARSE_WAND'])
+    @pytest.mark.parametrize("bm25_k1", [1.2, 1.5])
+    @pytest.mark.parametrize("bm25_b", [0.7, 0.5])
+    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36365")
+    def test_create_index_for_full_text_search(self, nb, dim, insert_round, auto_id, is_partition_key,
+                                               enable_dynamic_schema, tokenizer, index_type, bm25_k1, bm25_b):
+        """
+        Insert a vector with a simple payload
+        """
+        # create a collection
+        name = gen_collection_name()
+        payload = {
+            "collectionName": name,
+            "schema": {
+                "autoId": auto_id,
+                "enableDynamicField": enable_dynamic_schema,
+                "fields": [
+                    {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
+                    {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
+                     "elementTypeParams": {}},
+                    {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
+                    {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
+                    {"fieldName": "document_content", "dataType": "VarChar",
+                     "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
+                                           "analyzer_params": {
+                                               "tokenizer": tokenizer,
+                                           },
+                                           "enable_match": True}},
+                    {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
+                ],
+                "functions": [
+                    {
+                        "name": "bm25_fn",
+                        "type": "BM25",
+                        "inputFieldNames": ["document_content"],
+                        "outputFieldNames": ["sparse_vector"],
+                        "params": {}
+                    }
+                ]
+            },
+        }
+        rsp = self.collection_client.collection_create(payload)
+        assert rsp['code'] == 0
+        rsp = self.collection_client.collection_describe(name)
+        logger.info(f"rsp: {rsp}")
+        assert rsp['code'] == 0
+        if tokenizer == 'default':
+            fake = fake_en
+        elif tokenizer == 'jieba':
+            fake = fake_zh
+        else:
+            raise Exception("Invalid tokenizer")
+
+        # insert data
+        for i in range(insert_round):
+            data = []
+            for j in range(nb):
+                idx = i * nb + j
+                if auto_id:
+                    tmp = {
+                        "user_id": idx % 100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                else:
+                    tmp = {
+                        "book_id": idx,
+                        "user_id": idx % 100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                if enable_dynamic_schema:
+                    tmp.update({f"dynamic_field_{i}": i})
+                data.append(tmp)
+            payload = {
+                "collectionName": name,
+                "data": data,
+            }
+            rsp = self.vector_client.vector_insert(payload)
+            assert rsp['code'] == 0
+            assert rsp['data']['insertCount'] == nb
+        assert rsp['code'] == 0
+
+        # create index
+        payload = {
+            "collectionName": name,
+            "indexParams": [
+                {"fieldName": "sparse_vector", "indexName": "sparse_vector",
+                 "metricType": "BM25",
+                 "indexType": index_type,
+                 "params": {"bm25_k1": bm25_k1, "bm25_b": bm25_b}
+                 }
+            ]
+        }
+        rsp = self.index_client.index_create(payload)
+        c = Collection(name)
+        index_info = [index.to_dict() for index in c.indexes]
+        logger.info(f"index_info: {index_info}")
+        for info in index_info:
+            assert info['index_param']['metric_type'] == 'BM25'
+            assert info['index_param']["params"]['bm25_k1'] == bm25_k1
+            assert info['index_param']["params"]['bm25_b'] == bm25_b
+            assert info['index_param']['index_type'] == index_type
+
 
 @pytest.mark.L1
 class TestCreateIndexNegative(TestBase):
diff --git a/tests/restful_client_v2/testcases/test_jobs_operation.py b/tests/restful_client_v2/testcases/test_jobs_operation.py
index 46f058cb3f..9cbe9b9688 100644
--- a/tests/restful_client_v2/testcases/test_jobs_operation.py
+++ b/tests/restful_client_v2/testcases/test_jobs_operation.py
@@ -101,7 +101,6 @@ class TestCreateImportJob(TestBase):
                     assert False, "import job timeout"
         c = Collection(name)
         c.load(_refresh=True)
-        time.sleep(10)
         res = c.query(
             expr="",
             output_fields=["count(*)"],
@@ -192,7 +191,6 @@ class TestCreateImportJob(TestBase):
                     assert False, "import job timeout"
         c = Collection(name)
         c.load(_refresh=True)
-        time.sleep(10)
         res = c.query(
             expr="",
             output_fields=["count(*)"],
@@ -285,7 +283,6 @@ class TestCreateImportJob(TestBase):
                     assert False, "import job timeout"
         c = Collection(name)
         c.load(_refresh=True)
-        time.sleep(10)
         res = c.query(
             expr="",
             output_fields=["count(*)"],
@@ -376,6 +373,7 @@ class TestCreateImportJob(TestBase):
         time.sleep(10)
         # assert data count
         c = Collection(name)
+        c.load(_refresh=True)
         assert c.num_entities == 2000
         # assert import data can be queried
         payload = {
@@ -456,6 +454,7 @@ class TestCreateImportJob(TestBase):
         time.sleep(10)
         # assert data count
         c = Collection(name)
+        c.load(_refresh=True)
         assert c.num_entities == 2000
         # assert import data can be queried
         payload = {
@@ -541,6 +540,7 @@ class TestCreateImportJob(TestBase):
         time.sleep(10)
         # assert data count
         c = Collection(name)
+        c.load(_refresh=True)
         assert c.num_entities == 2000
         # assert import data can be queried
         payload = {
@@ -665,6 +665,7 @@ class TestCreateImportJob(TestBase):
         time.sleep(10)
         # assert data count
         c = Collection(name)
+        c.load(_refresh=True)
         assert c.num_entities == 6000
         # assert import data can be queried
         payload = {
@@ -915,6 +916,7 @@ class TestImportJobAdvance(TestBase):
         rsp = self.import_job_client.list_import_jobs(payload)
         # assert data count
         c = Collection(name)
+        c.load(_refresh=True)
         assert c.num_entities == file_nums * batch_size
         # assert import data can be queried
         payload = {
@@ -1007,6 +1009,7 @@ class TestCreateImportJobAdvance(TestBase):
         rsp = self.import_job_client.list_import_jobs(payload)
         # assert data count
         c = Collection(name)
+        c.load(_refresh=True)
         assert c.num_entities == file_nums * batch_size * task_num
         # assert import data can be queried
         payload = {
@@ -1096,6 +1099,7 @@ class TestCreateImportJobAdvance(TestBase):
         rsp = self.import_job_client.list_import_jobs(payload)
         # assert data count
         c = Collection(name)
+        c.load(_refresh=True)
         assert c.num_entities == file_nums * batch_size * task_num
         # assert import data can be queried
         payload = {
diff --git a/tests/restful_client_v2/testcases/test_vector_operations.py b/tests/restful_client_v2/testcases/test_vector_operations.py
index 991c12fe49..ecab26be90 100644
--- a/tests/restful_client_v2/testcases/test_vector_operations.py
+++ b/tests/restful_client_v2/testcases/test_vector_operations.py
@@ -6,7 +6,7 @@ import sys
 import json
 import time
 from utils import constant
-from utils.utils import gen_collection_name, get_sorted_distance
+from utils.utils import gen_collection_name, get_sorted_distance, patch_faker_text, en_vocabularies_distribution, zh_vocabularies_distribution
 from utils.util_log import test_log as logger
 import pytest
 from base.testbase import TestBase
@@ -20,6 +20,9 @@ Faker.seed(19530)
 fake_en = Faker("en_US")
 fake_zh = Faker("zh_CN")
 
+patch_faker_text(fake_en, en_vocabularies_distribution)
+patch_faker_text(fake_zh, zh_vocabularies_distribution)
+
 
 @pytest.mark.L0
 class TestInsertVector(TestBase):
@@ -1193,14 +1196,108 @@ class TestSearchVector(TestBase):
         rsp = self.vector_client.vector_search(payload)
         assert rsp['code'] == 0
 
+
+    @pytest.mark.parametrize("insert_round", [1])
+    @pytest.mark.parametrize("auto_id", [True, False])
+    @pytest.mark.parametrize("is_partition_key", [True, False])
+    @pytest.mark.parametrize("enable_dynamic_schema", [True])
+    @pytest.mark.parametrize("nb", [3000])
+    @pytest.mark.parametrize("dim", [128])
+    @pytest.mark.parametrize("groupingField", ['user_id', None])
+    @pytest.mark.parametrize("tokenizer", ['default'])
+    def test_search_vector_for_en_full_text_search(self, nb, dim, insert_round, auto_id,
+                                                      is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
+        """
+        Insert a vector with a simple payload
+        """
+        # create a collection
+        name = gen_collection_name()
+        payload = {
+            "collectionName": name,
+            "schema": {
+                "autoId": auto_id,
+                "enableDynamicField": enable_dynamic_schema,
+                "fields": [
+                    {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
+                    {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
+                     "elementTypeParams": {}},
+                    {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
+                    {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
+                    {"fieldName": "document_content", "dataType": "VarChar",
+                     "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
+                                           "analyzer_params": {
+                                               "tokenizer": tokenizer,
+                                           },
+                                           "enable_match": True}},
+                    {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
+                ],
+                "functions": [
+                    {
+                        "name": "bm25_fn",
+                        "type": "BM25",
+                        "inputFieldNames": ["document_content"],
+                        "outputFieldNames": ["sparse_vector"],
+                        "params": {}
+                    }
+                ]
+            },
+
+            "indexParams": [
+                {"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
+                 "params": {"index_type": "SPARSE_INVERTED_INDEX"}}
+            ]
+        }
+        rsp = self.collection_client.collection_create(payload)
+        assert rsp['code'] == 0
+        rsp = self.collection_client.collection_describe(name)
+        logger.info(f"rsp: {rsp}")
+        assert rsp['code'] == 0
+        if tokenizer == 'default':
+            fake = fake_en
+        elif tokenizer == 'jieba':
+            fake = fake_zh
+        else:
+            raise Exception("Invalid tokenizer")
+
+        # insert data
+        for i in range(insert_round):
+            data = []
+            for j in range(nb):
+                idx = i * nb + j
+                if auto_id:
+                    tmp = {
+                        "user_id": idx%100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                else:
+                    tmp = {
+                        "book_id": idx,
+                        "user_id": idx%100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                if enable_dynamic_schema:
+                    tmp.update({f"dynamic_field_{i}": i})
+                data.append(tmp)
+            payload = {
+                "collectionName": name,
+                "data": data,
+            }
+            rsp = self.vector_client.vector_insert(payload)
+            assert rsp['code'] == 0
+            assert rsp['data']['insertCount'] == nb
+        assert rsp['code'] == 0
+
         # search data
         payload = {
             "collectionName": name,
-            "data": [gen_vector(datatype="SparseFloatVector", dim=dim, sparse_format="coo")],
+            "data": [fake.text().lower() for _ in range(1)],
             "filter": "word_count > 100",
             "outputFields": ["*"],
             "searchParams": {
-                "metricType": "IP",
                 "params": {
                     "drop_ratio_search": "0.2",
                 }
@@ -1211,6 +1308,125 @@ class TestSearchVector(TestBase):
             payload["groupingField"] = groupingField
         rsp = self.vector_client.vector_search(payload)
         assert rsp['code'] == 0
+        assert len(rsp['data']) > 0
+
+
+    @pytest.mark.parametrize("insert_round", [1])
+    @pytest.mark.parametrize("auto_id", [True, False])
+    @pytest.mark.parametrize("is_partition_key", [True, False])
+    @pytest.mark.parametrize("enable_dynamic_schema", [True])
+    @pytest.mark.parametrize("nb", [3000])
+    @pytest.mark.parametrize("dim", [128])
+    @pytest.mark.parametrize("groupingField", ['user_id', None])
+    @pytest.mark.parametrize("tokenizer", ['jieba'])
+    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/36751")
+    def test_search_vector_for_zh_full_text_search(self, nb, dim, insert_round, auto_id,
+                                                      is_partition_key, enable_dynamic_schema, groupingField, tokenizer):
+        """
+        Insert a vector with a simple payload
+        """
+        # create a collection
+        name = gen_collection_name()
+        payload = {
+            "collectionName": name,
+            "schema": {
+                "autoId": auto_id,
+                "enableDynamicField": enable_dynamic_schema,
+                "fields": [
+                    {"fieldName": "book_id", "dataType": "Int64", "isPrimary": True, "elementTypeParams": {}},
+                    {"fieldName": "user_id", "dataType": "Int64", "isPartitionKey": is_partition_key,
+                     "elementTypeParams": {}},
+                    {"fieldName": "word_count", "dataType": "Int64", "elementTypeParams": {}},
+                    {"fieldName": "book_describe", "dataType": "VarChar", "elementTypeParams": {"max_length": "256"}},
+                    {"fieldName": "document_content", "dataType": "VarChar",
+                     "elementTypeParams": {"max_length": "1000", "enable_tokenizer": True,
+                                           "analyzer_params": {
+                                               "tokenizer": tokenizer,
+                                           },
+                                           "enable_match": True}},
+                    {"fieldName": "sparse_vector", "dataType": "SparseFloatVector"},
+                ],
+                "functions": [
+                    {
+                        "name": "bm25_fn",
+                        "type": "BM25",
+                        "inputFieldNames": ["document_content"],
+                        "outputFieldNames": ["sparse_vector"],
+                        "params": {}
+                    }
+                ]
+            },
+
+            "indexParams": [
+                {"fieldName": "sparse_vector", "indexName": "sparse_vector", "metricType": "BM25",
+                 "params": {"index_type": "SPARSE_INVERTED_INDEX"}}
+            ]
+        }
+        rsp = self.collection_client.collection_create(payload)
+        assert rsp['code'] == 0
+        rsp = self.collection_client.collection_describe(name)
+        logger.info(f"rsp: {rsp}")
+        assert rsp['code'] == 0
+        if tokenizer == 'default':
+            fake = fake_en
+        elif tokenizer == 'jieba':
+            fake = fake_zh
+        else:
+            raise Exception("Invalid tokenizer")
+
+        # insert data
+        for i in range(insert_round):
+            data = []
+            for j in range(nb):
+                idx = i * nb + j
+                if auto_id:
+                    tmp = {
+                        "user_id": idx%100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                else:
+                    tmp = {
+                        "book_id": idx,
+                        "user_id": idx%100,
+                        "word_count": j,
+                        "book_describe": f"book_{idx}",
+                        "document_content": fake.text().lower(),
+                    }
+                if enable_dynamic_schema:
+                    tmp.update({f"dynamic_field_{i}": i})
+                data.append(tmp)
+            payload = {
+                "collectionName": name,
+                "data": data,
+            }
+            rsp = self.vector_client.vector_insert(payload)
+            assert rsp['code'] == 0
+            assert rsp['data']['insertCount'] == nb
+        assert rsp['code'] == 0
+
+        # search data
+        payload = {
+            "collectionName": name,
+            "data": [fake.text().lower() for _ in range(2)],
+            "filter": "word_count > 100",
+            "outputFields": ["*"],
+            "searchParams": {
+                "params": {
+                    "drop_ratio_search": "0.2",
+                }
+            },
+            "limit": 500,
+        }
+        if groupingField:
+            payload["groupingField"] = groupingField
+        rsp = self.vector_client.vector_search(payload)
+        assert rsp['code'] == 0
+        assert len(rsp['data']) > 0
+
+
+
 
     @pytest.mark.parametrize("insert_round", [2])
     @pytest.mark.parametrize("auto_id", [True])
@@ -1790,6 +2006,29 @@ class TestSearchVectorNegative(TestBase):
         rsp = self.vector_client.vector_search(payload)
         assert rsp['code'] == 1802
 
+    @pytest.mark.parametrize("invalid_metric_type", ["L2", "IP", "UNSUPPORTED"])
+    @pytest.mark.xfail(reason="issue: https://github.com/milvus-io/milvus/issues/37138")
+    def test_search_vector_with_invalid_metric_type(self, invalid_metric_type):
+        """
+        Search a vector with a simple payload
+        """
+        name = gen_collection_name()
+        self.name = name
+        self.init_collection(name, metric_type="COSINE")
+
+        # search data
+        dim = 128
+        payload = {
+            "collectionName": name,
+            "data": [preprocessing.normalize([np.array([random.random() for i in range(dim)])])[0].tolist()],
+            "searchParams": {
+                "metricType": invalid_metric_type
+            }
+        }
+        rsp = self.vector_client.vector_search(payload)
+        assert rsp['code'] != 0
+
+
     @pytest.mark.parametrize("limit", [0, 16385])
     def test_search_vector_with_invalid_limit(self, limit):
         """
diff --git a/tests/restful_client_v2/utils/utils.py b/tests/restful_client_v2/utils/utils.py
index 0cd6a6e10e..d7fee27f1d 100644
--- a/tests/restful_client_v2/utils/utils.py
+++ b/tests/restful_client_v2/utils/utils.py
@@ -14,10 +14,74 @@ from sklearn.metrics import pairwise_distances
 from collections import Counter
 import bm25s
 import jieba
+
+
 fake = Faker()
+fake.seed_instance(19530)
 rng = np.random.default_rng()
 
 
+en_vocabularies_distribution = {
+    "hello": 0.01,
+    "milvus": 0.01,
+    "vector": 0.01,
+    "database": 0.01
+}
+
+zh_vocabularies_distribution = {
+    "你好": 0.01,
+    "向量": 0.01,
+    "数据": 0.01,
+    "库": 0.01
+}
+
+
+def patch_faker_text(fake_instance, vocabularies_distribution):
+    """
+    Monkey patch the text() method of a Faker instance to include custom vocabulary.
+    Each word in vocabularies_distribution has an independent chance to be inserted.
+    Args:
+        fake_instance: Faker instance to patch
+        vocabularies_distribution: Dictionary where:
+            - key: word to insert
+            - value: probability (0-1) of inserting this word into each sentence
+    Example:
+        vocabularies_distribution = {
+            "hello": 0.1,    # 10% chance to insert "hello" in each sentence
+            "milvus": 0.1,   # 10% chance to insert "milvus" in each sentence
+        }
+    """
+    original_text = fake_instance.text
+
+    def new_text(*args, **kwargs):
+        sentences = []
+        # Split original text into sentences
+        original_sentences = original_text(*args,**kwargs).split('.')
+        original_sentences = [s.strip() for s in original_sentences if s.strip()]
+
+        for base_sentence in original_sentences:
+            words = base_sentence.split()
+
+            # Independently decide whether to insert each word
+            for word, probability in vocabularies_distribution.items():
+                if random.random() < probability:
+                    # Choose random position to insert the word
+                    insert_pos = random.randint(0, len(words))
+                    words.insert(insert_pos, word)
+
+            # Reconstruct the sentence
+            base_sentence = ' '.join(words)
+
+            # Ensure proper capitalization
+            base_sentence = base_sentence[0].upper() + base_sentence[1:]
+            sentences.append(base_sentence)
+
+        return '. '.join(sentences) + '.'
+
+    # Replace the original text method with our custom one
+    fake_instance.text = new_text
+
+
 def analyze_documents(texts, language="en"):
     stopwords = "en"
     if language in ["en", "english"]: