diff --git a/tests/python_client/common/code_mapping.py b/tests/python_client/common/code_mapping.py index 74f87583e0..a3264c5799 100644 --- a/tests/python_client/common/code_mapping.py +++ b/tests/python_client/common/code_mapping.py @@ -40,6 +40,7 @@ class IndexErrorMessage(ExceptionsMessage): CheckBitmapOnPK = "create bitmap index on primary key not supported" CheckBitmapCardinality = "failed to check bitmap cardinality limit, should be larger than 0 and smaller than 1000" NotConfigable = "{0} is not configable index param" + InvalidOffsetCache = "invalid offset cache index params" class QueryErrorMessage(ExceptionsMessage): diff --git a/tests/python_client/common/common_func.py b/tests/python_client/common/common_func.py index cb2b88eb77..e41a47baa0 100644 --- a/tests/python_client/common/common_func.py +++ b/tests/python_client/common/common_func.py @@ -1688,6 +1688,10 @@ def get_dim_by_schema(schema=None): return None +def gen_varchar_data(length: int, nb: int): + return ["".join([chr(random.randint(97, 122)) for _ in range(length)]) for _ in range(nb)] + + def gen_data_by_collection_field(field, nb=None, start=None): # if nb is None, return one data, else return a list of data data_type = field.dtype @@ -1726,8 +1730,8 @@ def gen_data_by_collection_field(field, nb=None, start=None): max_length = min(20, max_length-1) length = random.randint(0, max_length) if nb is None: - return "".join([chr(random.randint(97, 122)) for _ in range(length)]) - return ["".join([chr(random.randint(97, 122)) for _ in range(length)]) for _ in range(nb)] + return gen_varchar_data(length=length, nb=1)[0] + return gen_varchar_data(length=length, nb=nb) if data_type == DataType.JSON: if nb is None: return {"name": fake.name(), "address": fake.address()} diff --git a/tests/python_client/testcases/test_index.py b/tests/python_client/testcases/test_index.py index 71c4aa17cd..5253823187 100644 --- a/tests/python_client/testcases/test_index.py +++ b/tests/python_client/testcases/test_index.py @@ -2461,6 +2461,81 @@ class TestBitmapIndex(TestcaseBase): # re-build loaded index self.build_multi_index(index_params=index_params) + @pytest.mark.tags(CaseLabel.L1) + @pytest.mark.parametrize("index_obj, filed_name", [(DefaultScalarIndexParams.Default, 'INT64_hybrid_index'), + (DefaultScalarIndexParams.INVERTED, 'INT64_inverted'), + (DefaultScalarIndexParams.STL_SORT, 'INT64_stl_sort'), + (DefaultScalarIndexParams.Trie, 'VARCHAR_trie')]) + def test_bitmap_offset_cache_on_not_bitmap_fields(self, request, index_obj, filed_name): + """ + target: + 1. alter offset cache on not `BITMAP` index scalar field + method: + 1. build scalar index on scalar field + 2. alter offset cache on scalar index field + expected: + 1. alter index raises expected error + """ + # init params + collection_name, primary_field = f"{request.function.__name__}_{filed_name}", 'INT64_pk' + + # create a collection with fields + self.collection_wrap.init_collection( + name=collection_name, + schema=cf.set_collection_schema( + fields=[primary_field, DataType.FLOAT_VECTOR.name, filed_name], + field_params={primary_field: FieldParams(is_primary=True).to_dict}, + ) + ) + + # build scalar index on empty collection + index_params = { + **DefaultVectorIndexParams.HNSW(DataType.FLOAT_VECTOR.name), + **index_obj(filed_name) + } + self.build_multi_index(index_params=index_params) + assert sorted([n.field_name for n in self.collection_wrap.indexes]) == sorted(index_params.keys()) + + # enable offset cache and raises error + self.collection_wrap.alter_index( + index_name=filed_name, extra_params=AlterIndexParams.index_offset_cache(), + check_task=CheckTasks.err_res, check_items={ct.err_code: 1100, ct.err_msg: iem.InvalidOffsetCache} + ) + + @pytest.mark.tags(CaseLabel.L1) + def test_bitmap_offset_cache_on_vector_field(self, request): + """ + target: + 1. alter offset cache on vector field + method: + 1. build vector index on an empty collection + 2. alter offset cache on vector index field + expected: + 1. alter index raises expected error + """ + # init params + collection_name, primary_field = f"{request.function.__name__}", 'INT64_pk' + + # create a collection with fields + self.collection_wrap.init_collection( + name=collection_name, + schema=cf.set_collection_schema( + fields=[primary_field, DataType.FLOAT_VECTOR.name], + field_params={primary_field: FieldParams(is_primary=True).to_dict}, + ) + ) + + # build index on empty collection + index_params = DefaultVectorIndexParams.HNSW(DataType.FLOAT_VECTOR.name) + self.build_multi_index(index_params=index_params) + assert sorted([n.field_name for n in self.collection_wrap.indexes]) == sorted(index_params.keys()) + + # enable offset cache and raises error + self.collection_wrap.alter_index( + index_name=DataType.FLOAT_VECTOR.name, extra_params=AlterIndexParams.index_offset_cache(), + check_task=CheckTasks.err_res, check_items={ct.err_code: 1100, ct.err_msg: iem.InvalidOffsetCache} + ) + @pytest.mark.tags(CaseLabel.L1) @pytest.mark.parametrize("auto_id", [True, False]) @pytest.mark.parametrize("primary_field", ["int64_pk", "varchar_pk"]) diff --git a/tests/python_client/testcases/test_mix_scenes.py b/tests/python_client/testcases/test_mix_scenes.py index a3610432aa..ac96bc5c1b 100644 --- a/tests/python_client/testcases/test_mix_scenes.py +++ b/tests/python_client/testcases/test_mix_scenes.py @@ -31,7 +31,7 @@ class TestNoIndexDQLExpr(TestCaseClassBase): self._connect(self) # init params - self.primary_field, nb = "int64_pk", 3000 + self.primary_field, self.nb = "int64_pk", 3000 # create a collection with fields self.collection_wrap.init_collection( @@ -49,7 +49,7 @@ class TestNoIndexDQLExpr(TestCaseClassBase): ) # prepare data (> 1024 triggering index building) - self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=nb) + self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb) @pytest.fixture(scope="class", autouse=True) def prepare_data(self): @@ -156,6 +156,48 @@ class TestNoIndexDQLExpr(TestCaseClassBase): res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field]) assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)]) + @pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64']) + @pytest.mark.parametrize("limit", [1, 10, 3000]) + def test_no_index_query_with_int_in(self, range_num, counts, expr_field, limit): + """ + target: + 1. check number operation `in` and `not in`, calculate total number via expr + method: + 1. prepare some data + 2. query with the different expr(in, not in) and limit + 3. check query result + expected: + 1. query response equal to min(insert data, limit) + """ + # random set expr list + range_numbers = [random.randint(*range_num) for _ in range(counts)] + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers]) + + # query `in` + res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + + # count `in` + self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + + # query `not in` + not_in_count = self.nb - expr_count + res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}" + + # count `not in` + self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": not_in_count}]}) + @pytest.mark.xdist_group("TestHybridIndexDQLExpr") class TestHybridIndexDQLExpr(TestCaseClassBase): @@ -173,13 +215,14 @@ class TestHybridIndexDQLExpr(TestCaseClassBase): # init params self.primary_field, self.nb = "int64_pk", 3000 + self.all_fields = [self.primary_field, DataType.FLOAT16_VECTOR.name, DataType.BFLOAT16_VECTOR.name, + DataType.SPARSE_FLOAT_VECTOR.name, DataType.BINARY_VECTOR.name, *self().all_scalar_fields] # create a collection with fields self.collection_wrap.init_collection( name=cf.gen_unique_str("test_hybrid_index_dql_expr"), schema=cf.set_collection_schema( - fields=[self.primary_field, DataType.FLOAT16_VECTOR.name, DataType.BFLOAT16_VECTOR.name, - DataType.SPARSE_FLOAT_VECTOR.name, DataType.BINARY_VECTOR.name, *self().all_scalar_fields], + fields=self.all_fields, field_params={ self.primary_field: FieldParams(is_primary=True).to_dict, DataType.FLOAT16_VECTOR.name: FieldParams(dim=3).to_dict, @@ -190,7 +233,10 @@ class TestHybridIndexDQLExpr(TestCaseClassBase): ) # prepare data (> 1024 triggering index building) - self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb) + self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb, default_values={ + 'VARCHAR': cf.gen_varchar_data(3, self.nb), + 'ARRAY_VARCHAR': [cf.gen_varchar_data(length=2, nb=random.randint(0, 10)) for _ in range(self.nb)] + }) @pytest.fixture(scope="class", autouse=True) def prepare_data(self): @@ -280,6 +326,114 @@ class TestHybridIndexDQLExpr(TestCaseClassBase): res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field]) assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)]) + @pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64']) + @pytest.mark.parametrize("limit", [1, 10, 3000]) + def test_hybrid_index_query_with_int_in(self, range_num, counts, expr_field, limit): + """ + target: + 1. check number operation `in` and `not in`, calculate total number via expr + method: + 1. prepare some data and build `Hybrid index` on scalar fields + 2. query with the different expr(in, not in) and limit + 3. check query result + expected: + 1. query response equal to min(insert data, limit) + """ + # random set expr list + range_numbers = [random.randint(*range_num) for _ in range(counts)] + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers]) + + # query `in` + res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + + # count `in` + self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + + # query `not in` + not_in_count = self.nb - expr_count + res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}" + + # count `not in` + self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": not_in_count}]}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("range_num, counts", [([1, 3], 50), ([2, 5], 50), ([3, 3], 100)]) + @pytest.mark.parametrize("limit", [1, 10, 3000]) + @pytest.mark.parametrize("expr_field", ['VARCHAR']) + def test_hybrid_index_query_with_varchar_in(self, range_num, counts, limit, expr_field): + """ + target: + 1. check varchar operation `in` and `not in`, calculate total number via expr + method: + 1. prepare some data and build `Hybrid index` on scalar fields + 2. query with the different expr(in, not in) and limit + 3. check query result + expected: + 1. query response equal to min(insert data, limit) + """ + # random set expr list + range_numbers = [cf.gen_varchar_data(random.randint(*range_num), 1)[0] for _ in range(counts)] + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers]) + + # query `in` + res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + + # count `in` + self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + + # query `not in` + not_in_count = self.nb - expr_count + res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}" + + # count `not in` + self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": not_in_count}]}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("length", [0, 5, 11]) + @pytest.mark.parametrize("expr_obj", [Expr.array_length, Expr.ARRAY_LENGTH]) + @pytest.mark.parametrize("expr_field", ['ARRAY_VARCHAR']) + def test_hybrid_index_query_array_length_count(self, length, expr_obj, expr_field): + """ + target: + 1. check query with count(*) via expr `array length` + method: + 1. prepare some data and build `Hybrid index` on scalar fields + 2. query with count(*) via expr + 3. check query result + expected: + 1. query response equal to insert nb + """ + expr = Expr.EQ(expr_obj(expr_field).value, length).value + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if len(i) == length]) + + # query count(*) + self.collection_wrap.query(expr=expr, output_fields=['count(*)'], check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + @pytest.mark.tags(CaseLabel.L1) def test_hybrid_index_query_count(self): """ @@ -296,6 +450,25 @@ class TestHybridIndexDQLExpr(TestCaseClassBase): self.collection_wrap.query(expr='', output_fields=['count(*)'], check_task=CheckTasks.check_query_results, check_items={"exp_res": [{"count(*)": self.nb}]}) + @pytest.mark.tags(CaseLabel.L2) + def test_hybrid_index_search_output_fields(self): + """ + target: + 1. check search output fields with Hybrid index built on scalar fields + method: + 1. prepare some data and build `Hybrid index` on scalar fields + 2. search output fields and check result + expected: + 1. search output fields with Hybrid index + """ + search_params, vector_field, limit, nq = {"metric_type": "L2", "ef": 32}, DataType.FLOAT16_VECTOR.name, 3, 1 + + self.collection_wrap.search( + cf.gen_vectors(nb=nq, dim=3, vector_data_type=vector_field), vector_field, search_params, limit, + output_fields=['*'], check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field), + "limit": limit, "output_fields": self.all_fields}) + @pytest.mark.xdist_group("TestInvertedIndexDQLExpr") class TestInvertedIndexDQLExpr(TestCaseClassBase): @@ -312,14 +485,15 @@ class TestInvertedIndexDQLExpr(TestCaseClassBase): self._connect(self) # init params - self.primary_field, nb = "int64_pk", 3000 + self.primary_field, self.nb = "int64_pk", 3000 + self.all_fields = [self.primary_field, DataType.FLOAT16_VECTOR.name, DataType.BFLOAT16_VECTOR.name, + DataType.SPARSE_FLOAT_VECTOR.name, DataType.BINARY_VECTOR.name, *self().all_scalar_fields] # create a collection with fields self.collection_wrap.init_collection( name=cf.gen_unique_str("test_inverted_index_dql_expr"), schema=cf.set_collection_schema( - fields=[self.primary_field, DataType.FLOAT16_VECTOR.name, DataType.BFLOAT16_VECTOR.name, - DataType.SPARSE_FLOAT_VECTOR.name, DataType.BINARY_VECTOR.name, *self().all_scalar_fields], + fields=self.all_fields, field_params={ self.primary_field: FieldParams(is_primary=True).to_dict, DataType.FLOAT16_VECTOR.name: FieldParams(dim=3).to_dict, @@ -330,7 +504,10 @@ class TestInvertedIndexDQLExpr(TestCaseClassBase): ) # prepare data (> 1024 triggering index building) - self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=nb) + self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb, default_values={ + 'VARCHAR': cf.gen_varchar_data(3, self.nb), + 'ARRAY_VARCHAR': [cf.gen_varchar_data(length=2, nb=random.randint(0, 10)) for _ in range(self.nb)] + }) @pytest.fixture(scope="class", autouse=True) def prepare_data(self): @@ -420,6 +597,114 @@ class TestInvertedIndexDQLExpr(TestCaseClassBase): res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field]) assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)]) + @pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64']) + @pytest.mark.parametrize("limit", [1, 10, 3000]) + def test_inverted_index_query_with_int_in(self, range_num, counts, expr_field, limit): + """ + target: + 1. check number operation `in` and `not in`, calculate total number via expr + method: + 1. prepare some data and build `INVERTED index` on scalar fields + 2. query with the different expr(in, not in) and limit + 3. check query result + expected: + 1. query response equal to min(insert data, limit) + """ + # random set expr list + range_numbers = [random.randint(*range_num) for _ in range(counts)] + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers]) + + # query `in` + res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + + # count `in` + self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + + # query `not in` + not_in_count = self.nb - expr_count + res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}" + + # count `not in` + self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": not_in_count}]}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("range_num, counts", [([1, 3], 50), ([2, 5], 50), ([3, 3], 100)]) + @pytest.mark.parametrize("limit", [1, 10, 3000]) + @pytest.mark.parametrize("expr_field", ['VARCHAR']) + def test_inverted_index_query_with_varchar_in(self, range_num, counts, limit, expr_field): + """ + target: + 1. check varchar operation `in` and `not in`, calculate total number via expr + method: + 1. prepare some data and build `INVERTED index` on scalar fields + 2. query with the different expr(in, not in) and limit + 3. check query result + expected: + 1. query response equal to min(insert data, limit) + """ + # random set expr list + range_numbers = [cf.gen_varchar_data(random.randint(*range_num), 1)[0] for _ in range(counts)] + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers]) + + # query `in` + res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + + # count `in` + self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + + # query `not in` + not_in_count = self.nb - expr_count + res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}" + + # count `not in` + self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": not_in_count}]}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("length", [0, 5, 11]) + @pytest.mark.parametrize("expr_obj", [Expr.array_length, Expr.ARRAY_LENGTH]) + @pytest.mark.parametrize("expr_field", ['ARRAY_VARCHAR']) + def test_inverted_index_query_array_length_count(self, length, expr_obj, expr_field): + """ + target: + 1. check query with count(*) via expr `array length` + method: + 1. prepare some data and build `INVERTED index` on scalar fields + 2. query with count(*) via expr + 3. check query result + expected: + 1. query response equal to insert nb + """ + expr = Expr.EQ(expr_obj(expr_field).value, length).value + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if len(i) == length]) + + # query count(*) + self.collection_wrap.query(expr=expr, output_fields=['count(*)'], check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + @pytest.mark.xdist_group("TestBitmapIndexDQLExpr") class TestBitmapIndexDQLExpr(TestCaseClassBase): @@ -437,13 +722,14 @@ class TestBitmapIndexDQLExpr(TestCaseClassBase): # init params self.primary_field, self.nb = "int64_pk", 3000 + self.all_fields = [self.primary_field, DataType.FLOAT16_VECTOR.name, DataType.BFLOAT16_VECTOR.name, + DataType.SPARSE_FLOAT_VECTOR.name, DataType.BINARY_VECTOR.name, *self().all_scalar_fields] # create a collection with fields self.collection_wrap.init_collection( name=cf.gen_unique_str("test_bitmap_index_dql_expr"), schema=cf.set_collection_schema( - fields=[self.primary_field, DataType.FLOAT16_VECTOR.name, DataType.BFLOAT16_VECTOR.name, - DataType.SPARSE_FLOAT_VECTOR.name, DataType.BINARY_VECTOR.name, *self().all_scalar_fields], + fields=self.all_fields, field_params={ self.primary_field: FieldParams(is_primary=True).to_dict, DataType.FLOAT16_VECTOR.name: FieldParams(dim=3).to_dict, @@ -454,38 +740,10 @@ class TestBitmapIndexDQLExpr(TestCaseClassBase): ) # prepare data (> 1024 triggering index building) - self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb) - - @pytest.mark.tags(CaseLabel.L2) - @pytest.mark.parametrize("range_num, counts", [ - ([-100, 200], 10), - ([2000, 5000], 10), - ([3000, 4000], 5), - ([0, 0], 1)]) - @pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64']) - @pytest.mark.parametrize("limit", [1, 10, 3000]) - def test_bitmap_index_query_with_int_in(self, range_num, counts, expr_field, limit): - """ - target: - 1. check number operation - method: - 1. prepare some data and build `BITMAP index` on scalar fields - 2. query with the different expr(in) and limit - 3. check query result - expected: - 1. query response equal to min(insert data, limit) - """ - # random set expr list - range_numbers = [random.randint(*range_num) for _ in range(counts)] - - # the total number of inserted data that matches the expression - expr_data = [i for i in self.insert_data.get(expr_field, []) if i in range_numbers] - expr_count = len(expr_data) - - # query - res, _ = self.collection_wrap.query(expr=f"{expr_field} in {range_numbers}", limit=limit, - output_fields=[expr_field]) - assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}, {expr_data}" + self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb, default_values={ + 'VARCHAR': cf.gen_varchar_data(3, self.nb), + 'ARRAY_VARCHAR': [cf.gen_varchar_data(length=2, nb=random.randint(0, 10)) for _ in range(self.nb)] + }) @pytest.fixture(scope="class", autouse=True) def prepare_data(self): @@ -600,6 +858,114 @@ class TestBitmapIndexDQLExpr(TestCaseClassBase): res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field]) assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)]) + @pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64']) + @pytest.mark.parametrize("limit", [1, 10, 3000]) + def test_bitmap_index_query_with_int_in(self, range_num, counts, expr_field, limit): + """ + target: + 1. check number operation `in` and `not in`, calculate total number via expr + method: + 1. prepare some data and build `BITMAP index` on scalar fields + 2. query with the different expr(in, not in) and limit + 3. check query result + expected: + 1. query response equal to min(insert data, limit) + """ + # random set expr list + range_numbers = [random.randint(*range_num) for _ in range(counts)] + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers]) + + # query `in` + res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + + # count `in` + self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + + # query `not in` + not_in_count = self.nb - expr_count + res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}" + + # count `not in` + self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": not_in_count}]}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("range_num, counts", [([1, 3], 50), ([2, 5], 50), ([3, 3], 100)]) + @pytest.mark.parametrize("limit", [1, 10, 3000]) + @pytest.mark.parametrize("expr_field", ['VARCHAR']) + def test_bitmap_index_query_with_varchar_in(self, range_num, counts, limit, expr_field): + """ + target: + 1. check varchar operation `in` and `not in`, calculate total number via expr + method: + 1. prepare some data and build `BITMAP index` on scalar fields + 2. query with the different expr(in, not in) and limit + 3. check query result + expected: + 1. query response equal to min(insert data, limit) + """ + # random set expr list + range_numbers = [cf.gen_varchar_data(random.randint(*range_num), 1)[0] for _ in range(counts)] + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers]) + + # query `in` + res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + + # count `in` + self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + + # query `not in` + not_in_count = self.nb - expr_count + res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}" + + # count `not in` + self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": not_in_count}]}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("length", [0, 5, 11]) + @pytest.mark.parametrize("expr_obj", [Expr.array_length, Expr.ARRAY_LENGTH]) + @pytest.mark.parametrize("expr_field", ['ARRAY_VARCHAR']) + def test_bitmap_index_query_array_length_count(self, length, expr_obj, expr_field): + """ + target: + 1. check query with count(*) via expr `array length` + method: + 1. prepare some data and build `BITMAP index` on scalar fields + 2. query with count(*) via expr + 3. check query result + expected: + 1. query response equal to insert nb + """ + expr = Expr.EQ(expr_obj(expr_field).value, length).value + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if len(i) == length]) + + # query count(*) + self.collection_wrap.query(expr=expr, output_fields=['count(*)'], check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + @pytest.mark.tags(CaseLabel.L1) def test_bitmap_index_query_count(self): """ @@ -663,6 +1029,25 @@ class TestBitmapIndexDQLExpr(TestCaseClassBase): cf.gen_vectors(nb=1, dim=3, vector_data_type=vector_field), vector_field, search_params, batch_size, expr='INT16 > 15', check_task=CheckTasks.check_search_iterator, check_items={"batch_size": batch_size}) + @pytest.mark.tags(CaseLabel.L2) + def test_bitmap_index_search_output_fields(self): + """ + target: + 1. check search output fields with BITMAP index built on scalar fields + method: + 1. prepare some data and build `BITMAP index` on scalar fields + 2. search output fields and check result + expected: + 1. search output fields with BITMAP index + """ + search_params, vector_field, limit, nq = {"metric_type": "L2", "ef": 32}, DataType.FLOAT16_VECTOR.name, 3, 1 + + self.collection_wrap.search( + cf.gen_vectors(nb=nq, dim=3, vector_data_type=vector_field), vector_field, search_params, limit, + output_fields=['*'], check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field), + "limit": limit, "output_fields": self.all_fields}) + @pytest.mark.tags(CaseLabel.L2) def test_bitmap_index_hybrid_search(self): """ @@ -697,7 +1082,7 @@ class TestBitmapIndexDQLExpr(TestCaseClassBase): ] self.collection_wrap.hybrid_search( req_list, RRFRanker(), limit, check_task=CheckTasks.check_search_results, - check_items={"nq": nq, "ids": self.insert_data.get('int64_pk'), "limit": limit}) + check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field), "limit": limit}) @pytest.mark.xdist_group("TestBitmapIndexOffsetCacheDQL") @@ -716,12 +1101,13 @@ class TestBitmapIndexOffsetCache(TestCaseClassBase): # init params self.primary_field, self.nb = "int64_pk", 3000 + self.all_fields = [self.primary_field, DataType.FLOAT_VECTOR.name, *self().all_scalar_fields] # create a collection with fields self.collection_wrap.init_collection( name=cf.gen_unique_str("test_bitmap_index_offset_cache"), schema=cf.set_collection_schema( - fields=[self.primary_field, DataType.FLOAT_VECTOR.name, *self().all_scalar_fields], + fields=self.all_fields, field_params={ self.primary_field: FieldParams(is_primary=True).to_dict }, @@ -729,7 +1115,10 @@ class TestBitmapIndexOffsetCache(TestCaseClassBase): ) # prepare data (> 1024 triggering index building) - self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb) + self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb, default_values={ + 'VARCHAR': cf.gen_varchar_data(3, self.nb), + 'ARRAY_VARCHAR': [cf.gen_varchar_data(length=2, nb=random.randint(0, 10)) for _ in range(self.nb)] + }) @pytest.fixture(scope="class", autouse=True) def prepare_data(self): @@ -819,6 +1208,114 @@ class TestBitmapIndexOffsetCache(TestCaseClassBase): res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=['*']) assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)]) + @pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64']) + @pytest.mark.parametrize("limit", [1, 10, 3000]) + def test_bitmap_offset_cache_query_with_int_in(self, range_num, counts, expr_field, limit): + """ + target: + 1. check number operation `in` and `not in`, calculate total number via expr + method: + 1. prepare some data and build `BITMAP index` on scalar fields + 2. query with the different expr(in, not in) and limit + 3. check query result + expected: + 1. query response equal to min(insert data, limit) + """ + # random set expr list + range_numbers = [random.randint(*range_num) for _ in range(counts)] + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers]) + + # query `in` + res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + + # count `in` + self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + + # query `not in` + not_in_count = self.nb - expr_count + res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}" + + # count `not in` + self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": not_in_count}]}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("range_num, counts", [([1, 3], 50), ([2, 5], 50), ([3, 3], 100)]) + @pytest.mark.parametrize("limit", [1, 10, 3000]) + @pytest.mark.parametrize("expr_field", ['VARCHAR']) + def test_bitmap_offset_cache_query_with_varchar_in(self, range_num, counts, limit, expr_field): + """ + target: + 1. check varchar operation `in` and `not in`, calculate total number via expr + method: + 1. prepare some data and build `BITMAP index` on scalar fields + 2. query with the different expr(in, not in) and limit + 3. check query result + expected: + 1. query response equal to min(insert data, limit) + """ + # random set expr list + range_numbers = [cf.gen_varchar_data(random.randint(*range_num), 1)[0] for _ in range(counts)] + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers]) + + # query `in` + res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + + # count `in` + self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + + # query `not in` + not_in_count = self.nb - expr_count + res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}" + + # count `not in` + self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": not_in_count}]}) + + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("length", [0, 5, 11]) + @pytest.mark.parametrize("expr_obj", [Expr.array_length, Expr.ARRAY_LENGTH]) + @pytest.mark.parametrize("expr_field", ['ARRAY_VARCHAR']) + def test_bitmap_offset_cache_query_array_length_count(self, length, expr_obj, expr_field): + """ + target: + 1. check query with count(*) via expr `array length` + method: + 1. prepare some data and build `BITMAP index` on scalar fields + 2. query with count(*) via expr + 3. check query result + expected: + 1. query response equal to insert nb + """ + expr = Expr.EQ(expr_obj(expr_field).value, length).value + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if len(i) == length]) + + # query count(*) + self.collection_wrap.query(expr=expr, output_fields=['count(*)'], check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + @pytest.mark.tags(CaseLabel.L2) def test_bitmap_offset_cache_query_count(self): """ @@ -835,6 +1332,25 @@ class TestBitmapIndexOffsetCache(TestCaseClassBase): self.collection_wrap.query(expr='', output_fields=['count(*)'], check_task=CheckTasks.check_query_results, check_items={"exp_res": [{"count(*)": self.nb}]}) + @pytest.mark.tags(CaseLabel.L2) + def test_bitmap_offset_cache_search_output_fields(self): + """ + target: + 1. check search output fields with BITMAP index built on scalar fields + method: + 1. prepare some data and build `BITMAP index` on scalar fields + 2. search output fields and check result + expected: + 1. search output fields with BITMAP index + """ + search_params, vector_field, limit, nq = {"metric_type": "L2", "ef": 32}, DataType.FLOAT_VECTOR.name, 3, 1 + + self.collection_wrap.search( + cf.gen_vectors(nb=nq, dim=ct.default_dim, vector_data_type=vector_field), + vector_field, search_params, limit, output_fields=['*'], check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field), + "limit": limit, "output_fields": self.all_fields}) + @pytest.mark.tags(CaseLabel.L2) def test_bitmap_offset_cache_hybrid_search(self): """ @@ -863,7 +1379,7 @@ class TestBitmapIndexOffsetCache(TestCaseClassBase): ] self.collection_wrap.hybrid_search( req_list, RRFRanker(), limit, check_task=CheckTasks.check_search_results, - check_items={"nq": nq, "ids": self.insert_data.get('int64_pk'), "limit": limit}) + check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field), "limit": limit}) @pytest.mark.xdist_group("TestBitmapIndexOffsetCacheDQL") @@ -882,12 +1398,13 @@ class TestBitmapIndexMmap(TestCaseClassBase): # init params self.primary_field, self.nb = "int64_pk", 3000 + self.all_fields = [self.primary_field, DataType.FLOAT_VECTOR.name, *self().all_scalar_fields] # create a collection with fields self.collection_wrap.init_collection( name=cf.gen_unique_str("test_bitmap_index_bitmap"), schema=cf.set_collection_schema( - fields=[self.primary_field, DataType.FLOAT_VECTOR.name, *self().all_scalar_fields], + fields=self.all_fields, field_params={ self.primary_field: FieldParams(is_primary=True).to_dict }, @@ -985,6 +1502,48 @@ class TestBitmapIndexMmap(TestCaseClassBase): res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field]) assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + @pytest.mark.tags(CaseLabel.L2) + @pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)]) + @pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64']) + @pytest.mark.parametrize("limit", [1, 10, 3000]) + def test_bitmap_mmap_query_with_int_in(self, range_num, counts, expr_field, limit): + """ + target: + 1. check number operation `in` and `not in`, calculate total number via expr + method: + 1. prepare some data and build `BITMAP index` on scalar fields + 2. query with the different expr(in, not in) and limit + 3. check query result + expected: + 1. query response equal to min(insert data, limit) + """ + # random set expr list + range_numbers = [random.randint(*range_num) for _ in range(counts)] + + # the total number of inserted data that matches the expression + expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers]) + + # query `in` + res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}" + + # count `in` + self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": expr_count}]}) + + # query `not in` + not_in_count = self.nb - expr_count + res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit, + output_fields=[expr_field]) + assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}" + + # count `not in` + self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'], + check_task=CheckTasks.check_query_results, + check_items={"exp_res": [{"count(*)": not_in_count}]}) + @pytest.mark.tags(CaseLabel.L2) def test_bitmap_mmap_query_count(self): """ @@ -1001,6 +1560,25 @@ class TestBitmapIndexMmap(TestCaseClassBase): self.collection_wrap.query(expr='', output_fields=['count(*)'], check_task=CheckTasks.check_query_results, check_items={"exp_res": [{"count(*)": self.nb}]}) + @pytest.mark.tags(CaseLabel.L2) + def test_bitmap_mmap_search_output_fields(self): + """ + target: + 1. check search output fields with BITMAP index built on scalar fields + method: + 1. prepare some data and build `BITMAP index` on scalar fields + 2. search output fields and check result + expected: + 1. search output fields with BITMAP index + """ + search_params, vector_field, limit, nq = {"metric_type": "L2", "ef": 32}, DataType.FLOAT_VECTOR.name, 3, 1 + + self.collection_wrap.search( + cf.gen_vectors(nb=nq, dim=ct.default_dim, vector_data_type=vector_field), + vector_field, search_params, limit, output_fields=['*'], check_task=CheckTasks.check_search_results, + check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field), + "limit": limit, "output_fields": self.all_fields}) + @pytest.mark.tags(CaseLabel.L2) def test_bitmap_mmap_hybrid_search(self): """ @@ -1029,7 +1607,7 @@ class TestBitmapIndexMmap(TestCaseClassBase): ] self.collection_wrap.hybrid_search( req_list, RRFRanker(), limit, check_task=CheckTasks.check_search_results, - check_items={"nq": nq, "ids": self.insert_data.get('int64_pk'), "limit": limit}) + check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field), "limit": limit}) @pytest.mark.xdist_group("TestIndexUnicodeString")