milvus/tests/python_client/testcases/test_mix_scenes.py
yanliang567 15ce8aedd8
test: Add some tests for group by search support json and dynamic field (#46630)
related issue: #46616


<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
- Core invariant: these tests assume the v2 group-by search
implementation (TestMilvusClientV2Base and pymilvus v2 APIs such as
AnnSearchRequest/WeightedRanker) is functionally correct; the PR extends
coverage to validate group-by semantics when using JSON fields and
dynamic fields (see
tests/python_client/milvus_client_v2/test_milvus_client_search_group_by.py
— TestGroupSearch.setup_class and parametrized group_by_field cases).
- Logic removed/simplified: legacy v1 test scaffolding and duplicated
parametrized fixtures/test permutations were consolidated into
v2-focused suites (TestGroupSearch now inherits TestMilvusClientV2Base;
old TestGroupSearch/TestcaseBase patterns and large blocks in
test_mix_scenes were removed) to avoid redundant fixture permutations
and duplicate assertions while reusing shared helpers in common_func
(e.g., gen_scalar_field, gen_row_data_by_schema) and common_type
constants.
- Why this does NOT introduce data loss or behavior regression: only
test code, test helpers, and test imports were changed — no
production/server code altered. Test helper changes are
backward-compatible (gen_scalar_field forces primary key nullable=False
and only affects test data generation paths in
tests/python_client/common/common_func.py; get_field_dtype_by_field_name
now accepts schema dicts/ORM schemas and is used only by tests to choose
vector generation) and collection creation/insertion in tests use the
same CollectionSchema/FieldSchema paths, so production
storage/serialization logic is untouched.
- New capability (test addition): adds v2 test coverage for group-by
search over JSON and dynamic fields plus related scenarios — pagination,
strict/non-strict group_size, min/max group constraints, multi-field
group-bys and binary vector cases — implemented in
tests/python_client/milvus_client_v2/test_milvus_client_search_group_by.py
to address issue #46616.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: yanliang567 <yanliang.qiao@zilliz.com>
2025-12-31 11:03:21 +08:00

2310 lines
110 KiB
Python

import random
import re
import math # do not remove `math`
import pytest
import random
import numpy as np
import pandas as pd
from pymilvus import DataType, AnnSearchRequest, RRFRanker, WeightedRanker
from common.common_type import CaseLabel, CheckTasks
from common import common_type as ct
from common import common_func as cf
from common import common_params as cp
from common.code_mapping import QueryErrorMessage as qem
from common.common_params import (
FieldParams, MetricType, DefaultVectorIndexParams, DefaultScalarIndexParams, Expr, AlterIndexParams
)
from base.client_base import TestcaseBase, TestCaseClassBase
from utils.util_log import test_log as log
@pytest.mark.xdist_group("TestNoIndexDQLExpr")
class TestNoIndexDQLExpr(TestCaseClassBase):
"""
Scalar fields are not indexed, and verify DQL requests
Author: Ting.Wang
"""
def setup_class(self):
super().setup_class(self)
# connect to server before testing
self._connect(self)
# init params
self.primary_field, self.nb = "int64_pk", 10000
# create a collection with fields
self.collection_wrap.init_collection(
name=cf.gen_unique_str("test_no_index_dql_expr"),
schema=cf.set_collection_schema(
fields=[self.primary_field, DataType.FLOAT16_VECTOR.name, DataType.BFLOAT16_VECTOR.name,
DataType.SPARSE_FLOAT_VECTOR.name, DataType.BINARY_VECTOR.name,
'VARCHAR_1', *self().all_scalar_fields],
field_params={
self.primary_field: FieldParams(is_primary=True).to_dict,
DataType.FLOAT16_VECTOR.name: FieldParams(dim=3).to_dict,
DataType.BFLOAT16_VECTOR.name: FieldParams(dim=6).to_dict,
DataType.BINARY_VECTOR.name: FieldParams(dim=16).to_dict
},
)
)
# prepare data (> 1024 triggering index building)
self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb, default_values={
'VARCHAR_1': cf.gen_varchar_data(1, self.nb)
})
@pytest.fixture(scope="class", autouse=True)
def prepare_data(self):
for d in cf.iter_insert_list_data(list(self.insert_data.values()), batch=3000, total_len=self.nb):
self.collection_wrap.insert(data=d, check_task=CheckTasks.check_insert_result)
# flush collection, segment sealed
self.collection_wrap.flush()
# build vectors index
index_params = {
**DefaultVectorIndexParams.IVF_SQ8(DataType.FLOAT16_VECTOR.name),
**DefaultVectorIndexParams.IVF_FLAT(DataType.BFLOAT16_VECTOR.name),
**DefaultVectorIndexParams.SPARSE_WAND(DataType.SPARSE_FLOAT_VECTOR.name),
**DefaultVectorIndexParams.BIN_IVF_FLAT(DataType.BINARY_VECTOR.name)
}
self.build_multi_index(index_params=index_params)
assert sorted([n.field_name for n in self.collection_wrap.indexes]) == sorted(index_params.keys())
# load collection
self.collection_wrap.load()
def check_query_res(self, res, expr_field: str) -> list:
""" Ensure that primary key field values are unique """
real_data = {x[0]: x[1] for x in zip(self.insert_data.get(self.primary_field),
self.insert_data.get(expr_field))}
if len(real_data) != len(self.insert_data.get(self.primary_field)):
log.warning("[TestNoIndexDQLExpr] The primary key values are not unique, " +
"only check whether the res value is within the inserted data")
return [(r.get(self.primary_field), r.get(expr_field)) for r in res if
r.get(expr_field) not in self.insert_data.get(expr_field)]
return [(r[self.primary_field], r[expr_field], real_data[r[self.primary_field]]) for r in res if
r[expr_field] != real_data[r[self.primary_field]]]
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr, output_fields", [
(Expr.In(Expr.MOD('INT8', 13).subset, [0, 1, 2]).value, ['INT8']),
(Expr.Nin(Expr.MOD('INT16', 100).subset, [10, 20, 30, 40]).value, ['INT16']),
])
def test_no_index_query_with_invalid_expr(self, expr, output_fields):
"""
target:
1. check invalid expr
method:
1. prepare some data
2. query with the invalid expr
expected:
1. raises expected error
"""
# query
self.collection_wrap.query(expr=expr, check_task=CheckTasks.err_res,
check_items={ct.err_code: 1100, ct.err_msg: qem.ParseExpressionFailed})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize(
"expr, expr_field", cf.gen_modulo_expression(['int64_pk', 'INT8', 'INT16', 'INT32', 'INT64']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_no_index_query_with_modulo(self, expr, expr_field, limit):
"""
target:
1. check modulo expression
method:
1. prepare some data
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if
eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr, expr_field, rex",
cf.gen_varchar_expression(['VARCHAR']) + cf.gen_varchar_operation(['VARCHAR_1']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_no_index_query_with_string(self, expr, expr_field, limit, rex):
"""
target:
1. check string expression
method:
1. prepare some data
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if re.search(rex, i) is not None])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr_l, expr_field_l, rex_l", cf.gen_varchar_expression(['VARCHAR']))
@pytest.mark.parametrize("expr_r, expr_field_r, rex_r", cf.gen_varchar_operation(['VARCHAR_1']))
@pytest.mark.parametrize("expr_obj, op", [(Expr.AND, 'and'), (Expr.OR, 'or')])
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_no_index_query_with_mix_string(
self, expr_l, expr_field_l, rex_l, expr_r, expr_field_r, rex_r, expr_obj, op, limit):
"""
target:
1. check mix string fields expression
method:
1. prepare some data
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len(cf.count_match_expr(self.insert_data.get(expr_field_l, []), rex_l, op,
self.insert_data.get(expr_field_r, []), rex_r))
# query
res, _ = self.collection_wrap.query(expr=expr_obj(f"({expr_l})", f"({expr_r})").value, limit=limit,
output_fields=[expr_field_l, expr_field_r])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field_l) == []
assert self.check_query_res(res=res, expr_field=expr_field_r) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize(
"expr, expr_field", cf.gen_number_operation(['INT8', 'INT16', 'INT32', 'INT64', 'FLOAT', 'DOUBLE']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_no_index_query_with_operation(self, expr, expr_field, limit):
"""
target:
1. check number operation
method:
1. prepare some data
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if eval(expr.replace(expr_field, str(i)))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)])
@pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64'])
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_no_index_query_with_int_in(self, range_num, counts, expr_field, limit):
"""
target:
1. check number operation `in` and `not in`, calculate total number via expr
method:
1. prepare some data
2. query with the different expr(in, not in) and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# random set expr list
range_numbers = [random.randint(*range_num) for _ in range(counts)]
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers])
# query `in`
res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `in`
self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
# query `not in`
not_in_count = self.nb - expr_count
res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `not in`
self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": not_in_count}]})
@pytest.mark.xdist_group("TestHybridIndexDQLExpr")
class TestHybridIndexDQLExpr(TestCaseClassBase):
"""
Scalar fields build Hybrid index, and verify DQL requests
Author: Ting.Wang
"""
def setup_class(self):
super().setup_class(self)
# connect to server before testing
self._connect(self)
# init params
self.primary_field, self.nb = "int64_pk", 10000
self.all_fields = [self.primary_field, DataType.FLOAT16_VECTOR.name, DataType.BFLOAT16_VECTOR.name,
DataType.SPARSE_FLOAT_VECTOR.name, DataType.BINARY_VECTOR.name,
'VARCHAR_1', *self().all_scalar_fields]
# create a collection with fields
self.collection_wrap.init_collection(
name=cf.gen_unique_str("test_hybrid_index_dql_expr"),
schema=cf.set_collection_schema(
fields=self.all_fields,
field_params={
self.primary_field: FieldParams(is_primary=True).to_dict,
DataType.FLOAT16_VECTOR.name: FieldParams(dim=3).to_dict,
DataType.BFLOAT16_VECTOR.name: FieldParams(dim=6).to_dict,
DataType.BINARY_VECTOR.name: FieldParams(dim=16).to_dict
},
)
)
# prepare data (> 1024 triggering index building)
self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb, default_values={
'VARCHAR': cf.gen_varchar_data(3, self.nb),
'VARCHAR_1': cf.gen_varchar_data(1, self.nb),
'ARRAY_VARCHAR': [cf.gen_varchar_data(length=2, nb=random.randint(0, 10)) for _ in range(self.nb)]
})
@pytest.fixture(scope="class", autouse=True)
def prepare_data(self):
for d in cf.iter_insert_list_data(list(self.insert_data.values()), batch=3000, total_len=self.nb):
self.collection_wrap.insert(data=d, check_task=CheckTasks.check_insert_result)
# flush collection, segment sealed
self.collection_wrap.flush()
# build `Hybrid index`
index_params = {
**DefaultVectorIndexParams.DISKANN(DataType.FLOAT16_VECTOR.name),
**DefaultVectorIndexParams.IVF_SQ8(DataType.BFLOAT16_VECTOR.name),
**DefaultVectorIndexParams.SPARSE_INVERTED_INDEX(DataType.SPARSE_FLOAT_VECTOR.name),
**DefaultVectorIndexParams.BIN_IVF_FLAT(DataType.BINARY_VECTOR.name),
# build Hybrid index
**DefaultScalarIndexParams.list_default([self.primary_field, 'VARCHAR_1'] + self.all_index_scalar_fields)
}
self.build_multi_index(index_params=index_params)
assert sorted([n.field_name for n in self.collection_wrap.indexes]) == sorted(index_params.keys())
# load collection
self.collection_wrap.load()
def check_query_res(self, res, expr_field: str) -> list:
""" Ensure that primary key field values are unique """
real_data = {x[0]: x[1] for x in zip(self.insert_data.get(self.primary_field),
self.insert_data.get(expr_field))}
if len(real_data) != len(self.insert_data.get(self.primary_field)):
log.warning("[TestHybridIndexDQLExpr] The primary key values are not unique, " +
"only check whether the res value is within the inserted data")
return [(r.get(self.primary_field), r.get(expr_field)) for r in res if
r.get(expr_field) not in self.insert_data.get(expr_field)]
return [(r[self.primary_field], r[expr_field], real_data[r[self.primary_field]]) for r in res if
r[expr_field] != real_data[r[self.primary_field]]]
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize(
"expr, expr_field", cf.gen_modulo_expression(['int64_pk', 'INT8', 'INT16', 'INT32', 'INT64']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_hybrid_index_query_with_modulo(self, expr, expr_field, limit):
"""
target:
1. check modulo expression
method:
1. prepare some data and build `Hybrid index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if
eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr, expr_field, rex",
cf.gen_varchar_expression(['VARCHAR']) + cf.gen_varchar_operation(['VARCHAR_1']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_hybrid_index_query_with_string(self, expr, expr_field, limit, rex):
"""
target:
1. check string expression
method:
1. prepare some data and build `Hybrid index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if re.search(rex, i) is not None])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr_l, expr_field_l, rex_l", cf.gen_varchar_expression(['VARCHAR']))
@pytest.mark.parametrize("expr_r, expr_field_r, rex_r", cf.gen_varchar_operation(['VARCHAR_1']))
@pytest.mark.parametrize("expr_obj, op", [(Expr.AND, 'and'), (Expr.OR, 'or')])
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_hybrid_index_query_with_mix_string(
self, expr_l, expr_field_l, rex_l, expr_r, expr_field_r, rex_r, expr_obj, op, limit):
"""
target:
1. check mix string fields expression
method:
1. prepare some data
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len(cf.count_match_expr(self.insert_data.get(expr_field_l, []), rex_l, op,
self.insert_data.get(expr_field_r, []), rex_r))
# query
res, _ = self.collection_wrap.query(expr=expr_obj(f"({expr_l})", f"({expr_r})").value, limit=limit,
output_fields=[expr_field_l, expr_field_r])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field_l) == []
assert self.check_query_res(res=res, expr_field=expr_field_r) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize(
"expr, expr_field", cf.gen_number_operation(['INT8', 'INT16', 'INT32', 'INT64', 'FLOAT', 'DOUBLE']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_hybrid_index_query_with_operation(self, expr, expr_field, limit):
"""
target:
1. check number operation
method:
1. prepare some data and build `Hybrid index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if eval(expr.replace(expr_field, str(i)))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)])
@pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64'])
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_hybrid_index_query_with_int_in(self, range_num, counts, expr_field, limit):
"""
target:
1. check number operation `in` and `not in`, calculate total number via expr
method:
1. prepare some data and build `Hybrid index` on scalar fields
2. query with the different expr(in, not in) and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# random set expr list
range_numbers = [random.randint(*range_num) for _ in range(counts)]
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers])
# query `in`
res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `in`
self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
# query `not in`
not_in_count = self.nb - expr_count
res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `not in`
self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": not_in_count}]})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("range_num, counts", [([1, 3], 50), ([2, 5], 50), ([3, 3], 100)])
@pytest.mark.parametrize("limit", [1, 10, 3000])
@pytest.mark.parametrize("expr_field", ['VARCHAR'])
def test_hybrid_index_query_with_varchar_in(self, range_num, counts, limit, expr_field):
"""
target:
1. check varchar operation `in` and `not in`, calculate total number via expr
method:
1. prepare some data and build `Hybrid index` on scalar fields
2. query with the different expr(in, not in) and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# random set expr list
range_numbers = [cf.gen_varchar_data(random.randint(*range_num), 1)[0] for _ in range(counts)]
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers])
# query `in`
res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `in`
self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
# query `not in`
not_in_count = self.nb - expr_count
res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `not in`
self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": not_in_count}]})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("length", [0, 5, 11])
@pytest.mark.parametrize("expr_obj", [Expr.array_length, Expr.ARRAY_LENGTH])
@pytest.mark.parametrize("expr_field", ['ARRAY_VARCHAR'])
def test_hybrid_index_query_array_length_count(self, length, expr_obj, expr_field):
"""
target:
1. check query with count(*) via expr `array length`
method:
1. prepare some data and build `Hybrid index` on scalar fields
2. query with count(*) via expr
3. check query result
expected:
1. query response equal to insert nb
"""
expr = Expr.EQ(expr_obj(expr_field).value, length).value
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if len(i) == length])
# query count(*)
self.collection_wrap.query(expr=expr, output_fields=['count(*)'], check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
@pytest.mark.tags(CaseLabel.L1)
def test_hybrid_index_query_count(self):
"""
target:
1. check query with count(*)
method:
1. prepare some data and build `Hybrid index` on scalar fields
2. query with count(*)
3. check query result
expected:
1. query response equal to insert nb
"""
# query count(*)
self.collection_wrap.query(expr='', output_fields=['count(*)'], check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": self.nb}]})
@pytest.mark.tags(CaseLabel.L1)
def test_hybrid_index_search_output_fields(self):
"""
target:
1. check search output fields with Hybrid index built on scalar fields
method:
1. prepare some data and build `Hybrid index` on scalar fields
2. search output fields and check result
expected:
1. search output fields with Hybrid index
"""
search_params, vector_field, limit, nq = {"metric_type": "L2", "ef": 32}, DataType.FLOAT16_VECTOR, 3, 1
self.collection_wrap.search(
cf.gen_vectors(nb=nq, dim=3, vector_data_type=vector_field), vector_field.name, search_params, limit,
output_fields=['*'], check_task=CheckTasks.check_search_results,
check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field),
"limit": limit, "output_fields": self.all_fields})
@pytest.mark.xdist_group("TestInvertedIndexDQLExpr")
class TestInvertedIndexDQLExpr(TestCaseClassBase):
"""
Scalar fields build INVERTED index, and verify DQL requests
Author: Ting.Wang
"""
def setup_class(self):
super().setup_class(self)
# connect to server before testing
self._connect(self)
# init params
self.primary_field, self.nb = "int64_pk", 10000
self.all_fields = [self.primary_field, DataType.FLOAT16_VECTOR.name, DataType.BFLOAT16_VECTOR.name,
DataType.SPARSE_FLOAT_VECTOR.name, DataType.BINARY_VECTOR.name,
'VARCHAR_1', *self().all_scalar_fields]
# create a collection with fields
self.collection_wrap.init_collection(
name=cf.gen_unique_str("test_inverted_index_dql_expr"),
schema=cf.set_collection_schema(
fields=self.all_fields,
field_params={
self.primary_field: FieldParams(is_primary=True).to_dict,
DataType.FLOAT16_VECTOR.name: FieldParams(dim=3).to_dict,
DataType.BFLOAT16_VECTOR.name: FieldParams(dim=6).to_dict,
DataType.BINARY_VECTOR.name: FieldParams(dim=16).to_dict
},
)
)
# prepare data (> 1024 triggering index building)
self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb, default_values={
'VARCHAR': cf.gen_varchar_data(3, self.nb),
'VARCHAR_1': cf.gen_varchar_data(1, self.nb),
'ARRAY_VARCHAR': [cf.gen_varchar_data(length=2, nb=random.randint(0, 10)) for _ in range(self.nb)]
})
@pytest.fixture(scope="class", autouse=True)
def prepare_data(self):
for d in cf.iter_insert_list_data(list(self.insert_data.values()), batch=3000, total_len=self.nb):
self.collection_wrap.insert(data=d, check_task=CheckTasks.check_insert_result)
# flush collection, segment sealed
self.collection_wrap.flush()
# build `INVERTED index`
index_params = {
**DefaultVectorIndexParams.IVF_FLAT(DataType.FLOAT16_VECTOR.name),
**DefaultVectorIndexParams.HNSW(DataType.BFLOAT16_VECTOR.name),
**DefaultVectorIndexParams.SPARSE_WAND(DataType.SPARSE_FLOAT_VECTOR.name),
**DefaultVectorIndexParams.BIN_FLAT(DataType.BINARY_VECTOR.name),
# build INVERTED index
**DefaultScalarIndexParams.list_inverted(
[self.primary_field, 'VARCHAR_1'] + self.inverted_support_dtype_names)
}
self.build_multi_index(index_params=index_params)
assert sorted([n.field_name for n in self.collection_wrap.indexes]) == sorted(index_params.keys())
# load collection
self.collection_wrap.load()
def check_query_res(self, res, expr_field: str) -> list:
""" Ensure that primary key field values are unique """
real_data = {x[0]: x[1] for x in zip(self.insert_data.get(self.primary_field),
self.insert_data.get(expr_field))}
if len(real_data) != len(self.insert_data.get(self.primary_field)):
log.warning("[TestInvertedIndexDQLExpr] The primary key values are not unique, " +
"only check whether the res value is within the inserted data")
return [(r.get(self.primary_field), r.get(expr_field)) for r in res if
r.get(expr_field) not in self.insert_data.get(expr_field)]
return [(r[self.primary_field], r[expr_field], real_data[r[self.primary_field]]) for r in res if
r[expr_field] != real_data[r[self.primary_field]]]
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize(
"expr, expr_field", cf.gen_modulo_expression(['int64_pk', 'INT8', 'INT16', 'INT32', 'INT64']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_inverted_index_query_with_modulo(self, expr, expr_field, limit):
"""
target:
1. check modulo expression
method:
1. prepare some data and build `INVERTED index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if
eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr, expr_field, rex",
cf.gen_varchar_expression(['VARCHAR']) + cf.gen_varchar_operation(['VARCHAR_1']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_inverted_index_query_with_string(self, expr, expr_field, limit, rex):
"""
target:
1. check string expression
method:
1. prepare some data and build `INVERTED index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if re.search(rex, i) is not None])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr_l, expr_field_l, rex_l", cf.gen_varchar_expression(['VARCHAR']))
@pytest.mark.parametrize("expr_r, expr_field_r, rex_r", cf.gen_varchar_operation(['VARCHAR_1']))
@pytest.mark.parametrize("expr_obj, op", [(Expr.AND, 'and'), (Expr.OR, 'or')])
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_inverted_index_query_with_mix_string(
self, expr_l, expr_field_l, rex_l, expr_r, expr_field_r, rex_r, expr_obj, op, limit):
"""
target:
1. check mix string fields expression
method:
1. prepare some data
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len(cf.count_match_expr(self.insert_data.get(expr_field_l, []), rex_l, op,
self.insert_data.get(expr_field_r, []), rex_r))
# query
res, _ = self.collection_wrap.query(expr=expr_obj(f"({expr_l})", f"({expr_r})").value, limit=limit,
output_fields=[expr_field_l, expr_field_r])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field_l) == []
assert self.check_query_res(res=res, expr_field=expr_field_r) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize(
"expr, expr_field", cf.gen_number_operation(['INT8', 'INT16', 'INT32', 'INT64', 'FLOAT', 'DOUBLE']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_inverted_index_query_with_operation(self, expr, expr_field, limit):
"""
target:
1. check number operation
method:
1. prepare some data and build `INVERTED index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if eval(expr.replace(expr_field, str(i)))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)])
@pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64'])
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_inverted_index_query_with_int_in(self, range_num, counts, expr_field, limit):
"""
target:
1. check number operation `in` and `not in`, calculate total number via expr
method:
1. prepare some data and build `INVERTED index` on scalar fields
2. query with the different expr(in, not in) and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# random set expr list
range_numbers = [random.randint(*range_num) for _ in range(counts)]
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers])
# query `in`
res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `in`
self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
# query `not in`
not_in_count = self.nb - expr_count
res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `not in`
self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": not_in_count}]})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("range_num, counts", [([1, 3], 50), ([2, 5], 50), ([3, 3], 100)])
@pytest.mark.parametrize("limit", [1, 10, 3000])
@pytest.mark.parametrize("expr_field", ['VARCHAR'])
def test_inverted_index_query_with_varchar_in(self, range_num, counts, limit, expr_field):
"""
target:
1. check varchar operation `in` and `not in`, calculate total number via expr
method:
1. prepare some data and build `INVERTED index` on scalar fields
2. query with the different expr(in, not in) and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# random set expr list
range_numbers = [cf.gen_varchar_data(random.randint(*range_num), 1)[0] for _ in range(counts)]
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers])
# query `in`
res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `in`
self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
# query `not in`
not_in_count = self.nb - expr_count
res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `not in`
self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": not_in_count}]})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("length", [0, 5, 11])
@pytest.mark.parametrize("expr_obj", [Expr.array_length, Expr.ARRAY_LENGTH])
@pytest.mark.parametrize("expr_field", ['ARRAY_VARCHAR'])
def test_inverted_index_query_array_length_count(self, length, expr_obj, expr_field):
"""
target:
1. check query with count(*) via expr `array length`
method:
1. prepare some data and build `INVERTED index` on scalar fields
2. query with count(*) via expr
3. check query result
expected:
1. query response equal to insert nb
"""
expr = Expr.EQ(expr_obj(expr_field).value, length).value
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if len(i) == length])
# query count(*)
self.collection_wrap.query(expr=expr, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
@pytest.mark.xdist_group("TestBitmapIndexDQLExpr")
class TestBitmapIndexDQLExpr(TestCaseClassBase):
"""
Scalar fields build BITMAP index, and verify DQL requests
Author: Ting.Wang
"""
def setup_class(self):
super().setup_class(self)
# connect to server before testing
self._connect(self)
# init params
self.primary_field, self.nb = "int64_pk", 10000
self.all_fields = [self.primary_field, DataType.FLOAT16_VECTOR.name, DataType.BFLOAT16_VECTOR.name,
DataType.SPARSE_FLOAT_VECTOR.name, DataType.BINARY_VECTOR.name,
"VARCHAR_1", *self().all_scalar_fields]
# create a collection with fields
self.collection_wrap.init_collection(
name=cf.gen_unique_str("test_bitmap_index_dql_expr"),
schema=cf.set_collection_schema(
fields=self.all_fields,
field_params={
self.primary_field: FieldParams(is_primary=True).to_dict,
DataType.FLOAT16_VECTOR.name: FieldParams(dim=3).to_dict,
DataType.BFLOAT16_VECTOR.name: FieldParams(dim=6).to_dict,
DataType.BINARY_VECTOR.name: FieldParams(dim=16).to_dict
},
)
)
# prepare data (> 1024 triggering index building)
self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb, default_values={
'VARCHAR': cf.gen_varchar_data(3, self.nb),
'VARCHAR_1': cf.gen_varchar_data(1, self.nb),
'ARRAY_VARCHAR': [cf.gen_varchar_data(length=2, nb=random.randint(0, 10)) for _ in range(self.nb)]
})
@pytest.fixture(scope="class", autouse=True)
def prepare_data(self):
for d in cf.iter_insert_list_data(list(self.insert_data.values()), batch=3000, total_len=self.nb):
self.collection_wrap.insert(data=d, check_task=CheckTasks.check_insert_result)
# flush collection, segment sealed
self.collection_wrap.flush()
# build `BITMAP index`
index_params = {
**DefaultVectorIndexParams.HNSW(DataType.FLOAT16_VECTOR.name),
**DefaultVectorIndexParams.DISKANN(DataType.BFLOAT16_VECTOR.name),
**DefaultVectorIndexParams.SPARSE_WAND(DataType.SPARSE_FLOAT_VECTOR.name),
**DefaultVectorIndexParams.BIN_IVF_FLAT(DataType.BINARY_VECTOR.name),
# build BITMAP index
**DefaultScalarIndexParams.list_bitmap(["VARCHAR_1"] + self.bitmap_support_dtype_names)
}
self.build_multi_index(index_params=index_params)
assert sorted([n.field_name for n in self.collection_wrap.indexes]) == sorted(index_params.keys())
# load collection
self.collection_wrap.load()
def check_query_res(self, res, expr_field: str) -> list:
""" Ensure that primary key field values are unique """
real_data = {x[0]: x[1] for x in zip(self.insert_data.get(self.primary_field),
self.insert_data.get(expr_field))}
if len(real_data) != len(self.insert_data.get(self.primary_field)):
log.warning("[TestBitmapIndexDQLExpr] The primary key values are not unique, " +
"only check whether the res value is within the inserted data")
return [(r.get(self.primary_field), r.get(expr_field)) for r in res if
r.get(expr_field) not in self.insert_data.get(expr_field)]
return [(r[self.primary_field], r[expr_field], real_data[r[self.primary_field]]) for r in res if
r[expr_field] != real_data[r[self.primary_field]]]
@pytest.mark.tags(CaseLabel.L1)
def test_bitmap_index_query_with_invalid_array_params(self):
"""
target:
1. check query with invalid array params
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different wrong expr
3. check query result error
expected:
1. query response check error
"""
# query
self.collection_wrap.query(
expr=Expr.array_contains_any('ARRAY_VARCHAR', [['a', 'b']]).value, limit=1, check_task=CheckTasks.err_res,
check_items={ct.err_code: 1100, ct.err_msg: qem.ParseExpressionFailed})
self.collection_wrap.query(
expr=Expr.array_contains_all('ARRAY_VARCHAR', [['a', 'b']]).value, limit=1, check_task=CheckTasks.err_res,
check_items={ct.err_code: 1100, ct.err_msg: qem.ParseExpressionFailed})
self.collection_wrap.query(
expr=Expr.array_contains('ARRAY_VARCHAR', [['a', 'b']]).value, limit=1, check_task=CheckTasks.err_res,
check_items={ct.err_code: 1100, ct.err_msg: qem.ParseExpressionFailed})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr, expr_field", cf.gen_modulo_expression(['INT8', 'INT16', 'INT32', 'INT64']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_bitmap_index_query_with_modulo(self, expr, expr_field, limit):
"""
target:
1. check modulo expression
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if
eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr, expr_field, rex",
cf.gen_varchar_expression(['VARCHAR']) + cf.gen_varchar_operation(['VARCHAR_1']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_bitmap_index_query_with_string(self, expr, expr_field, limit, rex):
"""
target:
1. check string expression
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if re.search(rex, i) is not None])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr_l, expr_field_l, rex_l", cf.gen_varchar_expression(['VARCHAR']))
@pytest.mark.parametrize("expr_r, expr_field_r, rex_r", cf.gen_varchar_operation(['VARCHAR_1']))
@pytest.mark.parametrize("expr_obj, op", [(Expr.AND, 'and'), (Expr.OR, 'or')])
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_bitmap_index_query_with_mix_string(
self, expr_l, expr_field_l, rex_l, expr_r, expr_field_r, rex_r, expr_obj, op, limit):
"""
target:
1. check mix string fields expression
method:
1. prepare some data
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len(cf.count_match_expr(self.insert_data.get(expr_field_l, []), rex_l, op,
self.insert_data.get(expr_field_r, []), rex_r))
# query
res, _ = self.collection_wrap.query(expr=expr_obj(f"({expr_l})", f"({expr_r})").value, limit=limit,
output_fields=[expr_field_l, expr_field_r])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field_l) == []
assert self.check_query_res(res=res, expr_field=expr_field_r) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize(
"expr, expr_field", cf.gen_number_operation(['INT8', 'INT16', 'INT32', 'INT64', 'FLOAT', 'DOUBLE']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_bitmap_index_query_with_operation(self, expr, expr_field, limit):
"""
target:
1. check number operation
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if eval(expr.replace(expr_field, str(i)))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)])
@pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64'])
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_bitmap_index_query_with_int_in(self, range_num, counts, expr_field, limit):
"""
target:
1. check number operation `in` and `not in`, calculate total number via expr
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr(in, not in) and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# random set expr list
range_numbers = [random.randint(*range_num) for _ in range(counts)]
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers])
# query `in`
res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `in`
self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
# query `not in`
not_in_count = self.nb - expr_count
res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `not in`
self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": not_in_count}]})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("range_num, counts", [([1, 3], 50), ([2, 5], 50), ([3, 3], 100)])
@pytest.mark.parametrize("limit", [1, 10, 3000])
@pytest.mark.parametrize("expr_field", ['VARCHAR'])
def test_bitmap_index_query_with_varchar_in(self, range_num, counts, limit, expr_field):
"""
target:
1. check varchar operation `in` and `not in`, calculate total number via expr
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr(in, not in) and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# random set expr list
range_numbers = [cf.gen_varchar_data(random.randint(*range_num), 1)[0] for _ in range(counts)]
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers])
# query `in`
res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `in`
self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
# query `not in`
not_in_count = self.nb - expr_count
res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `not in`
self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": not_in_count}]})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("length", [0, 5, 11])
@pytest.mark.parametrize("expr_obj", [Expr.array_length, Expr.ARRAY_LENGTH])
@pytest.mark.parametrize("expr_field", ['ARRAY_VARCHAR'])
def test_bitmap_index_query_array_length_count(self, length, expr_obj, expr_field):
"""
target:
1. check query with count(*) via expr `array length`
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with count(*) via expr
3. check query result
expected:
1. query response equal to insert nb
"""
expr = Expr.EQ(expr_obj(expr_field).value, length).value
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if len(i) == length])
# query count(*)
self.collection_wrap.query(expr=expr, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
@pytest.mark.tags(CaseLabel.L1)
def test_bitmap_index_query_count(self):
"""
target:
1. check query with count(*)
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with count(*)
3. check query result
expected:
1. query response equal to insert nb
"""
# query count(*)
self.collection_wrap.query(expr='', output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": self.nb}]})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("limit", [10, 1000])
@pytest.mark.parametrize("group_by_field", ['INT8', 'INT16', 'INT32', 'INT64', 'BOOL', 'VARCHAR'])
@pytest.mark.parametrize(
"dim, search_params, vector_field",
[(3, {"metric_type": MetricType.L2, "ef": 32}, DataType.FLOAT16_VECTOR),
(1000, {"metric_type": MetricType.IP, "drop_ratio_search": 0.2}, DataType.SPARSE_FLOAT_VECTOR)])
def test_bitmap_index_search_group_by(self, limit, group_by_field, dim, search_params, vector_field):
"""
target:
1. check search iterator with BITMAP index built on scalar fields
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. search group by scalar fields and check result
expected:
1. search group by with BITMAP index
"""
res, _ = self.collection_wrap.search(cf.gen_vectors(nb=1, dim=dim, vector_data_type=vector_field), vector_field.name,
search_params, limit, group_by_field=group_by_field,
output_fields=[group_by_field])
output_values = [i.fields for r in res for i in r]
# check output field
assert len([True for i in output_values if set(i.keys()) != {group_by_field}]) == 0, f"res: {output_values}"
# check `group_by_field` field values are unique
values = [v for i in output_values for k, v in i.items()]
assert len(values) == len(set(values)), f"values: {values}, output_values:{output_values}"
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("batch_size", [10, 1000])
def test_bitmap_index_search_iterator(self, batch_size):
"""
target:
1. check search iterator with BITMAP index built on scalar fields
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. search iterator and check result
expected:
1. search iterator with BITMAP index
"""
ef = 32 if batch_size <= 32 else batch_size # ef must be larger than or equal to batch size
search_params, vector_field = {"metric_type": "L2", "ef": ef}, DataType.FLOAT16_VECTOR
self.collection_wrap.search_iterator(
cf.gen_vectors(nb=1, dim=3, vector_data_type=vector_field), vector_field.name, search_params, batch_size,
expr='INT16 > 15', check_task=CheckTasks.check_search_iterator, check_items={"batch_size": batch_size})
@pytest.mark.tags(CaseLabel.L1)
def test_bitmap_index_search_output_fields(self):
"""
target:
1. check search output fields with BITMAP index built on scalar fields
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. search output fields and check result
expected:
1. search output fields with BITMAP index
"""
search_params, vector_field, limit, nq = {"metric_type": "L2", "ef": 32}, DataType.FLOAT16_VECTOR, 3, 1
self.collection_wrap.search(
cf.gen_vectors(nb=nq, dim=3, vector_data_type=vector_field), vector_field.name, search_params, limit,
output_fields=['*'], check_task=CheckTasks.check_search_results,
check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field),
"limit": limit, "output_fields": self.all_fields})
@pytest.mark.tags(CaseLabel.L1)
def test_bitmap_index_hybrid_search(self):
"""
target:
1. check hybrid search with expr
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. hybrid search with expr
expected:
1. hybrid search with expr
"""
nq, limit = 10, 10
vectors = cf.gen_field_values(self.collection_wrap.schema, nb=nq)
req_list = [
AnnSearchRequest(
data=vectors.get(DataType.FLOAT16_VECTOR.name), anns_field=DataType.FLOAT16_VECTOR.name,
param={"metric_type": MetricType.L2, "ef": 32}, limit=limit,
expr=Expr.In('INT64', [i for i in range(10, 30)]).value
),
AnnSearchRequest(
data=vectors.get(DataType.BFLOAT16_VECTOR.name), anns_field=DataType.BFLOAT16_VECTOR.name,
param={"metric_type": MetricType.L2, "search_list": 30}, limit=limit,
expr=Expr.OR(Expr.GT(Expr.SUB('INT8', 30).subset, 10), Expr.LIKE('VARCHAR', 'a%')).value
),
AnnSearchRequest(
data=vectors.get(DataType.SPARSE_FLOAT_VECTOR.name), anns_field=DataType.SPARSE_FLOAT_VECTOR.name,
param={"metric_type": MetricType.IP, "drop_ratio_search": 0.2}, limit=limit),
AnnSearchRequest(
data=vectors.get(DataType.BINARY_VECTOR.name), anns_field=DataType.BINARY_VECTOR.name,
param={"metric_type": MetricType.JACCARD, "nprobe": 128}, limit=limit)
]
self.collection_wrap.hybrid_search(
req_list, RRFRanker(), limit, check_task=CheckTasks.check_search_results,
check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field), "limit": limit})
@pytest.mark.xdist_group("TestBitmapIndexOffsetCacheDQL")
class TestBitmapIndexOffsetCache(TestCaseClassBase):
"""
Scalar fields build BITMAP index, and altering index indexoffsetcache
Author: Ting.Wang
"""
def setup_class(self):
super().setup_class(self)
# connect to server before testing
self._connect(self)
# init params
self.primary_field, self.nb = "int64_pk", 3000
self.all_fields = [self.primary_field, DataType.FLOAT_VECTOR.name, 'VARCHAR_1', *self().all_scalar_fields]
# create a collection with fields
self.collection_wrap.init_collection(
name=cf.gen_unique_str("test_bitmap_index_offset_cache"),
schema=cf.set_collection_schema(
fields=self.all_fields,
field_params={
self.primary_field: FieldParams(is_primary=True).to_dict
},
)
)
# prepare data (> 1024 triggering index building)
self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb, default_values={
'VARCHAR': cf.gen_varchar_data(3, self.nb),
'VARCHAR_1': cf.gen_varchar_data(1, self.nb),
'ARRAY_VARCHAR': [cf.gen_varchar_data(length=2, nb=random.randint(0, 10)) for _ in range(self.nb)]
})
@pytest.fixture(scope="class", autouse=True)
def prepare_data(self):
self.collection_wrap.insert(data=list(self.insert_data.values()), check_task=CheckTasks.check_insert_result)
# flush collection, segment sealed
self.collection_wrap.flush()
# build `BITMAP index`
index_params = {
**DefaultVectorIndexParams.HNSW(DataType.FLOAT_VECTOR.name),
# build BITMAP index
**DefaultScalarIndexParams.list_bitmap(['VARCHAR_1'] + self.bitmap_support_dtype_names)
}
self.build_multi_index(index_params=index_params)
assert sorted([n.field_name for n in self.collection_wrap.indexes]) == sorted(index_params.keys())
# enable offset cache
for index_name in ['VARCHAR_1'] + self.bitmap_support_dtype_names:
self.collection_wrap.alter_index(index_name=index_name, extra_params=AlterIndexParams.index_offset_cache())
# load collection
self.collection_wrap.load()
def check_query_res(self, res, expr_field: str) -> list:
""" Ensure that primary key field values are unique """
real_data = {x[0]: x[1] for x in zip(self.insert_data.get(self.primary_field),
self.insert_data.get(expr_field))}
if len(real_data) != len(self.insert_data.get(self.primary_field)):
log.warning("[TestBitmapIndexOffsetCache] The primary key values are not unique, " +
"only check whether the res value is within the inserted data")
return [(r.get(self.primary_field), r.get(expr_field)) for r in res if
r.get(expr_field) not in self.insert_data.get(expr_field)]
return [(r[self.primary_field], r[expr_field], real_data[r[self.primary_field]]) for r in res if
r[expr_field] != real_data[r[self.primary_field]]]
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr, expr_field", cf.gen_modulo_expression(['INT8', 'INT16', 'INT32', 'INT64']))
@pytest.mark.parametrize("limit", [1, 10])
def test_bitmap_offset_cache_query_with_modulo(self, expr, expr_field, limit):
"""
target:
1. check modulo expression
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if
eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=['*'])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr, expr_field, rex",
cf.gen_varchar_expression(['VARCHAR']) + cf.gen_varchar_operation(['VARCHAR_1']))
@pytest.mark.parametrize("limit", [1, 10])
def test_bitmap_offset_cache_query_with_string(self, expr, expr_field, limit, rex):
"""
target:
1. check string expression
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if re.search(rex, i) is not None])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=['*'])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr_l, expr_field_l, rex_l", cf.gen_varchar_expression(['VARCHAR']))
@pytest.mark.parametrize("expr_r, expr_field_r, rex_r", cf.gen_varchar_operation(['VARCHAR_1']))
@pytest.mark.parametrize("expr_obj, op", [(Expr.AND, 'and'), (Expr.OR, 'or')])
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_bitmap_offset_cache_query_with_mix_string(
self, expr_l, expr_field_l, rex_l, expr_r, expr_field_r, rex_r, expr_obj, op, limit):
"""
target:
1. check mix string fields expression
method:
1. prepare some data
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len(cf.count_match_expr(self.insert_data.get(expr_field_l, []), rex_l, op,
self.insert_data.get(expr_field_r, []), rex_r))
# query
res, _ = self.collection_wrap.query(expr=expr_obj(f"({expr_l})", f"({expr_r})").value, limit=limit,
output_fields=[expr_field_l, expr_field_r])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field_l) == []
assert self.check_query_res(res=res, expr_field=expr_field_r) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize(
"expr, expr_field", cf.gen_number_operation(['INT8', 'INT16', 'INT32', 'INT64', 'FLOAT', 'DOUBLE']))
@pytest.mark.parametrize("limit", [1, 10])
def test_bitmap_offset_cache_query_with_operation(self, expr, expr_field, limit):
"""
target:
1. check number operation
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if eval(expr.replace(expr_field, str(i)))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=['*'])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)])
@pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64'])
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_bitmap_offset_cache_query_with_int_in(self, range_num, counts, expr_field, limit):
"""
target:
1. check number operation `in` and `not in`, calculate total number via expr
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr(in, not in) and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# random set expr list
range_numbers = [random.randint(*range_num) for _ in range(counts)]
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers])
# query `in`
res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `in`
self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
# query `not in`
not_in_count = self.nb - expr_count
res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `not in`
self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": not_in_count}]})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("range_num, counts", [([1, 3], 50), ([2, 5], 50), ([3, 3], 100)])
@pytest.mark.parametrize("limit", [1, 10, 3000])
@pytest.mark.parametrize("expr_field", ['VARCHAR'])
def test_bitmap_offset_cache_query_with_varchar_in(self, range_num, counts, limit, expr_field):
"""
target:
1. check varchar operation `in` and `not in`, calculate total number via expr
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr(in, not in) and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# random set expr list
range_numbers = [cf.gen_varchar_data(random.randint(*range_num), 1)[0] for _ in range(counts)]
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers])
# query `in`
res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `in`
self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
# query `not in`
not_in_count = self.nb - expr_count
res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `not in`
self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": not_in_count}]})
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("length", [0, 5, 11])
@pytest.mark.parametrize("expr_obj", [Expr.array_length, Expr.ARRAY_LENGTH])
@pytest.mark.parametrize("expr_field", ['ARRAY_VARCHAR'])
def test_bitmap_offset_cache_query_array_length_count(self, length, expr_obj, expr_field):
"""
target:
1. check query with count(*) via expr `array length`
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with count(*) via expr
3. check query result
expected:
1. query response equal to insert nb
"""
expr = Expr.EQ(expr_obj(expr_field).value, length).value
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if len(i) == length])
# query count(*)
self.collection_wrap.query(expr=expr, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
@pytest.mark.tags(CaseLabel.L1)
def test_bitmap_offset_cache_query_count(self):
"""
target:
1. check query with count(*)
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with count(*)
3. check query result
expected:
1. query response equal to insert nb
"""
# query count(*)
self.collection_wrap.query(expr='', output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": self.nb}]})
@pytest.mark.tags(CaseLabel.L1)
def test_bitmap_offset_cache_search_output_fields(self):
"""
target:
1. check search output fields with BITMAP index built on scalar fields
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. search output fields and check result
expected:
1. search output fields with BITMAP index
"""
search_params, vector_field, limit, nq = {"metric_type": "L2", "ef": 32}, DataType.FLOAT_VECTOR, 3, 1
self.collection_wrap.search(
cf.gen_vectors(nb=nq, dim=ct.default_dim, vector_data_type=vector_field),
vector_field.name, search_params, limit, output_fields=['*'], check_task=CheckTasks.check_search_results,
check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field),
"limit": limit, "output_fields": self.all_fields})
@pytest.mark.tags(CaseLabel.L1)
def test_bitmap_offset_cache_hybrid_search(self):
"""
target:
1. check hybrid search with expr
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. hybrid search with expr
expected:
1. hybrid search with expr
"""
nq, limit = 10, 10
vectors = cf.gen_field_values(self.collection_wrap.schema, nb=nq)
req_list = [
AnnSearchRequest(
data=vectors.get(DataType.FLOAT_VECTOR.name), anns_field=DataType.FLOAT_VECTOR.name,
param={"metric_type": MetricType.L2, "ef": 32}, limit=limit,
expr=Expr.In('INT64', [i for i in range(10, 30)]).value
),
AnnSearchRequest(
data=vectors.get(DataType.FLOAT_VECTOR.name), anns_field=DataType.FLOAT_VECTOR.name,
param={"metric_type": MetricType.L2, "ef": 32}, limit=limit,
expr=Expr.OR(Expr.GT(Expr.SUB('INT8', 30).subset, 10), Expr.LIKE('VARCHAR', 'a%')).value
)
]
self.collection_wrap.hybrid_search(
req_list, RRFRanker(), limit, check_task=CheckTasks.check_search_results,
check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field), "limit": limit})
@pytest.mark.xdist_group("TestBitmapIndexOffsetCacheDQL")
class TestBitmapIndexMmap(TestCaseClassBase):
"""
Scalar fields build BITMAP index, and altering index Mmap
Author: Ting.Wang
"""
def setup_class(self):
super().setup_class(self)
# connect to server before testing
self._connect(self)
# init params
self.primary_field, self.nb = "int64_pk", 3000
self.all_fields = [self.primary_field, DataType.FLOAT_VECTOR.name, *self().all_scalar_fields]
# create a collection with fields
self.collection_wrap.init_collection(
name=cf.gen_unique_str("test_bitmap_index_bitmap"),
schema=cf.set_collection_schema(
fields=self.all_fields,
field_params={
self.primary_field: FieldParams(is_primary=True).to_dict
},
)
)
# prepare data (> 1024 triggering index building)
self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb)
@pytest.fixture(scope="class", autouse=True)
def prepare_data(self):
self.collection_wrap.insert(data=list(self.insert_data.values()), check_task=CheckTasks.check_insert_result)
# flush collection, segment sealed
self.collection_wrap.flush()
# build `BITMAP index`
index_params = {
**DefaultVectorIndexParams.HNSW(DataType.FLOAT_VECTOR.name),
# build BITMAP index
**DefaultScalarIndexParams.list_bitmap(self.bitmap_support_dtype_names)
}
self.build_multi_index(index_params=index_params)
assert sorted([n.field_name for n in self.collection_wrap.indexes]) == sorted(index_params.keys())
# enable mmap
for index_name in self.bitmap_support_dtype_names:
self.collection_wrap.alter_index(index_name=index_name, extra_params=AlterIndexParams.index_mmap())
# load collection
self.collection_wrap.load()
def check_query_res(self, res, expr_field: str) -> list:
""" Ensure that primary key field values are unique """
real_data = {x[0]: x[1] for x in zip(self.insert_data.get(self.primary_field),
self.insert_data.get(expr_field))}
if len(real_data) != len(self.insert_data.get(self.primary_field)):
log.warning("[TestBitmapIndexMmap] The primary key values are not unique, " +
"only check whether the res value is within the inserted data")
return [(r.get(self.primary_field), r.get(expr_field)) for r in res if
r.get(expr_field) not in self.insert_data.get(expr_field)]
return [(r[self.primary_field], r[expr_field], real_data[r[self.primary_field]]) for r in res if
r[expr_field] != real_data[r[self.primary_field]]]
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr, expr_field", cf.gen_modulo_expression(['INT8', 'INT16', 'INT32', 'INT64']))
@pytest.mark.parametrize("limit", [1, 10])
def test_bitmap_mmap_query_with_modulo(self, expr, expr_field, limit):
"""
target:
1. check modulo expression
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if
eval('cf.parse_fmod' + expr.replace(expr_field, str(i)).replace('%', ','))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr, expr_field, rex", cf.gen_varchar_expression(['VARCHAR']))
@pytest.mark.parametrize("limit", [1, 10])
def test_bitmap_mmap_query_with_string(self, expr, expr_field, limit, rex):
"""
target:
1. check string expression
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if re.search(rex, i) is not None])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize(
"expr, expr_field", cf.gen_number_operation(['INT8', 'INT16', 'INT32', 'INT64', 'FLOAT', 'DOUBLE']))
@pytest.mark.parametrize("limit", [1, 10])
def test_bitmap_mmap_query_with_operation(self, expr, expr_field, limit):
"""
target:
1. check number operation
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if eval(expr.replace(expr_field, str(i)))])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("range_num, counts", [([-100, 200], 10), ([2000, 5000], 10), ([3000, 4000], 5)])
@pytest.mark.parametrize("expr_field", ['INT8', 'INT16', 'INT32', 'INT64'])
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_bitmap_mmap_query_with_int_in(self, range_num, counts, expr_field, limit):
"""
target:
1. check number operation `in` and `not in`, calculate total number via expr
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr(in, not in) and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# random set expr list
range_numbers = [random.randint(*range_num) for _ in range(counts)]
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if i in range_numbers])
# query `in`
res, _ = self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `in`
self.collection_wrap.query(expr=Expr.In(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": expr_count}]})
# query `not in`
not_in_count = self.nb - expr_count
res, _ = self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, limit=limit,
output_fields=[expr_field])
assert len(res) == min(not_in_count, limit), f"actual: {len(res)} == expect: {min(not_in_count, limit)}"
# check query response data
assert self.check_query_res(res=res, expr_field=expr_field) == []
# count `not in`
self.collection_wrap.query(expr=Expr.Nin(expr_field, range_numbers).value, output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": not_in_count}]})
@pytest.mark.tags(CaseLabel.L1)
def test_bitmap_mmap_query_count(self):
"""
target:
1. check query with count(*)
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with count(*)
3. check query result
expected:
1. query response equal to insert nb
"""
# query count(*)
self.collection_wrap.query(expr='', output_fields=['count(*)'],
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [{"count(*)": self.nb}]})
@pytest.mark.tags(CaseLabel.L1)
def test_bitmap_mmap_search_output_fields(self):
"""
target:
1. check search output fields with BITMAP index built on scalar fields
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. search output fields and check result
expected:
1. search output fields with BITMAP index
"""
search_params, vector_field, limit, nq = {"metric_type": "L2", "ef": 32}, DataType.FLOAT_VECTOR, 3, 1
self.collection_wrap.search(
cf.gen_vectors(nb=nq, dim=ct.default_dim, vector_data_type=vector_field),
vector_field.name, search_params, limit, output_fields=['*'], check_task=CheckTasks.check_search_results,
check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field),
"limit": limit, "output_fields": self.all_fields})
@pytest.mark.tags(CaseLabel.L1)
def test_bitmap_mmap_hybrid_search(self):
"""
target:
1. check hybrid search with expr
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. hybrid search with expr
expected:
1. hybrid search with expr
"""
nq, limit = 10, 10
vectors = cf.gen_field_values(self.collection_wrap.schema, nb=nq)
req_list = [
AnnSearchRequest(
data=vectors.get(DataType.FLOAT_VECTOR.name), anns_field=DataType.FLOAT_VECTOR.name,
param={"metric_type": MetricType.L2, "ef": 32}, limit=limit,
expr=Expr.In('INT64', [i for i in range(10, 30)]).value
),
AnnSearchRequest(
data=vectors.get(DataType.FLOAT_VECTOR.name), anns_field=DataType.FLOAT_VECTOR.name,
param={"metric_type": MetricType.L2, "ef": 32}, limit=limit,
expr=Expr.OR(Expr.GT(Expr.SUB('INT8', 30).subset, 10), Expr.LIKE('VARCHAR', 'a%')).value
)
]
self.collection_wrap.hybrid_search(
req_list, RRFRanker(), limit, check_task=CheckTasks.check_search_results,
check_items={"nq": nq, "ids": self.insert_data.get(self.primary_field), "limit": limit})
@pytest.mark.xdist_group("TestIndexUnicodeString")
class TestIndexUnicodeString(TestCaseClassBase):
"""
Scalar fields build BITMAP index, and verify Unicode string
Author: Ting.Wang
"""
def setup_class(self):
super().setup_class(self)
# connect to server before testing
self._connect(self)
# init params
self.primary_field, self.nb = "int64_pk", 3000
# create a collection with fields
self.collection_wrap.init_collection(
name=cf.gen_unique_str("test_index_unicode_string"),
schema=cf.set_collection_schema(
fields=[self.primary_field, DataType.FLOAT_VECTOR.name,
f"{DataType.VARCHAR.name}_BITMAP", f"{DataType.ARRAY.name}_{DataType.VARCHAR.name}_BITMAP",
f"{DataType.VARCHAR.name}_INVERTED", f"{DataType.ARRAY.name}_{DataType.VARCHAR.name}_INVERTED",
f"{DataType.VARCHAR.name}_NoIndex", f"{DataType.ARRAY.name}_{DataType.VARCHAR.name}_NoIndex"],
field_params={
self.primary_field: FieldParams(is_primary=True).to_dict
},
)
)
# prepare data (> 1024 triggering index building)
# insert unicode string
self.insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=self.nb, default_values={
f"{DataType.VARCHAR.name}_BITMAP": cf.gen_unicode_string_batch(nb=self.nb, string_len=30),
f"{DataType.ARRAY.name}_{DataType.VARCHAR.name}_BITMAP": cf.gen_unicode_string_array_batch(
nb=self.nb, string_len=1, max_capacity=100),
f"{DataType.VARCHAR.name}_INVERTED": cf.gen_unicode_string_batch(nb=self.nb, string_len=30),
f"{DataType.ARRAY.name}_{DataType.VARCHAR.name}_INVERTED": cf.gen_unicode_string_array_batch(
nb=self.nb, string_len=1, max_capacity=100),
f"{DataType.VARCHAR.name}_NoIndex": cf.gen_unicode_string_batch(nb=self.nb, string_len=30),
f"{DataType.ARRAY.name}_{DataType.VARCHAR.name}_NoIndex": cf.gen_unicode_string_array_batch(
nb=self.nb, string_len=1, max_capacity=100),
})
@pytest.fixture(scope="class", autouse=True)
def prepare_data(self):
self.collection_wrap.insert(data=list(self.insert_data.values()), check_task=CheckTasks.check_insert_result)
# flush collection, segment sealed
self.collection_wrap.flush()
# build scalar index
index_params = {
**DefaultVectorIndexParams.HNSW(DataType.FLOAT_VECTOR.name),
# build BITMAP index
**DefaultScalarIndexParams.list_bitmap([f"{DataType.VARCHAR.name}_BITMAP",
f"{DataType.ARRAY.name}_{DataType.VARCHAR.name}_BITMAP"]),
# build INVERTED index
**DefaultScalarIndexParams.list_inverted([f"{DataType.VARCHAR.name}_INVERTED",
f"{DataType.ARRAY.name}_{DataType.VARCHAR.name}_INVERTED"])
}
self.build_multi_index(index_params=index_params)
assert sorted([n.field_name for n in self.collection_wrap.indexes]) == sorted(index_params.keys())
# load collection
self.collection_wrap.load()
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("expr, expr_field, rex",
cf.gen_varchar_unicode_expression(['VARCHAR_BITMAP', 'VARCHAR_INVERTED']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_index_unicode_string_query(self, expr, expr_field, limit, rex):
"""
target:
1. check string expression
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(expr_field, []) if re.search(rex, i) is not None])
# query
res, _ = self.collection_wrap.query(expr=expr, limit=limit, output_fields=[expr_field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
@pytest.mark.tags(CaseLabel.L1)
@pytest.mark.parametrize("obj", cf.gen_varchar_unicode_expression_array(
['ARRAY_VARCHAR_BITMAP', 'ARRAY_VARCHAR_INVERTED', 'ARRAY_VARCHAR_NoIndex']))
@pytest.mark.parametrize("limit", [1, 10, 3000])
def test_index_unicode_string_array_query(self, limit, obj):
"""
target:
1. check string expression
method:
1. prepare some data and build `BITMAP index` on scalar fields
2. query with the different expr and limit
3. check query result
expected:
1. query response equal to min(insert data, limit)
"""
# the total number of inserted data that matches the expression
expr_count = len([i for i in self.insert_data.get(obj.field, []) if eval(obj.rex.format(str(i)))])
# query
res, _ = self.collection_wrap.query(expr=obj.field_expr, limit=limit, output_fields=[obj.field])
assert len(res) == min(expr_count, limit), f"actual: {len(res)} == expect: {min(expr_count, limit)}"
class TestMixScenes(TestcaseBase):
"""
Testing cross-combination scenarios
Author: Ting.Wang
"""
@pytest.mark.tags(CaseLabel.L2)
def test_bitmap_upsert_and_delete(self, request):
"""
target:
1. upsert data and query returns the updated data
method:
1. create a collection with scalar fields
2. insert some data and build BITMAP index
3. query the data of the specified primary key value
4. upsert the specified primary key value
5. re-query and check data equal to the updated data
6. delete the specified primary key value
7. re-query and check result is []
expected:
1. check whether the upsert and delete data is effective
"""
# init params
collection_name, primary_field, nb = f"{request.function.__name__}", "int64_pk", 3000
# scalar fields
scalar_fields, expr = [DataType.INT64.name, f"{DataType.ARRAY.name}_{DataType.VARCHAR.name}"], 'int64_pk == 10'
# connect to server before testing
self._connect()
# create a collection with fields that can build `BITMAP` index
self.collection_wrap.init_collection(
name=collection_name,
schema=cf.set_collection_schema(
fields=[primary_field, DataType.FLOAT_VECTOR.name, *scalar_fields],
field_params={primary_field: FieldParams(is_primary=True).to_dict},
)
)
# prepare data (> 1024 triggering index building)
insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=nb)
self.collection_wrap.insert(data=list(insert_data.values()), check_task=CheckTasks.check_insert_result)
# flush collection, segment sealed
self.collection_wrap.flush()
# rebuild `BITMAP` index
self.build_multi_index(index_params={
**DefaultVectorIndexParams.HNSW(DataType.FLOAT_VECTOR.name),
**DefaultScalarIndexParams.list_bitmap(scalar_fields)
})
# load collection
self.collection_wrap.load()
# query before upsert
expected_res = [{k: v[10] for k, v in insert_data.items() if k != DataType.FLOAT_VECTOR.name}]
self.collection_wrap.query(expr=expr, output_fields=scalar_fields,
check_task=CheckTasks.check_query_results,
check_items={"exp_res": expected_res,
"pk_name": primary_field})
# upsert int64_pk = 10
upsert_data = cf.gen_field_values(self.collection_wrap.schema, nb=1,
default_values={primary_field: [10]}, start_id=10)
self.collection_wrap.upsert(data=list(upsert_data.values()))
# re-query
expected_upsert_res = [{k: v[0] for k, v in upsert_data.items() if k != DataType.FLOAT_VECTOR.name}]
self.collection_wrap.query(expr=expr, output_fields=scalar_fields,
check_task=CheckTasks.check_query_results,
check_items={"exp_res": expected_upsert_res,
"pk_name": primary_field})
# delete int64_pk = 10
self.collection_wrap.delete(expr=expr)
# re-query
self.collection_wrap.query(expr=expr, output_fields=scalar_fields,
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [],
"pk_name": primary_field})
@pytest.mark.tags(CaseLabel.L2)
def test_bitmap_offset_cache_and_mmap(self, request):
"""
target:
1. enable offset cache and mmap at the same time to verify DQL & DML operations
method:
1. create a collection with scalar fields
2. insert some data and build BITMAP index
3. alter all BITMAP fields: enabled offset cache and mmap
4. load collection
5. query the data of `not exist` primary key value
6. upsert the `not exist` primary key value
7. re-query and check data equal to the updated data
8. delete the upserted primary key value
9. re-query and check result is []
10. search with compound expressions and check result
expected:
1. DQL & DML operations are successful and the results are as expected
"""
# init params
collection_name, primary_field, nb = f"{request.function.__name__}", "int64_pk", 3000
scalar_fields, expr = [primary_field, *self.bitmap_support_dtype_names], 'int64_pk == 33333'
# connect to server before testing
self._connect()
# create a collection with fields that can build `BITMAP` index
self.collection_wrap.init_collection(
name=collection_name,
schema=cf.set_collection_schema(
fields=[primary_field, DataType.FLOAT_VECTOR.name, *self.bitmap_support_dtype_names],
field_params={primary_field: FieldParams(is_primary=True).to_dict},
)
)
# prepare data (> 1024 triggering index building)
insert_data = cf.gen_field_values(self.collection_wrap.schema, nb=nb)
self.collection_wrap.insert(data=list(insert_data.values()), check_task=CheckTasks.check_insert_result)
# flush collection, segment sealed
self.collection_wrap.flush()
# build `BITMAP` index
self.build_multi_index(index_params={
**DefaultVectorIndexParams.HNSW(DataType.FLOAT_VECTOR.name),
**DefaultScalarIndexParams.list_bitmap(self.bitmap_support_dtype_names)
})
# enable offset cache and mmap
for index_name in self.bitmap_support_dtype_names:
self.collection_wrap.alter_index(index_name=index_name, extra_params=AlterIndexParams.index_offset_cache())
self.collection_wrap.alter_index(index_name=index_name, extra_params=AlterIndexParams.index_mmap())
# load collection
self.collection_wrap.load()
# query before upsert
self.collection_wrap.query(expr=expr, output_fields=scalar_fields,
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [],
"pk_name": primary_field})
# upsert int64_pk = 33333
upsert_data = cf.gen_field_values(self.collection_wrap.schema, nb=1,
default_values={primary_field: [33333]}, start_id=33333)
self.collection_wrap.upsert(data=list(upsert_data.values()))
# re-query
expected_upsert_res = [{k: v[0] for k, v in upsert_data.items() if k != DataType.FLOAT_VECTOR.name}]
self.collection_wrap.query(expr=expr, output_fields=scalar_fields,
check_task=CheckTasks.check_query_results,
check_items={"exp_res": expected_upsert_res,
"pk_name": primary_field})
# delete int64_pk = 33333
self.collection_wrap.delete(expr=expr)
# re-query
self.collection_wrap.query(expr=expr, output_fields=scalar_fields,
check_task=CheckTasks.check_query_results,
check_items={"exp_res": [],
"pk_name": primary_field})
# search
expr_left, expr_right = Expr.GT(Expr.SUB('INT64', 37).subset, 13).value, Expr.LIKE('VARCHAR', '%a').value
expr, rex, nq, limit = Expr.AND(expr_left, expr_right).value, r'.*a$', 10, 10
# counts data match expr
counts = sum([eval(expr_left.replace('INT64', str(i))) and re.search(rex, j) is not None for i, j in
zip(insert_data.get('INT64', []), insert_data.get('VARCHAR', []))])
check_task = None if counts == 0 else CheckTasks.check_search_results
self.collection_wrap.search(
data=cf.gen_field_values(self.collection_wrap.schema, nb=nq).get(DataType.FLOAT_VECTOR.name),
anns_field=DataType.FLOAT_VECTOR.name, param={"metric_type": MetricType.L2, "ef": 32}, limit=limit,
expr=expr, output_fields=scalar_fields, check_task=check_task,
check_items={"nq": nq, "ids": insert_data.get(primary_field), "limit": min(limit, counts),
"output_fields": scalar_fields})
@pytest.mark.tags(CaseLabel.L2)
@pytest.mark.parametrize("scalar_field, data_type, expr_data", [('INT64', 'int', 3), ('VARCHAR', 'str', '3')])
def test_bitmap_partition_keys(self, request, scalar_field, data_type, expr_data):
"""
target:
1. build BITMAP index on partition key field
method:
1. create a collection with scalar field that enable partition key
2. insert some data and build BITMAP index
4. load collection
5. query via partition key field expr
expected:
1. build index and query are successful
"""
# init params
collection_name, primary_field, nb = f"{request.function.__name__}_{scalar_field}", "int64_pk", 10000
# connect to server before testing
self._connect()
# create a collection with fields that can build `BITMAP` index
self.collection_wrap.init_collection(
name=collection_name,
schema=cf.set_collection_schema(
fields=[primary_field, DataType.FLOAT_VECTOR.name, scalar_field],
field_params={primary_field: FieldParams(is_primary=True).to_dict,
scalar_field: FieldParams(is_partition_key=True).to_dict},
)
)
# prepare data (> 1024 triggering index building)
self.collection_wrap.insert(data=cf.gen_values(self.collection_wrap.schema, nb=nb, default_values={
scalar_field: [eval(f"{data_type}({random.randint(1, 4)})") for _ in range(nb)]
}), check_task=CheckTasks.check_insert_result)
# flush collection, segment sealed
self.collection_wrap.flush()
# build `BITMAP` index
self.build_multi_index(index_params={
**DefaultVectorIndexParams.HNSW(DataType.FLOAT_VECTOR.name),
**DefaultScalarIndexParams.BITMAP(scalar_field)
})
# load collection
self.collection_wrap.load()
# query
expr = f'{scalar_field} == {expr_data}' if scalar_field == 'INT64' else f'{scalar_field} == "{expr_data}"'
res, _ = self.collection_wrap.query(expr=expr, output_fields=[scalar_field], limit=100)
assert set([r.get(scalar_field) for r in res]) == {expr_data}