feat: use namespace skip index when search (#44888)

issue: #44011

---------

Signed-off-by: sunby <sunbingyi1992@gmail.com>
This commit is contained in:
Bingyi Sun 2025-10-23 12:04:04 +08:00 committed by GitHub
parent 6077178553
commit 52270701ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 117 additions and 13 deletions

View File

@ -32,7 +32,7 @@
namespace milvus {
using std::string;
const std::string namespace_field_name = "$namespace_id";
std::shared_ptr<Schema>
Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
auto schema = std::make_shared<Schema>();
@ -58,6 +58,9 @@ Schema::ParseFrom(const milvus::proto::schema::CollectionSchema& schema_proto) {
"repetitive dynamic field");
schema->set_dynamic_field_id(field_id);
}
if (child.name() == namespace_field_name) {
schema->set_namespace_field_id(field_id);
}
};
for (const milvus::proto::schema::FieldSchema& child :

View File

@ -213,11 +213,21 @@ class Schema {
this->dynamic_field_id_opt_ = field_id;
}
void
set_namespace_field_id(FieldId field_id) {
this->namespace_field_id_opt_ = field_id;
}
void
set_schema_version(uint64_t version) {
this->schema_version_ = version;
}
std::optional<FieldId>
get_namespace_field_id() const {
return this->namespace_field_id_opt_;
}
uint64_t
get_schema_version() const {
return this->schema_version_;
@ -348,6 +358,7 @@ class Schema {
std::optional<FieldId> primary_field_id_opt_;
std::optional<FieldId> dynamic_field_id_opt_;
std::optional<FieldId> namespace_field_id_opt_;
// field partial load list
// work as hint now

View File

@ -107,6 +107,13 @@ class PhyConjunctFilterExpr : public Expr {
return true;
}
void
SetNamespaceSkipFunc(SkipNamespaceFunc skip_namespace_func) override {
for (auto& input : inputs_) {
input->SetNamespaceSkipFunc(skip_namespace_func);
}
}
std::string
ToString() const {
if (!input_order_.empty()) {

View File

@ -667,8 +667,55 @@ RewriteConjunctExpr(std::shared_ptr<milvus::exec::PhyConjunctFilterExpr>& expr,
}
}
inline void
SetNamespaceSkipIndex(std::shared_ptr<PhyConjunctFilterExpr> conjunct_expr,
ExecContext* context) {
auto schema = context->get_query_context()->get_segment()->get_schema();
auto namespace_field_id = schema.get_namespace_field_id();
auto inputs = conjunct_expr->GetInputsRef();
std::shared_ptr<PhyUnaryRangeFilterExpr> namespace_expr = nullptr;
for (const auto& input : inputs) {
auto unary = std::dynamic_pointer_cast<PhyUnaryRangeFilterExpr>(input);
if (!unary) {
continue;
}
if (unary->GetColumnInfo().value().field_id_ ==
namespace_field_id.value() &&
unary->GetOpType() == proto::plan::OpType::Equal) {
namespace_expr = unary;
}
}
if (!namespace_expr) {
return;
}
auto namespace_field_meta = schema[namespace_field_id.value()];
auto& skip_index =
context->get_query_context()->get_segment()->GetSkipIndex();
if (namespace_field_meta.get_data_type() == DataType::INT64) {
auto skip_namespace_func = [&](int64_t chunk_id) -> bool {
return skip_index.CanSkipUnaryRange<int64_t>(
namespace_field_id.value(),
chunk_id,
proto::plan::OpType::Equal,
namespace_expr->GetLogicalExpr()->GetValue().int64_val());
};
namespace_expr->SetNamespaceSkipFunc(skip_namespace_func);
} else {
auto skip_namespace_func = [&](int64_t chunk_id) -> bool {
return skip_index.CanSkipUnaryRange<std::string>(
namespace_field_id.value(),
chunk_id,
proto::plan::OpType::Equal,
namespace_expr->GetLogicalExpr()->GetValue().string_val());
};
namespace_expr->SetNamespaceSkipFunc(skip_namespace_func);
}
}
inline void
OptimizeCompiledExprs(ExecContext* context, const std::vector<ExprPtr>& exprs) {
auto schema = context->get_query_context()->get_segment()->get_schema();
auto namespace_field_id = schema.get_namespace_field_id();
std::chrono::high_resolution_clock::time_point start =
std::chrono::high_resolution_clock::now();
for (const auto& expr : exprs) {
@ -680,6 +727,9 @@ OptimizeCompiledExprs(ExecContext* context, const std::vector<ExprPtr>& exprs) {
bool has_heavy_operation = false;
ReorderConjunctExpr(conjunct_expr, context, has_heavy_operation);
LOG_DEBUG("after reorder filter expression: {}", expr->ToString());
if (namespace_field_id.has_value()) {
SetNamespaceSkipIndex(conjunct_expr, context);
}
}
}
std::chrono::high_resolution_clock::time_point end =

View File

@ -126,6 +126,12 @@ class Expr {
return inputs_;
}
using SkipNamespaceFunc = std::function<bool(int64_t chunk_id)>;
virtual void
SetNamespaceSkipFunc(SkipNamespaceFunc skip_namespace_func) {
namespace_skip_func_ = std::move(skip_namespace_func);
}
protected:
DataType type_;
std::vector<std::shared_ptr<Expr>> inputs_;
@ -135,6 +141,10 @@ class Expr {
// whether we have offset input and do expr filtering on these data
// default is false which means we will do expr filtering on the total segment data
bool has_offset_input_ = false;
// check if we can skip a chunk for namespace field.
// if there's no namespace field, this is std::nullopt.
// TODO: for expression like f1 > 1 and f2 > 2, we can use skip function of f1 when evaluating f2.
std::optional<SkipNamespaceFunc> namespace_skip_func_;
};
using ExprPtr = std::shared_ptr<milvus::exec::Expr>;
@ -364,7 +374,9 @@ class SegmentExpr : public Expr {
auto pw = segment_->get_batch_views<T>(
op_ctx_, field_id_, 0, current_data_chunk_pos_, need_size);
auto views_info = pw.get();
if (!skip_func || !skip_func(skip_index, field_id_, 0)) {
if ((!skip_func || !skip_func(skip_index, field_id_, 0)) &&
(!namespace_skip_func_.has_value() ||
!namespace_skip_func_.value()(0))) {
// first is the raw data, second is valid_data
// use valid_data to see if raw data is null
if constexpr (NeedSegmentOffsets) {
@ -419,7 +431,9 @@ class SegmentExpr : public Expr {
auto pw =
segment_->get_views_by_offsets<T>(op_ctx_, field_id_, 0, *input);
auto [data_vec, valid_data] = pw.get();
if (!skip_func || !skip_func(skip_index, field_id_, 0)) {
if ((!skip_func || !skip_func(skip_index, field_id_, 0)) &&
(!namespace_skip_func_.has_value() ||
!namespace_skip_func_.value()(0))) {
func(data_vec.data(),
valid_data.data(),
nullptr,
@ -478,7 +492,9 @@ class SegmentExpr : public Expr {
auto valid_result = index_ptr->IsNotNull();
auto batch_size = input->size();
if (!skip_func || !skip_func(skip_index, field_id_, 0)) {
if ((!skip_func || !skip_func(skip_index, field_id_, 0)) &&
(!namespace_skip_func_.has_value() ||
!namespace_skip_func_.value()(0))) {
for (auto i = 0; i < batch_size; ++i) {
auto offset = (*input)[i];
auto raw = index_ptr->Reverse_Lookup(offset);
@ -544,8 +560,10 @@ class SegmentExpr : public Expr {
chunk_id,
{int32_t(chunk_offset)});
auto [data_vec, valid_data] = pw.get();
if (!skip_func ||
!skip_func(skip_index, field_id_, chunk_id)) {
if ((!skip_func ||
!skip_func(skip_index, field_id_, chunk_id)) &&
(!namespace_skip_func_.has_value() ||
!namespace_skip_func_.value()(chunk_id))) {
func.template operator()<FilterType::random>(
data_vec.data(),
valid_data.data(),
@ -577,8 +595,10 @@ class SegmentExpr : public Expr {
if (valid_data != nullptr) {
valid_data += chunk_offset;
}
if (!skip_func ||
!skip_func(skip_index, field_id_, chunk_id)) {
if ((!skip_func ||
!skip_func(skip_index, field_id_, chunk_id)) &&
(!namespace_skip_func_.has_value() ||
!namespace_skip_func_.value()(chunk_id))) {
func.template operator()<FilterType::random>(
data,
valid_data,
@ -607,7 +627,9 @@ class SegmentExpr : public Expr {
auto chunk = pw.get();
const T* data = chunk.data();
const bool* valid_data = chunk.valid_data();
if (!skip_func || !skip_func(skip_index, field_id_, 0)) {
if ((!skip_func || !skip_func(skip_index, field_id_, 0)) &&
(!namespace_skip_func_.has_value() ||
!namespace_skip_func_.value()(0))) {
func.template operator()<FilterType::random>(data,
valid_data,
input->data(),
@ -633,7 +655,10 @@ class SegmentExpr : public Expr {
if (valid_data != nullptr) {
valid_data += chunk_offset;
}
if (!skip_func || !skip_func(skip_index, field_id_, chunk_id)) {
if ((!skip_func ||
!skip_func(skip_index, field_id_, chunk_id)) &&
(!namespace_skip_func_.has_value() ||
!namespace_skip_func_.value()(chunk_id))) {
func.template operator()<FilterType::random>(
data,
valid_data,
@ -698,8 +723,9 @@ class SegmentExpr : public Expr {
if (valid_data != nullptr) {
valid_data += data_pos;
}
if (!skip_func || !skip_func(skip_index, field_id_, i)) {
if ((!skip_func || !skip_func(skip_index, field_id_, i)) &&
(!namespace_skip_func_.has_value() ||
!namespace_skip_func_.value()(i))) {
const T* data = chunk.data() + data_pos;
if constexpr (NeedSegmentOffsets) {
@ -784,7 +810,9 @@ class SegmentExpr : public Expr {
segment_offsets_array[j] = static_cast<int32_t>(offset);
}
auto& skip_index = segment_->GetSkipIndex();
if (!skip_func || !skip_func(skip_index, field_id_, i)) {
if ((!skip_func || !skip_func(skip_index, field_id_, i)) &&
(!namespace_skip_func_.has_value() ||
!namespace_skip_func_.value()(i))) {
bool is_seal = false;
if constexpr (std::is_same_v<T, std::string_view> ||
std::is_same_v<T, Json> ||

View File

@ -431,6 +431,11 @@ class UnaryRangeFilterExpr : public ITypeFilterExpr {
}
}
const proto::plan::GenericValue
GetValue() const {
return val_;
}
public:
const ColumnInfo column_;
const proto::plan::OpType op_type_;