enhance: remove timestamp filter for search_ids to optimize performance (#44634)

#44352

Signed-off-by: luzhang <luzhang@zilliz.com>
Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
zhagnlu 2025-10-17 16:10:01 +08:00 committed by GitHub
parent 9f2937fd0f
commit ae19c93c14
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 102 additions and 36 deletions

View File

@ -196,12 +196,8 @@ PhyTermFilterExpr::InitPkCacheOffset() {
} }
} }
auto seg_offsets = segment_->search_ids(*id_array, query_timestamp_);
cached_bits_.resize(active_count_, false); cached_bits_.resize(active_count_, false);
for (const auto& offset : seg_offsets) { segment_->search_ids(cached_bits_, *id_array);
auto _offset = (int64_t)offset.get();
cached_bits_[_offset] = true;
}
cached_bits_inited_ = true; cached_bits_inited_ = true;
} }

View File

@ -1038,6 +1038,84 @@ ChunkedSegmentSealedImpl::search_pk(milvus::OpContext* op_ctx,
}); });
} }
void
ChunkedSegmentSealedImpl::search_pks(BitsetType& bitset,
const std::vector<PkType>& pks) const {
BitsetTypeView bitset_view(bitset);
if (!is_sorted_by_pk_) {
for (auto& pk : pks) {
insert_record_.search_pk_range(
pk, proto::plan::OpType::Equal, bitset_view);
}
return;
}
auto pk_field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
AssertInfo(pk_field_id.get() != -1, "Primary key is -1");
auto pk_column = get_column(pk_field_id);
AssertInfo(pk_column != nullptr, "primary key column not loaded");
auto all_chunk_pins = pk_column->GetAllChunks(nullptr);
switch (schema_->get_fields().at(pk_field_id).get_data_type()) {
case DataType::INT64: {
auto num_chunk = pk_column->num_chunks();
for (int i = 0; i < num_chunk; ++i) {
auto pw = all_chunk_pins[i];
auto src =
reinterpret_cast<const int64_t*>(pw.get()->RawData());
auto chunk_row_num = pk_column->chunk_row_nums(i);
for (size_t j = 0; j < pks.size(); j++) {
// get int64 pks
auto target = std::get<int64_t>(pks[j]);
auto it = std::lower_bound(
src,
src + chunk_row_num,
target,
[](const int64_t& elem, const int64_t& value) {
return elem < value;
});
auto num_rows_until_chunk =
pk_column->GetNumRowsUntilChunk(i);
for (; it != src + chunk_row_num && *it == target; ++it) {
auto offset = it - src + num_rows_until_chunk;
bitset[offset] = true;
}
}
}
break;
}
case DataType::VARCHAR: {
auto num_chunk = pk_column->num_chunks();
for (int i = 0; i < num_chunk; ++i) {
// TODO @xiaocai2333, @sunby: chunk need to record the min/max.
auto num_rows_until_chunk = pk_column->GetNumRowsUntilChunk(i);
auto pw = all_chunk_pins[i];
auto string_chunk = static_cast<StringChunk*>(pw.get());
for (size_t j = 0; j < pks.size(); ++j) {
// get varchar pks
auto& target = std::get<std::string>(pks[j]);
auto offset = string_chunk->binary_search_string(target);
for (; offset != -1 && offset < string_chunk->RowNums() &&
string_chunk->operator[](offset) == target;
++offset) {
auto segment_offset = offset + num_rows_until_chunk;
bitset[segment_offset] = true;
}
}
}
break;
}
default: {
ThrowInfo(
DataTypeInvalid,
fmt::format(
"unsupported type {}",
schema_->get_fields().at(pk_field_id).get_data_type()));
}
}
}
void void
ChunkedSegmentSealedImpl::search_batch_pks( ChunkedSegmentSealedImpl::search_batch_pks(
const std::vector<PkType>& pks, const std::vector<PkType>& pks,
@ -2221,9 +2299,9 @@ ChunkedSegmentSealedImpl::GetFieldDataType(milvus::FieldId field_id) const {
return field_meta.get_data_type(); return field_meta.get_data_type();
} }
std::vector<SegOffset> void
ChunkedSegmentSealedImpl::search_ids(const IdArray& id_array, ChunkedSegmentSealedImpl::search_ids(BitsetType& bitset,
Timestamp timestamp) const { const IdArray& id_array) const {
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1)); auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
AssertInfo(field_id.get() != -1, "Primary key is -1"); AssertInfo(field_id.get() != -1, "Primary key is -1");
auto& field_meta = schema_->operator[](field_id); auto& field_meta = schema_->operator[](field_id);
@ -2232,16 +2310,7 @@ ChunkedSegmentSealedImpl::search_ids(const IdArray& id_array,
std::vector<PkType> pks(ids_size); std::vector<PkType> pks(ids_size);
ParsePksFromIDs(pks, data_type, id_array); ParsePksFromIDs(pks, data_type, id_array);
std::vector<SegOffset> res_offsets; this->search_pks(bitset, pks);
res_offsets.reserve(pks.size());
this->search_batch_pks(
pks,
[=](const size_t idx) { return timestamp; },
true,
[&](const SegOffset offset, const Timestamp ts) {
res_offsets.push_back(offset);
});
return std::move(res_offsets);
} }
SegcoreError SegcoreError

View File

@ -248,6 +248,9 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
return true; return true;
} }
void
search_pks(BitsetType& bitset, const std::vector<PkType>& pks) const;
void void
search_batch_pks( search_batch_pks(
const std::vector<PkType>& pks, const std::vector<PkType>& pks,
@ -478,8 +481,8 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
return system_ready_count_ == 1; return system_ready_count_ == 1;
} }
std::vector<SegOffset> void
search_ids(const IdArray& id_array, Timestamp timestamp) const override; search_ids(BitsetType& bitset, const IdArray& id_array) const override;
void void
LoadVecIndex(const LoadIndexInfo& info); LoadVecIndex(const LoadIndexInfo& info);

View File

@ -1164,9 +1164,9 @@ SegmentGrowingImpl::bulk_subscript(milvus::OpContext* op_ctx,
} }
} }
std::vector<SegOffset> void
SegmentGrowingImpl::search_ids(const IdArray& id_array, SegmentGrowingImpl::search_ids(BitsetType& bitset,
Timestamp timestamp) const { const IdArray& id_array) const {
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1)); auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
AssertInfo(field_id.get() != -1, "Primary key is -1"); AssertInfo(field_id.get() != -1, "Primary key is -1");
auto& field_meta = schema_->operator[](field_id); auto& field_meta = schema_->operator[](field_id);
@ -1175,15 +1175,11 @@ SegmentGrowingImpl::search_ids(const IdArray& id_array,
std::vector<PkType> pks(ids_size); std::vector<PkType> pks(ids_size);
ParsePksFromIDs(pks, data_type, id_array); ParsePksFromIDs(pks, data_type, id_array);
std::vector<SegOffset> res_offsets; BitsetTypeView bitset_view(bitset);
res_offsets.reserve(pks.size());
for (auto& pk : pks) { for (auto& pk : pks) {
auto segOffsets = insert_record_.search_pk(pk, timestamp); insert_record_.search_pk_range(
for (auto offset : segOffsets) { pk, proto::plan::OpType::Equal, bitset_view);
res_offsets.push_back(offset);
} }
}
return std::move(res_offsets);
} }
std::string std::string

View File

@ -375,8 +375,8 @@ class SegmentGrowingImpl : public SegmentGrowing {
int64_t ins_barrier, int64_t ins_barrier,
Timestamp timestamp) const override; Timestamp timestamp) const override;
std::vector<SegOffset> void
search_ids(const IdArray& id_array, Timestamp timestamp) const override; search_ids(BitsetType& bitset, const IdArray& id_array) const override;
bool bool
HasIndex(FieldId field_id) const { HasIndex(FieldId field_id) const {

View File

@ -434,12 +434,14 @@ class SegmentInternalInterface : public SegmentInterface {
/** /**
* search offset by possible pk values and mvcc timestamp * search offset by possible pk values and mvcc timestamp
* *
* @param bitset The final bitset after id array filtering,
* `false` means that the entity will be filtered out.
* @param id_array possible pk values * @param id_array possible pk values
* @param timestamp mvcc timestamp * this interface is used for internal expression calculation,
* @return all the hit entries in vector of offsets * so no need timestamp parameter, mvcc node prove the timestamp is already filtered.
*/ */
virtual std::vector<SegOffset> virtual void
search_ids(const IdArray& id_array, Timestamp timestamp) const = 0; search_ids(BitsetType& bitset, const IdArray& id_array) const = 0;
/** /**
* Apply timestamp filtering on bitset, the query can't see an entity whose * Apply timestamp filtering on bitset, the query can't see an entity whose