enhance: Utilize search_batch_pks for search_ids of PkTerm (#43751)

Related to #43660

---------

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
This commit is contained in:
congqixia 2025-08-07 14:19:40 +08:00 committed by GitHub
parent b8fe8aed53
commit b6199acb05
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 56 additions and 72 deletions

View File

@ -34,9 +34,13 @@ PhyJsonContainsFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
return; return;
} }
if (expr_->op_ == proto::plan::JSONContainsExpr_JSONOp_ContainsAll) { if (expr_->op_ == proto::plan::JSONContainsExpr_JSONOp_ContainsAll) {
result = std::make_shared<ColumnVector>(TargetBitmap(real_batch_size, true), TargetBitmap(real_batch_size, true)); result = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, true),
TargetBitmap(real_batch_size, true));
} else { } else {
result = std::make_shared<ColumnVector>(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true)); result = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
} }
MoveCursor(); MoveCursor();
return; return;

View File

@ -191,8 +191,7 @@ PhyTermFilterExpr::InitPkCacheOffset() {
} }
} }
auto [uids, seg_offsets] = auto seg_offsets = segment_->search_ids(*id_array, query_timestamp_);
segment_->search_ids(*id_array, query_timestamp_);
cached_bits_.resize(active_count_, false); cached_bits_.resize(active_count_, false);
for (const auto& offset : seg_offsets) { for (const auto& offset : seg_offsets) {
auto _offset = (int64_t)offset.get(); auto _offset = (int64_t)offset.get();
@ -540,8 +539,7 @@ PhyTermFilterExpr::ExecJsonInVariableByKeyIndex() {
if (!arg_inited_) { if (!arg_inited_) {
arg_set_ = std::make_shared<SetElement<ValueType>>(expr_->vals_); arg_set_ = std::make_shared<SetElement<ValueType>>(expr_->vals_);
if constexpr (std::is_same_v<GetType, double>) { if constexpr (std::is_same_v<GetType, double>) {
arg_set_float_ = arg_set_float_ = std::make_shared<SetElement<float>>(expr_->vals_);
std::make_shared<SetElement<float>>(expr_->vals_);
} }
arg_inited_ = true; arg_inited_ = true;
} }

View File

@ -916,17 +916,18 @@ ChunkedSegmentSealedImpl::search_pk(const PkType& pk,
void void
ChunkedSegmentSealedImpl::search_batch_pks( ChunkedSegmentSealedImpl::search_batch_pks(
const std::vector<PkType>& pks, const std::vector<PkType>& pks,
const Timestamp* timestamps, const std::function<Timestamp(const size_t idx)>& get_timestamp,
bool include_same_ts, bool include_same_ts,
const std::function<void(const SegOffset offset, const Timestamp ts)>& const std::function<void(const SegOffset offset, const Timestamp ts)>&
callback) const { callback) const {
// handle unsorted case // handle unsorted case
if (!is_sorted_by_pk_) { if (!is_sorted_by_pk_) {
for (size_t i = 0; i < pks.size(); i++) { for (size_t i = 0; i < pks.size(); i++) {
auto offsets = insert_record_.search_pk( auto timestamp = get_timestamp(i);
pks[i], timestamps[i], include_same_ts); auto offsets =
insert_record_.search_pk(pks[i], timestamp, include_same_ts);
for (auto offset : offsets) { for (auto offset : offsets) {
callback(offset, timestamps[i]); callback(offset, timestamp);
} }
} }
return; return;
@ -956,7 +957,7 @@ ChunkedSegmentSealedImpl::search_batch_pks(
for (size_t j = 0; j < pks.size(); j++) { for (size_t j = 0; j < pks.size(); j++) {
// get int64 pks // get int64 pks
auto target = std::get<int64_t>(pks[j]); auto target = std::get<int64_t>(pks[j]);
auto timestamp = timestamps[j]; auto timestamp = get_timestamp(j);
auto it = std::lower_bound( auto it = std::lower_bound(
src, src,
src + chunk_row_num, src + chunk_row_num,
@ -988,7 +989,7 @@ ChunkedSegmentSealedImpl::search_batch_pks(
for (size_t j = 0; j < pks.size(); ++j) { for (size_t j = 0; j < pks.size(); ++j) {
// get varchar pks // get varchar pks
auto& target = std::get<std::string>(pks[j]); auto& target = std::get<std::string>(pks[j]);
auto timestamp = timestamps[j]; auto timestamp = get_timestamp(j);
auto offset = string_chunk->binary_search_string(target); auto offset = string_chunk->binary_search_string(target);
for (; offset != -1 && offset < string_chunk->RowNums() && for (; offset != -1 && offset < string_chunk->RowNums() &&
string_chunk->operator[](offset) == target; string_chunk->operator[](offset) == target;
@ -1142,7 +1143,11 @@ ChunkedSegmentSealedImpl::ChunkedSegmentSealedImpl(
const Timestamp* timestamps, const Timestamp* timestamps,
std::function<void(const SegOffset offset, const Timestamp ts)> std::function<void(const SegOffset offset, const Timestamp ts)>
callback) { callback) {
this->search_batch_pks(pks, timestamps, false, callback); this->search_batch_pks(
pks,
[&](const size_t idx) { return timestamps[idx]; },
false,
callback);
}, },
segment_id) { segment_id) {
auto mcm = storage::MmapManager::GetInstance().GetMmapChunkManager(); auto mcm = storage::MmapManager::GetInstance().GetMmapChunkManager();
@ -1752,7 +1757,7 @@ ChunkedSegmentSealedImpl::GetFieldDataType(milvus::FieldId field_id) const {
return field_meta.get_data_type(); return field_meta.get_data_type();
} }
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>> std::vector<SegOffset>
ChunkedSegmentSealedImpl::search_ids(const IdArray& id_array, ChunkedSegmentSealedImpl::search_ids(const IdArray& id_array,
Timestamp timestamp) const { Timestamp timestamp) const {
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1)); auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
@ -1763,37 +1768,16 @@ ChunkedSegmentSealedImpl::search_ids(const IdArray& id_array,
std::vector<PkType> pks(ids_size); std::vector<PkType> pks(ids_size);
ParsePksFromIDs(pks, data_type, id_array); ParsePksFromIDs(pks, data_type, id_array);
auto res_id_arr = std::make_unique<IdArray>();
std::vector<SegOffset> res_offsets; std::vector<SegOffset> res_offsets;
res_offsets.reserve(pks.size()); res_offsets.reserve(pks.size());
for (auto& pk : pks) { this->search_batch_pks(
std::vector<SegOffset> pk_offsets; pks,
if (!is_sorted_by_pk_) { [=](const size_t idx) { return timestamp; },
pk_offsets = insert_record_.search_pk(pk, timestamp); true,
} else { [&](const SegOffset offset, const Timestamp ts) {
pk_offsets = search_pk(pk, timestamp);
}
for (auto offset : pk_offsets) {
switch (data_type) {
case DataType::INT64: {
res_id_arr->mutable_int_id()->add_data(
std::get<int64_t>(pk));
break;
}
case DataType::VARCHAR: {
res_id_arr->mutable_str_id()->add_data(
std::get<std::string>(std::move(pk)));
break;
}
default: {
ThrowInfo(DataTypeInvalid,
fmt::format("unsupported type {}", data_type));
}
}
res_offsets.push_back(offset); res_offsets.push_back(offset);
} });
} return std::move(res_offsets);
return {std::move(res_id_arr), std::move(res_offsets)};
} }
SegcoreError SegcoreError

View File

@ -207,7 +207,7 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
void void
search_batch_pks( search_batch_pks(
const std::vector<PkType>& pks, const std::vector<PkType>& pks,
const Timestamp* timestamps, const std::function<Timestamp(const size_t idx)>& get_timestamp,
bool include_same_ts, bool include_same_ts,
const std::function<void(const SegOffset offset, const Timestamp ts)>& const std::function<void(const SegOffset offset, const Timestamp ts)>&
callback) const; callback) const;
@ -410,7 +410,7 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
return system_ready_count_ == 1; return system_ready_count_ == 1;
} }
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>> std::vector<SegOffset>
search_ids(const IdArray& id_array, Timestamp timestamp) const override; search_ids(const IdArray& id_array, Timestamp timestamp) const override;
void void

View File

@ -118,7 +118,9 @@ class DeletedRecord {
} }
} }
search_pk_func_( search_pk_func_(
pks, timestamps, [&](SegOffset offset, Timestamp delete_ts) { pks,
timestamps,
[&](const SegOffset offset, const Timestamp delete_ts) {
auto row_id = offset.get(); auto row_id = offset.get();
// if already deleted, no need to add new record // if already deleted, no need to add new record
if (deleted_mask_.size() > row_id && deleted_mask_[row_id]) { if (deleted_mask_.size() > row_id && deleted_mask_[row_id]) {

View File

@ -1106,7 +1106,7 @@ SegmentGrowingImpl::bulk_subscript(SystemFieldType system_type,
} }
} }
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>> std::vector<SegOffset>
SegmentGrowingImpl::search_ids(const IdArray& id_array, SegmentGrowingImpl::search_ids(const IdArray& id_array,
Timestamp timestamp) const { Timestamp timestamp) const {
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1)); auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
@ -1117,32 +1117,15 @@ SegmentGrowingImpl::search_ids(const IdArray& id_array,
std::vector<PkType> pks(ids_size); std::vector<PkType> pks(ids_size);
ParsePksFromIDs(pks, data_type, id_array); ParsePksFromIDs(pks, data_type, id_array);
auto res_id_arr = std::make_unique<IdArray>();
std::vector<SegOffset> res_offsets; std::vector<SegOffset> res_offsets;
res_offsets.reserve(pks.size()); res_offsets.reserve(pks.size());
for (auto& pk : pks) { for (auto& pk : pks) {
auto segOffsets = insert_record_.search_pk(pk, timestamp); auto segOffsets = insert_record_.search_pk(pk, timestamp);
for (auto offset : segOffsets) { for (auto offset : segOffsets) {
switch (data_type) {
case DataType::INT64: {
res_id_arr->mutable_int_id()->add_data(
std::get<int64_t>(pk));
break;
}
case DataType::VARCHAR: {
res_id_arr->mutable_str_id()->add_data(
std::get<std::string>(std::move(pk)));
break;
}
default: {
ThrowInfo(DataTypeInvalid,
fmt::format("unsupported type {}", data_type));
}
}
res_offsets.push_back(offset); res_offsets.push_back(offset);
} }
} }
return {std::move(res_id_arr), std::move(res_offsets)}; return std::move(res_offsets);
} }
std::string std::string

View File

@ -340,7 +340,7 @@ class SegmentGrowingImpl : public SegmentGrowing {
int64_t ins_barrier, int64_t ins_barrier,
Timestamp timestamp) const override; Timestamp timestamp) const override;
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>> std::vector<SegOffset>
search_ids(const IdArray& id_array, Timestamp timestamp) const override; search_ids(const IdArray& id_array, Timestamp timestamp) const override;
bool bool

View File

@ -440,7 +440,14 @@ class SegmentInternalInterface : public SegmentInterface {
virtual int64_t virtual int64_t
get_active_count(Timestamp ts) const = 0; get_active_count(Timestamp ts) const = 0;
virtual std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>> /**
* search offset by possible pk values and mvcc timestamp
*
* @param id_array possible pk values
* @param timestamp mvcc timestamp
* @return all the hit entries in vector of offsets
*/
virtual std::vector<SegOffset>
search_ids(const IdArray& id_array, Timestamp timestamp) const = 0; search_ids(const IdArray& id_array, Timestamp timestamp) const = 0;
/** /**

View File

@ -45,7 +45,8 @@ TEST(DeleteMVCC, common_case) {
[&insert_record]( [&insert_record](
const std::vector<PkType>& pks, const std::vector<PkType>& pks,
const Timestamp* timestamps, const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) { std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) { for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i]; auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp); auto offsets = insert_record.search_pk(pks[i], timestamp);
@ -170,7 +171,8 @@ TEST(DeleteMVCC, delete_exist_duplicate_pks) {
[&insert_record]( [&insert_record](
const std::vector<PkType>& pks, const std::vector<PkType>& pks,
const Timestamp* timestamps, const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) { std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) { for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i]; auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp); auto offsets = insert_record.search_pk(pks[i], timestamp);
@ -294,7 +296,8 @@ TEST(DeleteMVCC, snapshot) {
[&insert_record]( [&insert_record](
const std::vector<PkType>& pks, const std::vector<PkType>& pks,
const Timestamp* timestamps, const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) { std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) { for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i]; auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp); auto offsets = insert_record.search_pk(pks[i], timestamp);
@ -351,7 +354,8 @@ TEST(DeleteMVCC, insert_after_snapshot) {
[&insert_record]( [&insert_record](
const std::vector<PkType>& pks, const std::vector<PkType>& pks,
const Timestamp* timestamps, const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) { std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) { for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i]; auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp); auto offsets = insert_record.search_pk(pks[i], timestamp);
@ -455,7 +459,8 @@ TEST(DeleteMVCC, perform) {
[&insert_record]( [&insert_record](
const std::vector<PkType>& pks, const std::vector<PkType>& pks,
const Timestamp* timestamps, const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) { std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) { for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i]; auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp); auto offsets = insert_record.search_pk(pks[i], timestamp);

View File

@ -93,7 +93,8 @@ TEST(Util, GetDeleteBitmap) {
[&insert_record]( [&insert_record](
const std::vector<PkType>& pks, const std::vector<PkType>& pks,
const Timestamp* timestamps, const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) { std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) { for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i]; auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp); auto offsets = insert_record.search_pk(pks[i], timestamp);