enhance: Utilize search_batch_pks for search_ids of PkTerm (#43751)

Related to #43660

---------

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
This commit is contained in:
congqixia 2025-08-07 14:19:40 +08:00 committed by GitHub
parent b8fe8aed53
commit b6199acb05
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 56 additions and 72 deletions

View File

@ -34,9 +34,13 @@ PhyJsonContainsFilterExpr::Eval(EvalCtx& context, VectorPtr& result) {
return;
}
if (expr_->op_ == proto::plan::JSONContainsExpr_JSONOp_ContainsAll) {
result = std::make_shared<ColumnVector>(TargetBitmap(real_batch_size, true), TargetBitmap(real_batch_size, true));
result = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, true),
TargetBitmap(real_batch_size, true));
} else {
result = std::make_shared<ColumnVector>(TargetBitmap(real_batch_size, false), TargetBitmap(real_batch_size, true));
result = std::make_shared<ColumnVector>(
TargetBitmap(real_batch_size, false),
TargetBitmap(real_batch_size, true));
}
MoveCursor();
return;

View File

@ -191,8 +191,7 @@ PhyTermFilterExpr::InitPkCacheOffset() {
}
}
auto [uids, seg_offsets] =
segment_->search_ids(*id_array, query_timestamp_);
auto seg_offsets = segment_->search_ids(*id_array, query_timestamp_);
cached_bits_.resize(active_count_, false);
for (const auto& offset : seg_offsets) {
auto _offset = (int64_t)offset.get();
@ -540,8 +539,7 @@ PhyTermFilterExpr::ExecJsonInVariableByKeyIndex() {
if (!arg_inited_) {
arg_set_ = std::make_shared<SetElement<ValueType>>(expr_->vals_);
if constexpr (std::is_same_v<GetType, double>) {
arg_set_float_ =
std::make_shared<SetElement<float>>(expr_->vals_);
arg_set_float_ = std::make_shared<SetElement<float>>(expr_->vals_);
}
arg_inited_ = true;
}

View File

@ -916,17 +916,18 @@ ChunkedSegmentSealedImpl::search_pk(const PkType& pk,
void
ChunkedSegmentSealedImpl::search_batch_pks(
const std::vector<PkType>& pks,
const Timestamp* timestamps,
const std::function<Timestamp(const size_t idx)>& get_timestamp,
bool include_same_ts,
const std::function<void(const SegOffset offset, const Timestamp ts)>&
callback) const {
// handle unsorted case
if (!is_sorted_by_pk_) {
for (size_t i = 0; i < pks.size(); i++) {
auto offsets = insert_record_.search_pk(
pks[i], timestamps[i], include_same_ts);
auto timestamp = get_timestamp(i);
auto offsets =
insert_record_.search_pk(pks[i], timestamp, include_same_ts);
for (auto offset : offsets) {
callback(offset, timestamps[i]);
callback(offset, timestamp);
}
}
return;
@ -956,7 +957,7 @@ ChunkedSegmentSealedImpl::search_batch_pks(
for (size_t j = 0; j < pks.size(); j++) {
// get int64 pks
auto target = std::get<int64_t>(pks[j]);
auto timestamp = timestamps[j];
auto timestamp = get_timestamp(j);
auto it = std::lower_bound(
src,
src + chunk_row_num,
@ -988,7 +989,7 @@ ChunkedSegmentSealedImpl::search_batch_pks(
for (size_t j = 0; j < pks.size(); ++j) {
// get varchar pks
auto& target = std::get<std::string>(pks[j]);
auto timestamp = timestamps[j];
auto timestamp = get_timestamp(j);
auto offset = string_chunk->binary_search_string(target);
for (; offset != -1 && offset < string_chunk->RowNums() &&
string_chunk->operator[](offset) == target;
@ -1142,7 +1143,11 @@ ChunkedSegmentSealedImpl::ChunkedSegmentSealedImpl(
const Timestamp* timestamps,
std::function<void(const SegOffset offset, const Timestamp ts)>
callback) {
this->search_batch_pks(pks, timestamps, false, callback);
this->search_batch_pks(
pks,
[&](const size_t idx) { return timestamps[idx]; },
false,
callback);
},
segment_id) {
auto mcm = storage::MmapManager::GetInstance().GetMmapChunkManager();
@ -1752,7 +1757,7 @@ ChunkedSegmentSealedImpl::GetFieldDataType(milvus::FieldId field_id) const {
return field_meta.get_data_type();
}
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
std::vector<SegOffset>
ChunkedSegmentSealedImpl::search_ids(const IdArray& id_array,
Timestamp timestamp) const {
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
@ -1763,37 +1768,16 @@ ChunkedSegmentSealedImpl::search_ids(const IdArray& id_array,
std::vector<PkType> pks(ids_size);
ParsePksFromIDs(pks, data_type, id_array);
auto res_id_arr = std::make_unique<IdArray>();
std::vector<SegOffset> res_offsets;
res_offsets.reserve(pks.size());
for (auto& pk : pks) {
std::vector<SegOffset> pk_offsets;
if (!is_sorted_by_pk_) {
pk_offsets = insert_record_.search_pk(pk, timestamp);
} else {
pk_offsets = search_pk(pk, timestamp);
}
for (auto offset : pk_offsets) {
switch (data_type) {
case DataType::INT64: {
res_id_arr->mutable_int_id()->add_data(
std::get<int64_t>(pk));
break;
}
case DataType::VARCHAR: {
res_id_arr->mutable_str_id()->add_data(
std::get<std::string>(std::move(pk)));
break;
}
default: {
ThrowInfo(DataTypeInvalid,
fmt::format("unsupported type {}", data_type));
}
}
this->search_batch_pks(
pks,
[=](const size_t idx) { return timestamp; },
true,
[&](const SegOffset offset, const Timestamp ts) {
res_offsets.push_back(offset);
}
}
return {std::move(res_id_arr), std::move(res_offsets)};
});
return std::move(res_offsets);
}
SegcoreError

View File

@ -207,7 +207,7 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
void
search_batch_pks(
const std::vector<PkType>& pks,
const Timestamp* timestamps,
const std::function<Timestamp(const size_t idx)>& get_timestamp,
bool include_same_ts,
const std::function<void(const SegOffset offset, const Timestamp ts)>&
callback) const;
@ -410,7 +410,7 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
return system_ready_count_ == 1;
}
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
std::vector<SegOffset>
search_ids(const IdArray& id_array, Timestamp timestamp) const override;
void

View File

@ -118,7 +118,9 @@ class DeletedRecord {
}
}
search_pk_func_(
pks, timestamps, [&](SegOffset offset, Timestamp delete_ts) {
pks,
timestamps,
[&](const SegOffset offset, const Timestamp delete_ts) {
auto row_id = offset.get();
// if already deleted, no need to add new record
if (deleted_mask_.size() > row_id && deleted_mask_[row_id]) {

View File

@ -1106,7 +1106,7 @@ SegmentGrowingImpl::bulk_subscript(SystemFieldType system_type,
}
}
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
std::vector<SegOffset>
SegmentGrowingImpl::search_ids(const IdArray& id_array,
Timestamp timestamp) const {
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
@ -1117,32 +1117,15 @@ SegmentGrowingImpl::search_ids(const IdArray& id_array,
std::vector<PkType> pks(ids_size);
ParsePksFromIDs(pks, data_type, id_array);
auto res_id_arr = std::make_unique<IdArray>();
std::vector<SegOffset> res_offsets;
res_offsets.reserve(pks.size());
for (auto& pk : pks) {
auto segOffsets = insert_record_.search_pk(pk, timestamp);
for (auto offset : segOffsets) {
switch (data_type) {
case DataType::INT64: {
res_id_arr->mutable_int_id()->add_data(
std::get<int64_t>(pk));
break;
}
case DataType::VARCHAR: {
res_id_arr->mutable_str_id()->add_data(
std::get<std::string>(std::move(pk)));
break;
}
default: {
ThrowInfo(DataTypeInvalid,
fmt::format("unsupported type {}", data_type));
}
}
res_offsets.push_back(offset);
}
}
return {std::move(res_id_arr), std::move(res_offsets)};
return std::move(res_offsets);
}
std::string

View File

@ -340,7 +340,7 @@ class SegmentGrowingImpl : public SegmentGrowing {
int64_t ins_barrier,
Timestamp timestamp) const override;
std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
std::vector<SegOffset>
search_ids(const IdArray& id_array, Timestamp timestamp) const override;
bool

View File

@ -440,7 +440,14 @@ class SegmentInternalInterface : public SegmentInterface {
virtual int64_t
get_active_count(Timestamp ts) const = 0;
virtual std::pair<std::unique_ptr<IdArray>, std::vector<SegOffset>>
/**
* search offset by possible pk values and mvcc timestamp
*
* @param id_array possible pk values
* @param timestamp mvcc timestamp
* @return all the hit entries in vector of offsets
*/
virtual std::vector<SegOffset>
search_ids(const IdArray& id_array, Timestamp timestamp) const = 0;
/**

View File

@ -45,7 +45,8 @@ TEST(DeleteMVCC, common_case) {
[&insert_record](
const std::vector<PkType>& pks,
const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) {
std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp);
@ -170,7 +171,8 @@ TEST(DeleteMVCC, delete_exist_duplicate_pks) {
[&insert_record](
const std::vector<PkType>& pks,
const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) {
std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp);
@ -294,7 +296,8 @@ TEST(DeleteMVCC, snapshot) {
[&insert_record](
const std::vector<PkType>& pks,
const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) {
std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp);
@ -351,7 +354,8 @@ TEST(DeleteMVCC, insert_after_snapshot) {
[&insert_record](
const std::vector<PkType>& pks,
const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) {
std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp);
@ -455,7 +459,8 @@ TEST(DeleteMVCC, perform) {
[&insert_record](
const std::vector<PkType>& pks,
const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) {
std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp);

View File

@ -93,7 +93,8 @@ TEST(Util, GetDeleteBitmap) {
[&insert_record](
const std::vector<PkType>& pks,
const Timestamp* timestamps,
std::function<void(SegOffset offset, Timestamp ts)> cb) {
std::function<void(const SegOffset offset, const Timestamp ts)>
cb) {
for (size_t i = 0; i < pks.size(); ++i) {
auto timestamp = timestamps[i];
auto offsets = insert_record.search_pk(pks[i], timestamp);