mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
enhance: remove timestamp filter for search_ids to optimize performance (#44634)
#44352 Signed-off-by: luzhang <luzhang@zilliz.com> Co-authored-by: luzhang <luzhang@zilliz.com>
This commit is contained in:
parent
9f2937fd0f
commit
ae19c93c14
@ -196,12 +196,8 @@ PhyTermFilterExpr::InitPkCacheOffset() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto seg_offsets = segment_->search_ids(*id_array, query_timestamp_);
|
|
||||||
cached_bits_.resize(active_count_, false);
|
cached_bits_.resize(active_count_, false);
|
||||||
for (const auto& offset : seg_offsets) {
|
segment_->search_ids(cached_bits_, *id_array);
|
||||||
auto _offset = (int64_t)offset.get();
|
|
||||||
cached_bits_[_offset] = true;
|
|
||||||
}
|
|
||||||
cached_bits_inited_ = true;
|
cached_bits_inited_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1038,6 +1038,84 @@ ChunkedSegmentSealedImpl::search_pk(milvus::OpContext* op_ctx,
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
ChunkedSegmentSealedImpl::search_pks(BitsetType& bitset,
|
||||||
|
const std::vector<PkType>& pks) const {
|
||||||
|
BitsetTypeView bitset_view(bitset);
|
||||||
|
if (!is_sorted_by_pk_) {
|
||||||
|
for (auto& pk : pks) {
|
||||||
|
insert_record_.search_pk_range(
|
||||||
|
pk, proto::plan::OpType::Equal, bitset_view);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto pk_field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
|
||||||
|
AssertInfo(pk_field_id.get() != -1, "Primary key is -1");
|
||||||
|
auto pk_column = get_column(pk_field_id);
|
||||||
|
AssertInfo(pk_column != nullptr, "primary key column not loaded");
|
||||||
|
|
||||||
|
auto all_chunk_pins = pk_column->GetAllChunks(nullptr);
|
||||||
|
switch (schema_->get_fields().at(pk_field_id).get_data_type()) {
|
||||||
|
case DataType::INT64: {
|
||||||
|
auto num_chunk = pk_column->num_chunks();
|
||||||
|
for (int i = 0; i < num_chunk; ++i) {
|
||||||
|
auto pw = all_chunk_pins[i];
|
||||||
|
auto src =
|
||||||
|
reinterpret_cast<const int64_t*>(pw.get()->RawData());
|
||||||
|
auto chunk_row_num = pk_column->chunk_row_nums(i);
|
||||||
|
for (size_t j = 0; j < pks.size(); j++) {
|
||||||
|
// get int64 pks
|
||||||
|
auto target = std::get<int64_t>(pks[j]);
|
||||||
|
auto it = std::lower_bound(
|
||||||
|
src,
|
||||||
|
src + chunk_row_num,
|
||||||
|
target,
|
||||||
|
[](const int64_t& elem, const int64_t& value) {
|
||||||
|
return elem < value;
|
||||||
|
});
|
||||||
|
auto num_rows_until_chunk =
|
||||||
|
pk_column->GetNumRowsUntilChunk(i);
|
||||||
|
for (; it != src + chunk_row_num && *it == target; ++it) {
|
||||||
|
auto offset = it - src + num_rows_until_chunk;
|
||||||
|
bitset[offset] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case DataType::VARCHAR: {
|
||||||
|
auto num_chunk = pk_column->num_chunks();
|
||||||
|
for (int i = 0; i < num_chunk; ++i) {
|
||||||
|
// TODO @xiaocai2333, @sunby: chunk need to record the min/max.
|
||||||
|
auto num_rows_until_chunk = pk_column->GetNumRowsUntilChunk(i);
|
||||||
|
auto pw = all_chunk_pins[i];
|
||||||
|
auto string_chunk = static_cast<StringChunk*>(pw.get());
|
||||||
|
for (size_t j = 0; j < pks.size(); ++j) {
|
||||||
|
// get varchar pks
|
||||||
|
auto& target = std::get<std::string>(pks[j]);
|
||||||
|
auto offset = string_chunk->binary_search_string(target);
|
||||||
|
for (; offset != -1 && offset < string_chunk->RowNums() &&
|
||||||
|
string_chunk->operator[](offset) == target;
|
||||||
|
++offset) {
|
||||||
|
auto segment_offset = offset + num_rows_until_chunk;
|
||||||
|
bitset[segment_offset] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default: {
|
||||||
|
ThrowInfo(
|
||||||
|
DataTypeInvalid,
|
||||||
|
fmt::format(
|
||||||
|
"unsupported type {}",
|
||||||
|
schema_->get_fields().at(pk_field_id).get_data_type()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
ChunkedSegmentSealedImpl::search_batch_pks(
|
ChunkedSegmentSealedImpl::search_batch_pks(
|
||||||
const std::vector<PkType>& pks,
|
const std::vector<PkType>& pks,
|
||||||
@ -2221,9 +2299,9 @@ ChunkedSegmentSealedImpl::GetFieldDataType(milvus::FieldId field_id) const {
|
|||||||
return field_meta.get_data_type();
|
return field_meta.get_data_type();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<SegOffset>
|
void
|
||||||
ChunkedSegmentSealedImpl::search_ids(const IdArray& id_array,
|
ChunkedSegmentSealedImpl::search_ids(BitsetType& bitset,
|
||||||
Timestamp timestamp) const {
|
const IdArray& id_array) const {
|
||||||
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
|
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
|
||||||
AssertInfo(field_id.get() != -1, "Primary key is -1");
|
AssertInfo(field_id.get() != -1, "Primary key is -1");
|
||||||
auto& field_meta = schema_->operator[](field_id);
|
auto& field_meta = schema_->operator[](field_id);
|
||||||
@ -2232,16 +2310,7 @@ ChunkedSegmentSealedImpl::search_ids(const IdArray& id_array,
|
|||||||
std::vector<PkType> pks(ids_size);
|
std::vector<PkType> pks(ids_size);
|
||||||
ParsePksFromIDs(pks, data_type, id_array);
|
ParsePksFromIDs(pks, data_type, id_array);
|
||||||
|
|
||||||
std::vector<SegOffset> res_offsets;
|
this->search_pks(bitset, pks);
|
||||||
res_offsets.reserve(pks.size());
|
|
||||||
this->search_batch_pks(
|
|
||||||
pks,
|
|
||||||
[=](const size_t idx) { return timestamp; },
|
|
||||||
true,
|
|
||||||
[&](const SegOffset offset, const Timestamp ts) {
|
|
||||||
res_offsets.push_back(offset);
|
|
||||||
});
|
|
||||||
return std::move(res_offsets);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SegcoreError
|
SegcoreError
|
||||||
|
|||||||
@ -248,6 +248,9 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
search_pks(BitsetType& bitset, const std::vector<PkType>& pks) const;
|
||||||
|
|
||||||
void
|
void
|
||||||
search_batch_pks(
|
search_batch_pks(
|
||||||
const std::vector<PkType>& pks,
|
const std::vector<PkType>& pks,
|
||||||
@ -478,8 +481,8 @@ class ChunkedSegmentSealedImpl : public SegmentSealed {
|
|||||||
return system_ready_count_ == 1;
|
return system_ready_count_ == 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<SegOffset>
|
void
|
||||||
search_ids(const IdArray& id_array, Timestamp timestamp) const override;
|
search_ids(BitsetType& bitset, const IdArray& id_array) const override;
|
||||||
|
|
||||||
void
|
void
|
||||||
LoadVecIndex(const LoadIndexInfo& info);
|
LoadVecIndex(const LoadIndexInfo& info);
|
||||||
|
|||||||
@ -1164,9 +1164,9 @@ SegmentGrowingImpl::bulk_subscript(milvus::OpContext* op_ctx,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<SegOffset>
|
void
|
||||||
SegmentGrowingImpl::search_ids(const IdArray& id_array,
|
SegmentGrowingImpl::search_ids(BitsetType& bitset,
|
||||||
Timestamp timestamp) const {
|
const IdArray& id_array) const {
|
||||||
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
|
auto field_id = schema_->get_primary_field_id().value_or(FieldId(-1));
|
||||||
AssertInfo(field_id.get() != -1, "Primary key is -1");
|
AssertInfo(field_id.get() != -1, "Primary key is -1");
|
||||||
auto& field_meta = schema_->operator[](field_id);
|
auto& field_meta = schema_->operator[](field_id);
|
||||||
@ -1175,15 +1175,11 @@ SegmentGrowingImpl::search_ids(const IdArray& id_array,
|
|||||||
std::vector<PkType> pks(ids_size);
|
std::vector<PkType> pks(ids_size);
|
||||||
ParsePksFromIDs(pks, data_type, id_array);
|
ParsePksFromIDs(pks, data_type, id_array);
|
||||||
|
|
||||||
std::vector<SegOffset> res_offsets;
|
BitsetTypeView bitset_view(bitset);
|
||||||
res_offsets.reserve(pks.size());
|
|
||||||
for (auto& pk : pks) {
|
for (auto& pk : pks) {
|
||||||
auto segOffsets = insert_record_.search_pk(pk, timestamp);
|
insert_record_.search_pk_range(
|
||||||
for (auto offset : segOffsets) {
|
pk, proto::plan::OpType::Equal, bitset_view);
|
||||||
res_offsets.push_back(offset);
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return std::move(res_offsets);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string
|
std::string
|
||||||
|
|||||||
@ -375,8 +375,8 @@ class SegmentGrowingImpl : public SegmentGrowing {
|
|||||||
int64_t ins_barrier,
|
int64_t ins_barrier,
|
||||||
Timestamp timestamp) const override;
|
Timestamp timestamp) const override;
|
||||||
|
|
||||||
std::vector<SegOffset>
|
void
|
||||||
search_ids(const IdArray& id_array, Timestamp timestamp) const override;
|
search_ids(BitsetType& bitset, const IdArray& id_array) const override;
|
||||||
|
|
||||||
bool
|
bool
|
||||||
HasIndex(FieldId field_id) const {
|
HasIndex(FieldId field_id) const {
|
||||||
|
|||||||
@ -434,12 +434,14 @@ class SegmentInternalInterface : public SegmentInterface {
|
|||||||
/**
|
/**
|
||||||
* search offset by possible pk values and mvcc timestamp
|
* search offset by possible pk values and mvcc timestamp
|
||||||
*
|
*
|
||||||
|
* @param bitset The final bitset after id array filtering,
|
||||||
|
* `false` means that the entity will be filtered out.
|
||||||
* @param id_array possible pk values
|
* @param id_array possible pk values
|
||||||
* @param timestamp mvcc timestamp
|
* this interface is used for internal expression calculation,
|
||||||
* @return all the hit entries in vector of offsets
|
* so no need timestamp parameter, mvcc node prove the timestamp is already filtered.
|
||||||
*/
|
*/
|
||||||
virtual std::vector<SegOffset>
|
virtual void
|
||||||
search_ids(const IdArray& id_array, Timestamp timestamp) const = 0;
|
search_ids(BitsetType& bitset, const IdArray& id_array) const = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Apply timestamp filtering on bitset, the query can't see an entity whose
|
* Apply timestamp filtering on bitset, the query can't see an entity whose
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user