fix: [2.3] Reduce duplicate PKs in segcore (#34267) (#35291)

issue: https://github.com/milvus-io/milvus/issues/34247

pr: https://github.com/milvus-io/milvus/pull/34267

Signed-off-by: bigsheeper <yihao.dai@zilliz.com>
This commit is contained in:
yihao.dai 2024-08-07 14:30:17 +08:00 committed by GitHub
parent b59ba81349
commit 52fcd3f48b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 11 additions and 8 deletions

View File

@ -112,6 +112,10 @@ class OffsetOrderedMap : public OffsetMap {
bool false_filtered_out) const override {
std::shared_lock<std::shared_mutex> lck(mtx_);
if (limit == Unlimited || limit == NoLimit) {
limit = map_.size();
}
// TODO: we can't retrieve pk by offset very conveniently.
// Selectivity should be done outside.
return find_first_by_index(limit, bitset, false_filtered_out);
@ -128,15 +132,15 @@ class OffsetOrderedMap : public OffsetMap {
if (!false_filtered_out) {
cnt = size - bitset.count();
}
if (limit == Unlimited || limit == NoLimit) {
limit = cnt;
}
limit = std::min(limit, cnt);
std::vector<int64_t> seg_offsets;
seg_offsets.reserve(limit);
auto it = map_.begin();
for (; hit_num < limit && it != map_.end(); it++) {
for (auto seg_offset : it->second) {
// Offsets in the growing segment are ordered by timestamp,
// so traverse from back to front to obtain the latest offset.
for (int i = it->second.size() - 1; i >= 0; --i) {
auto seg_offset = it->second[i];
if (seg_offset >= size) {
// Frequently concurrent insert/query will cause this case.
continue;
@ -145,9 +149,8 @@ class OffsetOrderedMap : public OffsetMap {
if (!(bitset[seg_offset] ^ false_filtered_out)) {
seg_offsets.push_back(seg_offset);
hit_num++;
if (hit_num >= limit) {
break;
}
// PK hit, no need to continue traversing offsets with the same PK.
break;
}
}
}

View File

@ -680,7 +680,7 @@ TEST(CApiTest, DeleteRepeatedPksFromGrowingSegment) {
auto suc = query_result->ParseFromArray(retrieve_result.proto_blob,
retrieve_result.proto_size);
ASSERT_TRUE(suc);
ASSERT_EQ(query_result->ids().int_id().data().size(), 6);
ASSERT_EQ(query_result->ids().int_id().data().size(), 3);
DeleteRetrieveResult(&retrieve_result);
// delete data pks = {1, 2, 3}