From 52fcd3f48b0150bb3a91c93f55ee77d74ba12161 Mon Sep 17 00:00:00 2001 From: "yihao.dai" Date: Wed, 7 Aug 2024 14:30:17 +0800 Subject: [PATCH] fix: [2.3] Reduce duplicate PKs in segcore (#34267) (#35291) issue: https://github.com/milvus-io/milvus/issues/34247 pr: https://github.com/milvus-io/milvus/pull/34267 Signed-off-by: bigsheeper --- internal/core/src/segcore/InsertRecord.h | 17 ++++++++++------- internal/core/unittest/test_c_api.cpp | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/internal/core/src/segcore/InsertRecord.h b/internal/core/src/segcore/InsertRecord.h index a9d76f334b..efc49f7863 100644 --- a/internal/core/src/segcore/InsertRecord.h +++ b/internal/core/src/segcore/InsertRecord.h @@ -112,6 +112,10 @@ class OffsetOrderedMap : public OffsetMap { bool false_filtered_out) const override { std::shared_lock lck(mtx_); + if (limit == Unlimited || limit == NoLimit) { + limit = map_.size(); + } + // TODO: we can't retrieve pk by offset very conveniently. // Selectivity should be done outside. return find_first_by_index(limit, bitset, false_filtered_out); @@ -128,15 +132,15 @@ class OffsetOrderedMap : public OffsetMap { if (!false_filtered_out) { cnt = size - bitset.count(); } - if (limit == Unlimited || limit == NoLimit) { - limit = cnt; - } limit = std::min(limit, cnt); std::vector seg_offsets; seg_offsets.reserve(limit); auto it = map_.begin(); for (; hit_num < limit && it != map_.end(); it++) { - for (auto seg_offset : it->second) { + // Offsets in the growing segment are ordered by timestamp, + // so traverse from back to front to obtain the latest offset. + for (int i = it->second.size() - 1; i >= 0; --i) { + auto seg_offset = it->second[i]; if (seg_offset >= size) { // Frequently concurrent insert/query will cause this case. continue; @@ -145,9 +149,8 @@ class OffsetOrderedMap : public OffsetMap { if (!(bitset[seg_offset] ^ false_filtered_out)) { seg_offsets.push_back(seg_offset); hit_num++; - if (hit_num >= limit) { - break; - } + // PK hit, no need to continue traversing offsets with the same PK. + break; } } } diff --git a/internal/core/unittest/test_c_api.cpp b/internal/core/unittest/test_c_api.cpp index d8fcd0f03c..61fe16ebc8 100644 --- a/internal/core/unittest/test_c_api.cpp +++ b/internal/core/unittest/test_c_api.cpp @@ -680,7 +680,7 @@ TEST(CApiTest, DeleteRepeatedPksFromGrowingSegment) { auto suc = query_result->ParseFromArray(retrieve_result.proto_blob, retrieve_result.proto_size); ASSERT_TRUE(suc); - ASSERT_EQ(query_result->ids().int_id().data().size(), 6); + ASSERT_EQ(query_result->ids().int_id().data().size(), 3); DeleteRetrieveResult(&retrieve_result); // delete data pks = {1, 2, 3}