mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 01:28:27 +08:00
fix: non-init seg_offset for growing raw-data when doing groupby (#34748)
related: #34713 Signed-off-by: MrPresent-Han <chun.han@gmail.com> Co-authored-by: MrPresent-Han <chun.han@gmail.com>
This commit is contained in:
parent
e4e18cb8c3
commit
ed057e6fce
@ -106,8 +106,11 @@ struct VectorIterator {
|
|||||||
int idx = 0;
|
int idx = 0;
|
||||||
for (auto& iter : iterators_) {
|
for (auto& iter : iterators_) {
|
||||||
if (iter->HasNext()) {
|
if (iter->HasNext()) {
|
||||||
|
auto origin_pair = iter->Next();
|
||||||
|
origin_pair.first = convert_to_segment_offset(
|
||||||
|
origin_pair.first, idx);
|
||||||
auto off_dis_pair =
|
auto off_dis_pair =
|
||||||
std::make_shared<OffsetDisPair>(iter->Next(), idx++);
|
std::make_shared<OffsetDisPair>(origin_pair, idx++);
|
||||||
heap_.push(off_dis_pair);
|
heap_.push(off_dis_pair);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -650,91 +650,91 @@ TEST(GroupBY, Reduce) {
|
|||||||
DeleteSegment(c_segment_2);
|
DeleteSegment(c_segment_2);
|
||||||
}
|
}
|
||||||
|
|
||||||
//TEST(GroupBY, GrowingRawData) {
|
TEST(GroupBY, GrowingRawData) {
|
||||||
// //0. set up growing segment
|
//0. set up growing segment
|
||||||
// int dim = 128;
|
int dim = 128;
|
||||||
// uint64_t seed = 512;
|
uint64_t seed = 512;
|
||||||
// auto schema = std::make_shared<Schema>();
|
auto schema = std::make_shared<Schema>();
|
||||||
// auto metric_type = knowhere::metric::L2;
|
auto metric_type = knowhere::metric::L2;
|
||||||
// auto int64_field_id = schema->AddDebugField("int64", DataType::INT64);
|
auto int64_field_id = schema->AddDebugField("int64", DataType::INT64);
|
||||||
// auto int32_field_id = schema->AddDebugField("int32", DataType::INT32);
|
auto int32_field_id = schema->AddDebugField("int32", DataType::INT32);
|
||||||
// auto vec_field_id = schema->AddDebugField(
|
auto vec_field_id = schema->AddDebugField(
|
||||||
// "embeddings", DataType::VECTOR_FLOAT, 128, metric_type);
|
"embeddings", DataType::VECTOR_FLOAT, 128, metric_type);
|
||||||
// schema->set_primary_field_id(int64_field_id);
|
schema->set_primary_field_id(int64_field_id);
|
||||||
//
|
|
||||||
// auto config = SegcoreConfig::default_config();
|
auto config = SegcoreConfig::default_config();
|
||||||
// config.set_chunk_rows(128);
|
config.set_chunk_rows(128);
|
||||||
// config.set_enable_interim_segment_index(
|
config.set_enable_interim_segment_index(
|
||||||
// false); //no growing index, test brute force
|
false); //no growing index, test brute force
|
||||||
// auto segment_growing = CreateGrowingSegment(schema, nullptr, 1, config);
|
auto segment_growing = CreateGrowingSegment(schema, nullptr, 1, config);
|
||||||
// auto segment_growing_impl =
|
auto segment_growing_impl =
|
||||||
// dynamic_cast<SegmentGrowingImpl*>(segment_growing.get());
|
dynamic_cast<SegmentGrowingImpl*>(segment_growing.get());
|
||||||
//
|
|
||||||
// //1. prepare raw data in growing segment
|
//1. prepare raw data in growing segment
|
||||||
// int64_t rows_per_batch = 512;
|
int64_t rows_per_batch = 512;
|
||||||
// int n_batch = 3;
|
int n_batch = 3;
|
||||||
// for (int i = 0; i < n_batch; i++) {
|
for (int i = 0; i < n_batch; i++) {
|
||||||
// auto data_set =
|
auto data_set =
|
||||||
// DataGen(schema, rows_per_batch, 42, 0, 8, 10, false, false);
|
DataGen(schema, rows_per_batch, 42, 0, 8, 10, false, false);
|
||||||
// auto offset = segment_growing_impl->PreInsert(rows_per_batch);
|
auto offset = segment_growing_impl->PreInsert(rows_per_batch);
|
||||||
// segment_growing_impl->Insert(offset,
|
segment_growing_impl->Insert(offset,
|
||||||
// rows_per_batch,
|
rows_per_batch,
|
||||||
// data_set.row_ids_.data(),
|
data_set.row_ids_.data(),
|
||||||
// data_set.timestamps_.data(),
|
data_set.timestamps_.data(),
|
||||||
// data_set.raw_);
|
data_set.raw_);
|
||||||
// }
|
}
|
||||||
//
|
|
||||||
// //2. Search group by
|
//2. Search group by
|
||||||
// auto num_queries = 10;
|
auto num_queries = 10;
|
||||||
// auto topK = 100;
|
auto topK = 100;
|
||||||
// int group_size = 1;
|
int group_size = 1;
|
||||||
// const char* raw_plan = R"(vector_anns: <
|
const char* raw_plan = R"(vector_anns: <
|
||||||
// field_id: 102
|
field_id: 102
|
||||||
// query_info: <
|
query_info: <
|
||||||
// topk: 100
|
topk: 100
|
||||||
// metric_type: "L2"
|
metric_type: "L2"
|
||||||
// search_params: "{\"ef\": 10}"
|
search_params: "{\"ef\": 10}"
|
||||||
// group_by_field_id: 101
|
group_by_field_id: 101
|
||||||
// group_size: 1
|
group_size: 1
|
||||||
// >
|
>
|
||||||
// placeholder_tag: "$0"
|
placeholder_tag: "$0"
|
||||||
//
|
|
||||||
// >)";
|
>)";
|
||||||
// auto plan_str = translate_text_plan_to_binary_plan(raw_plan);
|
auto plan_str = translate_text_plan_to_binary_plan(raw_plan);
|
||||||
// auto plan =
|
auto plan =
|
||||||
// CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size());
|
CreateSearchPlanByExpr(*schema, plan_str.data(), plan_str.size());
|
||||||
// auto ph_group_raw = CreatePlaceholderGroup(num_queries, dim, seed);
|
auto ph_group_raw = CreatePlaceholderGroup(num_queries, dim, seed);
|
||||||
// auto ph_group =
|
auto ph_group =
|
||||||
// ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
|
ParsePlaceholderGroup(plan.get(), ph_group_raw.SerializeAsString());
|
||||||
// auto search_result =
|
auto search_result =
|
||||||
// segment_growing_impl->Search(plan.get(), ph_group.get(), 1L << 63);
|
segment_growing_impl->Search(plan.get(), ph_group.get(), 1L << 63);
|
||||||
// CheckGroupBySearchResult(*search_result, topK, num_queries, true);
|
CheckGroupBySearchResult(*search_result, topK, num_queries, true);
|
||||||
//
|
|
||||||
// auto& group_by_values = search_result->group_by_values_.value();
|
auto& group_by_values = search_result->group_by_values_.value();
|
||||||
// int size = group_by_values.size();
|
int size = group_by_values.size();
|
||||||
// ASSERT_EQ(size, 640);
|
ASSERT_EQ(size, 640);
|
||||||
// //as the number of data is 512 and repeated count is 8, the group number is 64 for every query
|
//as the number of data is 512 and repeated count is 8, the group number is 64 for every query
|
||||||
// //and the total group number should be 640
|
//and the total group number should be 640
|
||||||
// int expected_group_count = 64;
|
int expected_group_count = 64;
|
||||||
// int idx = 0;
|
int idx = 0;
|
||||||
// for (int i = 0; i < num_queries; i++) {
|
for (int i = 0; i < num_queries; i++) {
|
||||||
// std::unordered_set<int32_t> i32_set;
|
std::unordered_set<int32_t> i32_set;
|
||||||
// float lastDistance = 0.0;
|
float lastDistance = 0.0;
|
||||||
// for (int j = 0; j < expected_group_count; j++) {
|
for (int j = 0; j < expected_group_count; j++) {
|
||||||
// if (std::holds_alternative<int32_t>(group_by_values[idx])) {
|
if (std::holds_alternative<int32_t>(group_by_values[idx])) {
|
||||||
// int32_t g_val = std::get<int32_t>(group_by_values[idx]);
|
int32_t g_val = std::get<int32_t>(group_by_values[idx]);
|
||||||
// ASSERT_FALSE(
|
ASSERT_FALSE(
|
||||||
// i32_set.count(g_val) >
|
i32_set.count(g_val) >
|
||||||
// 0); //as the group_size is 1, there should not be any duplication for group_by value
|
0); //as the group_size is 1, there should not be any duplication for group_by value
|
||||||
// i32_set.insert(g_val);
|
i32_set.insert(g_val);
|
||||||
// auto distance = search_result->distances_.at(idx);
|
auto distance = search_result->distances_.at(idx);
|
||||||
// ASSERT_TRUE(lastDistance <= distance);
|
ASSERT_TRUE(lastDistance <= distance);
|
||||||
// lastDistance = distance;
|
lastDistance = distance;
|
||||||
// }
|
}
|
||||||
// idx++;
|
idx++;
|
||||||
// }
|
}
|
||||||
// }
|
}
|
||||||
//}
|
}
|
||||||
|
|
||||||
TEST(GroupBY, GrowingIndex) {
|
TEST(GroupBY, GrowingIndex) {
|
||||||
//0. set up growing segment
|
//0. set up growing segment
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user