mirror of
https://gitee.com/milvus-io/milvus.git
synced 2025-12-07 17:48:29 +08:00
fix: storage v2 write mmap file per field per cell (#42180)
Each cell of a field should be written to its own mmap file, rather than writing all cells of the field into a single mmap file. related: #39173 --------- Signed-off-by: shaoting-huang <shaoting.huang@zilliz.com>
This commit is contained in:
parent
6e16653597
commit
b136f85ca0
@ -82,12 +82,6 @@ GroupChunkTranslator::GroupChunkTranslator(
|
||||
}
|
||||
|
||||
GroupChunkTranslator::~GroupChunkTranslator() {
|
||||
for (auto chunk : group_chunks_) {
|
||||
if (chunk != nullptr) {
|
||||
// let the GroupChunk to be deleted by the unique_ptr
|
||||
auto chunk_ptr = std::unique_ptr<GroupChunk>(chunk);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t
|
||||
@ -166,77 +160,31 @@ GroupChunkTranslator::get_cells(const std::vector<cachinglayer::cid_t>& cids) {
|
||||
LOG_INFO("segment {} submits load fields {} task to thread pool",
|
||||
segment_id_,
|
||||
field_id_list_.ToString());
|
||||
if (!use_mmap_) {
|
||||
load_column_group_in_memory();
|
||||
} else {
|
||||
load_column_group_in_mmap();
|
||||
}
|
||||
|
||||
for (auto cid : cids) {
|
||||
AssertInfo(group_chunks_[cid] != nullptr,
|
||||
"GroupChunkTranslator::get_cells failed to load cell {} of "
|
||||
"CacheSlot {}.",
|
||||
cid,
|
||||
key_);
|
||||
cells.emplace_back(
|
||||
cid, std::unique_ptr<milvus::GroupChunk>(group_chunks_[cid]));
|
||||
group_chunks_[cid] = nullptr;
|
||||
std::shared_ptr<milvus::ArrowDataWrapper> r;
|
||||
int64_t cid_idx = 0;
|
||||
int64_t total_tables = 0;
|
||||
while (column_group_info_.arrow_reader_channel->pop(r)) {
|
||||
for (const auto& table : r->arrow_tables) {
|
||||
AssertInfo(cid_idx < cids.size(),
|
||||
"Number of tables exceed number of cids ({})",
|
||||
cids.size());
|
||||
auto cid = cids[cid_idx++];
|
||||
cells.emplace_back(cid, load_group_chunk(table, cid));
|
||||
total_tables++;
|
||||
}
|
||||
}
|
||||
AssertInfo(total_tables == cids.size(),
|
||||
"Number of tables ({}) does not match number of cids ({})",
|
||||
total_tables,
|
||||
cids.size());
|
||||
return cells;
|
||||
}
|
||||
|
||||
void
|
||||
GroupChunkTranslator::load_column_group_in_memory() {
|
||||
std::vector<size_t> row_counts(field_id_list_.size(), 0);
|
||||
std::shared_ptr<milvus::ArrowDataWrapper> r;
|
||||
std::vector<std::string> files;
|
||||
std::vector<size_t> file_offsets;
|
||||
while (column_group_info_.arrow_reader_channel->pop(r)) {
|
||||
for (const auto& table : r->arrow_tables) {
|
||||
process_batch(table, files, file_offsets, row_counts);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GroupChunkTranslator::load_column_group_in_mmap() {
|
||||
std::vector<std::string> files;
|
||||
std::vector<size_t> file_offsets;
|
||||
std::vector<size_t> row_counts;
|
||||
|
||||
// Initialize files and offsets
|
||||
for (size_t i = 0; i < field_id_list_.size(); ++i) {
|
||||
auto field_id = field_id_list_.Get(i);
|
||||
auto filepath =
|
||||
std::filesystem::path(column_group_info_.mmap_dir_path) /
|
||||
std::to_string(segment_id_) / std::to_string(field_id);
|
||||
auto dir = filepath.parent_path();
|
||||
std::filesystem::create_directories(dir);
|
||||
files.push_back(filepath.string());
|
||||
file_offsets.push_back(0);
|
||||
row_counts.push_back(0);
|
||||
}
|
||||
|
||||
std::shared_ptr<milvus::ArrowDataWrapper> r;
|
||||
while (column_group_info_.arrow_reader_channel->pop(r)) {
|
||||
for (const auto& table : r->arrow_tables) {
|
||||
process_batch(table, files, file_offsets, row_counts);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < files.size(); ++i) {
|
||||
auto ok = unlink(files[i].c_str());
|
||||
AssertInfo(ok == 0,
|
||||
fmt::format("failed to unlink mmap data file {}, err: {}",
|
||||
files[i].c_str(),
|
||||
strerror(errno)));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GroupChunkTranslator::process_batch(const std::shared_ptr<arrow::Table>& table,
|
||||
const std::vector<std::string>& files,
|
||||
std::vector<size_t>& file_offsets,
|
||||
std::vector<size_t>& row_counts) {
|
||||
std::unique_ptr<milvus::GroupChunk>
|
||||
GroupChunkTranslator::load_group_chunk(
|
||||
const std::shared_ptr<arrow::Table>& table,
|
||||
const milvus::cachinglayer::cid_t cid) {
|
||||
// Create chunks for each field in this batch
|
||||
std::unordered_map<FieldId, std::shared_ptr<Chunk>> chunks;
|
||||
// Iterate through field_id_list to get field_id and create chunk
|
||||
@ -263,25 +211,35 @@ GroupChunkTranslator::process_batch(const std::shared_ptr<arrow::Table>& table,
|
||||
chunk = create_chunk(field_meta, dim, array_vec);
|
||||
} else {
|
||||
// Mmap mode
|
||||
int flags = O_RDWR;
|
||||
if (file_offsets[i] == 0) {
|
||||
// First write to this file, create and truncate
|
||||
flags |= O_CREAT | O_TRUNC;
|
||||
}
|
||||
auto file = File::Open(files[i], flags);
|
||||
// should seek to the file offset before writing
|
||||
file.Seek(file_offsets[i], SEEK_SET);
|
||||
chunk =
|
||||
create_chunk(field_meta, dim, file, file_offsets[i], array_vec);
|
||||
file_offsets[i] += chunk->Size();
|
||||
auto filepath =
|
||||
std::filesystem::path(column_group_info_.mmap_dir_path) /
|
||||
std::to_string(segment_id_) / std::to_string(field_id) /
|
||||
std::to_string(cid);
|
||||
|
||||
LOG_INFO(
|
||||
"storage v2 segment {} mmaping field {} chunk {} to path {}",
|
||||
segment_id_,
|
||||
field_id,
|
||||
cid,
|
||||
filepath.string());
|
||||
|
||||
std::filesystem::create_directories(filepath.parent_path());
|
||||
|
||||
auto file =
|
||||
File::Open(filepath.string(), O_CREAT | O_TRUNC | O_RDWR);
|
||||
auto chunk = create_chunk(field_meta, dim, file, 0, array_vec);
|
||||
auto ok = unlink(filepath.c_str());
|
||||
AssertInfo(
|
||||
ok == 0,
|
||||
fmt::format(
|
||||
"storage v2 failed to unlink mmap data file {}, err: {}",
|
||||
filepath.c_str(),
|
||||
strerror(errno)));
|
||||
}
|
||||
|
||||
row_counts[i] += chunk->RowNums();
|
||||
chunks[fid] = std::move(chunk);
|
||||
}
|
||||
// Create GroupChunk from chunks and store in results
|
||||
auto group_chunk = std::make_unique<milvus::GroupChunk>(chunks);
|
||||
group_chunks_.emplace_back(group_chunk.release());
|
||||
return std::make_unique<milvus::GroupChunk>(chunks);
|
||||
}
|
||||
|
||||
} // namespace milvus::segcore::storagev2translator
|
||||
|
||||
@ -72,17 +72,9 @@ class GroupChunkTranslator
|
||||
}
|
||||
|
||||
private:
|
||||
void
|
||||
load_column_group_in_memory();
|
||||
|
||||
void
|
||||
load_column_group_in_mmap();
|
||||
|
||||
void
|
||||
process_batch(const std::shared_ptr<arrow::Table>& table,
|
||||
const std::vector<std::string>& files,
|
||||
std::vector<size_t>& file_offsets,
|
||||
std::vector<size_t>& row_counts);
|
||||
std::unique_ptr<milvus::GroupChunk>
|
||||
load_group_chunk(const std::shared_ptr<arrow::Table>& table,
|
||||
const milvus::cachinglayer::cid_t cid);
|
||||
|
||||
int64_t segment_id_;
|
||||
std::string key_;
|
||||
@ -96,7 +88,6 @@ class GroupChunkTranslator
|
||||
ChunkedSegmentSealedImpl* chunked_segment_;
|
||||
std::unique_ptr<milvus::segcore::InsertRecord<true>> ir_;
|
||||
GroupCTMeta meta_;
|
||||
std::vector<milvus::GroupChunk*> group_chunks_;
|
||||
int64_t timestamp_offet_;
|
||||
bool use_mmap_;
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user