From 16b785e149baf22c9eff6c2b71e5189c5a5320d3 Mon Sep 17 00:00:00 2001 From: Jiquan Long Date: Wed, 28 Feb 2024 16:45:00 +0800 Subject: [PATCH] enhance: optimize the memory usage and speed up loading variable length data (#30787) /kind improvement this removes the 1x copying while loading variable length data, also avoids constructing std::string, which could lead to memory fragmentation --------- Signed-off-by: yah01 Signed-off-by: longjiquan Co-authored-by: yah01 --- internal/core/src/common/Json.h | 10 ++++++ internal/core/src/mmap/Column.h | 28 ++++++++++------ .../core/src/segcore/SegmentSealedImpl.cpp | 32 +++++-------------- .../unittest/test_disk_file_manager_test.cpp | 1 - 4 files changed, 37 insertions(+), 34 deletions(-) diff --git a/internal/core/src/common/Json.h b/internal/core/src/common/Json.h index 640dc03724..8fb7b23ca0 100644 --- a/internal/core/src/common/Json.h +++ b/internal/core/src/common/Json.h @@ -157,11 +157,21 @@ class Json { return dom_doc().at_pointer(pointer).get_array(); } + size_t + size() const { + return data_.size(); + } + std::string_view data() const { return data_; } + const char* + c_str() const { + return data_.data(); + } + private: std::optional own_data_{}; // this could be empty, then the Json will be just s view on bytes diff --git a/internal/core/src/mmap/Column.h b/internal/core/src/mmap/Column.h index fd965c35a7..ce7c085b63 100644 --- a/internal/core/src/mmap/Column.h +++ b/internal/core/src/mmap/Column.h @@ -209,10 +209,13 @@ class ColumnBase { if (data_ != nullptr) { std::memcpy(data, data_, size_); if (munmap(data_, cap_size_ + padding_)) { + auto err = errno; + munmap(data, new_size + padding_); + AssertInfo( false, "failed to unmap while expanding: {}, old_map_size={}", - strerror(errno), + strerror(err), cap_size_ + padding_); } } @@ -307,10 +310,14 @@ class VariableColumn : public ColumnBase { } void - Append(const char* data, size_t size) { - indices_.emplace_back(size_); - size_ += size; - load_buf_.emplace(data, size); + Append(FieldDataPtr chunk) { + for (auto i = 0; i < chunk->get_num_rows(); i++) { + auto data = static_cast(chunk->RawValue(i)); + + indices_.emplace_back(size_); + size_ += data->size(); + } + load_buf_.emplace(std::move(chunk)); } void @@ -328,11 +335,14 @@ class VariableColumn : public ColumnBase { Expand(total_size); while (!load_buf_.empty()) { - auto data = std::move(load_buf_.front()); + auto chunk = std::move(load_buf_.front()); load_buf_.pop(); - std::copy_n(data.data(), data.length(), data_ + size_); - size_ += data.length(); + for (auto i = 0; i < chunk->get_num_rows(); i++) { + auto data = static_cast(chunk->RawValue(i)); + std::copy_n(data->c_str(), data->size(), data_ + size_); + size_ += data->size(); + } } } @@ -352,7 +362,7 @@ class VariableColumn : public ColumnBase { private: // loading states - std::queue load_buf_{}; + std::queue load_buf_{}; std::vector indices_{}; diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index 8329af9bd6..cd0367cedf 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -390,18 +390,11 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) { num_rows, field_meta); FieldDataPtr field_data; while (data.channel->pop(field_data)) { - for (auto i = 0; i < field_data->get_num_rows(); i++) { - auto str = static_cast( - field_data->RawValue(i)); - auto str_size = str->size(); - var_column->Append(str->data(), str_size); - field_data_size += str_size; - - // we stores the offset for each string, so there is a additional uint64_t for each string - stats_.mem_size += str_size + sizeof(uint64_t); - } + var_column->Append(std::move(field_data)); } var_column->Seal(); + field_data_size = var_column->ByteSize(); + stats_.mem_size += var_column->ByteSize(); LoadStringSkipIndex(field_id, 0, *var_column); column = std::move(var_column); break; @@ -412,22 +405,11 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) { num_rows, field_meta); FieldDataPtr field_data; while (data.channel->pop(field_data)) { - for (auto i = 0; i < field_data->get_num_rows(); i++) { - auto padded_string = - static_cast( - field_data->RawValue(i)) - ->data(); - auto padded_string_size = padded_string.size(); - var_column->Append(padded_string.data(), - padded_string_size); - field_data_size += padded_string_size; - - // we stores the offset for each JSON, so there is a additional uint64_t for each JSON - stats_.mem_size += - padded_string_size + sizeof(uint64_t); - } + var_column->Append(std::move(field_data)); } var_column->Seal(); + stats_.mem_size += var_column->ByteSize(); + field_data_size = var_column->ByteSize(); column = std::move(var_column); break; } @@ -443,6 +425,8 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) { var_column->Append(*array); // we stores the offset for each array element, so there is a additional uint64_t for each array element + field_data_size = + array->byte_size() + sizeof(uint64_t); stats_.mem_size += array->byte_size() + sizeof(uint64_t); } diff --git a/internal/core/unittest/test_disk_file_manager_test.cpp b/internal/core/unittest/test_disk_file_manager_test.cpp index ea1777ae49..7fa7a4263e 100644 --- a/internal/core/unittest/test_disk_file_manager_test.cpp +++ b/internal/core/unittest/test_disk_file_manager_test.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include