enhance: optimize the memory usage and speed up loading variable length data (#30787)

/kind improvement this removes the 1x copying while loading variable length data, also avoids constructing std::string, which could lead to memory fragmentation --------- Signed-off-by: yah01 <yah2er0ne@outlook.com> Signed-off-by: longjiquan <jiquan.long@zilliz.com> Co-authored-by: yah01 <yah2er0ne@outlook.com>
2024-12-02 03:48:37 +08:00 · 2024-02-28 16:45:00 +08:00 · 2024-02-28 16:45:00 +08:00 · 16b785e149
commit 16b785e149
parent 4459078e0b
4 changed files with 37 additions and 34 deletions
--- a/internal/core/src/common/Json.h
+++ b/internal/core/src/common/Json.h
@ -157,11 +157,21 @@ class Json {
        return dom_doc().at_pointer(pointer).get_array();
    }

+    size_t
+    size() const {
+        return data_.size();
+    }
+
    std::string_view
    data() const {
        return data_;
    }

+    const char*
+    c_str() const {
+        return data_.data();
+    }
+
 private:
    std::optional<simdjson::padded_string>
        own_data_{};  // this could be empty, then the Json will be just s view on bytes
--- a/internal/core/src/mmap/Column.h
+++ b/internal/core/src/mmap/Column.h
@ -209,10 +209,13 @@ class ColumnBase {
        if (data_ != nullptr) {
            std::memcpy(data, data_, size_);
            if (munmap(data_, cap_size_ + padding_)) {
+                auto err = errno;
+                munmap(data, new_size + padding_);
+
                AssertInfo(
                    false,
                    "failed to unmap while expanding: {}, old_map_size={}",
-                    strerror(errno),
+                    strerror(err),
                    cap_size_ + padding_);
            }
        }
@ -307,10 +310,14 @@ class VariableColumn : public ColumnBase {
    }

    void
-    Append(const char* data, size_t size) {
+    Append(FieldDataPtr chunk) {
+        for (auto i = 0; i < chunk->get_num_rows(); i++) {
+            auto data = static_cast<const T*>(chunk->RawValue(i));
+
            indices_.emplace_back(size_);
-        size_ += size;
-        load_buf_.emplace(data, size);
+            size_ += data->size();
+        }
+        load_buf_.emplace(std::move(chunk));
    }

    void
@ -328,11 +335,14 @@ class VariableColumn : public ColumnBase {
            Expand(total_size);

            while (!load_buf_.empty()) {
-                auto data = std::move(load_buf_.front());
+                auto chunk = std::move(load_buf_.front());
                load_buf_.pop();

-                std::copy_n(data.data(), data.length(), data_ + size_);
-                size_ += data.length();
+                for (auto i = 0; i < chunk->get_num_rows(); i++) {
+                    auto data = static_cast<const T*>(chunk->RawValue(i));
+                    std::copy_n(data->c_str(), data->size(), data_ + size_);
+                    size_ += data->size();
+                }
            }
        }

@ -352,7 +362,7 @@ class VariableColumn : public ColumnBase {

 private:
    // loading states
-    std::queue<std::string> load_buf_{};
+    std::queue<FieldDataPtr> load_buf_{};

    std::vector<uint64_t> indices_{};

--- a/internal/core/src/segcore/SegmentSealedImpl.cpp
+++ b/internal/core/src/segcore/SegmentSealedImpl.cpp
@ -390,18 +390,11 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
                            num_rows, field_meta);
                    FieldDataPtr field_data;
                    while (data.channel->pop(field_data)) {
-                        for (auto i = 0; i < field_data->get_num_rows(); i++) {
-                            auto str = static_cast<const std::string*>(
-                                field_data->RawValue(i));
-                            auto str_size = str->size();
-                            var_column->Append(str->data(), str_size);
-                            field_data_size += str_size;
-
-                            // we stores the offset for each string, so there is a additional uint64_t for each string
-                            stats_.mem_size += str_size + sizeof(uint64_t);
-                        }
+                        var_column->Append(std::move(field_data));
                    }
                    var_column->Seal();
+                    field_data_size = var_column->ByteSize();
+                    stats_.mem_size += var_column->ByteSize();
                    LoadStringSkipIndex(field_id, 0, *var_column);
                    column = std::move(var_column);
                    break;
@ -412,22 +405,11 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
                            num_rows, field_meta);
                    FieldDataPtr field_data;
                    while (data.channel->pop(field_data)) {
-                        for (auto i = 0; i < field_data->get_num_rows(); i++) {
-                            auto padded_string =
-                                static_cast<const milvus::Json*>(
-                                    field_data->RawValue(i))
-                                    ->data();
-                            auto padded_string_size = padded_string.size();
-                            var_column->Append(padded_string.data(),
-                                               padded_string_size);
-                            field_data_size += padded_string_size;
-
-                            // we stores the offset for each JSON, so there is a additional uint64_t for each JSON
-                            stats_.mem_size +=
-                                padded_string_size + sizeof(uint64_t);
-                        }
+                        var_column->Append(std::move(field_data));
                    }
                    var_column->Seal();
+                    stats_.mem_size += var_column->ByteSize();
+                    field_data_size = var_column->ByteSize();
                    column = std::move(var_column);
                    break;
                }
@ -443,6 +425,8 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
                            var_column->Append(*array);

                            // we stores the offset for each array element, so there is a additional uint64_t for each array element
+                            field_data_size =
+                                array->byte_size() + sizeof(uint64_t);
                            stats_.mem_size +=
                                array->byte_size() + sizeof(uint64_t);
                        }
--- a/internal/core/unittest/test_disk_file_manager_test.cpp
+++ b/internal/core/unittest/test_disk_file_manager_test.cpp
@ -19,7 +19,6 @@
 #include <gtest/gtest.h>
 #include <cstdint>
 #include <memory>
-#include <memory_resource>
 #include <string>
 #include <fstream>
 #include <vector>