enhance: optimize the memory usage and speed up loading variable length data (#30787)

/kind improvement
this removes the 1x copying while loading variable length data, also
avoids constructing std::string, which could lead to memory
fragmentation

---------

Signed-off-by: yah01 <yah2er0ne@outlook.com>
Signed-off-by: longjiquan <jiquan.long@zilliz.com>
Co-authored-by: yah01 <yah2er0ne@outlook.com>
This commit is contained in:
Jiquan Long 2024-02-28 16:45:00 +08:00 committed by GitHub
parent 4459078e0b
commit 16b785e149
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 37 additions and 34 deletions

View File

@ -157,11 +157,21 @@ class Json {
return dom_doc().at_pointer(pointer).get_array();
}
size_t
size() const {
return data_.size();
}
std::string_view
data() const {
return data_;
}
const char*
c_str() const {
return data_.data();
}
private:
std::optional<simdjson::padded_string>
own_data_{}; // this could be empty, then the Json will be just s view on bytes

View File

@ -209,10 +209,13 @@ class ColumnBase {
if (data_ != nullptr) {
std::memcpy(data, data_, size_);
if (munmap(data_, cap_size_ + padding_)) {
auto err = errno;
munmap(data, new_size + padding_);
AssertInfo(
false,
"failed to unmap while expanding: {}, old_map_size={}",
strerror(errno),
strerror(err),
cap_size_ + padding_);
}
}
@ -307,10 +310,14 @@ class VariableColumn : public ColumnBase {
}
void
Append(const char* data, size_t size) {
Append(FieldDataPtr chunk) {
for (auto i = 0; i < chunk->get_num_rows(); i++) {
auto data = static_cast<const T*>(chunk->RawValue(i));
indices_.emplace_back(size_);
size_ += size;
load_buf_.emplace(data, size);
size_ += data->size();
}
load_buf_.emplace(std::move(chunk));
}
void
@ -328,11 +335,14 @@ class VariableColumn : public ColumnBase {
Expand(total_size);
while (!load_buf_.empty()) {
auto data = std::move(load_buf_.front());
auto chunk = std::move(load_buf_.front());
load_buf_.pop();
std::copy_n(data.data(), data.length(), data_ + size_);
size_ += data.length();
for (auto i = 0; i < chunk->get_num_rows(); i++) {
auto data = static_cast<const T*>(chunk->RawValue(i));
std::copy_n(data->c_str(), data->size(), data_ + size_);
size_ += data->size();
}
}
}
@ -352,7 +362,7 @@ class VariableColumn : public ColumnBase {
private:
// loading states
std::queue<std::string> load_buf_{};
std::queue<FieldDataPtr> load_buf_{};
std::vector<uint64_t> indices_{};

View File

@ -390,18 +390,11 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
num_rows, field_meta);
FieldDataPtr field_data;
while (data.channel->pop(field_data)) {
for (auto i = 0; i < field_data->get_num_rows(); i++) {
auto str = static_cast<const std::string*>(
field_data->RawValue(i));
auto str_size = str->size();
var_column->Append(str->data(), str_size);
field_data_size += str_size;
// we stores the offset for each string, so there is a additional uint64_t for each string
stats_.mem_size += str_size + sizeof(uint64_t);
}
var_column->Append(std::move(field_data));
}
var_column->Seal();
field_data_size = var_column->ByteSize();
stats_.mem_size += var_column->ByteSize();
LoadStringSkipIndex(field_id, 0, *var_column);
column = std::move(var_column);
break;
@ -412,22 +405,11 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
num_rows, field_meta);
FieldDataPtr field_data;
while (data.channel->pop(field_data)) {
for (auto i = 0; i < field_data->get_num_rows(); i++) {
auto padded_string =
static_cast<const milvus::Json*>(
field_data->RawValue(i))
->data();
auto padded_string_size = padded_string.size();
var_column->Append(padded_string.data(),
padded_string_size);
field_data_size += padded_string_size;
// we stores the offset for each JSON, so there is a additional uint64_t for each JSON
stats_.mem_size +=
padded_string_size + sizeof(uint64_t);
}
var_column->Append(std::move(field_data));
}
var_column->Seal();
stats_.mem_size += var_column->ByteSize();
field_data_size = var_column->ByteSize();
column = std::move(var_column);
break;
}
@ -443,6 +425,8 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
var_column->Append(*array);
// we stores the offset for each array element, so there is a additional uint64_t for each array element
field_data_size =
array->byte_size() + sizeof(uint64_t);
stats_.mem_size +=
array->byte_size() + sizeof(uint64_t);
}

View File

@ -19,7 +19,6 @@
#include <gtest/gtest.h>
#include <cstdint>
#include <memory>
#include <memory_resource>
#include <string>
#include <fstream>
#include <vector>