mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-02 03:48:37 +08:00
enhance: optimize the memory usage and speed up loading variable length data (#30787)
/kind improvement this removes the 1x copying while loading variable length data, also avoids constructing std::string, which could lead to memory fragmentation --------- Signed-off-by: yah01 <yah2er0ne@outlook.com> Signed-off-by: longjiquan <jiquan.long@zilliz.com> Co-authored-by: yah01 <yah2er0ne@outlook.com>
This commit is contained in:
parent
4459078e0b
commit
16b785e149
@ -157,11 +157,21 @@ class Json {
|
||||
return dom_doc().at_pointer(pointer).get_array();
|
||||
}
|
||||
|
||||
size_t
|
||||
size() const {
|
||||
return data_.size();
|
||||
}
|
||||
|
||||
std::string_view
|
||||
data() const {
|
||||
return data_;
|
||||
}
|
||||
|
||||
const char*
|
||||
c_str() const {
|
||||
return data_.data();
|
||||
}
|
||||
|
||||
private:
|
||||
std::optional<simdjson::padded_string>
|
||||
own_data_{}; // this could be empty, then the Json will be just s view on bytes
|
||||
|
@ -209,10 +209,13 @@ class ColumnBase {
|
||||
if (data_ != nullptr) {
|
||||
std::memcpy(data, data_, size_);
|
||||
if (munmap(data_, cap_size_ + padding_)) {
|
||||
auto err = errno;
|
||||
munmap(data, new_size + padding_);
|
||||
|
||||
AssertInfo(
|
||||
false,
|
||||
"failed to unmap while expanding: {}, old_map_size={}",
|
||||
strerror(errno),
|
||||
strerror(err),
|
||||
cap_size_ + padding_);
|
||||
}
|
||||
}
|
||||
@ -307,10 +310,14 @@ class VariableColumn : public ColumnBase {
|
||||
}
|
||||
|
||||
void
|
||||
Append(const char* data, size_t size) {
|
||||
Append(FieldDataPtr chunk) {
|
||||
for (auto i = 0; i < chunk->get_num_rows(); i++) {
|
||||
auto data = static_cast<const T*>(chunk->RawValue(i));
|
||||
|
||||
indices_.emplace_back(size_);
|
||||
size_ += size;
|
||||
load_buf_.emplace(data, size);
|
||||
size_ += data->size();
|
||||
}
|
||||
load_buf_.emplace(std::move(chunk));
|
||||
}
|
||||
|
||||
void
|
||||
@ -328,11 +335,14 @@ class VariableColumn : public ColumnBase {
|
||||
Expand(total_size);
|
||||
|
||||
while (!load_buf_.empty()) {
|
||||
auto data = std::move(load_buf_.front());
|
||||
auto chunk = std::move(load_buf_.front());
|
||||
load_buf_.pop();
|
||||
|
||||
std::copy_n(data.data(), data.length(), data_ + size_);
|
||||
size_ += data.length();
|
||||
for (auto i = 0; i < chunk->get_num_rows(); i++) {
|
||||
auto data = static_cast<const T*>(chunk->RawValue(i));
|
||||
std::copy_n(data->c_str(), data->size(), data_ + size_);
|
||||
size_ += data->size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -352,7 +362,7 @@ class VariableColumn : public ColumnBase {
|
||||
|
||||
private:
|
||||
// loading states
|
||||
std::queue<std::string> load_buf_{};
|
||||
std::queue<FieldDataPtr> load_buf_{};
|
||||
|
||||
std::vector<uint64_t> indices_{};
|
||||
|
||||
|
@ -390,18 +390,11 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
|
||||
num_rows, field_meta);
|
||||
FieldDataPtr field_data;
|
||||
while (data.channel->pop(field_data)) {
|
||||
for (auto i = 0; i < field_data->get_num_rows(); i++) {
|
||||
auto str = static_cast<const std::string*>(
|
||||
field_data->RawValue(i));
|
||||
auto str_size = str->size();
|
||||
var_column->Append(str->data(), str_size);
|
||||
field_data_size += str_size;
|
||||
|
||||
// we stores the offset for each string, so there is a additional uint64_t for each string
|
||||
stats_.mem_size += str_size + sizeof(uint64_t);
|
||||
}
|
||||
var_column->Append(std::move(field_data));
|
||||
}
|
||||
var_column->Seal();
|
||||
field_data_size = var_column->ByteSize();
|
||||
stats_.mem_size += var_column->ByteSize();
|
||||
LoadStringSkipIndex(field_id, 0, *var_column);
|
||||
column = std::move(var_column);
|
||||
break;
|
||||
@ -412,22 +405,11 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
|
||||
num_rows, field_meta);
|
||||
FieldDataPtr field_data;
|
||||
while (data.channel->pop(field_data)) {
|
||||
for (auto i = 0; i < field_data->get_num_rows(); i++) {
|
||||
auto padded_string =
|
||||
static_cast<const milvus::Json*>(
|
||||
field_data->RawValue(i))
|
||||
->data();
|
||||
auto padded_string_size = padded_string.size();
|
||||
var_column->Append(padded_string.data(),
|
||||
padded_string_size);
|
||||
field_data_size += padded_string_size;
|
||||
|
||||
// we stores the offset for each JSON, so there is a additional uint64_t for each JSON
|
||||
stats_.mem_size +=
|
||||
padded_string_size + sizeof(uint64_t);
|
||||
}
|
||||
var_column->Append(std::move(field_data));
|
||||
}
|
||||
var_column->Seal();
|
||||
stats_.mem_size += var_column->ByteSize();
|
||||
field_data_size = var_column->ByteSize();
|
||||
column = std::move(var_column);
|
||||
break;
|
||||
}
|
||||
@ -443,6 +425,8 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
|
||||
var_column->Append(*array);
|
||||
|
||||
// we stores the offset for each array element, so there is a additional uint64_t for each array element
|
||||
field_data_size =
|
||||
array->byte_size() + sizeof(uint64_t);
|
||||
stats_.mem_size +=
|
||||
array->byte_size() + sizeof(uint64_t);
|
||||
}
|
||||
|
@ -19,7 +19,6 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <memory_resource>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
|
Loading…
Reference in New Issue
Block a user