mirror of
https://gitee.com/milvus-io/milvus.git
synced 2024-12-02 03:48:37 +08:00
enhance: Refine code for get_deleted_bitmap (#36819)
issue: #33744 Check whether the PK is truly sorted in the debug model. --------- Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
This commit is contained in:
parent
4926021c02
commit
86687bd8ed
@ -12,3 +12,11 @@
|
|||||||
|
|
||||||
add_source_at_current_directory_recursively()
|
add_source_at_current_directory_recursively()
|
||||||
add_library(milvus_segcore OBJECT ${SOURCE_FILES})
|
add_library(milvus_segcore OBJECT ${SOURCE_FILES})
|
||||||
|
|
||||||
|
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
||||||
|
set(CHECK_SORTED ON)
|
||||||
|
else()
|
||||||
|
set(CHECK_SORTED OFF)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
add_definitions(-DCHECK_SORTED=${CHECK_SORTED})
|
@ -54,6 +54,41 @@
|
|||||||
|
|
||||||
namespace milvus::segcore {
|
namespace milvus::segcore {
|
||||||
|
|
||||||
|
#ifdef CHECK_SORTED
|
||||||
|
#define ASSERT_COLUMN_ORDERED(data_type, column) \
|
||||||
|
{ \
|
||||||
|
switch (data_type) { \
|
||||||
|
case DataType::INT64: { \
|
||||||
|
auto col = \
|
||||||
|
std::dynamic_pointer_cast<SingleChunkColumn>(column); \
|
||||||
|
auto pks = reinterpret_cast<const int64_t*>(col->Data()); \
|
||||||
|
for (int i = 1; i < col->NumRows(); ++i) { \
|
||||||
|
assert(pks[i - 1] <= pks[i] && \
|
||||||
|
"INT64 Column is not ordered!"); \
|
||||||
|
} \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
case DataType::VARCHAR: { \
|
||||||
|
auto col = std::dynamic_pointer_cast< \
|
||||||
|
SingleChunkVariableColumn<std::string>>(column); \
|
||||||
|
auto pks = col->Views(); \
|
||||||
|
for (int i = 1; i < col->NumRows(); ++i) { \
|
||||||
|
assert(pks[i - 1] <= pks[i] && \
|
||||||
|
"VARCHAR Column is not ordered!"); \
|
||||||
|
} \
|
||||||
|
break; \
|
||||||
|
} \
|
||||||
|
default: { \
|
||||||
|
PanicInfo(DataTypeInvalid, \
|
||||||
|
fmt::format("unsupported primary key data type", \
|
||||||
|
data_type)); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define ASSERT_COLUMN_ORDERED(data_type, column) ((void)0)
|
||||||
|
#endif
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
set_bit(BitsetType& bitset, FieldId field_id, bool flag = true) {
|
set_bit(BitsetType& bitset, FieldId field_id, bool flag = true) {
|
||||||
auto pos = field_id.get() - START_USER_FIELDID;
|
auto pos = field_id.get() - START_USER_FIELDID;
|
||||||
@ -458,11 +493,15 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
|
|||||||
// set pks to offset
|
// set pks to offset
|
||||||
// if the segments are already sorted by pk, there is no need to build a pk offset index.
|
// if the segments are already sorted by pk, there is no need to build a pk offset index.
|
||||||
// it can directly perform a binary search on the pk column.
|
// it can directly perform a binary search on the pk column.
|
||||||
if (schema_->get_primary_field_id() == field_id && !is_sorted_by_pk_) {
|
if (schema_->get_primary_field_id() == field_id) {
|
||||||
AssertInfo(field_id.get() != -1, "Primary key is -1");
|
if (!is_sorted_by_pk_) {
|
||||||
AssertInfo(insert_record_.empty_pks(), "already exists");
|
AssertInfo(field_id.get() != -1, "Primary key is -1");
|
||||||
insert_record_.insert_pks(data_type, column);
|
AssertInfo(insert_record_.empty_pks(), "already exists");
|
||||||
insert_record_.seal_pks();
|
insert_record_.insert_pks(data_type, column);
|
||||||
|
insert_record_.seal_pks();
|
||||||
|
} else {
|
||||||
|
ASSERT_COLUMN_ORDERED(data_type, column);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool use_temp_index = false;
|
bool use_temp_index = false;
|
||||||
@ -889,74 +928,6 @@ SegmentSealedImpl::search_pk(const PkType& pk, int64_t insert_barrier) const {
|
|||||||
return pk_offsets;
|
return pk_offsets;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<DeletedRecord::TmpBitmap>
|
|
||||||
SegmentSealedImpl::get_deleted_bitmap_s(int64_t del_barrier,
|
|
||||||
int64_t insert_barrier,
|
|
||||||
DeletedRecord& delete_record,
|
|
||||||
Timestamp query_timestamp) const {
|
|
||||||
// if insert_barrier and del_barrier have not changed, use cache data directly
|
|
||||||
bool hit_cache = false;
|
|
||||||
int64_t old_del_barrier = 0;
|
|
||||||
auto current = delete_record.clone_lru_entry(
|
|
||||||
insert_barrier, del_barrier, old_del_barrier, hit_cache);
|
|
||||||
if (hit_cache) {
|
|
||||||
return current;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto bitmap = current->bitmap_ptr;
|
|
||||||
|
|
||||||
int64_t start, end;
|
|
||||||
if (del_barrier < old_del_barrier) {
|
|
||||||
// in this case, ts of delete record[current_del_barrier : old_del_barrier] > query_timestamp
|
|
||||||
// so these deletion records do not take effect in query/search
|
|
||||||
// so bitmap corresponding to those pks in delete record[current_del_barrier:old_del_barrier] will be reset to 0
|
|
||||||
// for example, current_del_barrier = 2, query_time = 120, the bitmap will be reset to [0, 1, 1, 0, 0, 0, 0, 0]
|
|
||||||
start = del_barrier;
|
|
||||||
end = old_del_barrier;
|
|
||||||
} else {
|
|
||||||
// the cache is not enough, so update bitmap using new pks in delete record[old_del_barrier:current_del_barrier]
|
|
||||||
// for example, current_del_barrier = 4, query_time = 300, bitmap will be updated to [0, 1, 1, 0, 1, 1, 0, 0]
|
|
||||||
start = old_del_barrier;
|
|
||||||
end = del_barrier;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Avoid invalid calculations when there are a lot of repeated delete pks
|
|
||||||
std::unordered_map<PkType, Timestamp> delete_timestamps;
|
|
||||||
for (auto del_index = start; del_index < end; ++del_index) {
|
|
||||||
auto pk = delete_record.pks()[del_index];
|
|
||||||
auto timestamp = delete_record.timestamps()[del_index];
|
|
||||||
|
|
||||||
delete_timestamps[pk] = timestamp > delete_timestamps[pk]
|
|
||||||
? timestamp
|
|
||||||
: delete_timestamps[pk];
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto& [pk, timestamp] : delete_timestamps) {
|
|
||||||
auto segOffsets = search_pk(pk, insert_barrier);
|
|
||||||
for (auto offset : segOffsets) {
|
|
||||||
int64_t insert_row_offset = offset.get();
|
|
||||||
|
|
||||||
// The deletion record do not take effect in search/query,
|
|
||||||
// and reset bitmap to 0
|
|
||||||
if (timestamp > query_timestamp) {
|
|
||||||
bitmap->reset(insert_row_offset);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// Insert after delete with same pk, delete will not task effect on this insert record,
|
|
||||||
// and reset bitmap to 0
|
|
||||||
if (insert_record_.timestamps_[offset.get()] >= timestamp) {
|
|
||||||
bitmap->reset(insert_row_offset);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// insert data corresponding to the insert_row_offset will be ignored in search/query
|
|
||||||
bitmap->set(insert_row_offset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
delete_record.insert_lru_entry(current);
|
|
||||||
return current;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
SegmentSealedImpl::mask_with_delete(BitsetTypeView& bitset,
|
SegmentSealedImpl::mask_with_delete(BitsetTypeView& bitset,
|
||||||
int64_t ins_barrier,
|
int64_t ins_barrier,
|
||||||
@ -968,16 +939,16 @@ SegmentSealedImpl::mask_with_delete(BitsetTypeView& bitset,
|
|||||||
|
|
||||||
auto bitmap_holder = std::shared_ptr<DeletedRecord::TmpBitmap>();
|
auto bitmap_holder = std::shared_ptr<DeletedRecord::TmpBitmap>();
|
||||||
|
|
||||||
if (!is_sorted_by_pk_) {
|
auto search_fn = [this](const PkType& pk, int64_t barrier) {
|
||||||
bitmap_holder = get_deleted_bitmap(del_barrier,
|
return this->search_pk(pk, barrier);
|
||||||
ins_barrier,
|
};
|
||||||
deleted_record_,
|
bitmap_holder = get_deleted_bitmap(del_barrier,
|
||||||
insert_record_,
|
ins_barrier,
|
||||||
timestamp);
|
deleted_record_,
|
||||||
} else {
|
insert_record_,
|
||||||
bitmap_holder = get_deleted_bitmap_s(
|
timestamp,
|
||||||
del_barrier, ins_barrier, deleted_record_, timestamp);
|
is_sorted_by_pk_,
|
||||||
}
|
search_fn);
|
||||||
|
|
||||||
if (!bitmap_holder || !bitmap_holder->bitmap_ptr) {
|
if (!bitmap_holder || !bitmap_holder->bitmap_ptr) {
|
||||||
return;
|
return;
|
||||||
|
@ -121,12 +121,6 @@ class SegmentSealedImpl : public SegmentSealed {
|
|||||||
std::vector<SegOffset>
|
std::vector<SegOffset>
|
||||||
search_pk(const PkType& pk, int64_t insert_barrier) const;
|
search_pk(const PkType& pk, int64_t insert_barrier) const;
|
||||||
|
|
||||||
std::shared_ptr<DeletedRecord::TmpBitmap>
|
|
||||||
get_deleted_bitmap_s(int64_t del_barrier,
|
|
||||||
int64_t insert_barrier,
|
|
||||||
DeletedRecord& delete_record,
|
|
||||||
Timestamp query_timestamp) const;
|
|
||||||
|
|
||||||
std::unique_ptr<DataArray>
|
std::unique_ptr<DataArray>
|
||||||
get_vector(FieldId field_id,
|
get_vector(FieldId field_id,
|
||||||
const int64_t* ids,
|
const int64_t* ids,
|
||||||
|
@ -110,11 +110,15 @@ MergeDataArray(std::vector<MergeBase>& merge_bases,
|
|||||||
|
|
||||||
template <bool is_sealed>
|
template <bool is_sealed>
|
||||||
std::shared_ptr<DeletedRecord::TmpBitmap>
|
std::shared_ptr<DeletedRecord::TmpBitmap>
|
||||||
get_deleted_bitmap(int64_t del_barrier,
|
get_deleted_bitmap(
|
||||||
int64_t insert_barrier,
|
int64_t del_barrier,
|
||||||
DeletedRecord& delete_record,
|
int64_t insert_barrier,
|
||||||
const InsertRecord<is_sealed>& insert_record,
|
DeletedRecord& delete_record,
|
||||||
Timestamp query_timestamp) {
|
const InsertRecord<is_sealed>& insert_record,
|
||||||
|
Timestamp query_timestamp,
|
||||||
|
bool is_sorted_by_pk = false,
|
||||||
|
const std::function<std::vector<SegOffset>(const PkType&, int64_t)>&
|
||||||
|
search_fn = nullptr) {
|
||||||
// if insert_barrier and del_barrier have not changed, use cache data directly
|
// if insert_barrier and del_barrier have not changed, use cache data directly
|
||||||
bool hit_cache = false;
|
bool hit_cache = false;
|
||||||
int64_t old_del_barrier = 0;
|
int64_t old_del_barrier = 0;
|
||||||
@ -153,7 +157,9 @@ get_deleted_bitmap(int64_t del_barrier,
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (auto& [pk, timestamp] : delete_timestamps) {
|
for (auto& [pk, timestamp] : delete_timestamps) {
|
||||||
auto segOffsets = insert_record.search_pk(pk, insert_barrier);
|
auto segOffsets = is_sorted_by_pk
|
||||||
|
? search_fn(pk, insert_barrier)
|
||||||
|
: insert_record.search_pk(pk, insert_barrier);
|
||||||
for (auto offset : segOffsets) {
|
for (auto offset : segOffsets) {
|
||||||
int64_t insert_row_offset = offset.get();
|
int64_t insert_row_offset = offset.get();
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user