enhance: Refine code for get_deleted_bitmap (#36819)

issue: #33744 

Check whether the PK is truly sorted in the debug model.

---------

Signed-off-by: Cai Zhang <cai.zhang@zilliz.com>
This commit is contained in:
cai.zhang 2024-10-28 15:19:30 +08:00 committed by GitHub
parent 4926021c02
commit 86687bd8ed
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 74 additions and 95 deletions

View File

@ -12,3 +12,11 @@
add_source_at_current_directory_recursively() add_source_at_current_directory_recursively()
add_library(milvus_segcore OBJECT ${SOURCE_FILES}) add_library(milvus_segcore OBJECT ${SOURCE_FILES})
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CHECK_SORTED ON)
else()
set(CHECK_SORTED OFF)
endif()
add_definitions(-DCHECK_SORTED=${CHECK_SORTED})

View File

@ -54,6 +54,41 @@
namespace milvus::segcore { namespace milvus::segcore {
#ifdef CHECK_SORTED
#define ASSERT_COLUMN_ORDERED(data_type, column) \
{ \
switch (data_type) { \
case DataType::INT64: { \
auto col = \
std::dynamic_pointer_cast<SingleChunkColumn>(column); \
auto pks = reinterpret_cast<const int64_t*>(col->Data()); \
for (int i = 1; i < col->NumRows(); ++i) { \
assert(pks[i - 1] <= pks[i] && \
"INT64 Column is not ordered!"); \
} \
break; \
} \
case DataType::VARCHAR: { \
auto col = std::dynamic_pointer_cast< \
SingleChunkVariableColumn<std::string>>(column); \
auto pks = col->Views(); \
for (int i = 1; i < col->NumRows(); ++i) { \
assert(pks[i - 1] <= pks[i] && \
"VARCHAR Column is not ordered!"); \
} \
break; \
} \
default: { \
PanicInfo(DataTypeInvalid, \
fmt::format("unsupported primary key data type", \
data_type)); \
} \
} \
}
#else
#define ASSERT_COLUMN_ORDERED(data_type, column) ((void)0)
#endif
static inline void static inline void
set_bit(BitsetType& bitset, FieldId field_id, bool flag = true) { set_bit(BitsetType& bitset, FieldId field_id, bool flag = true) {
auto pos = field_id.get() - START_USER_FIELDID; auto pos = field_id.get() - START_USER_FIELDID;
@ -458,11 +493,15 @@ SegmentSealedImpl::LoadFieldData(FieldId field_id, FieldDataInfo& data) {
// set pks to offset // set pks to offset
// if the segments are already sorted by pk, there is no need to build a pk offset index. // if the segments are already sorted by pk, there is no need to build a pk offset index.
// it can directly perform a binary search on the pk column. // it can directly perform a binary search on the pk column.
if (schema_->get_primary_field_id() == field_id && !is_sorted_by_pk_) { if (schema_->get_primary_field_id() == field_id) {
AssertInfo(field_id.get() != -1, "Primary key is -1"); if (!is_sorted_by_pk_) {
AssertInfo(insert_record_.empty_pks(), "already exists"); AssertInfo(field_id.get() != -1, "Primary key is -1");
insert_record_.insert_pks(data_type, column); AssertInfo(insert_record_.empty_pks(), "already exists");
insert_record_.seal_pks(); insert_record_.insert_pks(data_type, column);
insert_record_.seal_pks();
} else {
ASSERT_COLUMN_ORDERED(data_type, column);
}
} }
bool use_temp_index = false; bool use_temp_index = false;
@ -889,74 +928,6 @@ SegmentSealedImpl::search_pk(const PkType& pk, int64_t insert_barrier) const {
return pk_offsets; return pk_offsets;
} }
std::shared_ptr<DeletedRecord::TmpBitmap>
SegmentSealedImpl::get_deleted_bitmap_s(int64_t del_barrier,
int64_t insert_barrier,
DeletedRecord& delete_record,
Timestamp query_timestamp) const {
// if insert_barrier and del_barrier have not changed, use cache data directly
bool hit_cache = false;
int64_t old_del_barrier = 0;
auto current = delete_record.clone_lru_entry(
insert_barrier, del_barrier, old_del_barrier, hit_cache);
if (hit_cache) {
return current;
}
auto bitmap = current->bitmap_ptr;
int64_t start, end;
if (del_barrier < old_del_barrier) {
// in this case, ts of delete record[current_del_barrier : old_del_barrier] > query_timestamp
// so these deletion records do not take effect in query/search
// so bitmap corresponding to those pks in delete record[current_del_barrier:old_del_barrier] will be reset to 0
// for example, current_del_barrier = 2, query_time = 120, the bitmap will be reset to [0, 1, 1, 0, 0, 0, 0, 0]
start = del_barrier;
end = old_del_barrier;
} else {
// the cache is not enough, so update bitmap using new pks in delete record[old_del_barrier:current_del_barrier]
// for example, current_del_barrier = 4, query_time = 300, bitmap will be updated to [0, 1, 1, 0, 1, 1, 0, 0]
start = old_del_barrier;
end = del_barrier;
}
// Avoid invalid calculations when there are a lot of repeated delete pks
std::unordered_map<PkType, Timestamp> delete_timestamps;
for (auto del_index = start; del_index < end; ++del_index) {
auto pk = delete_record.pks()[del_index];
auto timestamp = delete_record.timestamps()[del_index];
delete_timestamps[pk] = timestamp > delete_timestamps[pk]
? timestamp
: delete_timestamps[pk];
}
for (auto& [pk, timestamp] : delete_timestamps) {
auto segOffsets = search_pk(pk, insert_barrier);
for (auto offset : segOffsets) {
int64_t insert_row_offset = offset.get();
// The deletion record do not take effect in search/query,
// and reset bitmap to 0
if (timestamp > query_timestamp) {
bitmap->reset(insert_row_offset);
continue;
}
// Insert after delete with same pk, delete will not task effect on this insert record,
// and reset bitmap to 0
if (insert_record_.timestamps_[offset.get()] >= timestamp) {
bitmap->reset(insert_row_offset);
continue;
}
// insert data corresponding to the insert_row_offset will be ignored in search/query
bitmap->set(insert_row_offset);
}
}
delete_record.insert_lru_entry(current);
return current;
}
void void
SegmentSealedImpl::mask_with_delete(BitsetTypeView& bitset, SegmentSealedImpl::mask_with_delete(BitsetTypeView& bitset,
int64_t ins_barrier, int64_t ins_barrier,
@ -968,16 +939,16 @@ SegmentSealedImpl::mask_with_delete(BitsetTypeView& bitset,
auto bitmap_holder = std::shared_ptr<DeletedRecord::TmpBitmap>(); auto bitmap_holder = std::shared_ptr<DeletedRecord::TmpBitmap>();
if (!is_sorted_by_pk_) { auto search_fn = [this](const PkType& pk, int64_t barrier) {
bitmap_holder = get_deleted_bitmap(del_barrier, return this->search_pk(pk, barrier);
ins_barrier, };
deleted_record_, bitmap_holder = get_deleted_bitmap(del_barrier,
insert_record_, ins_barrier,
timestamp); deleted_record_,
} else { insert_record_,
bitmap_holder = get_deleted_bitmap_s( timestamp,
del_barrier, ins_barrier, deleted_record_, timestamp); is_sorted_by_pk_,
} search_fn);
if (!bitmap_holder || !bitmap_holder->bitmap_ptr) { if (!bitmap_holder || !bitmap_holder->bitmap_ptr) {
return; return;

View File

@ -121,12 +121,6 @@ class SegmentSealedImpl : public SegmentSealed {
std::vector<SegOffset> std::vector<SegOffset>
search_pk(const PkType& pk, int64_t insert_barrier) const; search_pk(const PkType& pk, int64_t insert_barrier) const;
std::shared_ptr<DeletedRecord::TmpBitmap>
get_deleted_bitmap_s(int64_t del_barrier,
int64_t insert_barrier,
DeletedRecord& delete_record,
Timestamp query_timestamp) const;
std::unique_ptr<DataArray> std::unique_ptr<DataArray>
get_vector(FieldId field_id, get_vector(FieldId field_id,
const int64_t* ids, const int64_t* ids,

View File

@ -110,11 +110,15 @@ MergeDataArray(std::vector<MergeBase>& merge_bases,
template <bool is_sealed> template <bool is_sealed>
std::shared_ptr<DeletedRecord::TmpBitmap> std::shared_ptr<DeletedRecord::TmpBitmap>
get_deleted_bitmap(int64_t del_barrier, get_deleted_bitmap(
int64_t insert_barrier, int64_t del_barrier,
DeletedRecord& delete_record, int64_t insert_barrier,
const InsertRecord<is_sealed>& insert_record, DeletedRecord& delete_record,
Timestamp query_timestamp) { const InsertRecord<is_sealed>& insert_record,
Timestamp query_timestamp,
bool is_sorted_by_pk = false,
const std::function<std::vector<SegOffset>(const PkType&, int64_t)>&
search_fn = nullptr) {
// if insert_barrier and del_barrier have not changed, use cache data directly // if insert_barrier and del_barrier have not changed, use cache data directly
bool hit_cache = false; bool hit_cache = false;
int64_t old_del_barrier = 0; int64_t old_del_barrier = 0;
@ -153,7 +157,9 @@ get_deleted_bitmap(int64_t del_barrier,
} }
for (auto& [pk, timestamp] : delete_timestamps) { for (auto& [pk, timestamp] : delete_timestamps) {
auto segOffsets = insert_record.search_pk(pk, insert_barrier); auto segOffsets = is_sorted_by_pk
? search_fn(pk, insert_barrier)
: insert_record.search_pk(pk, insert_barrier);
for (auto offset : segOffsets) { for (auto offset : segOffsets) {
int64_t insert_row_offset = offset.get(); int64_t insert_row_offset = offset.get();