Delete only if primary keys exist (#25292)

Signed-off-by: yah01 <yang.cen@zilliz.com>
This commit is contained in:
yah01 2023-09-20 19:03:25 +08:00 committed by GitHub
parent 16b35e07b3
commit 93e2eb78c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 82 additions and 5 deletions

View File

@ -42,6 +42,9 @@ class OffsetMap {
public: public:
virtual ~OffsetMap() = default; virtual ~OffsetMap() = default;
virtual bool
contain(const PkType& pk) const = 0;
virtual std::vector<int64_t> virtual std::vector<int64_t>
find(const PkType& pk) const = 0; find(const PkType& pk) const = 0;
@ -65,6 +68,11 @@ class OffsetMap {
template <typename T> template <typename T>
class OffsetOrderedMap : public OffsetMap { class OffsetOrderedMap : public OffsetMap {
public: public:
bool
contain(const PkType& pk) const override {
return map_.find(std::get<T>(pk)) != map_.end();
}
std::vector<int64_t> std::vector<int64_t>
find(const PkType& pk) const override { find(const PkType& pk) const override {
auto offset_vector = map_.find(std::get<T>(pk)); auto offset_vector = map_.find(std::get<T>(pk));
@ -138,6 +146,19 @@ class OffsetOrderedMap : public OffsetMap {
template <typename T> template <typename T>
class OffsetOrderedArray : public OffsetMap { class OffsetOrderedArray : public OffsetMap {
public: public:
bool
contain(const PkType& pk) const override {
const T& target = std::get<T>(pk);
auto it =
std::lower_bound(array_.begin(),
array_.end(),
target,
[](const std::pair<T, int64_t>& elem,
const T& value) { return elem.first < value; });
return it != array_.end();
}
std::vector<int64_t> std::vector<int64_t>
find(const PkType& pk) const override { find(const PkType& pk) const override {
check_search(); check_search();
@ -355,6 +376,11 @@ struct InsertRecord {
} }
} }
bool
contain(const PkType& pk) const {
return pk2offset_->contain(pk);
}
std::vector<SegOffset> std::vector<SegOffset>
search_pk(const PkType& pk, Timestamp timestamp) const { search_pk(const PkType& pk, Timestamp timestamp) const {
std::shared_lock lck(shared_mutex_); std::shared_lock lck(shared_mutex_);

View File

@ -235,6 +235,15 @@ SegmentGrowingImpl::Delete(int64_t reserved_begin,
std::vector<PkType> pks(size); std::vector<PkType> pks(size);
ParsePksFromIDs(pks, field_meta.get_data_type(), *ids); ParsePksFromIDs(pks, field_meta.get_data_type(), *ids);
// filter out the deletions that the primary key not exists
auto end = std::remove_if(pks.begin(), pks.end(), [&](const PkType& pk) {
return !insert_record_.contain(pk);
});
size = end - pks.begin();
if (size == 0) {
return SegcoreError::success();
}
// step 1: sort timestamp // step 1: sort timestamp
std::vector<std::tuple<Timestamp, PkType>> ordering(size); std::vector<std::tuple<Timestamp, PkType>> ordering(size);
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {

View File

@ -28,7 +28,7 @@
#include "InsertRecord.h" #include "InsertRecord.h"
#include "SealedIndexingRecord.h" #include "SealedIndexingRecord.h"
#include "SegmentGrowing.h" #include "SegmentGrowing.h"
#include "common/Types.h"
#include "common/EasyAssert.h" #include "common/EasyAssert.h"
#include "query/PlanNode.h" #include "query/PlanNode.h"
#include "common/IndexMeta.h" #include "common/IndexMeta.h"
@ -47,6 +47,11 @@ class SegmentGrowingImpl : public SegmentGrowing {
const Timestamp* timestamps, const Timestamp* timestamps,
const InsertData* insert_data) override; const InsertData* insert_data) override;
bool
Contain(const PkType& pk) const override {
return insert_record_.contain(pk);
}
// TODO: add id into delete log, possibly bitmap // TODO: add id into delete log, possibly bitmap
SegcoreError SegcoreError
Delete(int64_t reserved_offset, Delete(int64_t reserved_offset,

View File

@ -47,6 +47,9 @@ class SegmentInterface {
virtual void virtual void
FillTargetEntry(const query::Plan* plan, SearchResult& results) const = 0; FillTargetEntry(const query::Plan* plan, SearchResult& results) const = 0;
virtual bool
Contain(const PkType& pk) const = 0;
virtual std::unique_ptr<SearchResult> virtual std::unique_ptr<SearchResult>
Search(const query::Plan* Plan, Search(const query::Plan* Plan,
const query::PlaceholderGroup* placeholder_group) const = 0; const query::PlaceholderGroup* placeholder_group) const = 0;

View File

@ -14,6 +14,7 @@
#include <fcntl.h> #include <fcntl.h>
#include <fmt/core.h> #include <fmt/core.h>
#include <algorithm>
#include <cstdint> #include <cstdint>
#include <filesystem> #include <filesystem>
#include <memory> #include <memory>
@ -1136,6 +1137,15 @@ SegmentSealedImpl::Delete(int64_t reserved_offset, // deprecated
std::vector<PkType> pks(size); std::vector<PkType> pks(size);
ParsePksFromIDs(pks, field_meta.get_data_type(), *ids); ParsePksFromIDs(pks, field_meta.get_data_type(), *ids);
// filter out the deletions that the primary key not exists
auto end = std::remove_if(pks.begin(), pks.end(), [&](const PkType& pk) {
return !insert_record_.contain(pk);
});
size = end - pks.begin();
if (size == 0) {
return SegcoreError::success();
}
// step 1: sort timestamp // step 1: sort timestamp
std::vector<std::tuple<Timestamp, PkType>> ordering(size); std::vector<std::tuple<Timestamp, PkType>> ordering(size);
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {

View File

@ -58,6 +58,11 @@ class SegmentSealedImpl : public SegmentSealed {
bool bool
HasFieldData(FieldId field_id) const override; HasFieldData(FieldId field_id) const override;
bool
Contain(const PkType& pk) const override {
return insert_record_.contain(pk);
}
void void
LoadFieldData(FieldId field_id, FieldDataInfo& data) override; LoadFieldData(FieldId field_id, FieldDataInfo& data) override;
void void

View File

@ -21,6 +21,7 @@
#include "segcore/Collection.h" #include "segcore/Collection.h"
#include "segcore/SegmentGrowingImpl.h" #include "segcore/SegmentGrowingImpl.h"
#include "segcore/SegmentSealedImpl.h" #include "segcore/SegmentSealedImpl.h"
#include "segcore/Utils.h"
#include "storage/FieldData.h" #include "storage/FieldData.h"
#include "storage/Util.h" #include "storage/Util.h"
#include "mmap/Types.h" #include "mmap/Types.h"

View File

@ -120,6 +120,12 @@ AddFieldDataInfoForSealed(CSegmentInterface c_segment,
CLoadFieldDataInfo c_load_field_data_info); CLoadFieldDataInfo c_load_field_data_info);
////////////////////////////// interfaces for SegmentInterface ////////////////////////////// ////////////////////////////// interfaces for SegmentInterface //////////////////////////////
CStatus
ExistPk(CSegmentInterface c_segment,
const uint8_t* raw_ids,
const uint64_t size,
bool* results);
CStatus CStatus
Delete(CSegmentInterface c_segment, Delete(CSegmentInterface c_segment,
int64_t reserved_offset, int64_t reserved_offset,

View File

@ -12,6 +12,7 @@
#include <google/protobuf/text_format.h> #include <google/protobuf/text_format.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <array>
#include <boost/format.hpp> #include <boost/format.hpp>
#include <chrono> #include <chrono>
#include <iostream> #include <iostream>
@ -19,8 +20,10 @@
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include "boost/container/vector.hpp"
#include "common/LoadInfo.h" #include "common/LoadInfo.h"
#include "common/Types.h" #include "common/Types.h"
#include "common/type_c.h"
#include "index/IndexFactory.h" #include "index/IndexFactory.h"
#include "knowhere/comp/index_param.h" #include "knowhere/comp/index_param.h"
#include "pb/plan.pb.h" #include "pb/plan.pb.h"
@ -28,6 +31,7 @@
#include "segcore/Collection.h" #include "segcore/Collection.h"
#include "segcore/Reduce.h" #include "segcore/Reduce.h"
#include "segcore/reduce_c.h" #include "segcore/reduce_c.h"
#include "segcore/segment_c.h"
#include "test_utils/DataGen.h" #include "test_utils/DataGen.h"
#include "test_utils/PbHelper.h" #include "test_utils/PbHelper.h"
#include "test_utils/indexbuilder_test_utils.h" #include "test_utils/indexbuilder_test_utils.h"
@ -1151,7 +1155,7 @@ TEST(CApiTest, GetDeletedCountTest) {
// TODO: assert(deleted_count == len(delete_row_ids)) // TODO: assert(deleted_count == len(delete_row_ids))
auto deleted_count = GetDeletedCount(segment); auto deleted_count = GetDeletedCount(segment);
ASSERT_EQ(deleted_count, delete_row_ids.size()); ASSERT_EQ(deleted_count, 0);
DeleteCollection(collection); DeleteCollection(collection);
DeleteSegment(segment); DeleteSegment(segment);

View File

@ -31,10 +31,18 @@ TEST(Growing, DeleteCount) {
int64_t c = 10; int64_t c = 10;
auto offset = 0; auto offset = 0;
auto dataset = DataGen(schema, c);
auto pks = dataset.get_col<int64_t>(pk);
segment->Insert(offset,
c,
dataset.row_ids_.data(),
dataset.timestamps_.data(),
dataset.raw_);
Timestamp begin_ts = 100; Timestamp begin_ts = 100;
auto tss = GenTss(c, begin_ts); auto tss = GenTss(c, begin_ts);
auto pks = GenPKs(c, 0); auto del_pks = GenPKs(pks.begin(), pks.end());
auto status = segment->Delete(offset, c, pks.get(), tss.data()); auto status = segment->Delete(offset, c, del_pks.get(), tss.data());
ASSERT_TRUE(status.ok()); ASSERT_TRUE(status.ok());
auto cnt = segment->get_deleted_count(); auto cnt = segment->get_deleted_count();

View File

@ -1061,7 +1061,7 @@ TEST(Sealed, DeleteCount) {
ASSERT_TRUE(status.ok()); ASSERT_TRUE(status.ok());
auto cnt = segment->get_deleted_count(); auto cnt = segment->get_deleted_count();
ASSERT_EQ(cnt, c); ASSERT_EQ(cnt, 0);
} }
TEST(Sealed, RealCount) { TEST(Sealed, RealCount) {