Merge branch 'dev_performance' of https://gitee.com/zoujia_cm/he3pg into dev_performance

This commit is contained in:
zoujia 2023-05-19 15:58:27 +08:00
commit 369dcb62f5
71 changed files with 7220 additions and 2008 deletions

View File

@ -0,0 +1,107 @@
# 部署
## 1 启动原生PG作为主
### 1.1 PG14.2源码编译安装
```shell
./configure --enable-depend --enable-cassert --enable-debug CFLAGS="-ggdb -O0" --prefix=/home/postgres/psql14_pg
make && make install
```
其中configure选项参考[CONFIGURE-OPTIONS](https://www.postgresql.org/docs/current/install-procedure.html#CONFIGURE-OPTIONS)
### 1.2 初始化数据
```shell
cd /home/postgres/psql14_pg
./bin/initdb -D /home/postgres/pgdata_14
```
### 1.3 修改配置文件
```shell
vim /home/postgres/pgdata_14/postgresql.conf
port=15432
wal_level = replica
wal_recycle=off
```
修改访问控制文件
```shell
vim /home/postgres/pgdata_14/pg_hba.conf
host repl all 0.0.0.0/0 trust
```
### 1.4 启动服务
```shell
./bin/pg_ctl -D /home/postgres/pgdata_14 start -l logfile
```
### 1.5 创建流复制用户
```shell
./bin/psql -h127.0.0.1 -p15432
postgres=# CREATE ROLE repl login replication encrypted password 'repl';
```
## 2 启动He3DB作为备
### 2.1 编译安装PG He3DB
```shell
//编译需要依赖静态库 he3pg/src/backend/storage/file/librust_log.a
./configure --enable-depend --enable-cassert --enable-debug CFLAGS="-ggdb -O0" --prefix=/home/postgres/psqlhe3_mirror
make && make install
```
### 2.2 从主备份数据
```shell
cd /home/postgres/psqlhe3_mirror
./bin/pg_basebackup -h 127.0.0.1 -p 15432 -U repl -R -Fp -Xs -Pv -D /home/postgres/pgdata_mirror
```
### 2.3 修改postgres.conf配置
```shell
vim /home/postgres/pgdata_mirror/postgresql.conf
// 配置文件最后添加配置
primary_conninfo = 'application_name=pushstandby user=repl host=127.0.0.1 port=15432 sslmode=disable sslcompression=0 gssencmode=disable target_session_attrs=any'
hot_standby=on
port = 5434
push_standby=on
wal_recycle=off
fsync=off
wal_keep_size=10000
full_page_writes=off
he3mirror=true
```
### 2.4 启动服务
```shell
./bin/pg_ctl -D /home/postgres/pgdata_mirror start -l logfile
```
## 3 验证
### 3.1 链接主插入新数据
```shell
./bin/psql -h127.0.0.1 -p15432
postgres=# create table "t1" (id int);
CREATE TABLE
postgres=# insert into t1 values(1);
INSERT 0 1
```
### 3.2 备机验证数据
```shell
./bin/psql -h127.0.0.1 -p5434
postgres=# select * from t1;
id
----
1
(1 row)
```
### 3.3 链接主插入新数据
```
./bin/psql -h127.0.0.1 -p15432
postgres=# insert into t1 values(2);
INSERT 0 1
```
### 3.4 备机验证数据
```shell
./bin/psql -h127.0.0.1 -p5434
postgres=# select * from t1;
id
----
1
2
(2 row)
```

View File

@ -66,12 +66,15 @@ func runArchive(cmd *cobra.Command, args []string) {
// archive wal kv
fmt.Printf("archive wal kv!\n")
//0600000000000000010000000000000000
retStartString := fmt.Sprintf("06%s%s", archive_start_time_line, archive_start_lsn)
retEndString := "06ffffffffffffffffffffffffffffffff"
for id := 0; id < 8; id++ {
//06000000000000000100000000000000070000000000000000
//因为加了个id字段目前不能跨时间线备份
retStartString := fmt.Sprintf("06%s000000000000000%d%s", archive_start_time_line, id, archive_start_lsn)
//retEndString := fmt.Sprintf("06ffffffffffffffff000000000000000%dffffffffffffffff", id)
retEndString := fmt.Sprintf("06%s000000000000000%dffffffffffffffff", archive_start_time_line, id)
retStart := make([]byte, 17)
retEnd := make([]byte, 17)
retStart := make([]byte, 25)
retEnd := make([]byte, 25)
index := 0
for i := 0; i < len(retStartString); i += 2 {
value, _ := strconv.ParseUint(retStartString[i:i+2], 16, 8)
@ -104,6 +107,7 @@ func runArchive(cmd *cobra.Command, args []string) {
}
wlCount--
}
}
wg.Wait()
client.Close()

View File

@ -122,7 +122,7 @@ $(top_builddir)/src/port/libpgport_srv.a: | submake-libpgport
LIBS += $(libpq)
librust_log = -L$(top_builddir)/src/backend/storage/file/ -lrust_log -lstdc++ -lm -ldl -lpthread -lfuse3 -Wl,-gc-section
LIBS += $(librust_log)
libglib = -L/usr/lib/x86_64-linux-gnu/ -lglib-2.0 -I/usr/include/glib-2.0/ -I/usr/lib/x86_64-linux-gnu/glib-2.0/include/ -lpthread
libglib = -L/usr/lib/x86_64-linux-gnu/ -lglib-2.0 -I/usr/include/glib-2.0/ -I/usr/lib/x86_64-linux-gnu/glib-2.0/include/ -lpthread -llmdb
LIBS += $(libglib)
postgres.o: $(OBJS)
$(CC) $(LDREL) $(call expand_subsys,$^) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@

View File

@ -37,7 +37,8 @@ OBJS = \
xlogutils.o \
pagehashqueue.o \
ringbuffer.o \
pthreadpool.o
pthreadpool.o \
pg_mirror.o
include $(top_srcdir)/src/backend/common.mk

View File

@ -382,12 +382,12 @@ CleanLogIndexMain(int argc, char *argv[])
* Loop forever
*/
SetProcessingMode(NormalProcessing);
uint64_t pushStandbyPoint = 0;
uint64_t pushStandbyPrePoint = 0;
uint64_t preLastBlkStartLsn = 0;
uint64_t preLastBlkEndLsn = 0;
uint64_t preLastStartLsn = 0;
uint64_t preLastEndLsn = 0;
XLogRecPtr pushStandbyPoint = 0;
XLogRecPtr pushStandbyPrePoint = 0;
XLogRecPtr preLastBlkStartLsn = 0;
XLogRecPtr preLastBlkEndLsn = 0;
XLogRecPtr preLastStartLsn = 0;
XLogRecPtr preLastEndLsn = 0;
for (;;)
{
/* Clear any already-pending wakeups */
@ -416,7 +416,7 @@ CleanLogIndexMain(int argc, char *argv[])
int pageNum = 0;
while(next!=NULL) {
addFileKey(&next->tag.tag);
next = tagList->next;
next = next->next;
pageNum++;
}
FreeTagNode(tagList);
@ -689,6 +689,9 @@ BufferTag* QueuePushPage(void) {
if (ready!= 0 && gpushpos < gpos) {
return &(PageHashQueueShmem->gtag[gpushpos]->tag);
} else {
if (gpushpos < gpos) {
elog(ERROR,"QueuePushPage gpushpos %d < gpos %d",gpushpos,gpos);
}
return NULL;
}
}
@ -698,7 +701,7 @@ void ProcFlushBufferToDisk(BufferTag*tag) {
RBM_NORMAL);
if (!BufferIsValid(buffer))
{
elog(ERROR,"ProcFlushBufferToDisk is invalid rel %d,flk %d,blk %d",tag->rnode.relNode,tag->forkNum,tag->blockNum);
elog(FATAL,"ProcFlushBufferToDisk is invalid rel %d,flk %d,blk %d",tag->rnode.relNode,tag->forkNum,tag->blockNum);
pg_atomic_fetch_add_u32(&PageHashQueueShmem->taskNum,1);
return;
}

View File

@ -0,0 +1,742 @@
#include "access/pg_mirror.h"
#include "postgres.h"
#include "access/xlogrecord.h"
#include "access/heapam_xlog.h"
#include "access/nbtxlog.h"
#include "access/gistxlog.h"
#include "access/spgxlog.h"
#include "access/brin_xlog.h"
#include "assert.h"
#include "common/controldata_utils.h"
#include "miscadmin.h"
#define INSERT_FREESPACE_MIRROR(endptr) \
(((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
static ControlFileData *ControlFile = NULL;
//default 16MB
static int WalSegSz = 16777216;
//muti block to one record
typedef struct XLogHe3ToPg {
uint64 CurrBytePos;
uint64 PrevBytePos;
}XLogHe3ToPg;
static XLogHe3ToPg g_walHe3ToPg;
static void ReConvertMainData(XLogRecord* sRecord, char*sMainData, uint32_t*sLen, char* dMainData, uint32_t* dLen) {
RmgrId rmid = sRecord->xl_rmid;
uint8 info = (sRecord->xl_info & ~XLR_INFO_MASK);
bool hasChange = false;
switch(rmid) {
case RM_HEAP2_ID:
{
if ((info & XLOG_HEAP_OPMASK) == XLOG_HEAP2_VISIBLE) {
xl_heap_visible *xlrec = (xl_heap_visible *)sMainData;
xl_old_heap_visible xlrecOld;
xlrecOld.cutoff_xid = xlrec->cutoff_xid;
xlrecOld.flags = xlrec->flags;
*dLen = sizeof(xl_old_heap_visible);
memcpy(dMainData,&xlrecOld,*dLen);
hasChange = true;
}
break;
}
case RM_HEAP_ID:
{
if (((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_UPDATE) ||
((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_HOT_UPDATE)) {
xl_heap_update *xlrec = (xl_heap_update *)sMainData;
xl_old_heap_update xlrecOld;
xlrecOld.old_xmax = xlrec->old_xmax;
xlrecOld.old_offnum = xlrec->old_offnum;
xlrecOld.old_infobits_set = xlrec->old_infobits_set;
xlrecOld.flags = xlrec->flags;
xlrecOld.new_xmax = xlrec->new_xmax;
xlrecOld.new_offnum = xlrec->new_offnum;
*dLen = sizeof(xl_old_heap_update);
memcpy(dMainData,&xlrecOld,*dLen);
hasChange = true;
}
break;
}
case RM_BTREE_ID:
{
if (info == XLOG_BTREE_SPLIT_L || info == XLOG_BTREE_SPLIT_R) {
xl_btree_split *xlrec = (xl_btree_split *)sMainData;
xl_old_btree_split xlrecOld;
xlrecOld.level = xlrec->level;
xlrecOld.firstrightoff = xlrec->firstrightoff;
xlrecOld.newitemoff = xlrec->newitemoff;
xlrecOld.postingoff = xlrec->postingoff;
*dLen = sizeof(xl_old_btree_split);
memcpy(dMainData,&xlrecOld,*dLen);
hasChange = true;
}
break;
}
case RM_GIST_ID:
{
if (info == XLOG_GIST_PAGE_SPLIT) {
gistxlogPageSplit *xlrec = (gistxlogPageSplit *)sMainData;
gistoldxlogPageSplit xlrecOld;
xlrecOld.origrlink = xlrec->origrlink;
xlrecOld.orignsn = xlrec->orignsn;
xlrecOld.origleaf = xlrec->origleaf;
xlrecOld.npage = xlrec->npage;
xlrecOld.markfollowright = xlrec->markfollowright;
*dLen = sizeof(gistoldxlogPageSplit);
memcpy(dMainData,&xlrecOld,*dLen);
hasChange = true;
}
break;
}
case RM_SPGIST_ID:
{
if (info == XLOG_SPGIST_ADD_LEAF) {
spgxlogAddLeaf *xlrec = (spgxlogAddLeaf *)sMainData;
spgoldxlogAddLeaf xlrecOld;
xlrecOld.newPage = xlrec->newPage;
xlrecOld.storesNulls = xlrec->storesNulls;
xlrecOld.offnumLeaf = xlrec->offnumLeaf;
xlrecOld.offnumHeadLeaf = xlrec->offnumHeadLeaf;
xlrecOld.offnumParent = xlrec->offnumParent;
xlrecOld.nodeI = xlrec->nodeI;
*dLen = sizeof(spgoldxlogAddLeaf);
memcpy(dMainData,&xlrecOld,*dLen);
hasChange = true;
} else if (info == XLOG_SPGIST_MOVE_LEAFS) {
spgxlogMoveLeafs *xlrec = (spgxlogMoveLeafs *)sMainData;
spgoldxlogMoveLeafs xlrecOld;
xlrecOld.nMoves = xlrec->nMoves;
xlrecOld.newPage = xlrec->newPage;
xlrecOld.replaceDead = xlrec->replaceDead;
xlrecOld.storesNulls = xlrec->storesNulls;
xlrecOld.offnumParent = xlrec->offnumParent;
xlrecOld.nodeI = xlrec->nodeI;
xlrecOld.stateSrc = xlrec->stateSrc;
*dLen = SizeOfOldSpgxlogMoveLeafs;
memcpy(dMainData,&xlrecOld,*dLen);
memcpy(dMainData+*dLen,xlrec->offsets,*sLen-SizeOfSpgxlogMoveLeafs);
*dLen += *sLen-SizeOfSpgxlogMoveLeafs;
hasChange = true;
} else if (info == XLOG_SPGIST_ADD_NODE) {
spgxlogAddNode *xlrec = (spgxlogAddNode *)sMainData;
spgoldxlogAddNode xlrecOld;
xlrecOld.offnum = xlrec->offnum;
xlrecOld.offnumNew = xlrec->offnumNew;
xlrecOld.newPage = xlrec->newPage;
xlrecOld.parentBlk = xlrec->parentBlk;
xlrecOld.offnumParent = xlrec->offnumParent;
xlrecOld.nodeI = xlrec->nodeI;
xlrecOld.stateSrc = xlrec->stateSrc;
*dLen = sizeof(spgoldxlogAddNode);
memcpy(dMainData,&xlrecOld,*dLen);
hasChange = true;
} else if (info == XLOG_SPGIST_PICKSPLIT) {
spgxlogPickSplit *xlrec = (spgxlogPickSplit *)sMainData;
spgoldxlogPickSplit xlrecOld;
xlrecOld.isRootSplit = xlrec->isRootSplit;
xlrecOld.nDelete = xlrec->nDelete;
xlrecOld.nInsert = xlrec->nInsert;
xlrecOld.initSrc = xlrec->initSrc;
xlrecOld.initDest = xlrec->initDest;
xlrecOld.offnumInner = xlrec->offnumInner;
xlrecOld.initInner = xlrec->initInner;
xlrecOld.storesNulls = xlrec->storesNulls;
xlrecOld.innerIsParent = xlrec->innerIsParent;
xlrecOld.offnumParent = xlrec->offnumParent;
xlrecOld.nodeI = xlrec->nodeI;
xlrecOld.stateSrc = xlrec->stateSrc;
*dLen = SizeOfOldSpgxlogPickSplit;
memcpy(dMainData,&xlrecOld,*dLen);
memcpy(dMainData+*dLen,xlrec->offsets,*sLen-SizeOfSpgxlogPickSplit);
*dLen += *sLen-SizeOfSpgxlogPickSplit;
hasChange = true;
}
break;
}
case RM_BRIN_ID:
{
if (info == XLOG_BRIN_INSERT) {
xl_brin_insert *xlrec = (xl_brin_insert *)sMainData;
xl_old_brin_insert xlrecOld;
xlrecOld.heapBlk = xlrec->heapBlk;
/* extra information needed to update the revmap */
xlrecOld.pagesPerRange = xlrec->pagesPerRange;
xlrecOld.offnum = xlrec->offnum;
*dLen = sizeof(xl_old_brin_insert);
memcpy(dMainData,&xlrecOld,*dLen);
hasChange = true;
} else if ( info == XLOG_BRIN_UPDATE) {
xl_brin_update *xlrec = (xl_brin_update *) sMainData;
xl_old_brin_update xlrecUpdate;
xl_brin_insert *xlrecInsert = &xlrec->insert;
xl_old_brin_insert xlrecOld;
xlrecOld.heapBlk = xlrecInsert->heapBlk;
/* extra information needed to update the revmap */
xlrecOld.pagesPerRange = xlrecInsert->pagesPerRange;
xlrecOld.offnum = xlrecInsert->offnum;
/* offset number of old tuple on old page */
xlrecUpdate.oldOffnum = xlrec->oldOffnum;
xlrecUpdate.insert = xlrecOld;
*dLen = sizeof(xl_old_brin_update);
memcpy(dMainData,&xlrecUpdate,*dLen);
hasChange = true;
}
break;
}
default:
{
break;
}
}
if (hasChange == false) {
*dLen = *sLen;
memcpy(dMainData,sMainData,*dLen);
}
}
static int XlogHe3ToPg(XLogRecord*newRecord[],int n, OldXLogRecord*oldRecord) {
oldRecord->xl_xid = newRecord[0]->xl_xid;
oldRecord->xl_info = newRecord[0]->xl_info;
oldRecord->xl_rmid = newRecord[0]->xl_rmid;
char d_main_data[8192];
int dPos = 0;
char* dst = (char*)oldRecord;
dPos += sizeof(OldXLogRecord);
uint32_t d_main_data_len = 0;
uint32 main_data_len = 0;
uint8_t blkNum = 0;
bool hasblk = false;
char*img_ptr[XLR_MAX_BLOCK_ID + 1] = {0};
char*data_ptr[XLR_MAX_BLOCK_ID + 1] = {0};
uint16_t bimg_len[XLR_MAX_BLOCK_ID + 1] = {0};
uint16_t data_len[XLR_MAX_BLOCK_ID + 1] = {0};
for(int i = 0;i<n;i++) {
int sPos = 0;
char* src = (char*)newRecord[i];
uint32 remaining = newRecord[i]->xl_tot_len - sizeof(XLogRecord);
uint32 datatotal = 0;
sPos += sizeof(XLogRecord);
while(remaining > datatotal) {
uint8_t block_id = *(src + sPos);
if (block_id == XLR_BLOCK_ID_DATA_SHORT) {
sPos += sizeof(block_id);
remaining -= sizeof(block_id);
if (i == n-1) {
memcpy(dst + dPos,&block_id,sizeof(block_id));
dPos += sizeof(block_id);
}
main_data_len = *((uint8_t*)(src + sPos));
//main_data_len type XLR_BLOCK_ID_DATA_SHORT
uint8 d_len;
if (i == n-1) {
ReConvertMainData(newRecord[i],src + sPos + sizeof(d_len)+bimg_len[blkNum]+data_len[blkNum],&main_data_len,d_main_data,&d_main_data_len);
d_len = d_main_data_len;
memcpy(dst + dPos,&d_len,sizeof(d_len));
dPos += sizeof(d_len);
}
sPos += sizeof(d_len);
remaining -= sizeof(d_len);
datatotal += main_data_len;
break;
} else if (block_id == XLR_BLOCK_ID_DATA_LONG) {
sPos += sizeof(block_id);
remaining -= sizeof(block_id);
if (i == n-1) {
memcpy((dst + dPos),&block_id,sizeof(block_id));
dPos += sizeof(block_id);
}
memcpy(&main_data_len,src + sPos,sizeof(uint32));
if (i == n-1) {
ReConvertMainData(newRecord[i],src + sPos + sizeof(main_data_len)+bimg_len[blkNum]+data_len[blkNum],&main_data_len,d_main_data,&d_main_data_len);
if (d_main_data_len > 255) {
memcpy(dst + dPos,&d_main_data_len,sizeof(d_main_data_len));
dPos += sizeof(d_main_data_len);
} else {
*(dst + dPos - 1) = XLR_BLOCK_ID_DATA_SHORT;
uint8_t d_len = d_main_data_len;
memcpy(dst + dPos,&d_len,sizeof(d_len));
dPos += sizeof(d_len);
}
}
sPos += sizeof(main_data_len);
remaining -= sizeof(main_data_len);
datatotal += main_data_len;
break;
} else if (block_id == XLR_BLOCK_ID_ORIGIN) {
sPos += sizeof(block_id);
remaining -= sizeof(block_id);
if (i == n-1) {
memcpy(dst + dPos,&block_id,sizeof(block_id));
dPos += sizeof(block_id);
memcpy(dst + dPos,src+sPos,sizeof(RepOriginId));
dPos += sizeof(RepOriginId);
}
sPos += sizeof(RepOriginId);
remaining -= sizeof(RepOriginId);
} else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID) {
sPos += sizeof(block_id);
remaining -= sizeof(block_id);
if (i == n - 1) {
memcpy(dst + dPos,&block_id,sizeof(block_id));
dPos += sizeof(block_id);
memcpy(dst + dPos,src+sPos,sizeof(TransactionId));
dPos += sizeof(TransactionId);
}
sPos += sizeof(TransactionId);
remaining -= sizeof(TransactionId);
} else if (block_id <= XLR_MAX_BLOCK_ID) {
memcpy(dst + dPos, src + sPos, SizeOfXLogRecordBlockHeader);
uint8_t fork_flags = *(src + sPos + sizeof(block_id));
*(dst + dPos) = blkNum;
hasblk = true;
data_len[blkNum] = *((uint16_t*)(src + sPos + sizeof(block_id) + sizeof(fork_flags)));
datatotal += data_len[blkNum];
sPos += SizeOfXLogRecordBlockHeader;
dPos += SizeOfXLogRecordBlockHeader;
remaining -= SizeOfXLogRecordBlockHeader;
if ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0) {
bimg_len[blkNum] = *((uint16_t*)(src + sPos));
datatotal += bimg_len[blkNum];
uint16_t hole_offset = *((uint16_t*)(src + sPos + sizeof(bimg_len)));
uint8_t bimg_info = *((uint16_t*)(src + sPos + sizeof(bimg_len) + sizeof(hole_offset)));
memcpy(dst + dPos, src + sPos, SizeOfXLogRecordBlockImageHeader);
sPos += SizeOfXLogRecordBlockImageHeader;
dPos += SizeOfXLogRecordBlockImageHeader;
remaining -= SizeOfXLogRecordBlockImageHeader;
if ((bimg_info & BKPIMAGE_IS_COMPRESSED) != 0) {
if ((bimg_info & BKPIMAGE_HAS_HOLE) != 0) {
memcpy(dst + dPos, src + sPos, SizeOfXLogRecordBlockCompressHeader);
sPos += SizeOfXLogRecordBlockCompressHeader;
dPos += SizeOfXLogRecordBlockCompressHeader;
remaining -= SizeOfXLogRecordBlockCompressHeader;
}
}
}
if (!(fork_flags & BKPBLOCK_SAME_REL)) {
memcpy(dst + dPos, src + sPos, sizeof(RelFileNode));
sPos += sizeof(RelFileNode);
dPos += sizeof(RelFileNode);
remaining -= sizeof(RelFileNode);
}
memcpy(dst + dPos, src + sPos, sizeof(BlockNumber));
sPos += sizeof(BlockNumber);
dPos += sizeof(BlockNumber);
remaining -= sizeof(BlockNumber);
} else {
printf("invalid block_id %u",block_id);
}
}
assert(remaining == datatotal);
if (bimg_len[blkNum] != 0 ) {
img_ptr[blkNum] = src + sPos;
sPos += bimg_len[blkNum];
}
if (data_len[blkNum] != 0) {
data_ptr[blkNum] = src + sPos;
sPos += data_len[blkNum];
}
if (hasblk == true) {
blkNum++;
}
sPos += main_data_len;
assert(sPos == newRecord[i]->xl_tot_len);
}
int idx = 0;
while(idx < blkNum) {
if (bimg_len[idx] != 0) {
memcpy(dst + dPos, img_ptr[idx], bimg_len[idx]);
dPos += bimg_len[idx];
}
if (data_len[idx] != 0){
memcpy(dst + dPos, data_ptr[idx], data_len[idx]);
dPos += data_len[idx];
}
idx++;
}
memcpy(dst + dPos, d_main_data, d_main_data_len);
dPos += d_main_data_len;
oldRecord->xl_tot_len = dPos;
return dPos;
}
static int OldUsableBytesInSegment =
(DEFAULT_XLOG_SEG_SIZE / XLOG_BLCKSZ * (XLOG_BLCKSZ - SizeOfXLogShortPHD)) -
(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
static XLogRecPtr
OldXLogBytePosToRecPtr(uint64 bytepos)
{
uint64 fullsegs;
uint64 fullpages;
uint64 bytesleft;
uint32 seg_offset;
XLogRecPtr result;
fullsegs = bytepos / OldUsableBytesInSegment;
bytesleft = bytepos % OldUsableBytesInSegment;
if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
{
/* fits on first page of segment */
seg_offset = bytesleft + SizeOfXLogLongPHD;
}
else
{
/* account for the first page on segment with long header */
seg_offset = XLOG_BLCKSZ;
bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
fullpages = bytesleft / (XLOG_BLCKSZ - SizeOfXLogShortPHD);
bytesleft = bytesleft % (XLOG_BLCKSZ - SizeOfXLogShortPHD);
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
}
XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, DEFAULT_XLOG_SEG_SIZE, result);
return result;
}
static XLogRecPtr
OldXLogBytePosToEndRecPtr(uint64 bytepos)
{
uint64 fullsegs;
uint64 fullpages;
uint64 bytesleft;
uint32 seg_offset;
XLogRecPtr result;
fullsegs = bytepos / OldUsableBytesInSegment;
bytesleft = bytepos % OldUsableBytesInSegment;
if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
{
/* fits on first page of segment */
if (bytesleft == 0)
seg_offset = 0;
else
seg_offset = bytesleft + SizeOfXLogLongPHD;
}
else
{
/* account for the first page on segment with long header */
seg_offset = XLOG_BLCKSZ;
bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
fullpages = bytesleft / (XLOG_BLCKSZ - SizeOfXLogShortPHD);
bytesleft = bytesleft % (XLOG_BLCKSZ - SizeOfXLogShortPHD);
if (bytesleft == 0)
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
else
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
}
XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, DEFAULT_XLOG_SEG_SIZE, result);
return result;
}
static uint64
OldXLogRecPtrToBytePos(XLogRecPtr ptr)
{
uint64 fullsegs;
uint32 fullpages;
uint32 offset;
uint64 result;
XLByteToSeg(ptr, fullsegs, DEFAULT_XLOG_SEG_SIZE);
fullpages = (XLogSegmentOffset(ptr, DEFAULT_XLOG_SEG_SIZE)) / XLOG_BLCKSZ;
offset = ptr % XLOG_BLCKSZ;
if (fullpages == 0)
{
result = fullsegs * OldUsableBytesInSegment;
if (offset > 0)
{
Assert(offset >= SizeOfXLogLongPHD);
result += offset - SizeOfXLogLongPHD;
}
}
else
{
result = fullsegs * OldUsableBytesInSegment +
(XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
(fullpages - 1) * (XLOG_BLCKSZ - SizeOfXLogShortPHD); /* full pages */
if (offset > 0)
{
Assert(offset >= SizeOfXLogShortPHD);
result += offset - SizeOfXLogShortPHD;
}
}
return result;
}
static bool
ReserveXLogWalSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
{
XLogHe3ToPg *Insert = &g_walHe3ToPg;
uint64 startbytepos;
uint64 endbytepos;
uint64 prevbytepos;
uint32 size = MAXALIGN(SizeOfOldXLogRecord);
XLogRecPtr ptr;
uint32 segleft;
startbytepos = Insert->CurrBytePos;
ptr = OldXLogBytePosToEndRecPtr(startbytepos);
if (XLogSegmentOffset(ptr, DEFAULT_XLOG_SEG_SIZE) == 0)
{
*EndPos = *StartPos = ptr;
return false;
}
endbytepos = startbytepos + size;
prevbytepos = Insert->PrevBytePos;
*StartPos = OldXLogBytePosToRecPtr(startbytepos);
*EndPos = OldXLogBytePosToEndRecPtr(endbytepos);
segleft = DEFAULT_XLOG_SEG_SIZE - XLogSegmentOffset(*EndPos, DEFAULT_XLOG_SEG_SIZE);
if (segleft != DEFAULT_XLOG_SEG_SIZE)
{
/* consume the rest of the segment */
*EndPos += segleft;
endbytepos = OldXLogRecPtrToBytePos(*EndPos);
}
Insert->CurrBytePos = endbytepos;
Insert->PrevBytePos = startbytepos;
*PrevPtr = OldXLogBytePosToRecPtr(prevbytepos);
Assert(XLogSegmentOffset(*EndPos, DEFAULT_XLOG_SEG_SIZE) == 0);
Assert(OldXLogRecPtrToBytePos(*EndPos) == endbytepos);
Assert(OldXLogRecPtrToBytePos(*StartPos) == startbytepos);
Assert(OldXLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
return true;
}
static void
ReserveXLogWalInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
XLogRecPtr *PrevPtr)
{
XLogHe3ToPg *Insert = &g_walHe3ToPg;
uint64 startbytepos;
uint64 endbytepos;
uint64 prevbytepos;
size = MAXALIGN(size);
/* All (non xlog-switch) records should contain data. */
Assert(size > SizeOfOldXLogRecord);
/*
* The duration the spinlock needs to be held is minimized by minimizing
* the calculations that have to be done while holding the lock. The
* current tip of reserved WAL is kept in CurrBytePos, as a byte position
* that only counts "usable" bytes in WAL, that is, it excludes all WAL
* page headers. The mapping between "usable" byte positions and physical
* positions (XLogRecPtrs) can be done outside the locked region, and
* because the usable byte position doesn't include any headers, reserving
* X bytes from WAL is almost as simple as "CurrBytePos += X".
*/
startbytepos = Insert->CurrBytePos;
endbytepos = startbytepos + size;
prevbytepos = Insert->PrevBytePos;
Insert->CurrBytePos = endbytepos;
Insert->PrevBytePos = startbytepos;
*StartPos = OldXLogBytePosToRecPtr(startbytepos);
*EndPos = OldXLogBytePosToEndRecPtr(endbytepos);
*PrevPtr = OldXLogBytePosToRecPtr(prevbytepos);
/*
* Check that the conversions between "usable byte positions" and
* XLogRecPtrs work consistently in both directions.
*/
Assert(OldXLogRecPtrToBytePos(*StartPos) == startbytepos);
Assert(OldXLogRecPtrToBytePos(*EndPos) == endbytepos);
Assert(OldXLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
}
static void CopyXLogRecordToPgWAL(int write_len,OldXLogRecord* rechdr,XLogRecPtr StartPos, XLogRecPtr EndPos,
char*dBuf,int* dLen) {
char *currpos;
int freespace;
int written;
XLogRecPtr CurrPos;
XLogPageHeader pagehdr;
CurrPos = StartPos;
XLogPageHeader page;
XLogLongPageHeader longpage;
currpos = dBuf;
if (CurrPos % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
XLogSegmentOffset(CurrPos, DEFAULT_XLOG_SEG_SIZE) > XLOG_BLCKSZ) {
page = (XLogPageHeader)currpos;
page->xlp_magic = XLOG_PAGE_MAGIC;
page->xlp_info = 0;
page->xlp_tli = ControlFile->checkPointCopy.ThisTimeLineID;
page->xlp_pageaddr = CurrPos - (CurrPos % XLOG_BLCKSZ);
currpos += SizeOfXLogShortPHD;
}
else if (CurrPos % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
XLogSegmentOffset(CurrPos, DEFAULT_XLOG_SEG_SIZE) < XLOG_BLCKSZ) {
page = (XLogPageHeader)currpos;
page->xlp_magic = XLOG_PAGE_MAGIC;
page->xlp_info = XLP_LONG_HEADER;
page->xlp_tli = ControlFile->checkPointCopy.ThisTimeLineID;
page->xlp_pageaddr = CurrPos - (CurrPos % XLOG_BLCKSZ);
longpage = (XLogLongPageHeader) page;
longpage->xlp_sysid = ControlFile->system_identifier;
longpage->xlp_seg_size = WalSegSz;
longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
currpos += SizeOfXLogLongPHD;
}
freespace = INSERT_FREESPACE_MIRROR(CurrPos);
Assert(freespace >= sizeof(uint32));
/* Copy record data */
written = 0;
if (rechdr != NULL) {
char *rdata_data = rechdr;
int rdata_len = rechdr->xl_tot_len;
while (rdata_len > freespace)
{
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
memcpy(currpos, rdata_data, freespace);
rdata_data += freespace;
rdata_len -= freespace;
written += freespace;
CurrPos += freespace;
currpos += freespace;
pagehdr = (XLogPageHeader) currpos;
pagehdr->xlp_info = 0;
pagehdr->xlp_tli = ControlFile->checkPointCopy.ThisTimeLineID;
pagehdr->xlp_magic = XLOG_PAGE_MAGIC;
pagehdr->xlp_pageaddr = CurrPos - (CurrPos % XLOG_BLCKSZ);
pagehdr->xlp_rem_len = write_len - written;
pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
if (XLogSegmentOffset(CurrPos, DEFAULT_XLOG_SEG_SIZE) == 0) {
CurrPos += SizeOfXLogLongPHD;
currpos += SizeOfXLogLongPHD;
pagehdr->xlp_info |= XLP_LONG_HEADER;
longpage = (XLogLongPageHeader) pagehdr;
longpage->xlp_sysid = ControlFile->system_identifier;
longpage->xlp_seg_size = WalSegSz;
longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
} else {
CurrPos += SizeOfXLogShortPHD;
currpos += SizeOfXLogShortPHD;
}
freespace = INSERT_FREESPACE_MIRROR(CurrPos);
}
Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
memcpy(currpos, rdata_data, rdata_len);
currpos += rdata_len;
CurrPos += rdata_len;
freespace -= rdata_len;
written += rdata_len;
}
Assert(written == write_len);
int extra_space = MAXALIGN64(CurrPos) - CurrPos;
CurrPos = MAXALIGN64(CurrPos);
if (CurrPos != EndPos)
printf("ERROR space reserved for WAL record does not match what was written");
currpos += extra_space;
*dLen = (int)(currpos - dBuf);
}
void readControlFile(char*pathstr) {
bool crc_ok;
ControlFile = get_controlfile(pathstr,&crc_ok);
if (!crc_ok)
printf(_("WARNING: Calculated CRC checksum does not match value stored in file.\n"
"Either the file is corrupt, or it has a different layout than this program\n"
"is expecting. The results below are untrustworthy.\n\n"));
/* set wal segment size */
WalSegSz = ControlFile->xlog_seg_size;
}
void setControlFile(ControlFileData *cfile) {
ControlFile = cfile;
}
int ArrayXlogHe3ToPg(char*sBuf,int sLen, char*dBuf,int* dLen,uint64 *startLsn,uint64 *endLsn) {
XLogRecord*one = (XLogRecord*)sBuf;
//32kB
static char tBuf[32768];
int tLen = 0;
int MtrLen = 0;
int iLen = 0;
int oLen = 0;
*dLen = 0;
for(;iLen<sLen;) {
int n = 0;
XLogRecord*newRecord[XLR_MAX_BLOCK_ID + 1];
while(one->mtr == false) {
newRecord[n++] = one;
iLen += one->xl_tot_len;
one = (((char*)one) + one->xl_tot_len);
if (iLen > sLen) {
break;
}
}
newRecord[n++] = one;
iLen += one->xl_tot_len;
one = (((char*)one) + one->xl_tot_len);
if (iLen > sLen) {
break;
}
XlogHe3ToPg(newRecord,n,tBuf+tLen);
uint64 StartPos,EndPos;
XLogRecPtr reduceV = 0;
if (g_walHe3ToPg.PrevBytePos == 0) {
uint64 xl_prev = newRecord[0]->xl_end - newRecord[0]->xl_tot_len;
g_walHe3ToPg.PrevBytePos = g_walHe3ToPg.CurrBytePos = xl_prev;
bool Insert = ReserveXLogWalSwitch(&StartPos,&EndPos,&xl_prev);
g_walHe3ToPg.PrevBytePos = g_walHe3ToPg.CurrBytePos;
reduceV = 1;
}
OldXLogRecord* rechdr = (OldXLogRecord*)(tBuf + tLen);
ReserveXLogWalInsertLocation(rechdr->xl_tot_len,&StartPos,&EndPos,&rechdr->xl_prev);
//for pg check
if (rechdr->xl_rmid == RM_XLOG_ID &&
(rechdr->xl_info == XLOG_CHECKPOINT_SHUTDOWN || rechdr->xl_info == XLOG_CHECKPOINT_ONLINE)) {
CheckPoint*cp = (CheckPoint*)(((char*)rechdr)+SizeOfOldXLogRecord + SizeOfXLogRecordDataHeaderShort);
cp->redo = StartPos;
rechdr->xl_prev = rechdr->xl_prev-reduceV;
}
pg_crc32c rdata_crc;
INIT_CRC32C(rdata_crc);
COMP_CRC32C(rdata_crc, ((char*)rechdr) + SizeOfOldXLogRecord, rechdr->xl_tot_len - SizeOfOldXLogRecord);
COMP_CRC32C(rdata_crc, rechdr, offsetof(OldXLogRecord, xl_crc));
FIN_CRC32C(rdata_crc);
rechdr->xl_crc = rdata_crc;
CopyXLogRecordToPgWAL(rechdr->xl_tot_len,rechdr,StartPos,EndPos,dBuf+*dLen,&oLen);
if (*startLsn == 0) {
*startLsn = StartPos;
}
*endLsn = EndPos;
*dLen += oLen;
tLen += rechdr->xl_tot_len;
MtrLen = iLen;
}
return MtrLen;
}

View File

@ -18,9 +18,9 @@ static void getWalFunc(gpointer data, gpointer user_data) {
//elem->status = STARTSTATUS;
int r;
clock_t start = clock();
r = batchRead((uint8_t *) elem->data, ThisTimeLineID, elem->startLsn, walStoreToLocal);
r = batchRead((uint8_t *) elem->data, ThisTimeLineID, elem->startLsn, elem->endLsn, walStoreToLocal);
clock_t end = clock();
printf("====LSN %X/%X==pid %d==len %d===time %u\n",LSN_FORMAT_ARGS(elem->startLsn),pthread_self(),r,end-start);
//printf("====LSN %X/%X==pid %d==len %d===time %u\n",LSN_FORMAT_ARGS(elem->startLsn),pthread_self(),r,end-start);
elem->dataLen = r;
if (r > sizeof(XLogRecord)) {
XLogRecord* record = ((XLogRecord*)elem->data);
@ -42,7 +42,11 @@ int initPthreadPool(void) {
return -1;
}
//default 8 thread read
if(he3mirror){
gpool = g_thread_pool_new(getWalFunc,NULL,1,FALSE,NULL);
}else{
gpool = g_thread_pool_new(getWalFunc,NULL,8,FALSE,NULL);
}
elog(LOG,"thread pool max threads is %d,num thread is %d",
g_thread_pool_get_max_threads(gpool),g_thread_pool_get_num_threads(gpool));
return 0;

View File

@ -29,29 +29,29 @@ static PGconn *connToPushStandby = NULL;
pid_t startupPid = 0;
static bool ConnectPushStandbyDB() {
char *err;
const char *keys[] = {"dbname","user","password","host","port",NULL};
const char *vals[] = {"postgres","repl","123456","127.0.0.1","15431",NULL};
connToPushStandby = PQconnectdbParams(keys, vals, false);
if (PQstatus(connToPushStandby) == CONNECTION_BAD)
{
err = pchomp(PQerrorMessage(connToPushStandby));
ereport(ERROR,
(errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("push standby could not connect to the push standby server: %s", err)));
return false;
}
return true;
// static bool ConnectPushStandbyDB() {
// char *err;
// const char *keys[] = {"dbname","user","password","host","port",NULL};
// const char *vals[] = {"postgres","repl","123456","100.73.36.123","15431",NULL};
// connToPushStandby = PQconnectdbParams(keys, vals, false);
// if (PQstatus(connToPushStandby) == CONNECTION_BAD)
// {
// err = pchomp(PQerrorMessage(connToPushStandby));
// ereport(ERROR,
// (errcode(ERRCODE_CONNECTION_FAILURE),
// errmsg("push standby could not connect to the push standby server: %s", err)));
// return false;
// }
// return true;
}
// }
static bool ConnectPrimaryDB() {
char *err;
char conninfo[maxconnlen];
const char *keys[] = {"dbname","user","password","host","port",NULL};
const char *vals[] = {"postgres","repl","123456","127.0.0.1","15432",NULL};
// const char *keys[] = {"dbname","user","password","host","port",NULL};
// const char *vals[] = {"postgres","repl","123456","100.73.36.123","15432",NULL};
strlcpy(conninfo, (char *) PrimaryConnInfo, maxconnlen);
/* Establish the connection to the primary for query Min Lsn*/
/*
@ -59,7 +59,8 @@ static bool ConnectPrimaryDB() {
* URI), and pass some extra options.
*/
/* Note we do not want libpq to re-expand the dbname parameter */
pushconn = PQconnectdbParams(keys, vals, true);
pushconn = PQconnectdb(conninfo);
// pushconn = PQconnectdbParams(keys, vals, true);
if (PQstatus(pushconn) == CONNECTION_BAD)
{
err = pchomp(PQerrorMessage(pushconn));
@ -71,57 +72,58 @@ static bool ConnectPrimaryDB() {
return true;
}
static bool ConnectPrimaryDB4ReplyLSN() {
char *err;
char conninfo[maxconnlen];
const char *keys[] = {"dbname","user","password","host","port",NULL};
const char *vals[] = {"postgres","postgres","","127.0.0.1","15432",NULL};
strlcpy(conninfo, (char *) PrimaryConnInfo, maxconnlen);
/* Establish the connection to the primary for query Min Lsn*/
/*
* We use the expand_dbname parameter to process the connection string (or
* URI), and pass some extra options.
*/
/* Note we do not want libpq to re-expand the dbname parameter */
pushconn = PQconnectdbParams(keys, vals, true);
if (PQstatus(pushconn) == CONNECTION_BAD)
{
err = pchomp(PQerrorMessage(pushconn));
ereport(WARNING,
(errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("push standby could not connect to the primary server: %s", err)));
return false;
}
return true;
}
// static bool ConnectPrimaryDB4ReplyLSN() {
// char *err;
// char conninfo[maxconnlen];
// const char *keys[] = {"dbname","user","password","host","port",NULL};
// const char *vals[] = {"postgres","postgres","","100.73.36.123","15432",NULL};
// strlcpy(conninfo, (char *) PrimaryConnInfo, maxconnlen);
// /* Establish the connection to the primary for query Min Lsn*/
// /*
// * We use the expand_dbname parameter to process the connection string (or
// * URI), and pass some extra options.
// */
// /* Note we do not want libpq to re-expand the dbname parameter */
// pushconn = PQconnectdbParams(keys, vals, true);
// if (PQstatus(pushconn) == CONNECTION_BAD)
// {
// err = pchomp(PQerrorMessage(pushconn));
// ereport(WARNING,
// (errcode(ERRCODE_CONNECTION_FAILURE),
// errmsg("push standby could not connect to the primary server: %s", err)));
// return false;
// }
// return true;
// }
XLogRecPtr QueryPushLsn()
{
StringInfoData cmd;
XLogRecPtr replylsn = InvalidXLogRecPtr;
char *replyptr;
initStringInfo(&cmd);
appendStringInfoString(&cmd,"select pg_last_wal_replay_lsn()");
replylsn = InvalidXLogRecPtr;
if (connToPushStandby == NULL) {
if (ConnectPushStandbyDB() == false) {
return InvalidXLogRecPtr;
}
}
PGresult *pgres = NULL;
pgres = PQexec(connToPushStandby, cmd.data);
if (PQresultStatus(pgres) == PGRES_TUPLES_OK && PQntuples(pgres) == 1) {
replyptr = PQgetvalue(pgres, 0, 0);
bool flag;
replylsn = pg_lsn_in_internal(replyptr,&flag);
}
PQfinish(connToPushStandby);
connToPushStandby = NULL;
PQclear(pgres);
return replylsn;
// XLogRecPtr QueryPushLsn()
// {
// StringInfoData cmd;
// XLogRecPtr replylsn = InvalidXLogRecPtr;
// char *replyptr;
// initStringInfo(&cmd);
// appendStringInfoString(&cmd,"select pg_last_wal_replay_lsn()");
// replylsn = InvalidXLogRecPtr;
// if (connToPushStandby == NULL) {
// if (ConnectPushStandbyDB() == false) {
// return InvalidXLogRecPtr;
// }
// }
// PGresult *pgres = NULL;
// pgres = PQexec(connToPushStandby, cmd.data);
// if (PQresultStatus(pgres) == PGRES_TUPLES_OK && PQntuples(pgres) == 1) {
// replyptr = PQgetvalue(pgres, 0, 0);
// bool flag;
// replylsn = pg_lsn_in_internal(replyptr,&flag);
}
// }
// PQfinish(connToPushStandby);
// connToPushStandby = NULL;
// PQclear(pgres);
// return replylsn;
// }
XLogRecPtr QueryPushChkpointLsn(void)
{
@ -202,50 +204,50 @@ XLogRecPtr QueryMinLsn(XLogRecPtr lsn)
return replylsn;
}
XLogRecPtr QueryReplyLsn(XLogRecPtr lsn)
{
StringInfoData cmd;
XLogRecPtr replylsn;
PGresult *pgres = NULL;
char *appname;
char *state;
char *syncstate;
char *replyptr;
replylsn = InvalidXLogRecPtr;
if (pushconn == NULL) {
if (ConnectPrimaryDB4ReplyLSN() == false) {
return InvalidXLogRecPtr;
}
}
// XLogRecPtr QueryReplyLsn(XLogRecPtr lsn)
// {
// StringInfoData cmd;
// XLogRecPtr replylsn;
// PGresult *pgres = NULL;
// char *appname;
// char *state;
// char *syncstate;
// char *replyptr;
// replylsn = InvalidXLogRecPtr;
// if (pushconn == NULL) {
// if (ConnectPrimaryDB4ReplyLSN() == false) {
// return InvalidXLogRecPtr;
// }
// }
initStringInfo(&cmd);
appendStringInfoString(&cmd, "SELECT t.application_name, t.replay_lsn, t.state, t.sync_state FROM pg_catalog.pg_stat_replication t WHERE t.application_name <> \'");
appendStringInfoString(&cmd, "pushstandby");
appendStringInfoString(&cmd, "\' order by t.replay_lsn limit 1");
// initStringInfo(&cmd);
// appendStringInfoString(&cmd, "SELECT t.application_name, t.replay_lsn, t.state, t.sync_state FROM pg_catalog.pg_stat_replication t WHERE t.application_name <> \'");
// appendStringInfoString(&cmd, "pushstandby");
// appendStringInfoString(&cmd, "\' order by t.replay_lsn limit 1");
pgres = PQexec(pushconn, cmd.data);
if (PQresultStatus(pgres) == PGRES_TUPLES_OK && PQntuples(pgres) == 1) {
appname = PQgetvalue(pgres, 0, 0);
replyptr = PQgetvalue(pgres, 0, 1);
bool flag;
replylsn = pg_lsn_in_internal(replyptr,&flag);
//replylsn = atol(replyptr);
state = PQgetvalue(pgres, 0, 2);
syncstate = PQgetvalue(pgres, 0, 3);
}
else if (PQresultStatus(pgres) == PGRES_BAD_RESPONSE ||
PQresultStatus(pgres) == PGRES_NONFATAL_ERROR ||
PQresultStatus(pgres) == PGRES_FATAL_ERROR)
{
PQfinish(pushconn);
pushconn = NULL;
PQclear(pgres);
return InvalidXLogRecPtr;
}
//elog(LOG,"appnamelsn: %x: replylsn %x",lsn,replylsn);
if (lsn !=InvalidXLogRecPtr && lsn < replylsn||replylsn == InvalidXLogRecPtr) {
replylsn = lsn;
}
PQclear(pgres);
return replylsn;
}
// pgres = PQexec(pushconn, cmd.data);
// if (PQresultStatus(pgres) == PGRES_TUPLES_OK && PQntuples(pgres) == 1) {
// appname = PQgetvalue(pgres, 0, 0);
// replyptr = PQgetvalue(pgres, 0, 1);
// bool flag;
// replylsn = pg_lsn_in_internal(replyptr,&flag);
// //replylsn = atol(replyptr);
// state = PQgetvalue(pgres, 0, 2);
// syncstate = PQgetvalue(pgres, 0, 3);
// }
// else if (PQresultStatus(pgres) == PGRES_BAD_RESPONSE ||
// PQresultStatus(pgres) == PGRES_NONFATAL_ERROR ||
// PQresultStatus(pgres) == PGRES_FATAL_ERROR)
// {
// PQfinish(pushconn);
// pushconn = NULL;
// PQclear(pgres);
// return InvalidXLogRecPtr;
// }
// //elog(LOG,"appnamelsn: %x: replylsn %x",lsn,replylsn);
// if (lsn !=InvalidXLogRecPtr && lsn < replylsn||replylsn == InvalidXLogRecPtr) {
// replylsn = lsn;
// }
// PQclear(pgres);
// return replylsn;
// }

View File

@ -24,6 +24,7 @@ wal_batch_t *ring_buffer_queue(ring_buffer_t *buffer, wal_batch_t data) {
return NULL;
}
buffer->buffer[buffer->head_index].startLsn = data.startLsn;
buffer->buffer[buffer->head_index].endLsn = data.endLsn;
pg_atomic_exchange_u32(&buffer->buffer[buffer->head_index].status,(uint32_t)UNKOWNSTATUS);
curWal = &buffer->buffer[buffer->head_index];
buffer->head_index = ((buffer->head_index + 1) & RING_BUFFER_MASK(buffer));
@ -53,7 +54,7 @@ uint8_t ring_buffer_dequeue_arr(ring_buffer_t *buffer, uint32 size) {
SpinLockRelease(&buffer->mutex);
return 0;
}
ring_buffer_size_t pos = 0;
ring_buffer_size_t pos = buffer->tail_index;
for(uint32 i = 0;i<size;i++) {
pg_atomic_exchange_u32(&buffer->buffer[pos].status,(uint32_t)UNKOWNSTATUS);
pos = ((pos+1) & RING_BUFFER_MASK(buffer));
@ -122,10 +123,19 @@ void InitRingBufferSpace(void) {
}
int walRecordQuery(char**buffer,int* curpos,int* maxspace,uint64 lsn) {
if (gRingBufferManger->maxIdx == 0) {
ring_buffer_size_t maxIdx = gRingBufferManger->maxIdx;
if (maxIdx == 0) {
return -1;
}
int low = 0,high = gRingBufferManger->maxIdx, mid = 0;
ring_buffer_size_t tailIdx = gRingBufferManger->tail_index;
int low = tailIdx,high = ((tailIdx+maxIdx) & RING_BUFFER_MASK(gRingBufferManger)), mid = 0;
if (low > high) {
if (gRingBufferManger->buffer[gRingBufferManger->buffer_mask].startLsn + gRingBufferManger->buffer[gRingBufferManger->buffer_mask].dataLen > lsn) {
high = gRingBufferManger->buffer_mask+1;
} else {
low = 0;
}
}
if (gRingBufferManger->buffer[high-1].startLsn == 0) {
high -= 2;
} else {
@ -169,7 +179,7 @@ int walRecordQuery(char**buffer,int* curpos,int* maxspace,uint64 lsn) {
free(*buffer);
*buffer = ptr;
}
memcpy(*buffer,record,xllen);
memcpy(*buffer+*curpos,record,xllen);
*curpos += xllen;
}
return xllen;

File diff suppressed because it is too large Load Diff

View File

@ -33,6 +33,13 @@
#include "storage/proc.h"
#include "storage/spin.h"
#include "utils/memutils.h"
#include "access/heapam_xlog.h"
#include "access/nbtxlog.h"
#include "access/nbtree.h"
#include "access/gistxlog.h"
#include "access/gist_private.h"
#include "access/spgxlog.h"
#include "access/brin_xlog.h"
/* Buffer size required to store a compressed version of backup block image */
#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ)
@ -69,10 +76,12 @@ static int max_registered_block_id = 0; /* highest block_id + 1 currently
int group_total_len;
int grouo_rec_count;
int grouo_rec_cur_count;
XLogRecord *grouphead[XLR_MAX_BLOCK_ID + 1];
int grouplens[XLR_MAX_BLOCK_ID + 1];
XLogRecData groupRecData[XLR_MAX_BLOCK_ID + 1];
XLogRecPtr groupEndLsn[XLR_MAX_BLOCK_ID + 1];
/*
* A chain of XLogRecDatas to hold the "main data" of a WAL record, registered
@ -552,6 +561,459 @@ XLogInsert(RmgrId rmid, uint8 info)
return EndPos;
}
static XLogRecData g_bkp_rdatas[XLR_MAX_BLOCK_ID + 1][2];
static XLogRecData g_main_data;
static void extendMainData(XLogReaderState *state) {
int extendSize = 64;
if (state->main_data_len + extendSize > state->main_data_bufsz)
{
if (state->main_data)
pfree(state->main_data);
/*
* main_data_bufsz must be MAXALIGN'ed. In many xlog record
* types, we omit trailing struct padding on-disk to save a few
* bytes; but compilers may generate accesses to the xlog struct
* that assume that padding bytes are present. If the palloc
* request is not large enough to include such padding bytes then
* we'll get valgrind complaints due to otherwise-harmless fetches
* of the padding bytes.
*
* In addition, force the initial request to be reasonably large
* so that we don't waste time with lots of trips through this
* stanza. BLCKSZ / 2 seems like a good compromise choice.
*/
state->main_data_bufsz = MAXALIGN(Max(state->main_data_len + extendSize,
BLCKSZ / 2));
state->main_data = palloc(state->main_data_bufsz);
}
}
static void convertMainData(XLogReaderState *state, OldXLogRecord *record) {
RmgrId rmid = record->xl_rmid;
uint8 info = (record->xl_info & ~XLR_INFO_MASK);
switch(rmid) {
case RM_HEAP2_ID:
{
if ((info & XLOG_HEAP_OPMASK) == XLOG_HEAP2_VISIBLE) {
xl_old_heap_visible *xlrec = (xl_old_heap_visible *) XLogRecGetData(state);
xl_heap_visible xlrecNew;
xlrecNew.rnode = state->blocks[1].rnode;
xlrecNew.blkno = state->blocks[1].blkno;
xlrecNew.cutoff_xid = xlrec->cutoff_xid;
xlrecNew.flags = xlrec->flags;
extendMainData(state);
state->main_data_len = sizeof(xl_heap_visible);
memcpy(state->main_data,&xlrecNew,state->main_data_len);
}
break;
}
case RM_HEAP_ID:
{
if (((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_UPDATE) ||
((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_HOT_UPDATE)) {
xl_old_heap_update *xlrec = (xl_old_heap_update *) XLogRecGetData(state);
xl_heap_update xlrecNew;
xlrecNew.old_xmax = xlrec->old_xmax;
xlrecNew.old_offnum = xlrec->old_offnum;
xlrecNew.old_infobits_set = xlrec->old_infobits_set;
xlrecNew.flags = xlrec->flags;
xlrecNew.new_xmax = xlrec->new_xmax;
xlrecNew.new_offnum = xlrec->new_offnum;
xlrecNew.newblk = state->blocks[0].blkno;
if(state->max_block_id == 0){
xlrecNew.oldblk = state->blocks[0].blkno;
} else{
xlrecNew.oldblk = state->blocks[1].blkno;
}
xlrecNew.rnode = state->blocks[0].rnode;
extendMainData(state);
state->main_data_len = sizeof(xl_heap_update);
memcpy(state->main_data,&xlrecNew,state->main_data_len);
}
break;
}
case RM_BTREE_ID:
{
if (info == XLOG_BTREE_SPLIT_L || info == XLOG_BTREE_SPLIT_R) {
xl_old_btree_split *xlrec = (xl_old_btree_split *) XLogRecGetData(state);
xl_btree_split xlrecNew;
xlrecNew.level = xlrec->level;
xlrecNew.firstrightoff = xlrec->firstrightoff;
xlrecNew.newitemoff = xlrec->newitemoff;
xlrecNew.postingoff = xlrec->postingoff;
xlrecNew.origpagenumber = state->blocks[0].blkno;
xlrecNew.rightpagenumber = state->blocks[1].blkno;
if (!XLogRecGetBlockTag(state, 2, NULL, NULL, &xlrecNew.spagenumber))
xlrecNew.spagenumber = P_NONE;
extendMainData(state);
state->main_data_len = sizeof(xl_btree_split);
memcpy(state->main_data,&xlrecNew,state->main_data_len);
}
break;
}
case RM_GIST_ID:
{
if (info == XLOG_GIST_PAGE_SPLIT) {
gistoldxlogPageSplit *xlrec = (gistoldxlogPageSplit *) XLogRecGetData(state);
gistxlogPageSplit xlrecNew;
xlrecNew.markfollowright = xlrec->markfollowright;
xlrecNew.npage = xlrec->npage;
xlrecNew.origleaf = xlrec->origleaf;
xlrecNew.orignsn = xlrec->orignsn;
xlrecNew.origrlink = xlrec->origrlink;
xlrecNew.isroot = false;
if (xlrec->npage > 0) {
if (state->blocks[1].blkno == GIST_ROOT_BLKNO) {
xlrecNew.isroot = true;
}
}
extendMainData(state);
state->main_data_len = sizeof(gistxlogPageSplit);
memcpy(state->main_data,&xlrecNew,state->main_data_len);
}
break;
}
case RM_SPGIST_ID:
{
if (info == XLOG_SPGIST_ADD_LEAF) {
spgoldxlogAddLeaf *xlrec = (spgoldxlogAddLeaf *) XLogRecGetData(state);
spgxlogAddLeaf xlrecNew;
xlrecNew.newPage = xlrec->newPage; /* init dest page? */
xlrecNew.storesNulls = xlrec->storesNulls; /* page is in the nulls tree? */
xlrecNew.offnumLeaf = xlrec->offnumLeaf; /* offset where leaf tuple gets placed */
xlrecNew.offnumHeadLeaf = xlrec->offnumHeadLeaf; /* offset of head tuple in chain, if any */
xlrecNew.offnumParent = xlrec->offnumParent; /* where the parent downlink is, if any */
xlrecNew.nodeI = xlrec->nodeI;
xlrecNew.blknoLeaf = state->blocks[0].blkno;
extendMainData(state);
state->main_data_len = sizeof(spgxlogAddLeaf);
memcpy(state->main_data,&xlrecNew,state->main_data_len);
} else if (info == XLOG_SPGIST_MOVE_LEAFS) {
spgoldxlogMoveLeafs *xlrec = (spgoldxlogMoveLeafs *) XLogRecGetData(state);
spgxlogMoveLeafs xlrecNew;
xlrecNew.nMoves = xlrec->nMoves; /* number of tuples moved from source page */
xlrecNew.newPage = xlrec->newPage; /* init dest page? */
xlrecNew.replaceDead = xlrec->replaceDead; /* are we replacing a DEAD source tuple? */
xlrecNew.storesNulls = xlrec->storesNulls; /* pages are in the nulls tree? */
/* where the parent downlink is */
xlrecNew.offnumParent = xlrec->offnumParent;
xlrecNew.nodeI = xlrec->nodeI;
xlrecNew.stateSrc = xlrec->stateSrc;
/* for he3pg */
xlrecNew.blknoDst = state->blocks[1].blkno;
/*----------
* data follows:
* array of deleted tuple numbers, length nMoves
* array of inserted tuple numbers, length nMoves + 1 or 1
* list of leaf tuples, length nMoves + 1 or 1 (unaligned!)
*
* Note: if replaceDead is true then there is only one inserted tuple
* number and only one leaf tuple in the data, because we are not copying
* the dead tuple from the source
*----------
*/
char* tmp = palloc(state->main_data_len-SizeOfOldSpgxlogMoveLeafs);
memcpy(tmp,state->main_data+SizeOfOldSpgxlogMoveLeafs,state->main_data_len-SizeOfOldSpgxlogMoveLeafs);
extendMainData(state);
memcpy(state->main_data,&xlrecNew,SizeOfSpgxlogMoveLeafs);
memcpy(state->main_data + SizeOfSpgxlogMoveLeafs, tmp, state->main_data_len-SizeOfOldSpgxlogMoveLeafs);
state->main_data_len += SizeOfSpgxlogMoveLeafs-SizeOfOldSpgxlogMoveLeafs;
pfree(tmp);
} else if (info == XLOG_SPGIST_ADD_NODE) {
spgoldxlogAddNode *xlrec = (spgoldxlogAddNode *) XLogRecGetData(state);
spgxlogAddNode xlrecNew;
xlrecNew.offnum = xlrec->offnum;
/*
* Offset of the new tuple, on the new page (on backup block 1). Invalid,
* if we overwrote the old tuple in the original page).
*/
xlrecNew.offnumNew = xlrec->offnumNew;
xlrecNew.newPage = xlrec->newPage; /* init new page? */
/*----
* Where is the parent downlink? parentBlk indicates which page it's on,
* and offnumParent is the offset within the page. The possible values for
* parentBlk are:
*
* 0: parent == original page
* 1: parent == new page
* 2: parent == different page (blk ref 2)
* -1: parent not updated
*----
*/
xlrecNew.parentBlk = xlrec->parentBlk;
xlrecNew.offnumParent = xlrec->offnumParent; /* offset within the parent page */
xlrecNew.nodeI = xlrec->nodeI;
xlrecNew.blkno1 = state->blocks[0].blkno;
xlrecNew.stateSrc = xlrec->stateSrc;
extendMainData(state);
state->main_data_len = sizeof(spgxlogAddNode);
memcpy(state->main_data,&xlrecNew,state->main_data_len);
} else if (info == XLOG_SPGIST_PICKSPLIT) {
spgoldxlogPickSplit *xlrec = (spgoldxlogPickSplit *) XLogRecGetData(state);
spgxlogPickSplit xlrecNew;
xlrecNew.isRootSplit = xlrec->isRootSplit;
xlrecNew.nDelete = xlrec->nDelete; /* n to delete from Src */
xlrecNew.nInsert = xlrec->nInsert; /* n to insert on Src and/or Dest */
xlrecNew.initSrc = xlrec->initSrc; /* re-init the Src page? */
xlrecNew.initDest = xlrec->initDest; /* re-init the Dest page? */
/* for he3pg */
xlrecNew.blknoInner = state->blocks[2].blkno;
/* where to put new inner tuple */
xlrecNew.offnumInner = xlrec->offnumInner;
xlrecNew.initInner = xlrec->initInner; /* re-init the Inner page? */
xlrecNew.storesNulls = xlrec->storesNulls; /* pages are in the nulls tree? */
/* where the parent downlink is, if any */
xlrecNew.innerIsParent = xlrec->innerIsParent; /* is parent the same as inner page? */
xlrecNew.offnumParent = xlrec->offnumParent;
xlrecNew.nodeI = xlrec->nodeI;
xlrecNew.stateSrc = xlrec->stateSrc;
/*----------
* data follows:
* array of deleted tuple numbers, length nDelete
* array of inserted tuple numbers, length nInsert
* array of page selector bytes for inserted tuples, length nInsert
* new inner tuple (unaligned!)
* list of leaf tuples, length nInsert (unaligned!)
*----------
*/
char* tmp = palloc(state->main_data_len-SizeOfOldSpgxlogPickSplit);
memcpy(tmp,state->main_data+SizeOfOldSpgxlogPickSplit,state->main_data_len-SizeOfOldSpgxlogPickSplit);
extendMainData(state);
memcpy(state->main_data,&xlrecNew,SizeOfSpgxlogPickSplit);
memcpy(state->main_data + SizeOfSpgxlogPickSplit, tmp, state->main_data_len-SizeOfOldSpgxlogPickSplit);
state->main_data_len += SizeOfSpgxlogPickSplit-SizeOfOldSpgxlogPickSplit;
pfree(tmp);
}
break;
}
case RM_BRIN_ID:
{
if (info == XLOG_BRIN_INSERT) {
xl_old_brin_insert *xlrec = (xl_old_brin_insert *) XLogRecGetData(state);
xl_brin_insert xlrecNew;
xlrecNew.heapBlk = xlrec->heapBlk;
/* extra information needed to update the revmap */
xlrecNew.pagesPerRange = xlrec->pagesPerRange;
xlrecNew.block0 = state->blocks[0].blkno;
/* offset number in the main page to insert the tuple to. */
xlrecNew.offnum = xlrec->offnum;
extendMainData(state);
state->main_data_len = sizeof(xl_brin_insert);
memcpy(state->main_data,&xlrecNew,state->main_data_len);
} else if ( info == XLOG_BRIN_UPDATE) {
xl_old_brin_update *xlrec = (xl_old_brin_update *) XLogRecGetData(state);
xl_brin_update xlrecUpdate;
xl_old_brin_insert *xlrecInsert = &xlrec->insert;
xl_brin_insert xlrecNew;
xlrecNew.heapBlk = xlrecInsert->heapBlk;
/* extra information needed to update the revmap */
xlrecNew.pagesPerRange = xlrecInsert->pagesPerRange;
xlrecNew.block0 = state->blocks[0].blkno;
/* offset number in the main page to insert the tuple to. */
xlrecNew.offnum = xlrecInsert->offnum;
xlrecUpdate.oldOffnum = xlrec->oldOffnum;
xlrecUpdate.insert = xlrecNew;
extendMainData(state);
state->main_data_len = sizeof(xl_brin_update);
memcpy(state->main_data,&xlrecUpdate,state->main_data_len);
}
break;
}
default:
break;
}
}
XLogRecData *DecodeXLogRecordAssemble(XLogReaderState *state, OldXLogRecord *record,
XLogRecPtr RedoRecPtr, bool doPageWrites,
XLogRecPtr *fpw_lsn, int *num_fpi)
{
/*
* Make an rdata chain containing all the data portions of all block
* references. This includes the data for full-page images. Also append
* the headers for the block references in the scratch buffer.
*/
RmgrId rmid = record->xl_rmid;
uint8 info = record->xl_info;
*fpw_lsn = InvalidXLogRecPtr;
int block_id;
XLogRecord *rechdr = NULL;
group_total_len = 0;
grouo_rec_count = 0;
grouo_rec_cur_count = 0;
int maxidx = (state->max_block_id < 0 ? 1:state->max_block_id+1);
bool isDone = false;
for (block_id = 0; block_id < maxidx; block_id++)
{
XLogRecData* rdt;
uint32 total_len;
total_len = 0;
pg_crc32c rdata_crc;
XLogRecData *rdt_datas_last;
char *scratch;
// char linkkey[36];
groupRecData[grouo_rec_count].next = NULL;
rdt_datas_last = &groupRecData[grouo_rec_count];
scratch = hdr_scratch + grouo_rec_count * SINGLE_SCRATCH_SIZE;
groupRecData[grouo_rec_count].data = scratch;
/*group_total_len+=HEADER_SCRATCH_SIZE;*/
grouphead[grouo_rec_count]=(XLogRecord *)scratch;
/* The record begins with the fixed-size header */
rechdr = (XLogRecord *)scratch;
scratch += SizeOfXLogRecord;
if (state->max_block_id >= 0) {
DecodedBkpBlock *blkbuf = &state->blocks[block_id];
XLogRecData* bkp_rdatas = g_bkp_rdatas[block_id];
XLogRecordBlockHeader bkpb;
XLogRecordBlockImageHeader bimg;
XLogRecordBlockCompressHeader cbimg = {0};
bkpb.id = 0;
bkpb.fork_flags = blkbuf->flags;
bkpb.data_length = blkbuf->data_len;
//total_len += bkpb.data_length;
/* Ok, copy the header to the scratch buffer */
memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader);
scratch += SizeOfXLogRecordBlockHeader;
if (blkbuf->has_image) {
bimg.bimg_info = blkbuf->bimg_info;
bimg.hole_offset = blkbuf->hole_offset;
bimg.length = blkbuf->bimg_len;
memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader);
scratch += SizeOfXLogRecordBlockImageHeader;
rdt_datas_last->next = &bkp_rdatas[0];
rdt_datas_last = rdt_datas_last->next;
bkp_rdatas[0].data = blkbuf->bkp_image;
bkp_rdatas[0].len = blkbuf->bimg_len;
if (bimg.bimg_info & BKPIMAGE_IS_COMPRESSED) {
cbimg.hole_length = blkbuf->hole_length;
if (bimg.bimg_info & BKPIMAGE_HAS_HOLE) {
memcpy(scratch, &cbimg,
SizeOfXLogRecordBlockCompressHeader);
scratch += SizeOfXLogRecordBlockCompressHeader;
}
}
total_len += bimg.length;
*num_fpi += 1;
}
if (blkbuf->has_data) {
rdt_datas_last->next = &bkp_rdatas[1];
rdt_datas_last = rdt_datas_last->next;
bkp_rdatas[1].data = blkbuf->data;
bkp_rdatas[1].len = blkbuf->data_len;
total_len += blkbuf->data_len;
}
memcpy(scratch, &blkbuf->rnode, sizeof(RelFileNode));
scratch += sizeof(RelFileNode);
memcpy(scratch, &blkbuf->blkno, sizeof(BlockNumber));
scratch += sizeof(BlockNumber);
}
if (state->record_origin != InvalidRepOriginId) {
*(scratch++) = (char)XLR_BLOCK_ID_ORIGIN;
memcpy(scratch, &state->record_origin, sizeof(RepOriginId));
scratch += sizeof(RepOriginId);
}
if (state->toplevel_xid != InvalidTransactionId) {
*(scratch++) = (char)XLR_BLOCK_ID_TOPLEVEL_XID;
memcpy(scratch, &state->toplevel_xid, sizeof(TransactionId));
scratch += sizeof(TransactionId);
}
if (state->main_data_len > 0) {
rdt_datas_last->next = &g_main_data;
rdt_datas_last = &g_main_data;
if (isDone == false) {
convertMainData(state,record);
g_main_data.data = state->main_data;
g_main_data.len = state->main_data_len;
isDone = true;
}
if (state->main_data_len > 255) {
*(scratch++) = (char)XLR_BLOCK_ID_DATA_LONG;
memcpy(scratch, &state->main_data_len, sizeof(uint32));
scratch += sizeof(uint32);
} else {
*(scratch++) = (char)XLR_BLOCK_ID_DATA_SHORT;
*(scratch++) = (uint8)state->main_data_len;
}
total_len += state->main_data_len;
}
rdt_datas_last->next = NULL;
groupRecData[grouo_rec_count].len = scratch - groupRecData[grouo_rec_count].data;
total_len += groupRecData[grouo_rec_count].len;
grouplens[grouo_rec_count] = total_len;
/*
in func CopyXLogRecordToWAL, we need freespace >= sizeof(uint32). so size of xlog size must be Maxalignalign
*/
/*total_len=MAXALIGN(total_len);*/
/*
* Calculate CRC of the data
*
* Note that the record header isn't added into the CRC initially since we
* don't know the prev-link yet. Thus, the CRC will represent the CRC of
* the whole record in the order: rdata, then backup blocks, then record
* header.
*/
INIT_CRC32C(rdata_crc);
COMP_CRC32C(rdata_crc, groupRecData[grouo_rec_count].data + SizeOfXLogRecord, groupRecData[grouo_rec_count].len - SizeOfXLogRecord);
rdt = groupRecData[grouo_rec_count].next;
for (; rdt != NULL; rdt = rdt->next)
COMP_CRC32C(rdata_crc, rdt->data, rdt->len);
/*
* Fill in the fields in the record header. Prev-link is filled in later,
* once we know where in the WAL the record will be inserted. The CRC does
* not include the record header yet.
*/
rechdr->xl_xid = record->xl_xid;
rechdr->xl_tot_len = total_len;
rechdr->xl_info = info;
rechdr->xl_rmid = rmid;
rechdr->xl_prev = InvalidXLogRecPtr;
rechdr->xl_crc = rdata_crc;
rechdr->blocknum = block_id;
rechdr->mtr = false;
group_total_len += total_len;
grouo_rec_count++;
}
rechdr->mtr = true;
return &groupRecData[0];
}
/*
* Assemble a WAL record from the registered data and buffers into an
* XLogRecData chain, ready for insertion with XLogInsertRecord().

File diff suppressed because it is too large Load Diff

View File

@ -632,13 +632,15 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
else
{
/* hm, page doesn't exist in file */
/*if (mode == RBM_NORMAL)
if(!he3mirror){
if (mode == RBM_NORMAL && EnableHotStandby != false)
{
log_invalid_page(rnode, forknum, blkno, false);
return InvalidBuffer;
}*/
}
if (mode == RBM_NORMAL_NO_LOG)
return InvalidBuffer;
}
/* OK to extend the file */
/* we do this in recovery only - no rel-extension lock needed */
Assert(InRecovery);
@ -666,7 +668,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
}
}
if (mode == RBM_NORMAL)
if (!he3mirror && mode == RBM_NORMAL)
{
/* check that page has been initialized */
Page page = (Page) BufferGetPage(buffer);
@ -989,38 +991,38 @@ XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
// }
/* XLogReaderRoutine->segment_open callback for local pg_wal files */
// void
// wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo,
// TimeLineID *tli_p)
// {
// TimeLineID tli = *tli_p;
// char path[MAXPGPATH];
void
wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo,
TimeLineID *tli_p)
{
TimeLineID tli = *tli_p;
char path[MAXPGPATH];
// XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
// state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
// if (state->seg.ws_file >= 0)
// return;
XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
if (state->seg.ws_file >= 0)
return;
// if (errno == ENOENT)
// ereport(ERROR,
// (errcode_for_file_access(),
// errmsg("requested WAL segment %s has already been removed",
// path)));
// else
// ereport(ERROR,
// (errcode_for_file_access(),
// errmsg("could not open file \"%s\": %m",
// path)));
// }
if (errno == ENOENT)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("requested WAL segment %s has already been removed",
path)));
else
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not open file \"%s\": %m",
path)));
}
/* stock XLogReaderRoutine->segment_close callback */
// void
// wal_segment_close(XLogReaderState *state)
// {
// close(state->seg.ws_file);
// /* need to check errno? */
// state->seg.ws_file = -1;
// }
void
wal_segment_close(XLogReaderState *state)
{
close(state->seg.ws_file);
/* need to check errno? */
state->seg.ws_file = -1;
}
/*
* XLogReaderRoutine->page_read callback for reading local xlog files
@ -1158,7 +1160,7 @@ XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
int
read_local_xlog_batch(XLogReaderState *state,
int reqLen, XLogRecPtr targetRecPtr, char *cur_page)
XLogRecPtr targetRecPtr, int reqLen, char *cur_page)
{
XLogRecPtr read_upto,
loc;

View File

@ -37,6 +37,7 @@
#include "postmaster/bgwriter.h"
#include "postmaster/startup.h"
#include "postmaster/walwriter.h"
#include "postmaster/secondbuffer.h"
#include "replication/walreceiver.h"
#include "storage/bufmgr.h"
#include "storage/bufpage.h"

View File

@ -44,8 +44,7 @@ OBJS = \
pg_subscription.o \
pg_type.o \
storage.o \
toasting.o \
pg_hot_data.o
toasting.o
include $(top_srcdir)/src/backend/common.mk
@ -70,7 +69,7 @@ CATALOG_HEADERS := \
pg_default_acl.h pg_init_privs.h pg_seclabel.h pg_shseclabel.h \
pg_collation.h pg_partitioned_table.h pg_range.h pg_transform.h \
pg_sequence.h pg_publication.h pg_publication_rel.h pg_subscription.h \
pg_subscription_rel.h pg_stat_share_storage.h pg_hot_data.h
pg_subscription_rel.h pg_stat_share_storage.h
GENERATED_HEADERS := $(CATALOG_HEADERS:%.h=%_d.h) schemapg.h system_fk_info.h

View File

@ -40,7 +40,6 @@
#include "catalog/pg_stat_share_storage.h"
#include "catalog/pg_tablespace.h"
#include "catalog/pg_type.h"
#include "catalog/pg_hot_data.h"
#include "miscadmin.h"
#include "storage/fd.h"
#include "utils/fmgroids.h"
@ -248,7 +247,6 @@ IsSharedRelation(Oid relationId)
if (relationId == AuthIdRelationId ||
relationId == AuthMemRelationId ||
relationId == DatabaseRelationId ||
relationId == HotDataRelationId ||
relationId == SharedDescriptionRelationId ||
relationId == SharedDependRelationId ||
relationId == SharedSecLabelRelationId ||
@ -265,7 +263,6 @@ IsSharedRelation(Oid relationId)
relationId == AuthMemMemRoleIndexId ||
relationId == DatabaseNameIndexId ||
relationId == DatabaseOidIndexId ||
relationId == HotDataDatnameRelnameIndexId ||
relationId == SharedDescriptionObjIndexId ||
relationId == SharedDependDependerIndexId ||
relationId == SharedDependReferenceIndexId ||

View File

@ -1,276 +0,0 @@
/*-------------------------------------------------------------------------
*
* pg_hot_data.c
* for hot data precache
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/pg_hot_data.h"
#include "libpq-fe.h"
#include "lib/stringinfo.h"
#include "utils/timestamp.h"
#include "access/xlog.h"
#include "postmaster/postmaster.h"
#include <stdlib.h>
void PrecacheHotData()
{
char instanceName[NAMEDATALEN]; //default:master
char primaryHost[16]; //default:127.0.0.1
char primaryUser[NAMEDATALEN]; //default:postgres
char primaryPw[NAMEDATALEN]; //default:123456
char primaryPort[8]; //default:PostPortNumber
char localPort[8]; //default:master
StringInfoData cmd, primaryConnStr, localConnStr;
initStringInfo(&cmd);
initStringInfo(&primaryConnStr);
initStringInfo(&localConnStr);
memset(instanceName, 0, NAMEDATALEN);
memset(primaryHost, 0, 16);
memset(primaryUser, 0, NAMEDATALEN);
memset(primaryPw, 0, NAMEDATALEN);
memset(primaryPort, 0, 8);
memset(localPort, 0, 8);
//parse
if (strlen(PrimaryConnInfo) > 0)
{
char *temStr;
char *temChr;
int temStrLen;
//instanceName
temStr = strstr(PrimaryConnInfo, "application_name=");
temStrLen = strlen("application_name=");
if (temStr != NULL)
{
temChr = strchr(temStr, ' ');
if (temChr != NULL)
{
memcpy(instanceName, temStr + temStrLen, temChr - temStr - temStrLen);
}
else
{
strcpy(instanceName, temStr + temStrLen);
}
}
else
{
strcpy(instanceName, "master");
}
//primaryHost
temStr = strstr(PrimaryConnInfo, "host=");
temStrLen = strlen("host=");
if (temStr != NULL)
{
temChr = strchr(temStr, ' ');
if (temChr != NULL)
{
memcpy(primaryHost, temStr + temStrLen, temChr - temStr - temStrLen);
}
else
{
strcpy(primaryHost, temStr + temStrLen);
}
}
else
{
strcpy(primaryHost, "127.0.0.1");
}
//primaryUser
temStr = strstr(PrimaryConnInfo, "user=");
temStrLen = strlen("user=");
if (temStr != NULL)
{
temChr = strchr(temStr, ' ');
if (temChr != NULL)
{
memcpy(primaryUser, temStr + temStrLen, temChr - temStr - temStrLen);
}
else
{
strcpy(primaryUser, temStr + temStrLen);
}
}
else
{
strcpy(primaryUser, "postgres");
}
//primaryPw
temStr = strstr(PrimaryConnInfo, "password=");
temStrLen = strlen("password=");
if (temStr != NULL)
{
temChr = strchr(temStr, ' ');
if (temChr != NULL)
{
memcpy(primaryPw, temStr + temStrLen, temChr - temStr - temStrLen);
}
else
{
strcpy(primaryPw, temStr + temStrLen);
}
}
else
{
strcpy(primaryPw, "123456");
}
//primaryPort
temStr = strstr(PrimaryConnInfo, "port=");
temStrLen = strlen("port=");
if (temStr != NULL)
{
temChr = strchr(temStr, ' ');
if (temChr != NULL)
{
memcpy(primaryPort, temStr + temStrLen, temChr - temStr - temStrLen);
}
else
{
strcpy(primaryPort, temStr + temStrLen);
}
}
else
{
sprintf(primaryPort, "%d", PostPortNumber);
}
}
else
{
strcpy(instanceName, "master");
strcpy(primaryHost, "127.0.0.1");
strcpy(primaryUser, "postgres");
strcpy(primaryPw, "123456");
sprintf(primaryPort, "%d", PostPortNumber);
}
//assemble primaryConnStr
appendStringInfoString(&primaryConnStr, "host=");
appendStringInfoString(&primaryConnStr, primaryHost);
appendStringInfoString(&primaryConnStr, " user=");
appendStringInfoString(&primaryConnStr, primaryUser);
appendStringInfoString(&primaryConnStr, " password=");
appendStringInfoString(&primaryConnStr, primaryPw);
appendStringInfoString(&primaryConnStr, " port=");
appendStringInfoString(&primaryConnStr, primaryPort);
appendStringInfoString(&primaryConnStr, " dbname=postgres");
//conn local
sprintf(localPort, "%d", PostPortNumber);
appendStringInfoString(&localConnStr, "host=127.0.0.1 port=");
appendStringInfoString(&localConnStr, localPort);
appendStringInfoString(&localConnStr, " user=postgres dbname=postgres");
PGconn *localConn = PQconnectdb(localConnStr.data);
if (PQstatus(localConn) != CONNECTION_OK)
{
PQfinish(localConn);
//log
return;
}
appendStringInfoString(&cmd, "SELECT datname, relname, crules FROM pg_hot_data WHERE crulessettime>cachetime AND clientname='");
appendStringInfoString(&cmd, instanceName);
appendStringInfoString(&cmd, "'");
//Query the corresponding precache policy
PGresult *ruleRes = PQexec(localConn, cmd.data);
if (PQresultStatus(ruleRes) != PGRES_TUPLES_OK)
{
PQclear(ruleRes);
PQfinish(localConn);
//log
return;
}
int rows = PQntuples(ruleRes);
for(int i=0; i<rows; i++)
{
char *datname;
char *relname;
char *crules;
datname = PQgetvalue(ruleRes, i, 0);
relname = PQgetvalue(ruleRes, i, 1);
crules = PQgetvalue(ruleRes, i, 2);
//precache hot data(table level)
if (strcmp(crules, "t") == 0)
{
//precache
resetStringInfo(&localConnStr);
appendStringInfoString(&localConnStr, "host=127.0.0.1 port=");
appendStringInfoString(&localConnStr, localPort);
appendStringInfoString(&localConnStr, " user=postgres dbname=");
appendStringInfoString(&localConnStr, datname);
PGconn *precacheConn = PQconnectdb(localConnStr.data);
if (PQstatus(precacheConn) != CONNECTION_OK)
{
PQfinish(precacheConn);
//log
continue;
}
resetStringInfo(&cmd);
appendStringInfoString(&cmd, "precache select * from ");
appendStringInfoString(&cmd, relname);
PGresult *precacheRes = PQexec(precacheConn, cmd.data);
if (PQresultStatus(precacheRes) != PGRES_TUPLES_OK)
{
PQclear(precacheRes);
PQfinish(precacheConn);
//log
continue;
}
PQclear(precacheRes);
PQfinish(precacheConn);
//update primary pg_hot_data
const char* currentTime = NULL;
currentTime = timestamptz_to_str(GetCurrentTimestamp());
resetStringInfo(&cmd);
appendStringInfoString(&cmd, "UPDATE pg_hot_data SET cachetime='");
appendStringInfoString(&cmd, currentTime);
appendStringInfoString(&cmd, "' WHERE datname='");
appendStringInfoString(&cmd, datname);
appendStringInfoString(&cmd, "' AND relname='");
appendStringInfoString(&cmd, relname);
appendStringInfoString(&cmd, "' AND crules='");
appendStringInfoString(&cmd, crules);
appendStringInfoString(&cmd, "' AND clientname='");
appendStringInfoString(&cmd, instanceName);
appendStringInfoString(&cmd, "'");
PGconn *primaryConn = PQconnectdb(primaryConnStr.data);
if (PQstatus(primaryConn) != CONNECTION_OK)
{
PQfinish(primaryConn);
//log
continue;
}
PGresult *updateRes=PQexec(primaryConn, cmd.data);
if (PQresultStatus(updateRes) != PGRES_TUPLES_OK)
{
PQclear(updateRes);
PQfinish(primaryConn);
//log
continue;
}
PQclear(updateRes);
PQfinish(primaryConn);
}
}
PQclear(ruleRes);
PQfinish(localConn);
}

View File

@ -33,6 +33,7 @@
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "utils/guc.h"
/* GUC variables */
int wal_skip_threshold = 2048; /* in kilobytes */
@ -924,7 +925,7 @@ smgr_redo(XLogReaderState *record)
reln = smgropen(xlrec->rnode, InvalidBackendId);
/* He3DB: propeller instance and He3DB slave instance not create rel file*/
if (!EnableHotStandby)
if (!EnableHotStandby || he3mirror)
{
smgrcreate(reln, xlrec->forkNum, true);
}
@ -948,7 +949,7 @@ smgr_redo(XLogReaderState *record)
* log as best we can until the drop is seen.
*/
/* He3DB: propeller instance and He3DB slave instance not create rel file*/
if (!EnableHotStandby)
if (!EnableHotStandby || he3mirror)
{
smgrcreate(reln, MAIN_FORKNUM, true);
}
@ -1007,7 +1008,7 @@ smgr_redo(XLogReaderState *record)
}
/* Do the real work to truncate relation forks */
if (nforks > 0)
if (nforks > 0 && !EnableHotStandby)
smgrtruncate(reln, forks, nforks, blocks);
/*

View File

@ -1557,7 +1557,7 @@ ExecutePlan(EState *estate,
if (TupIsNull(slot))
break;
if (!isPreCache)
if (!isPreCacheTable && !isPreCacheIndex)
{
/*
* If we have a junk filter, then project a new tuple with the junk

View File

@ -51,6 +51,7 @@
#include "utils/rel.h"
#include "utils/snapmgr.h"
#include "utils/spccache.h"
#include "storage/bufmgr.h"
static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
@ -81,6 +82,28 @@ BitmapHeapNext(BitmapHeapScanState *node)
ParallelBitmapHeapState *pstate = node->pstate;
dsa_area *dsa = node->ss.ps.state->es_query_dsa;
/* set preCacheNodeOid */
if (isPreCacheIndex && preCacheNodeOid == 0)
{
preCacheNodeOid = ((BitmapIndexScanState *)((PlanState *)(node))->lefttree)->biss_ScanDesc->indexRelation->rd_node.relNode;
if (isPreCacheAction)
{
preCacheNodesPtr[(*preCacheNodesCountPtr)++] = preCacheNodeOid;
}
else
{
for(int i = 0; i < *preCacheNodesCountPtr; i++)
{
if (preCacheNodesPtr[i] == preCacheNodeOid)
{
preCacheNodesPtr[i] = preCacheNodesPtr[*preCacheNodesCountPtr - 1];
(*preCacheNodesCountPtr)--;
break;
}
}
}
}
/*
* extract necessary information from index scan node
*/

View File

@ -66,6 +66,28 @@ IndexOnlyNext(IndexOnlyScanState *node)
TupleTableSlot *slot;
ItemPointer tid;
/* set preCacheNodeOid */
if (isPreCacheIndex && preCacheNodeOid == 0)
{
preCacheNodeOid = node->ioss_RelationDesc->rd_node.relNode;
if (isPreCacheAction)
{
preCacheNodesPtr[(*preCacheNodesCountPtr)++] = preCacheNodeOid;
}
else
{
for(int i = 0; i < *preCacheNodesCountPtr; i++)
{
if (preCacheNodesPtr[i] == preCacheNodeOid)
{
preCacheNodesPtr[i] = preCacheNodesPtr[*preCacheNodesCountPtr - 1];
(*preCacheNodesCountPtr)--;
break;
}
}
}
}
/*
* extract necessary information from index scan node
*/

View File

@ -43,6 +43,7 @@
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/rel.h"
#include "storage/bufmgr.h"
/*
* When an ordering operator is used, tuples fetched from the index that
@ -86,6 +87,28 @@ IndexNext(IndexScanState *node)
IndexScanDesc scandesc;
TupleTableSlot *slot;
/* set preCacheNodeOid */
if (isPreCacheIndex && preCacheNodeOid == 0)
{
preCacheNodeOid = node->iss_RelationDesc->rd_node.relNode;
if (isPreCacheAction)
{
preCacheNodesPtr[(*preCacheNodesCountPtr)++] = preCacheNodeOid;
}
else
{
for(int i = 0; i < *preCacheNodesCountPtr; i++)
{
if (preCacheNodesPtr[i] == preCacheNodeOid)
{
preCacheNodesPtr[i] = preCacheNodesPtr[*preCacheNodesCountPtr - 1];
(*preCacheNodesCountPtr)--;
break;
}
}
}
}
/*
* extract necessary information from index scan node
*/

View File

@ -32,6 +32,7 @@
#include "executor/execdebug.h"
#include "executor/nodeSeqscan.h"
#include "utils/rel.h"
#include "storage/bufmgr.h"
static TupleTableSlot *SeqNext(SeqScanState *node);
@ -54,6 +55,28 @@ SeqNext(SeqScanState *node)
ScanDirection direction;
TupleTableSlot *slot;
/* set preCacheTableNode */
if (isPreCacheTable && preCacheNodeOid == 0)
{
preCacheNodeOid = node->ss.ss_currentRelation->rd_node.relNode;
if (isPreCacheAction)
{
preCacheNodesPtr[(*preCacheNodesCountPtr)++] = preCacheNodeOid;
}
else
{
for(int i = 0; i < *preCacheNodesCountPtr; i++)
{
if (preCacheNodesPtr[i] == preCacheNodeOid)
{
preCacheNodesPtr[i] = preCacheNodesPtr[*preCacheNodesCountPtr - 1];
(*preCacheNodesCountPtr)--;
break;
}
}
}
}
/*
* get information from the estate and scan state
*/

View File

@ -21,6 +21,7 @@ OBJS = \
interrupt.o \
pgarch.o \
pgstat.o \
secondbuffer.o \
postmaster.o \
startup.o \
syslogger.o \

View File

@ -114,6 +114,7 @@
#include "postmaster/interrupt.h"
#include "postmaster/pgarch.h"
#include "postmaster/postmaster.h"
#include "postmaster/secondbuffer.h"
#include "postmaster/syslogger.h"
#include "replication/logicallauncher.h"
#include "replication/walsender.h"
@ -257,8 +258,9 @@ static pid_t StartupPID = 0,
AutoVacPID = 0,
PgArchPID = 0,
PgStatPID = 0,
SysLoggerPID = 0,
SecondBufferPID = 0;
SecondBufferPID = 0,
CleanLogIndexPID = 0,
SysLoggerPID = 0;
/* Startup process's status */
typedef enum
@ -566,7 +568,9 @@ static void ShmemBackendArrayRemove(Backend *bn);
#define StartCheckpointer() StartChildProcess(CheckpointerProcess)
#define StartWalWriter() StartChildProcess(WalWriterProcess)
#define StartWalReceiver() StartChildProcess(WalReceiverProcess)
#define StartSecondBuffer() StartChildProcess(SecondBufferProcess)
#define StartSecondBuffer() StartChildProcess(SecondBufferProcess)
#define StartCleanLogIndex() StartChildProcess(CleanLogIndexProcess)
/* Macros to check exit status of a child process */
#define EXIT_STATUS_0(st) ((st) == 0)
#define EXIT_STATUS_1(st) (WIFEXITED(st) && WEXITSTATUS(st) == 1)
@ -1779,6 +1783,10 @@ ServerLoop(void)
CheckpointerPID = StartCheckpointer();
if (BgWriterPID == 0)
BgWriterPID = StartBackgroundWriter();
if (CleanLogIndexPID == 0)
CleanLogIndexPID = StartCleanLogIndex();
if (SecondBufferPID == 0)
SecondBufferPID = StartSecondBuffer();
}
/*
@ -1789,8 +1797,8 @@ ServerLoop(void)
if (WalWriterPID == 0 && pmState == PM_RUN)
WalWriterPID = StartWalWriter();
if(SecondBufferPID == 0 && pmState == PM_RUN)
SecondBufferPID = StartSecondBuffer();
// if(SecondBufferPID == 0 && pmState == PM_RUN)
// SecondBufferPID = StartSecondBuffer();
/*
* If we have lost the autovacuum launcher, try to start a new one. We
@ -2744,7 +2752,10 @@ SIGHUP_handler(SIGNAL_ARGS)
if (PgStatPID != 0)
signal_child(PgStatPID, SIGHUP);
if (SecondBufferPID != 0)
signal_child(SecondBufferPID, SIGHUP); //重新加载配置后重启进程?
signal_child(SecondBufferPID, SIGHUP);
if (CleanLogIndexPID != 0 )
signal_child(CleanLogIndexPID, SIGHUP);
/* Reload authentication config files too */
if (!load_hba())
@ -3066,6 +3077,8 @@ reaper(SIGNAL_ARGS)
WalWriterPID = StartWalWriter();
if (SecondBufferPID == 0)
SecondBufferPID = StartSecondBuffer(); //作用?
if (CleanLogIndexPID == 0)
CleanLogIndexPID = StartCleanLogIndex();
/*
* Likewise, start other special children as needed. In a restart
@ -3179,13 +3192,13 @@ reaper(SIGNAL_ARGS)
continue;
}
if (pid == SecondBufferPID)
{
SecondBufferPID = 0;
if (!EXIT_STATUS_0(exitstatus))
HandleChildCrash(pid, exitstatus,
_("second buffer process"));
}
// if (pid == SecondBufferPID)
// {
// SecondBufferPID = 0;
// if (!EXIT_STATUS_0(exitstatus))
// HandleChildCrash(pid, exitstatus,
// _("second buffer process"));
// }
/*
* Was it the wal receiver? If exit status is zero (normal) or one
@ -3663,18 +3676,6 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
signal_child(WalWriterPID, (SendStop ? SIGSTOP : SIGQUIT));
}
/* Take care of the walwriter too*/
if (pid == SecondBufferPID)
SecondBufferPID = 0;
else if (SecondBufferPID != 0 && take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
(SendStop ? "SIGSTOP" : "SIGQUIT"),
(int) SecondBufferPID)));
signal_child(SecondBufferPID, (SendStop ? SIGSTOP : SIGQUIT));
}
/* Take care of the walreceiver too */
if (pid == WalReceiverPID)
WalReceiverPID = 0;
@ -3726,7 +3727,29 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
signal_child(PgStatPID, SIGQUIT);
allow_immediate_pgstat_restart();
}
/* Take care of the clean logindex too */
if (pid == CleanLogIndexPID)
CleanLogIndexPID = 0;
else if (CleanLogIndexPID != 0 && take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
(SendStop ? "SIGSTOP" : "SIGQUIT"),
(int) CleanLogIndexPID)));
signal_child(CleanLogIndexPID, (SendStop ? SIGSTOP : SIGQUIT));
}
/* Take care of the walwriter too*/
if (pid == SecondBufferPID)
SecondBufferPID = 0;
else if (SecondBufferPID != 0 && take_action)
{
ereport(DEBUG2,
(errmsg_internal("sending %s to process %d",
(SendStop ? "SIGSTOP" : "SIGQUIT"),
(int) SecondBufferPID)));
signal_child(SecondBufferPID, (SendStop ? SIGSTOP : SIGQUIT));
}
/* We do NOT restart the syslogger */
if (Shutdown != ImmediateShutdown)
@ -3869,14 +3892,16 @@ PostmasterStateMachine(void)
/* and the walwriter too */
if (WalWriterPID != 0)
signal_child(WalWriterPID, SIGTERM);
/*and the secondbuffer too*/
if (SecondBufferPID != 0)
signal_child(SecondBufferPID,SIGTERM);
/* If we're in recovery, also stop startup and walreceiver procs */
if (StartupPID != 0)
signal_child(StartupPID, SIGTERM);
if (WalReceiverPID != 0)
signal_child(WalReceiverPID, SIGTERM);
if (CleanLogIndexPID != 0)
signal_child(CleanLogIndexPID, SIGTERM);
/*and the secondbuffer too*/
if (SecondBufferPID != 0)
signal_child(SecondBufferPID,SIGTERM);
/* checkpointer, archiver, stats, and syslogger may continue for now */
@ -3905,7 +3930,6 @@ PostmasterStateMachine(void)
StartupPID == 0 &&
WalReceiverPID == 0 &&
BgWriterPID == 0 &&
SecondBufferPID == 0 &&
(CheckpointerPID == 0 ||
(!FatalError && Shutdown < ImmediateShutdown)) &&
WalWriterPID == 0 &&
@ -4002,7 +4026,6 @@ PostmasterStateMachine(void)
Assert(BgWriterPID == 0);
Assert(CheckpointerPID == 0);
Assert(WalWriterPID == 0);
Assert(SecondBufferPID == 0);
Assert(AutoVacPID == 0);
/* syslogger is not considered here */
pmState = PM_NO_CHILDREN;
@ -4209,8 +4232,6 @@ TerminateChildren(int signal)
signal_child(CheckpointerPID, signal);
if (WalWriterPID != 0)
signal_child(WalWriterPID, signal);
if (SecondBufferPID != 0)
signal_child(SecondBufferPID,signal);
if (WalReceiverPID != 0)
signal_child(WalReceiverPID, signal);
if (AutoVacPID != 0)
@ -4219,6 +4240,10 @@ TerminateChildren(int signal)
signal_child(PgArchPID, signal);
if (PgStatPID != 0)
signal_child(PgStatPID, signal);
if (CleanLogIndexPID !=0)
signal_child(CleanLogIndexPID, signal);
if (SecondBufferPID != 0)
signal_child(SecondBufferPID, signal);
}
/*
@ -4572,7 +4597,7 @@ BackendRun(Port *port)
if (port->privateConn == true) {
privateConn = true;
}
client_application_name = port->application_name;
/*
* Make sure we aren't in PostmasterContext anymore. (We can't delete it
* just yet, though, because InitPostgres will need the HBA data.)
@ -5347,6 +5372,17 @@ sigusr1_handler(SIGNAL_ARGS)
StartALLPageFlushWorker();
}
if (CheckPostmasterSignal(PMSIGNAL_CLEAN_LOGINDEX_WORKER)) {
if ( CleanLogIndexPID == 0) {
CleanLogIndexPID = StartCleanLogIndex();
}
}
// if (CheckPostmasterSignal(PMSIGNAL_SECONDBUFFER_WORKER)) {
// if (SecondBufferPID == 0) {
// SecondBufferPID = StartSecondBuffer();
// }
// }
if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER))
{
/* Startup Process wants us to start the walreceiver process. */
@ -5512,6 +5548,14 @@ StartChildProcess(AuxProcType type)
av[ac++] = NULL; /* filled in by postmaster_forkexec */
#endif
if (pageEnv == NULL)
{
InitPageDBEnv();
}
if (walEnv == NULL)
{
InitWalDBEnv();
}
snprintf(typebuf, sizeof(typebuf), "-x%d", type);
av[ac++] = typebuf;

File diff suppressed because it is too large Load Diff

View File

@ -244,9 +244,11 @@ StartupProcessMain(void)
//start flushWork
#ifndef PG_NOREPLAY
if (IsBootstrapProcessingMode() != true && InitdbSingle!=true) {
if (push_standby == true) {
//if (push_standby == true) {
SignalStartFlushWork();
}
//}
pg_usleep(1000);
SignalStartCleanLogIndexWork();
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
}
#endif

View File

@ -95,7 +95,8 @@ bool hot_standby_feedback;
static WalReceiverConn *wrconn = NULL;
WalReceiverFunctionsType *WalReceiverFunctions = NULL;
#define NAPTIME_PER_CYCLE 100 /* max sleep time between cycles (100ms) */
//#define NAPTIME_PER_CYCLE 100 /* max sleep time between cycles (100ms) */
#define NAPTIME_PER_CYCLE 10 /* max sleep time between cycles (10ms) */
/*
* These variables are used similarly to openLogFile/SegNo,
@ -824,11 +825,11 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len)
case 'w': /* WAL records */
{
/* copy message to StringInfo */
#ifdef PG_NOREPLAY
if (he3mirror) {
hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64);
#else
} else {
hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64) + sizeof(int64);
#endif
}
if (len < hdrlen)
ereport(ERROR,
(errcode(ERRCODE_PROTOCOL_VIOLATION),
@ -838,21 +839,21 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len)
/* read the fields */
dataStart = pq_getmsgint64(&incoming_message);
walEnd = pq_getmsgint64(&incoming_message);
#ifdef PG_NOREPLAY
if (he3mirror){
len -= hdrlen;
#else
} else{
len = pq_getmsgint64(&incoming_message);
#endif
}
sendTime = pq_getmsgint64(&incoming_message);
ProcessWalSndrMessage(walEnd, sendTime);
buf += hdrlen;
#ifdef PG_NOREPLAY
if (he3mirror) {
XLogWalRcvWrite(buf, len, dataStart);
#else
} else {
LogstreamResult.Write = dataStart+len;
/* Update shared-memory status */
pg_atomic_write_u64(&WalRcv->writtenUpto, LogstreamResult.Write);
#endif
}
break;
}
case 'k': /* Keepalive */
@ -1124,6 +1125,11 @@ XLogWalRcvSendReply(bool force, bool requestReply)
writePtr = LogstreamResult.Write;
flushPtr = LogstreamResult.Flush;
applyPtr = GetXLogReplayRecPtr(NULL);
#ifndef PG_NOREPLAY
if (!he3mirror && push_standby == true) {
applyPtr = GetXLogPushToDisk();
}
#endif
resetStringInfo(&reply_message);
pq_sendbyte(&reply_message, 'r');

View File

@ -48,6 +48,8 @@
#include <signal.h>
#include <unistd.h>
#include <assert.h>
#include <string.h>
#include "access/printtup.h"
#include "access/timeline.h"
@ -93,6 +95,13 @@
#include "utils/ps_status.h"
#include "utils/timeout.h"
#include "utils/timestamp.h"
#include "access/heapam_xlog.h"
#include "catalog/pg_control.h"
#include "access/nbtxlog.h"
#include "access/gistxlog.h"
#include "access/spgxlog.h"
#include "access/brin_xlog.h"
#include "access/xlog.h"
/*
* Maximum data payload in a WAL data message. Must be >= XLOG_BLCKSZ.
@ -105,6 +114,10 @@
*/
#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)
#define ONCE_READ_TIKV_WAL (XLOG_BLCKSZ * 2)
//bachread tikv 16k,but last record len mybe gt 8k,so DEFAULT_SEND_WAL_CAPCITY = 2 * ONCE_READ_TIKV_WAL
#define DEFAULT_SEND_WAL_CAPCITY (ONCE_READ_TIKV_WAL*2)
/* Array of WalSnds in shared memory */
WalSndCtlData *WalSndCtl = NULL;
@ -255,7 +268,7 @@ static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch);
static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
TimeLineID *tli_p);
static void XLogSendTiKVPhysical(void);
/* Initialize walsender process before entering the main command loop */
void
@ -572,6 +585,10 @@ StartReplication(StartReplicationCmd *cmd)
{
StringInfoData buf;
XLogRecPtr FlushPtr;
bool pgmirrorFlag = false;
if (client_application_name!=NULL && strncmp(client_application_name,"pgmirror",strlen("pgmirror")) == 0) {
pgmirrorFlag = true;
}
if (ThisTimeLineID == 0)
ereport(ERROR,
@ -717,7 +734,7 @@ StartReplication(StartReplicationCmd *cmd)
* Don't allow a request to stream from a future point in WAL that
* hasn't been flushed to disk in this server yet.
*/
if (FlushPtr < cmd->startpoint)
if (pgmirrorFlag == false && FlushPtr < cmd->startpoint)
{
ereport(ERROR,
(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
@ -737,8 +754,21 @@ StartReplication(StartReplicationCmd *cmd)
/* Main loop of walsender */
replication_active = true;
if (pgmirrorFlag == false) {
WalSndLoop(XLogSendPhysical);
} else {
readControlFile(DataDir);
SpinLockAcquire(&MyWalSnd->mutex);
if (walsenderLsn != 0) {
MyWalSnd->sentPtr = walsenderLsn;
sentPtr = walsenderLsn;
elog(LOG,"wal sender LSN %X/%X",LSN_FORMAT_ARGS(walsenderLsn));
} else {
elog(ERROR,"WAL sender LSN 0/0");
}
SpinLockRelease(&MyWalSnd->mutex);
WalSndLoop(XLogSendTiKVPhysical);
}
replication_active = false;
if (got_STOPPING)
@ -1300,7 +1330,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
break;
sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
// sleeptime = 10; //10ms
/* Sleep until something happens or we time out */
WalSndWait(WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE, sleeptime,
WAIT_EVENT_WAL_SENDER_WRITE_DATA);
@ -2374,7 +2404,7 @@ WalSndLoop(WalSndSendDataCallback send_data)
* of reaching wal_sender_timeout before sending a keepalive.
*/
sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
// sleeptime = 10; //10ms
if (pq_is_send_pending())
wakeEvents |= WL_SOCKET_WRITEABLE;
@ -2462,74 +2492,671 @@ WalSndKill(int code, Datum arg)
SpinLockRelease(&walsnd->mutex);
}
/* XLogReaderRoutine->segment_open callback */
// static void
// WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
// TimeLineID *tli_p)
// {
// char path[MAXPGPATH];
static void reConvertMainData(XLogRecord* sRecord, char*sMainData, uint32_t*sLen, char* dMainData, uint32_t* dLen) {
RmgrId rmid = sRecord->xl_rmid;
uint8 info = (sRecord->xl_info & ~XLR_INFO_MASK);
switch(rmid) {
case RM_HEAP2_ID:
{
if ((info & XLOG_HEAP_OPMASK) == XLOG_HEAP2_VISIBLE) {
xl_heap_visible *xlrec = (xl_heap_visible *)sMainData;
xl_old_heap_visible xlrecOld;
xlrecOld.cutoff_xid = xlrec->cutoff_xid;
xlrecOld.flags = xlrec->flags;
*dLen = sizeof(xl_old_heap_visible);
memcpy(dMainData,&xlrecOld,*dLen);
}
break;
}
case RM_HEAP_ID:
{
if (((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_UPDATE) ||
((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_HOT_UPDATE)) {
xl_heap_update *xlrec = (xl_heap_update *)sMainData;
xl_old_heap_update xlrecOld;
xlrecOld.old_xmax = xlrec->old_xmax;
xlrecOld.old_offnum = xlrec->old_offnum;
xlrecOld.old_infobits_set = xlrec->old_infobits_set;
xlrecOld.flags = xlrec->flags;
xlrecOld.new_xmax = xlrec->new_xmax;
xlrecOld.new_offnum = xlrec->new_offnum;
*dLen = sizeof(xl_old_heap_update);
memcpy(dMainData,&xlrecOld,*dLen);
}
break;
}
case RM_BTREE_ID:
{
if (info == XLOG_BTREE_SPLIT_L || info == XLOG_BTREE_SPLIT_R) {
xl_btree_split *xlrec = (xl_btree_split *)sMainData;
xl_old_btree_split xlrecOld;
xlrecOld.level = xlrec->level;
xlrecOld.firstrightoff = xlrec->firstrightoff;
xlrecOld.newitemoff = xlrec->newitemoff;
xlrecOld.postingoff = xlrec->postingoff;
*dLen = sizeof(xl_old_btree_split);
memcpy(dMainData,&xlrecOld,*dLen);
}
break;
}
case RM_GIST_ID:
{
if (info == XLOG_GIST_PAGE_SPLIT) {
gistxlogPageSplit *xlrec = (gistxlogPageSplit *)sMainData;
gistoldxlogPageSplit xlrecOld;
xlrecOld.origrlink = xlrec->origrlink;
xlrecOld.orignsn = xlrec->orignsn;
xlrecOld.origleaf = xlrec->origleaf;
xlrecOld.npage = xlrec->npage;
xlrecOld.markfollowright = xlrec->markfollowright;
*dLen = sizeof(gistoldxlogPageSplit);
memcpy(dMainData,&xlrecOld,*dLen);
}
break;
}
case RM_SPGIST_ID:
{
if (info == XLOG_SPGIST_ADD_LEAF) {
spgxlogAddLeaf *xlrec = (spgxlogAddLeaf *)sMainData;
spgoldxlogAddLeaf xlrecOld;
xlrecOld.newPage = xlrec->newPage;
xlrecOld.storesNulls = xlrec->storesNulls;
xlrecOld.offnumLeaf = xlrec->offnumLeaf;
xlrecOld.offnumHeadLeaf = xlrec->offnumHeadLeaf;
xlrecOld.offnumParent = xlrec->offnumParent;
xlrecOld.nodeI = xlrec->nodeI;
*dLen = sizeof(spgoldxlogAddLeaf);
memcpy(dMainData,&xlrecOld,*dLen);
} else if (info == XLOG_SPGIST_MOVE_LEAFS) {
spgxlogMoveLeafs *xlrec = (spgxlogMoveLeafs *)sMainData;
spgoldxlogMoveLeafs xlrecOld;
xlrecOld.nMoves = xlrec->nMoves;
xlrecOld.newPage = xlrec->newPage;
xlrecOld.replaceDead = xlrec->replaceDead;
xlrecOld.storesNulls = xlrec->storesNulls;
xlrecOld.offnumParent = xlrec->offnumParent;
xlrecOld.nodeI = xlrec->nodeI;
xlrecOld.stateSrc = xlrec->stateSrc;
*dLen = SizeOfOldSpgxlogMoveLeafs;
memcpy(dMainData,&xlrecOld,*dLen);
memcpy(dMainData+*dLen,xlrec->offsets,*sLen-SizeOfSpgxlogMoveLeafs);
*dLen += *sLen-SizeOfSpgxlogMoveLeafs;
} else if (info == XLOG_SPGIST_ADD_NODE) {
spgxlogAddNode *xlrec = (spgxlogAddNode *)sMainData;
spgoldxlogAddNode xlrecOld;
xlrecOld.offnum = xlrec->offnum;
xlrecOld.offnumNew = xlrec->offnumNew;
xlrecOld.newPage = xlrec->newPage;
xlrecOld.parentBlk = xlrec->parentBlk;
xlrecOld.offnumParent = xlrec->offnumParent;
xlrecOld.nodeI = xlrec->nodeI;
xlrecOld.stateSrc = xlrec->stateSrc;
*dLen = sizeof(spgoldxlogAddNode);
memcpy(dMainData,&xlrecOld,*dLen);
} else if (info == XLOG_SPGIST_PICKSPLIT) {
spgxlogPickSplit *xlrec = (spgxlogPickSplit *)sMainData;
spgoldxlogPickSplit xlrecOld;
xlrecOld.isRootSplit = xlrec->isRootSplit;
xlrecOld.nDelete = xlrec->nDelete;
xlrecOld.nInsert = xlrec->nInsert;
xlrecOld.initSrc = xlrec->initSrc;
xlrecOld.initDest = xlrec->initDest;
xlrecOld.offnumInner = xlrec->offnumInner;
xlrecOld.initInner = xlrec->initInner;
xlrecOld.storesNulls = xlrec->storesNulls;
xlrecOld.innerIsParent = xlrec->innerIsParent;
xlrecOld.offnumParent = xlrec->offnumParent;
xlrecOld.nodeI = xlrec->nodeI;
xlrecOld.stateSrc = xlrec->stateSrc;
*dLen = SizeOfOldSpgxlogPickSplit;
memcpy(dMainData,&xlrecOld,*dLen);
memcpy(dMainData+*dLen,xlrec->offsets,*sLen-SizeOfSpgxlogPickSplit);
*dLen += *sLen-SizeOfSpgxlogPickSplit;
}
break;
}
case RM_BRIN_ID:
{
if (info == XLOG_BRIN_INSERT) {
xl_brin_insert *xlrec = (xl_brin_insert *)sMainData;
xl_old_brin_insert xlrecOld;
xlrecOld.heapBlk = xlrec->heapBlk;
/* extra information needed to update the revmap */
xlrecOld.pagesPerRange = xlrec->pagesPerRange;
xlrecOld.offnum = xlrec->offnum;
*dLen = sizeof(xl_old_brin_insert);
memcpy(dMainData,&xlrecOld,*dLen);
} else if ( info == XLOG_BRIN_UPDATE) {
xl_brin_update *xlrec = (xl_brin_update *) sMainData;
xl_old_brin_update xlrecUpdate;
xl_brin_insert *xlrecInsert = &xlrec->insert;
xl_old_brin_insert xlrecOld;
xlrecOld.heapBlk = xlrecInsert->heapBlk;
/* extra information needed to update the revmap */
xlrecOld.pagesPerRange = xlrecInsert->pagesPerRange;
xlrecOld.offnum = xlrecInsert->offnum;
/* offset number of old tuple on old page */
xlrecUpdate.oldOffnum = xlrec->oldOffnum;
xlrecUpdate.insert = xlrecOld;
*dLen = sizeof(xl_old_brin_update);
memcpy(dMainData,&xlrecUpdate,*dLen);
}
break;
}
default:
{
break;
}
}
}
// /*-------
// * When reading from a historic timeline, and there is a timeline switch
// * within this segment, read from the WAL segment belonging to the new
// * timeline.
// *
// * For example, imagine that this server is currently on timeline 5, and
// * we're streaming timeline 4. The switch from timeline 4 to 5 happened at
// * 0/13002088. In pg_wal, we have these files:
// *
// * ...
// * 000000040000000000000012
// * 000000040000000000000013
// * 000000050000000000000013
// * 000000050000000000000014
// * ...
// *
// * In this situation, when requested to send the WAL from segment 0x13, on
// * timeline 4, we read the WAL from file 000000050000000000000013. Archive
// * recovery prefers files from newer timelines, so if the segment was
// * restored from the archive on this server, the file belonging to the old
// * timeline, 000000040000000000000013, might not exist. Their contents are
// * equal up to the switchpoint, because at a timeline switch, the used
// * portion of the old segment is copied to the new file. -------
// */
// *tli_p = sendTimeLine;
// if (sendTimeLineIsHistoric)
// {
// XLogSegNo endSegNo;
//1.recomplete CRC 2.MTR as endpoint Merge 3.some struct convert 4.checkpoint redo reset 5.use file segment manage
static int MergeWalForPgMirror(char*source,char*destion,int limit,int*he3_pos) {
int pos1 = 0,pos2 = 0,prev_pos2 = 0;
bool isMtr = false;
*he3_pos = 0;
while(pos1<limit) {
uint8_t blkNum = 0;
char*img_ptr[XLR_MAX_BLOCK_ID + 1] = {0};
char*data_ptr[XLR_MAX_BLOCK_ID + 1] = {0};
uint16_t bimg_len[XLR_MAX_BLOCK_ID + 1] = {0};
uint16_t data_len[XLR_MAX_BLOCK_ID + 1] = {0};
uint16 RepOriginId = 0;
uint32 TransactionId = 0;
uint32_t d_main_data_len = 0;
char d_main_data[8192];
OldXLogRecord*old = NULL;
prev_pos2 = pos2;
while(isMtr == false) {
if (pos1 >= limit) {
return pos2;
}
XLogRecord*one = (XLogRecord*)(source + pos1);
old = (OldXLogRecord*)(destion + pos2);
old->xl_xid = one->xl_xid;
old->xl_info = one->xl_info;
old->xl_rmid = one->xl_rmid;
pos1 += sizeof(XLogRecord);
pos2 += sizeof(OldXLogRecord);
uint32 remaining = one->xl_tot_len - sizeof(XLogRecord);
uint32 datatotal = 0;
isMtr = one->mtr;
while(remaining > datatotal) {
uint8_t block_id = *(source + pos1);
if (block_id == XLR_BLOCK_ID_DATA_SHORT) {
/* XLogRecordDataHeaderShort */
pos1 += sizeof(block_id);
if (isMtr == true) {
memcpy((destion + pos2),&block_id,sizeof(block_id));
pos2 += sizeof(block_id);
}
uint32_t main_data_len = 0;
main_data_len = *((uint8_t*)(source + pos1));
if (isMtr == true) {
reConvertMainData(one,source + pos1,&main_data_len,d_main_data,&d_main_data_len);
if (d_main_data_len == 0) {
memcpy(destion + pos2,source + pos1,sizeof(uint8_t));
}
pos2 += sizeof(uint8_t);
}
pos1 += sizeof(uint8_t);
remaining -= sizeof(uint8_t);
datatotal += main_data_len;
break;
} else if (block_id == XLR_BLOCK_ID_DATA_LONG) {
/* XLogRecordDataHeaderLong */
pos1 += sizeof(block_id);
if (isMtr == true) {
memcpy((destion + pos2),&block_id,sizeof(block_id));
pos2 += sizeof(block_id);
}
uint32 main_data_len = 0,d_main_data_len = 0;
memcpy(&main_data_len,source + pos1,sizeof(uint32));
if (isMtr == true) {
reConvertMainData(one,source + pos1,&main_data_len,d_main_data,&d_main_data_len);
if (d_main_data_len == 0) {
memcpy(destion + pos2,&main_data_len,sizeof(main_data_len));
pos2 += sizeof(main_data_len);
} else {
if (d_main_data_len > 255) {
memcpy(destion + pos2,&d_main_data_len,sizeof(d_main_data_len));
pos2 += sizeof(d_main_data_len);
} else {
*(destion + pos2 - 1) = XLR_BLOCK_ID_DATA_SHORT;
uint8_t tlen = d_main_data_len;
memcpy(destion + pos2,&tlen,sizeof(tlen));
pos2 += sizeof(uint8_t);
}
}
}
pos1 += sizeof(main_data_len);
remaining -= sizeof(main_data_len);
datatotal += main_data_len;
break; /* by convention, the main data fragment is
* always last */
} else if (block_id == XLR_BLOCK_ID_ORIGIN) {
pos1 += sizeof(block_id);
if (isMtr == true) {
memcpy((destion + pos2),&block_id,sizeof(block_id));
pos2 += sizeof(block_id);
}
memcpy(&RepOriginId, source + pos1,sizeof(RepOriginId));
if (isMtr == true) {
memcpy(destion + pos2,&RepOriginId,sizeof(RepOriginId));
pos2 += sizeof(RepOriginId);
}
pos1 += sizeof(RepOriginId);
remaining -= sizeof(RepOriginId);
} else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID) {
pos1 += sizeof(block_id);
if (isMtr == true) {
memcpy((destion + pos2),&block_id,sizeof(block_id));
pos2 += sizeof(block_id);
}
memcpy(&TransactionId,source + pos1,sizeof(TransactionId));
if (isMtr == true) {
memcpy(destion + pos2,&TransactionId,sizeof(TransactionId));
pos2 += sizeof(TransactionId);
}
pos1 += sizeof(TransactionId);
remaining -= sizeof(TransactionId);
} else if (block_id <= XLR_MAX_BLOCK_ID) {
/* Ok, copy the header to the scratch buffer */
memcpy(destion + pos2, source + pos1, SizeOfXLogRecordBlockHeader);
uint8_t fork_flags = *(source + pos1 + sizeof(block_id));
*(destion + pos2) = blkNum;
data_len[blkNum] = *((uint16_t*)(source + pos1 + sizeof(block_id) + sizeof(fork_flags)));
datatotal += data_len;
pos1 += SizeOfXLogRecordBlockHeader;
pos2 += SizeOfXLogRecordBlockHeader;
remaining -= SizeOfXLogRecordBlockHeader;
if ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0) {
bimg_len[blkNum] = *((uint16_t*)(source + pos1));
datatotal += bimg_len;
uint16_t hole_offset = *((uint16_t*)(source + pos1 + sizeof(bimg_len)));
uint8_t bimg_info = *((uint16_t*)(source + pos1 + sizeof(bimg_len) + sizeof(hole_offset)));
memcpy(destion + pos2, source + pos1, SizeOfXLogRecordBlockImageHeader);
pos1 += SizeOfXLogRecordBlockImageHeader;
pos2 += SizeOfXLogRecordBlockImageHeader;
remaining -= SizeOfXLogRecordBlockImageHeader;
if ((bimg_info & BKPIMAGE_IS_COMPRESSED) != 0) {
if ((bimg_info & BKPIMAGE_HAS_HOLE) != 0) {
memcpy(destion + pos2, source + pos1, SizeOfXLogRecordBlockCompressHeader);
pos1 += SizeOfXLogRecordBlockCompressHeader;
pos2 += SizeOfXLogRecordBlockCompressHeader;
remaining -= SizeOfXLogRecordBlockCompressHeader;
}
}
if (!(fork_flags & BKPBLOCK_SAME_REL)) {
memcpy(destion + pos2, source + pos1, sizeof(RelFileNode));
pos1 += sizeof(RelFileNode);
pos2 += sizeof(RelFileNode);
remaining -= sizeof(RelFileNode);
}
memcpy(destion + pos2, source + pos1, sizeof(BlockNumber));
pos1 += sizeof(BlockNumber);
pos2 += sizeof(BlockNumber);
remaining -= sizeof(BlockNumber);
}
} else {
elog(FATAL,"invalid block_id %u",block_id);
}
}
assert(remaining == datatotal);
if (bimg_len[blkNum] != 0 ) {
img_ptr[blkNum] = source + pos1;
pos1 += bimg_len[blkNum];
}
if (data_len[blkNum] != 0) {
data_ptr[blkNum] = source + pos1;
pos1 += data_len[blkNum];
}
blkNum++;
}
*he3_pos = pos1;
int idx = 0;
while(idx < blkNum) {
if (bimg_len[idx] != 0) {
memcpy(destion + pos2, img_ptr[idx], bimg_len[idx]);
pos2 += bimg_len[idx];
}
if (data_len[blkNum] != 0){
memcpy(destion + pos2, data_ptr[idx], data_len[idx]);
pos2 += data_len[idx];
}
}
memcpy(destion + pos2, d_main_data, d_main_data_len);
pos2 += d_main_data_len;
old->xl_tot_len = pos2-prev_pos2;
isMtr = false;
}
return pos2;
}
// XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
// if (nextSegNo == endSegNo)
// *tli_p = sendTimeLineNextTLI;
// }
static int findFirstCheckPoint(char* source,int limit) {
XLogRecord* head = (XLogRecord*)source;
bool find = false;
int datalen = 0;
while(!(head->xl_rmid == RM_XLOG_ID &&
((head->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN || (head->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_ONLINE)) &&
datalen < limit) {
datalen += head->xl_tot_len;
}
if (datalen == limit) {
return -1;
}
return datalen;
}
// XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
// state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
// if (state->seg.ws_file >= 0)
// return;
/*
* Send out the WAL in its normal physical/stored form.
*
* Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
* but not yet sent to the client, and buffer it in the libpq output
* buffer.
*
* If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
* otherwise WalSndCaughtUp is set to false.
*/
static uint64_t EndLsn = 0;
static void
XLogSendTiKVPhysical(void)
{
XLogRecPtr SendRqstPtr;
XLogRecPtr startptr;
XLogRecPtr endptr;
Size nbytes;
XLogSegNo segno;
WALReadError errinfo;
// /*
// * If the file is not found, assume it's because the standby asked for a
// * too old WAL segment that has already been removed or recycled.
// */
// if (errno == ENOENT)
// {
// char xlogfname[MAXFNAMELEN];
// int save_errno = errno;
/* If requested switch the WAL sender to the stopping state. */
if (got_STOPPING)
WalSndSetState(WALSNDSTATE_STOPPING);
// XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
// errno = save_errno;
// ereport(ERROR,
// (errcode_for_file_access(),
// errmsg("requested WAL segment %s has already been removed",
// xlogfname)));
// }
// else
// ereport(ERROR,
// (errcode_for_file_access(),
// errmsg("could not open file \"%s\": %m",
// path)));
// }
if (streamingDoneSending)
{
WalSndCaughtUp = true;
return;
}
/* Figure out how far we can safely send the WAL. */
if (sendTimeLineIsHistoric)
{
/*
* Streaming an old timeline that's in this server's history, but is
* not the one we're currently inserting or replaying. It can be
* streamed up to the point where we switched off that timeline.
*/
SendRqstPtr = sendTimeLineValidUpto;
}
else if (am_cascading_walsender)
{
/*
* Streaming the latest timeline on a standby.
*
* Attempt to send all WAL that has already been replayed, so that we
* know it's valid. If we're receiving WAL through streaming
* replication, it's also OK to send any WAL that has been received
* but not replayed.
*
* The timeline we're recovering from can change, or we can be
* promoted. In either case, the current timeline becomes historic. We
* need to detect that so that we don't try to stream past the point
* where we switched to another timeline. We check for promotion or
* timeline switch after calculating FlushPtr, to avoid a race
* condition: if the timeline becomes historic just after we checked
* that it was still current, it's still be OK to stream it up to the
* FlushPtr that was calculated before it became historic.
*/
bool becameHistoric = false;
SendRqstPtr = GetStandbyFlushRecPtr();
if (!RecoveryInProgress())
{
/*
* We have been promoted. RecoveryInProgress() updated
* ThisTimeLineID to the new current timeline.
*/
am_cascading_walsender = false;
becameHistoric = true;
}
else
{
/*
* Still a cascading standby. But is the timeline we're sending
* still the one recovery is recovering from? ThisTimeLineID was
* updated by the GetStandbyFlushRecPtr() call above.
*/
if (sendTimeLine != ThisTimeLineID)
becameHistoric = true;
}
if (becameHistoric)
{
/*
* The timeline we were sending has become historic. Read the
* timeline history file of the new timeline to see where exactly
* we forked off from the timeline we were sending.
*/
List *history;
history = readTimeLineHistory(ThisTimeLineID);
sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
Assert(sendTimeLine < sendTimeLineNextTLI);
list_free_deep(history);
sendTimeLineIsHistoric = true;
SendRqstPtr = sendTimeLineValidUpto;
}
}
else
{
/*
* Streaming the current timeline on a primary.
*
* Attempt to send all data that's already been written out and
* fsync'd to disk. We cannot go further than what's been written out
* given the current implementation of WALRead(). And in any case
* it's unsafe to send WAL that is not securely down to disk on the
* primary: if the primary subsequently crashes and restarts, standbys
* must not have applied any WAL that got lost on the primary.
*/
SendRqstPtr = GetFlushRecPtr();
}
/*
* Record the current system time as an approximation of the time at which
* this WAL location was written for the purposes of lag tracking.
*
* In theory we could make XLogFlush() record a time in shmem whenever WAL
* is flushed and we could get that time as well as the LSN when we call
* GetFlushRecPtr() above (and likewise for the cascading standby
* equivalent), but rather than putting any new code into the hot WAL path
* it seems good enough to capture the time here. We should reach this
* after XLogFlush() runs WalSndWakeupProcessRequests(), and although that
* may take some time, we read the WAL flush pointer and take the time
* very close to together here so that we'll get a later position if it is
* still moving.
*
* Because LagTrackerWrite ignores samples when the LSN hasn't advanced,
* this gives us a cheap approximation for the WAL flush time for this
* LSN.
*
* Note that the LSN is not necessarily the LSN for the data contained in
* the present message; it's the end of the WAL, which might be further
* ahead. All the lag tracking machinery cares about is finding out when
* that arbitrary LSN is eventually reported as written, flushed and
* applied, so that it can measure the elapsed time.
*/
LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
/*
* If this is a historic timeline and we've reached the point where we
* forked to the next timeline, stop streaming.
*
* Note: We might already have sent WAL > sendTimeLineValidUpto. The
* startup process will normally replay all WAL that has been received
* from the primary, before promoting, but if the WAL streaming is
* terminated at a WAL page boundary, the valid portion of the timeline
* might end in the middle of a WAL record. We might've already sent the
* first half of that partial WAL record to the cascading standby, so that
* sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
* replay the partial WAL record either, so it can still follow our
* timeline switch.
*/
if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
{
/* close the current file. */
// if (xlogreader->seg.ws_file >= 0)
// wal_segment_close(xlogreader);
/* Send CopyDone */
pq_putmessage_noblock('c', NULL, 0);
streamingDoneSending = true;
WalSndCaughtUp = true;
elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
LSN_FORMAT_ARGS(sendTimeLineValidUpto),
LSN_FORMAT_ARGS(sentPtr));
return;
}
/* Do we have any work to do? */
Assert(sentPtr <= SendRqstPtr);
if (SendRqstPtr <= sentPtr)
{
WalSndCaughtUp = true;
return;
}
/*
* Figure out how much to send in one message. If there's no more than
* MAX_SEND_SIZE bytes to send, send everything. Otherwise send
* MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
*
* The rounding is not only for performance reasons. Walreceiver relies on
* the fact that we never split a WAL record across two messages. Since a
* long WAL record is split at page boundary into continuation records,
* page boundary is always a safe cut-off point. We also assume that
* SendRqstPtr never points to the middle of a WAL record.
*/
startptr = sentPtr;
endptr = startptr;
endptr += ONCE_READ_TIKV_WAL;
/* if we went beyond SendRqstPtr, back off */
if (SendRqstPtr <= endptr)
{
endptr = SendRqstPtr;
if (sendTimeLineIsHistoric)
WalSndCaughtUp = false;
else
WalSndCaughtUp = true;
}
else
{
/* round down to page boundary. */
endptr -= (endptr % XLOG_BLCKSZ);
WalSndCaughtUp = false;
}
nbytes = endptr - startptr;
Assert(nbytes <= ONCE_READ_TIKV_WAL);
/*
* OK to read and send the slice.
*/
resetStringInfo(&output_message);
pq_sendbyte(&output_message, 'w');
pq_sendint64(&output_message, 0); /* dataStart */
pq_sendint64(&output_message, 0); /* walEnd */
pq_sendint64(&output_message, 0); /* sendtime, filled in last */
/*
* Read the log directly into the output buffer to avoid extra memcpy
* calls.
*/
enlargeStringInfo(&output_message, DEFAULT_SEND_WAL_CAPCITY);
static char* he3_wal_cache = NULL;
uint64_t StartLsn = 0;
static uint64_t PrevLsn = 0;
if (he3_wal_cache == NULL) {
he3_wal_cache = malloc(DEFAULT_SEND_WAL_CAPCITY);
}
retry:
xlogreader->currTLI = ThisTimeLineID;
int ret = -1;
ret = He3DBWALRead(xlogreader,
startptr,
nbytes,
he3_wal_cache);
if (ret < 0) {
WALReadRaiseError(&errinfo);
return;
} else {
nbytes = ret;
}
int dLen = 0;
int mtrLen = ArrayXlogHe3ToPg(he3_wal_cache,nbytes,&output_message.data[output_message.len],&dLen,&StartLsn,&EndLsn);
output_message.len += dLen;
output_message.data[output_message.len] = '\0';
/*
* Fill the send timestamp last, so that it is taken as late as possible.
*/
if (StartLsn % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
XLogSegmentOffset(StartLsn, DEFAULT_XLOG_SEG_SIZE) > XLOG_BLCKSZ) {
StartLsn -= SizeOfXLogShortPHD;
}
else if (StartLsn % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
XLogSegmentOffset(StartLsn, DEFAULT_XLOG_SEG_SIZE) < XLOG_BLCKSZ) {
StartLsn -= SizeOfXLogLongPHD;
}
endptr = startptr + mtrLen ;
resetStringInfo(&tmpbuf);
pq_sendint64(&tmpbuf, StartLsn);
/* walStart */
memcpy(&output_message.data[1],
tmpbuf.data, sizeof(int64));
EndLsn = StartLsn+dLen;
resetStringInfo(&tmpbuf);
pq_sendint64(&tmpbuf, EndLsn);
/* walEnd */
memcpy(&output_message.data[1 + sizeof(int64)],
tmpbuf.data, sizeof(int64));
resetStringInfo(&tmpbuf);
/* sendtime, filled in last */
pq_sendint64(&tmpbuf, GetCurrentTimestamp());
memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)],
tmpbuf.data, sizeof(int64));
pq_putmessage_noblock('d', output_message.data, output_message.len);
sentPtr = endptr;
/* Update shared memory status */
{
WalSnd *walsnd = MyWalSnd;
SpinLockAcquire(&walsnd->mutex);
walsnd->sentPtr = sentPtr;
SpinLockRelease(&walsnd->mutex);
}
/* Report progress of XLOG streaming in PS display */
if (update_process_title)
{
char activitymsg[50];
snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X",
LSN_FORMAT_ARGS(EndLsn));
set_ps_display(activitymsg);
}
}
/*
* Send out the WAL in its normal physical/stored form.
@ -3489,7 +4116,11 @@ WalSndKeepalive(bool requestReply)
/* construct the message... */
resetStringInfo(&output_message);
pq_sendbyte(&output_message, 'k');
if (EndLsn != 0) {
pq_sendint64(&output_message, sentPtr);
} else {
pq_sendint64(&output_message, EndLsn);
}
pq_sendint64(&output_message, GetCurrentTimestamp());
pq_sendbyte(&output_message, requestReply ? 1 : 0);
@ -3533,6 +4164,9 @@ WalSndKeepaliveIfNecessary(void)
/* Try to flush pending output to the client */
if (pq_flush_if_writable() != 0)
WalSndShutdown();
// } else {
// WalSndKeepalive(true);
// pg_usleep(10000);
}
}

View File

@ -141,6 +141,17 @@ InitBufferPool(void)
/* Init other shared buffer-management stuff */
StrategyInitialize(!foundDescs);
/* Init preCacheNodes arrays */
preCacheNodesPtr = (Oid *)
ShmemInitStruct("preCacheNodesPtr",
NPreCacheNodes * sizeof(Oid), &foundBufCkpt);
memset(preCacheNodesPtr, 0, NPreCacheNodes * sizeof(Oid));
preCacheNodesCountPtr = (uint16 *)
ShmemInitStruct("preCacheNodesCountPtr",
sizeof(uint16), &foundBufCkpt);
memset(preCacheNodesCountPtr, 0, sizeof(uint16));
/* Initialize per-backend file flush context */
WritebackContextInit(&BackendWritebackContext,
&backend_flush_after);
@ -177,5 +188,8 @@ BufferShmemSize(void)
/* size of checkpoint sort array in bufmgr.c */
size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem)));
/* size of preCacheNodes */
size = add_size(size, mul_size(NPreCacheNodes, sizeof(Oid)) + sizeof(uint16));
return size;
}

View File

@ -793,6 +793,18 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
* miss.
*/
pgstat_count_buffer_read(reln);
/* precache or unprecache index */
if (isPreCacheIndex && !isPreCacheIndexDone && preCacheNodeOid == reln->rd_node.relNode)
{
BlockNumber precacheblocks;
precacheblocks = smgrnblocks(reln->rd_smgr, forkNum);
for(BlockNumber i=0; i < precacheblocks; i++)
{
ReleaseBuffer(ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence, forkNum, i, mode, strategy, &hit));
}
isPreCacheIndexDone = true;
}
buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
forkNum, blockNum, mode, strategy, &hit);
if (hit)
@ -919,6 +931,16 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
mode == RBM_ZERO_ON_ERROR)
pgBufferUsage.shared_blks_read++;
// for precache: buf not be eliminated by clock algorithm
if (needPreCacheEscape && preCacheNodeOid == bufHdr->tag.rnode.relNode)
{
bufHdr->isPreCacheEscape=true;
}
// for unprecache: buf be eliminated by clock algorithm
if (needUnpreCacheEscape && preCacheNodeOid == bufHdr->tag.rnode.relNode)
{
bufHdr->isPreCacheEscape=false;
}
}
/* At this point we do NOT hold any locks. */
@ -1032,6 +1054,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
int lsnLen = 0;
bool outdata = true;
Bufrd tWalRecord;
tWalRecord.count = 0;
tWalRecord.buf = NULL;
LsnNode* head = NULL;
char* pageXlogPtr = NULL;
@ -1047,6 +1070,21 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
MemSet((char *) bufBlock, 0, BLCKSZ);
/* don't set checksum for all-zero page */
smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
/* for new page precache */
if (*preCacheNodesCountPtr > 0)
{
uint16 preCacheNodei = 0;
while (preCacheNodei < *preCacheNodesCountPtr)
{
if (preCacheNodesPtr[preCacheNodei] == bufHdr->tag.rnode.relNode)
{
bufHdr->isPreCacheEscape=true;
break;
}
preCacheNodei++;
}
}
//parallel replay PageFlushWorkerMain=>ProcFlushBufferToDisk=>XLogReadBufferExtended=>default status RM_NORMAL,
//where init page,status is RBM_ZERO_AND_LOCK will lead to page invaild,so need smgrextend page then to smgrread
//push standby can use ReadWalsByPage to replay base RBM_ZERO page,but slave must be ensure flush page min LSN point
@ -1056,7 +1094,6 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
!isLocalBuf) && IsBootstrapProcessingMode() != true && InitdbSingle != true)
{
if (EnableHotStandby == true || InRecovery) {
if (EnableHotStandby == true && push_standby == false) {
BufferTag pageTag;
pageTag.rnode = smgr->smgr_rnode.node;
pageTag.forkNum = forkNum;
@ -1064,18 +1101,20 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
replayLsn = GetXLogReplayRecPtr(&tli);
XLogRecPtr pageLsn = BufferGetLSN(bufHdr);
head = GetLogIndexByPage(&pageTag,pageLsn,replayLsn);
GetXLogReplayRecPtr(&tli);
if ((EnableHotStandby == true && push_standby == false) || he3mirror) {
if (head->next!=NULL) {
tWalRecord = ReadWalsByPage(pageTag.rnode.dbNode,pageTag.rnode.relNode,forkNum,blockNum,tli,head);
}
} else {
LsnNode* next = head->next;
if (next!=NULL) {
walRecord.cap = 8192;
walRecord.buf = malloc(walRecord.cap);
LsnNode* next = head->next;
}
while(next!=NULL) {
int count = walRecordQuery(&walRecord.buf,&walRecord.count,&walRecord.cap,next->lsn);
if (count == -1) {
elog(FATAL,"======walRecordQuery query wal Faild %X/%X======",LSN_FORMAT_ARGS(next->lsn));
elog(FATAL,"======walRecordQuery query wal Faild %X/%X===1===",LSN_FORMAT_ARGS(next->lsn));
}
next = next->next;
}
@ -1118,18 +1157,20 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
pageTag.blockNum = blockNum;
XLogRecPtr pageLsn = BufferGetLSN(bufHdr);
head = GetLogIndexByPage(&pageTag,pageLsn,replayLsn);
if (EnableHotStandby == true && push_standby == false) {
if ((EnableHotStandby == true && push_standby == false) || he3mirror) {
if (head->next != NULL) {
tWalRecord = ReadWalsByPage(pageTag.rnode.dbNode,pageTag.rnode.relNode,forkNum,blockNum,tli,head);
}
} else {
LsnNode* next = head->next;
if (next != NULL) {
walRecord.cap = 8192;
walRecord.buf = malloc(walRecord.cap);
LsnNode* next = head->next;
}
while(next!=NULL) {
int count = walRecordQuery(&walRecord.buf,&walRecord.count,&walRecord.cap,next->lsn);
if (count == -1) {
elog(FATAL,"======walRecordQuery query wal Faild %X/%X======",LSN_FORMAT_ARGS(next->lsn));
elog(FATAL,"======walRecordQuery query wal Faild %X/%X===2===",LSN_FORMAT_ARGS(next->lsn));
}
next = next->next;
}
@ -1140,6 +1181,30 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
elog(FATAL,"smgrextend=>he3dbsmgrread rel %d flk %d blk %d nbytes %d",smgr->smgr_rnode.node.relNode,forkNum, blockNum,nbytes);
} else {
memcpy(bufBlock,pageXlogPtr,BLCKSZ);
if (push_standby == true || EnableHotStandby == false) {
BufferTag pageTag;
pageTag.rnode = smgr->smgr_rnode.node;
pageTag.forkNum = forkNum;
pageTag.blockNum = blockNum;
XLogRecPtr pageLsn = BufferGetLSN(bufHdr);
head = GetLogIndexByPage(&pageTag,pageLsn,replayLsn);
if (head->next!=NULL && he3mirror) {
tWalRecord = ReadWalsByPage(pageTag.rnode.dbNode,pageTag.rnode.relNode,forkNum,blockNum,tli,head);
}else{
LsnNode* next = head->next;
if (next != NULL) {
walRecord.cap = 8192;
walRecord.buf = malloc(walRecord.cap);
}
while(next!=NULL) {
int count = walRecordQuery(&walRecord.buf,&walRecord.count,&walRecord.cap,next->lsn);
if (count == -1) {
elog(FATAL,"======walRecordQuery query wal Faild %X/%X===3===",LSN_FORMAT_ARGS(next->lsn));
}
next = next->next;
}
}
}
}
}
}
@ -1212,14 +1277,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
* apply logs to this old page when read from disk.
*
*/
if (pageXlogPtr != NULL || tWalRecord.buf != NULL || walRecord.buf != NULL)
if (pageXlogPtr != NULL || tWalRecord.count != 0 || walRecord.count != 0)
{
XLogRecPtr pageLsn = BufferGetLSN(bufHdr);
char *xlogStart = NULL;
if (pageXlogPtr != NULL) {
xlogStart = pageXlogPtr + BLCKSZ;
nbytes = nbytes - BLCKSZ;
} else if (tWalRecord.buf != NULL) {
if (walRecord.count != 0) {
xlogStart = walRecord.buf;
nbytes = walRecord.count;
}
} else if (tWalRecord.count != 0) {
xlogStart = tWalRecord.buf;
nbytes = tWalRecord.count;
} else {
@ -1230,13 +1299,15 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
if (pageXlogPtr != NULL) {
free(pageXlogPtr);
pageXlogPtr = NULL;
if (walRecord.count != 0) {
free(walRecord.buf);
FreeLsnNode(head);
}
} else if (tWalRecord.count != 0) {
free_dataRead(tWalRecord.buf,tWalRecord.count,tWalRecord.cap);
FreeLsnNode(head);
} else {
if (walRecord.buf != NULL) {
free(walRecord.buf);
}
FreeLsnNode(head);
}
}
@ -2134,40 +2205,6 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
ref->refcount++;
Assert(ref->refcount > 0);
// for precache: buf not be eliminated by clock algorithm
if (needPreCacheEscape)
{
uint32 buf_state;
uint32 old_buf_state;
old_buf_state = pg_atomic_read_u32(&buf->state);
for (;;)
{
if (old_buf_state & BM_LOCKED)
old_buf_state = WaitBufHdrUnlocked(buf);
buf_state = old_buf_state;
/* increase refcount */
buf_state += BUF_REFCOUNT_ONE;
if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
buf_state))
{
result = (buf_state & BM_VALID) != 0;
/*
* Assume that we acquired a buffer pin for the purposes of
* Valgrind buffer client checks (even in !result case) to
* keep things simple. Buffers that are unsafe to access are
* not generally guaranteed to be marked undefined or
* non-accessible in any case.
*/
VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
break;
}
}
}
ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
return result;
@ -2221,11 +2258,6 @@ PinBuffer_Locked(BufferDesc *buf)
buf_state = pg_atomic_read_u32(&buf->state);
Assert(buf_state & BM_LOCKED);
buf_state += BUF_REFCOUNT_ONE;
// for precache: buf not be eliminated by clock algorithm
if (needPreCacheEscape)
{
buf_state += BUF_REFCOUNT_ONE;
}
UnlockBufHdr(buf, buf_state);
b = BufferDescriptorGetBuffer(buf);

View File

@ -324,7 +324,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
*/
local_buf_state = LockBufHdr(buf);
if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
if (buf->isPreCacheEscape == false && BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
{
if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
{

View File

@ -97,12 +97,13 @@
#include "pgstat.h"
#include "port/pg_iovec.h"
#include "portability/mem.h"
#include "postmaster/secondbuffer.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/spin.h"
#include "utils/guc.h"
#include "utils/resowner_private.h"
#include "utils/hfs.h"
//#include "utils/hfs.h"
/* He3DB: He3FS */
//#include "storage/iport.h"
@ -2422,9 +2423,15 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
int
MasterFileRead(char *buffer,uint32_t dbid, uint32_t relid, uint32_t forkno, uint32_t blockno){
OriginDPageKey odpk;
PageKey pageKey;
Bufrd bufrd;
bufrd.count = 0;
Bufrd *bufrd = NULL;
bufrd = (Bufrd *)malloc(sizeof(Bufrd));
bufrd->count = 0;
bufrd->cap = 0;
bufrd->buf = buffer;
int count = 0;
pageKey.relfileNode.dbNode = dbid;
pageKey.relfileNode.relNode = relid;
@ -2433,14 +2440,17 @@ MasterFileRead(char *buffer,uint32_t dbid, uint32_t relid, uint32_t forkno, uint
pageKey.pageLsn = 0;
pageKey.replyLsn = GetXLogWriteRecPtr();
bufrd = MoveOnePageToMemory(pageKey);
if (bufrd.count > 0)
odpk.pk = pageKey;
odpk.opration = (int)EVICT;
GetPageFromCurrentNode(pageKey,bufrd);
count = bufrd->count;
if (count > 0)
{
memcpy(buffer,bufrd.buf,bufrd.count);
free_dataRead(bufrd.buf,bufrd.count, bufrd.cap);
AddOneItemToDPArray(odpk);
bufrd->buf = NULL;
}
return bufrd.count;
free(bufrd);
return count;
}
int

View File

@ -29,6 +29,7 @@
#include "postmaster/bgworker_internals.h"
#include "postmaster/bgwriter.h"
#include "postmaster/postmaster.h"
#include "postmaster/secondbuffer.h"
#include "replication/logicallauncher.h"
#include "replication/origin.h"
#include "replication/slot.h"
@ -168,7 +169,8 @@ CreateSharedMemoryAndSemaphores(void)
/* secondbufferhash code. */
//TODO the size should be calculated base on data buffer size.
size = add_size(size, 1<<30);
size = add_size(size, SecondBufferShmemSize());
size = add_size(size, SecondBufferLWLockShmemSize());
size = add_size(size, He3dbLogIndexShmemSize());
/* cache file size */
@ -222,6 +224,8 @@ CreateSharedMemoryAndSemaphores(void)
*/
CreateLWLocks();
CreateSecondBufferLWLocks();
/*
* Set up shmem.c index hashtable
*/
@ -254,6 +258,8 @@ CreateSharedMemoryAndSemaphores(void)
* set up second buffer hash
*/
InitSecondBufferHash();
InitSecondBufferMeta();
InitDPageKeyArray();
/*
* set up fs meta

View File

@ -140,6 +140,7 @@ static void SetActiveTblWithFirstPage(LogIndexMemTBL *mem_tbl, const BufferTag *
pg_atomic_write_u32(&(mem_tbl->meta.state), LOG_INDEX_MEM_TBL_STATE_ACTIVE);
// index start with 1, 0 means INVALID. hash[] all values will be 0 after init, so set to 1 when first use.
mem_tbl->meta.id = log_index_mem_list->active_table_index;
mem_tbl->meta.lsn_free_head = 1;
mem_tbl->meta.page_free_head = 1;
// calculate hashcode by buffer tag
@ -235,6 +236,17 @@ static void InsertLsnNodeByHead(LsnNode *head, XLogRecPtr lsn)
head->next = new_node;
}
// eg: before: head-->node1-->NULL, after: head-->node1-->newNode-->NULL
static LsnNode *InsertLsnNodeByTail(LsnNode *head, XLogRecPtr lsn)
{
LsnNode *new_node;
new_node = (LsnNode *)malloc(sizeof(LsnNode));
head->next = new_node;
new_node->lsn = lsn;
new_node->next = NULL;
return new_node;
}
// print nodelist
static void PrintLsnNode(LsnNode *head)
{
@ -365,6 +377,7 @@ void InsertLogIndexByPage(const BufferTag *page, XLogRecPtr lsn)
if(mem_tbl->hash[hash_key] == 0)
{
// set hash value to next free head
if (!(mem_tbl->meta.page_free_head > LOG_INDEX_MEM_TBL_PAGE_NUM || mem_tbl->meta.lsn_free_head > LOG_INDEX_MEM_TBL_SEG_NUM))
mem_tbl->hash[hash_key] = mem_tbl->meta.page_free_head;
SetNextPageItem(mem_tbl, page, lsn);
}
@ -437,11 +450,13 @@ void InsertLogIndexByPage(const BufferTag *page, XLogRecPtr lsn)
LsnNode *GetLogIndexByPage(const BufferTag *page, XLogRecPtr start_lsn, XLogRecPtr end_lsn)
{
LsnNode *head_node;
LsnNode *tail;
uint64 tbl_index;
// Prevent metadata changes during discovery.
// TODO change to Lightweight Lock
head_node = InitLsnNode();
tail = head_node;
LWLockAcquire(LogIndexMemListLock,LW_SHARED);
tbl_index = log_index_mem_list->table_start_index;
while(tbl_index != log_index_mem_list->active_table_index)
@ -470,9 +485,8 @@ LsnNode *GetLogIndexByPage(const BufferTag *page, XLogRecPtr start_lsn, XLogRecP
{
if(lsn < end_lsn)
{
InsertLsnNodeByHead(head_node, lsn);
tail = InsertLsnNodeByTail(tail, lsn);
}else{
ReverseLsnNode(head_node);
LWLockRelease(LogIndexMemListLock);
return head_node;
}
@ -501,9 +515,8 @@ LsnNode *GetLogIndexByPage(const BufferTag *page, XLogRecPtr start_lsn, XLogRecP
{
if(lsn < end_lsn)
{
InsertLsnNodeByHead(head_node, lsn);
tail = InsertLsnNodeByTail(tail, lsn);
}else{
ReverseLsnNode(head_node);
LWLockRelease(LogIndexMemListLock);
return head_node;
}
@ -514,11 +527,9 @@ LsnNode *GetLogIndexByPage(const BufferTag *page, XLogRecPtr start_lsn, XLogRecP
}
seg_index = item_seg->next_seg;
}
ReverseLsnNode(head_node);
LWLockRelease(LogIndexMemListLock);
return head_node;
}
ReverseLsnNode(head_node);
LWLockRelease(LogIndexMemListLock);
return head_node;
}
@ -584,7 +595,8 @@ TagNode *GetBufTagByLsnRange(XLogRecPtr start_lsn, XLogRecPtr end_lsn)
head_node = InitTagNode();
LWLockAcquire(LogIndexMemListLock,LW_SHARED);
tbl_index = log_index_mem_list->table_start_index;
while(tbl_index != log_index_mem_list->active_table_index)
uint64 active_index_next = (log_index_mem_list->active_table_index+1)%(log_index_mem_list->table_cap);
while(tbl_index != active_index_next)
{
LogIndexMemTBL *mem_tbl = &(log_index_mem_list->mem_table[tbl_index]);
tbl_index = (tbl_index + 1)%(log_index_mem_list->table_cap);

View File

@ -286,10 +286,7 @@ static HTAB *LockMethodLockHash;
static HTAB *LockMethodProcLockHash;
static HTAB *LockMethodLocalHash;
/*
secondbufferhash code
*/
static HTAB *SecondBufferHash;
/*
fs meta code
@ -484,47 +481,6 @@ void InitLocks(void)
HASH_ELEM | HASH_BLOBS);
}
/*
init SecondBufferHash
*/
void
InitSecondBufferHash(void)
{
HASHCTL info;
long init_table_size,
max_table_size;
bool found;
/*
* Compute init/max size to request for lock hashtables. Note these
* calculations must agree with SecondBufferhashShmemSize!
*/
max_table_size = 200;
init_table_size = max_table_size / 2;
info.keysize = sizeof(PageKey);
info.entrysize = sizeof(PageValue);
info.num_partitions = NUM_LOCK_PARTITIONS;
SecondBufferHash = ShmemInitHash("SecondBuffer hash",
init_table_size,
max_table_size,
&info,
HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
/*
* Allocate wal global structures.
*/
secondBbufferglobalOffset =
ShmemInitStruct("secondbuffer global set",
sizeof(globalOffset), &found);
if (!found){
SpinLockInit(&secondBufferGlobalOffset->mutex);
//initglobaloffset();
}
}
void setglobaloffset(uint64 offset,uint64 ino){
SpinLockAcquire(&secondBufferGlobalOffset->mutex);
@ -561,21 +517,6 @@ void InitFSMetaHash(void)
HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
}
/*
* notification_match: match function to use with notification_hash
*/
static int
secondbuffer_match(const void *key1, const void *key2, Size keysize)
{
const pageKey *k1 = (const pageKey *)key1;
const pageKey *k2 = (const pageKey *)key2;
Assert(keysize == sizeof(pageKey));
if (k1->dbid == k2->dbid &&
k1->blkno == k2->blkno && k1->forkno == k2->forkno && k1->relid == k2->relid)
return 0; /* equal */
return 1; /* not equal */
}
/*
* Fetch the lock method table associated with a given lock
@ -4801,71 +4742,6 @@ int LockWaiterCount(const LOCKTAG *locktag)
return waiters;
}
//**************************
//* ops for wal hash code
//*************************
/*
WAL LOG HASH
*/
uint32
SecondBufferHashCode(const pageKey *pk)
{
return get_hash_value(SecondBufferHash, (const void *)pk);
}
/*
* Find or create LOCK and PROCLOCK objects as needed for a new lock
* request.
*
* Returns the PROCLOCK object, or NULL if we failed to create the objects
* for lack of shared memory.
*
* The appropriate partition lock must be held at entry, and will be
* held at exit.
*/
PageValue *
SetupSecondBufferInTable(const pageKey *pk)
{
PageValue *pv;
bool found;
wl = (PageValue *)
hash_search(SecondBufferHash, pk, HASH_ENTER, &found);
if (!found)
{
printf("not found \n");
}
return pv;
}
void CleanUpSecondBuffer(const PageKey *pk, uint32 hashcode)
{
hash_search_with_hash_value(SecondBufferHash,
(void *)pk,
hashcode,
HASH_REMOVE,
NULL);
}
PageValue *
FindSecondBufferInTable(const PageKey *pk)
{
PageValue *pv;
bool found;
pv = (PageValue *)
hash_search(SecondBufferHash, pk, HASH_FIND, &found);
return pv;
}
/*
for fs meta

View File

@ -33,6 +33,7 @@
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "postmaster/secondbuffer.h"
#include "storage/bufmgr.h"
#include "storage/buf_internals.h"
#include "storage/fd.h"
@ -44,7 +45,7 @@
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/guc.h"
#include "utils/hfs.h"
// #include "utils/hfs.h"
#include "storage/he3db_logindex.h"
/*
@ -93,17 +94,14 @@ typedef struct _MdfdVec
static MemoryContext MdCxt; /* context for all MdfdVec objects */
/* Populate a file tag describing an md.c segment file. */
#define INIT_MD_FILETAG(a,xx_rnode,xx_forknum,xx_segno) \
( \
#define INIT_MD_FILETAG(a, xx_rnode, xx_forknum, xx_segno) \
( \
memset(&(a), 0, sizeof(FileTag)), \
(a).handler = SYNC_HANDLER_MD, \
(a).rnode = (xx_rnode), \
(a).forknum = (xx_forknum), \
(a).segno = (xx_segno) \
)
(a).segno = (xx_segno))
/*** behavior for mdopen & _mdfd_getseg ***/
/* ereport if segment not present */
@ -123,7 +121,6 @@ static MemoryContext MdCxt; /* context for all MdfdVec objects */
*/
#define EXTENSION_DONT_CHECK_SIZE (1 << 4)
/* local routines */
static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
bool isRedo);
@ -146,12 +143,10 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
MdfdVec *seg);
/*
* mdinit() -- Initialize private state for magnetic disk storage manager.
*/
void
mdinit(void)
void mdinit(void)
{
MdCxt = AllocSetContextCreate(TopMemoryContext,
"MdSmgr",
@ -163,8 +158,7 @@ mdinit(void)
*
* Note: this will return true for lingering files, with pending deletions
*/
bool
mdexists(SMgrRelation reln, ForkNumber forkNum)
bool mdexists(SMgrRelation reln, ForkNumber forkNum)
{
/*
* Close it first, to ensure that we notice if the fork has been unlinked
@ -180,8 +174,7 @@ mdexists(SMgrRelation reln, ForkNumber forkNum)
*
* If isRedo is true, it's okay for the relation to exist already.
*/
void
mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
void mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
{
MdfdVec *mdfd;
char *path;
@ -280,8 +273,7 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
* Note: any failure should be reported as WARNING not ERROR, because
* we are usually not in a transaction anymore when this is called.
*/
void
mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
void mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
{
/* Now do the per-fork work */
if (forkNum == InvalidForkNumber)
@ -306,7 +298,7 @@ do_truncate(const char *path)
* He3DB: He3FS replace OS FS
* only propeller instance can release disk space
*/
//ret = pg_truncate(path, 0);
// ret = pg_truncate(path, 0);
if (push_standby)
{
ret = pg_truncate(path, 0);
@ -348,7 +340,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
ret = do_truncate(path);
/* Forget any pending sync requests for the first segment */
register_forget_request(rnode, forkNum, 0 /* first seg */ );
register_forget_request(rnode, forkNum, 0 /* first seg */);
}
else
ret = 0;
@ -356,7 +348,8 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
/* Next unlink the file, unless it was already found to be missing */
if (ret == 0 || errno != ENOENT)
{
if (push_standby == true || RelFileNodeBackendIsTemp(rnode)) {
if (push_standby == true || RelFileNodeBackendIsTemp(rnode))
{
ret = unlink(path);
}
if (ret < 0 && errno != ENOENT)
@ -371,7 +364,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
ret = do_truncate(path);
/* Register request to unlink first segment later */
register_unlink_segment(rnode, forkNum, 0 /* first seg */ );
register_unlink_segment(rnode, forkNum, 0 /* first seg */);
}
/*
@ -379,7 +372,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
*/
if (ret >= 0)
{
char *segpath = (char *) palloc(strlen(path) + 12);
char *segpath = (char *)palloc(strlen(path) + 12);
BlockNumber segno;
/*
@ -405,7 +398,8 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
*/
register_forget_request(rnode, forkNum, segno);
}
if (push_standby == true || RelFileNodeBackendIsTemp(rnode)) {
if (push_standby == true || RelFileNodeBackendIsTemp(rnode))
{
if (unlink(segpath) < 0)
{
/* ENOENT is expected after the last segment... */
@ -415,7 +409,9 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
errmsg("could not remove file \"%s\": %m", segpath)));
break;
}
} else {
}
else
{
break;
}
}
@ -434,8 +430,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
* EOF). Note that we assume writing a block beyond current EOF
* causes intervening file space to become filled with zeroes.
*/
void
mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool skipFsync)
{
off_t seekpos;
@ -462,9 +457,9 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE));
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE);
if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
{
@ -486,7 +481,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
if (!skipFsync && !SmgrIsTemp(reln))
register_dirty_segment(reln, forknum, v);
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber)RELSEG_SIZE));
}
/*
@ -535,7 +530,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
mdfd->mdfd_vfd = fd;
mdfd->mdfd_segno = 0;
Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber)RELSEG_SIZE));
return mdfd;
}
@ -543,8 +538,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
/*
* mdopen() -- Initialize newly-opened relation.
*/
void
mdopen(SMgrRelation reln)
void mdopen(SMgrRelation reln)
{
/* mark it not open */
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
@ -554,8 +548,7 @@ mdopen(SMgrRelation reln)
/*
* mdclose() -- Close the specified relation, if it isn't closed already.
*/
void
mdclose(SMgrRelation reln, ForkNumber forknum)
void mdclose(SMgrRelation reln, ForkNumber forknum)
{
int nopensegs = reln->md_num_open_segs[forknum];
@ -577,8 +570,7 @@ mdclose(SMgrRelation reln, ForkNumber forknum)
/*
* mdprefetch() -- Initiate asynchronous read of the specified block of a relation
*/
bool
mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
{
#ifdef USE_PREFETCH
off_t seekpos;
@ -589,11 +581,11 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
if (v == NULL)
return false;
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE));
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE);
(void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH);
(void)FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH);
#endif /* USE_PREFETCH */
return true;
@ -605,8 +597,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
* This accepts a range of blocks because flushing several pages at once is
* considerably more efficient than doing so individually.
*/
void
mdwriteback(SMgrRelation reln, ForkNumber forknum,
void mdwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks)
{
/*
@ -621,7 +612,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum,
int segnum_start,
segnum_end;
v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */,
EXTENSION_RETURN_NULL);
/*
@ -637,14 +628,14 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum,
/* compute number of desired writes within the current segment */
segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
if (segnum_start != segnum_end)
nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
nflush = RELSEG_SIZE - (blocknum % ((BlockNumber)RELSEG_SIZE));
Assert(nflush >= 1);
Assert(nflush <= nblocks);
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE));
FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
FileWriteback(v->mdfd_vfd, seekpos, (off_t)BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
nblocks -= nflush;
blocknum += nflush;
@ -654,8 +645,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum,
/*
* mdread() -- Read the specified block from a relation.
*/
void
mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer)
{
off_t seekpos;
@ -672,18 +662,29 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
v = _mdfd_getseg(reln, forknum, blocknum, false,
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
segno = (uint32_t) blocknum /((BlockNumber) RELSEG_SIZE);
seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE));
segno = (uint32_t)blocknum / ((BlockNumber)RELSEG_SIZE);
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE);
//TODO read page from disk
// TODO read page from disk
if (!(InitdbSingle || IsBootstrapProcessingMode() == true))
nbytes = MasterFileRead(buffer,reln->smgr_rnode.node.dbNode,reln->smgr_rnode.node.relNode,forknum,blocknum);
nbytes = MasterFileRead(buffer, reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, forknum, blocknum);
if (nbytes == 0)
{
nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
if (!(InitdbSingle || IsBootstrapProcessingMode() == true))
{
PageKey pageKey;
pageKey.relfileNode.dbNode = reln->smgr_rnode.node.dbNode;
pageKey.relfileNode.relNode = reln->smgr_rnode.node.relNode;
pageKey.forkNo = forknum;
pageKey.blkNo = blocknum;
pageKey.pageLsn = PageGetLSN(buffer);
pageKey.replyLsn = 0;
ReceivePageFromDataBuffer(&pageKey, (uint8_t *)buffer);
}
}
TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
@ -735,8 +736,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
* 1)return read bytes
* 2)add parameter to control pageXlog read or only page read
*/
int
he3db_mdread_pagexlog(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
int he3db_mdread_pagexlog(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char **buffer, XLogRecPtr lsn)
{
off_t seekpos;
@ -755,62 +755,54 @@ he3db_mdread_pagexlog(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknu
// reln->smgr_rnode.node.relNode,
// reln->smgr_rnode.backend);
OriginDPageKey odpk;
PageKey pageKey;
Bufrd bufrd;
bufrd.count = 0;
Bufrd *bufrd = NULL;
int count = 0;
if (!push_standby)
{
bufrd = (Bufrd *)malloc(sizeof(Bufrd));
bufrd->count = 0;
bufrd->cap = 0;
bufrd->buf = NULL;
pageKey.relfileNode.dbNode = pageTag.rnode.dbNode;
pageKey.relfileNode.relNode = pageTag.rnode.relNode;
pageKey.forkNo = pageTag.forkNum;
pageKey.forkNo = (uint32)pageTag.forkNum;
pageKey.blkNo = pageTag.blockNum;
pageKey.pageLsn = 0;
pageKey.replyLsn = lsn;
bufrd = MoveOnePageToMemory(pageKey);
if (bufrd.count > 0)
{
nbytes = bufrd.count;
*buffer = (uint8_t *)malloc(bufrd.count);
memcpy(*buffer, bufrd.buf,bufrd.count);
free_dataRead(bufrd.buf, bufrd.count, bufrd.cap);
if (push_standby)
{
Assert(bufrd.count == BLCKSZ);
pageKey.pageLsn = PageGetLSN(*buffer);
LsnNode *head = GetLogIndexByPage(&pageTag, pageKey.pageLsn, pageKey.replyLsn);
if (head->next != NULL)
{
TimeLineID tli;
GetXLogReplayRecPtr(&tli);
Bufrd result;
result = ReadWalsByPage(pageKey.relfileNode.dbNode, pageKey.relfileNode.relNode,
pageKey.forkNo, pageKey.blkNo, tli, head);
Assert(result.count != 0);
nbytes += result.count;
*buffer = (uint8_t *)realloc(*buffer, BLCKSZ + result.count);
strcat(*buffer,result.buf);
free_dataRead(result.buf, result.count, result.cap);
odpk.pk = pageKey;
odpk.opration = (int)EVICT;
GetPageFromCurrentNode(pageKey, bufrd);
count = bufrd->count;
}
//TODO free result
FreeLsnNode(head);
}
// *buffer = bufrd.buf;
return nbytes;
if (count > 0)
{
*buffer = bufrd->buf;
free(bufrd);
AddOneItemToDPArray(odpk);
return count;
}
else
{
//TODO 如果本地盘不存在则调用标准接口读取page再调用tikv的借口获取范围的wal
uint8_t *buf = (uint8_t *)malloc(BLCKSZ);
*buffer = (uint8_t *)malloc(BLCKSZ);
// TODO 如果本地盘不存在则调用标准接口读取page再调用tikv的借口获取范围的wal
v = _mdfd_getseg(reln, forknum, blocknum, false,
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE));
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
segno = (uint32_t)blocknum /((BlockNumber) RELSEG_SIZE);
Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE);
segno = (uint32_t)blocknum / ((BlockNumber)RELSEG_SIZE);
nbytes = FileRead(v->mdfd_vfd, buf, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
nbytes = FileRead(v->mdfd_vfd, *buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
if (nbytes < BLCKSZ)
{
if (nbytes < 0)
@ -818,6 +810,9 @@ he3db_mdread_pagexlog(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknu
(errcode_for_file_access(),
errmsg("could not read block %u in file \"%s\": %m",
blocknum, FilePathName(v->mdfd_vfd))));
if (he3mirror && nbytes == 0)
MemSet(*buffer, 0, BLCKSZ);
ereport(PANIC,
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
@ -825,8 +820,13 @@ he3db_mdread_pagexlog(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknu
nbytes, BLCKSZ)));
}
if (push_standby)
{
return nbytes;
}
pageKey.pageLsn = PageGetLSN(*buffer);
pageKey.pageLsn = PageGetLSN(buf);;
pageKey.replyLsn = lsn;
LsnNode *head = GetLogIndexByPage(&pageTag, pageKey.pageLsn, pageKey.replyLsn);
@ -835,19 +835,31 @@ he3db_mdread_pagexlog(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknu
TimeLineID tli;
GetXLogReplayRecPtr(&tli);
Bufrd result;
result = GetWalsFromDisk(pageKey);
if (result.count == 0) {
result = ReadWalsByPage(pageKey.relfileNode.dbNode,pageKey.relfileNode.relNode,
pageKey.forkNo,pageKey.blkNo, tli, head);
WalLdPageKey wlpk;
wlpk.sk.dbid = pageKey.relfileNode.dbNode;
wlpk.sk.relid = pageKey.relfileNode.relNode;
wlpk.pageLsn = pageKey.pageLsn;
wlpk.partition = 0;
result.count = 0;
result = GetWalFromLocalBuffer(&wlpk);
if (result.count == 0)
{
free(result.buf);
result = ReadWalsByPage(pageKey.relfileNode.dbNode, pageKey.relfileNode.relNode,
pageKey.forkNo, pageKey.blkNo, tli, head);
}
Assert(result.count != 0);
nbytes += result.count;
buf = (uint8_t *)realloc(buf, BLCKSZ + result.count);
strcat(buf,result.buf);
//TODO free result
*buffer = (uint8_t *)realloc(*buffer, BLCKSZ + result.count);
memcpy((*buffer) + BLCKSZ, result.buf, result.count);
// TODO free result
free_dataRead(result.buf, result.count, result.cap);
}
*buffer = buf;
else
{
ReceivePageFromDataBuffer(&pageKey, *buffer);
}
FreeLsnNode(head);
return nbytes;
}
@ -913,17 +925,16 @@ he3db_mdread_pagexlog(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknu
* 1)return read bytes
* 2)add parameter to control pageXlog read or only page read
*/
int
he3db_mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
int he3db_mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char **buffer, bool pagexlog, XLogRecPtr lsn)
{
// if (likely(pagexlog))
// {
// if (likely(pagexlog))
// {
return he3db_mdread_pagexlog(reln, forknum, blocknum, buffer, lsn);
// }
// }
// mdread(reln, forknum, blocknum, *buffer, lsn);
// return 0;
// mdread(reln, forknum, blocknum, *buffer, lsn);
// return 0;
}
/*
@ -933,8 +944,7 @@ he3db_mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
* relation (ie, those before the current EOF). To extend a relation,
* use mdextend().
*/
void
mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool skipFsync)
{
off_t seekpos;
@ -955,9 +965,9 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE));
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE);
nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
@ -1029,10 +1039,10 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
for (;;)
{
nblocks = _mdnblocks(reln, forknum, v);
if (nblocks > ((BlockNumber) RELSEG_SIZE))
if (nblocks > ((BlockNumber)RELSEG_SIZE))
elog(FATAL, "segment too big");
if (nblocks < ((BlockNumber) RELSEG_SIZE))
return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
if (nblocks < ((BlockNumber)RELSEG_SIZE))
return (segno * ((BlockNumber)RELSEG_SIZE)) + nblocks;
/*
* If segment is exactly RELSEG_SIZE, advance to next one.
@ -1048,15 +1058,14 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
*/
v = _mdfd_openseg(reln, forknum, segno, 0);
if (v == NULL)
return segno * ((BlockNumber) RELSEG_SIZE);
return segno * ((BlockNumber)RELSEG_SIZE);
}
}
/*
* mdtruncate() -- Truncate relation to specified number of blocks.
*/
void
mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
{
BlockNumber curnblk;
BlockNumber priorblocks;
@ -1114,7 +1123,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
FileClose(v->mdfd_vfd);
_fdvec_resize(reln, forknum, curopensegs - 1);
}
else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
else if (priorblocks + ((BlockNumber)RELSEG_SIZE) > nblocks)
{
/*
* This is the last segment we want to keep. Truncate the file to
@ -1125,7 +1134,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
*/
BlockNumber lastsegblocks = nblocks - priorblocks;
if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
if (FileTruncate(v->mdfd_vfd, (off_t)lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not truncate file \"%s\" to %u blocks: %m",
@ -1161,8 +1170,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
* crash before the next checkpoint syncs the newly-inactive segment, that
* segment may survive recovery, reintroducing unwanted data into the table.
*/
void
mdimmedsync(SMgrRelation reln, ForkNumber forknum)
void mdimmedsync(SMgrRelation reln, ForkNumber forknum)
{
int segno;
int min_inactive_seg;
@ -1224,7 +1232,7 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
/* Temp relations should never be fsync'd */
Assert(!SmgrIsTemp(reln));
if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */))
{
ereport(DEBUG1,
(errmsg_internal("could not forward fsync request because request queue is full")));
@ -1251,7 +1259,7 @@ register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum,
/* Should never be used with temp relations */
Assert(!RelFileNodeBackendIsTemp(rnode));
RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */);
}
/*
@ -1265,14 +1273,13 @@ register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum,
INIT_MD_FILETAG(tag, rnode.node, forknum, segno);
RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */);
}
/*
* ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
*/
void
ForgetDatabaseSyncRequests(Oid dbid)
void ForgetDatabaseSyncRequests(Oid dbid)
{
FileTag tag;
RelFileNode rnode;
@ -1283,14 +1290,13 @@ ForgetDatabaseSyncRequests(Oid dbid)
INIT_MD_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber);
RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */);
}
/*
* DropRelationFiles -- drop files of all given relations
*/
void
DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
{
SMgrRelation *srels;
int i;
@ -1317,7 +1323,6 @@ DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
pfree(srels);
}
/*
* _fdvec_resize() -- Resize the fork's open segments array
*/
@ -1413,7 +1418,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
v->mdfd_vfd = fd;
v->mdfd_segno = segno;
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber)RELSEG_SIZE));
/* all done */
return v;
@ -1439,7 +1444,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
Assert(behavior &
(EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL));
targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
targetseg = blkno / ((BlockNumber)RELSEG_SIZE);
/* if an existing and opened segment, we're done */
if (targetseg < reln->md_num_open_segs[forknum])
@ -1472,7 +1477,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
Assert(nextsegno == v->mdfd_segno + 1);
if (nblocks > ((BlockNumber) RELSEG_SIZE))
if (nblocks > ((BlockNumber)RELSEG_SIZE))
elog(FATAL, "segment too big");
if ((behavior & EXTENSION_CREATE) ||
@ -1492,19 +1497,19 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
* matters if in recovery, or if the caller is extending the
* relation discontiguously, but that can happen in hash indexes.)
*/
if (nblocks < ((BlockNumber) RELSEG_SIZE))
if (nblocks < ((BlockNumber)RELSEG_SIZE))
{
char *zerobuf = palloc0(BLCKSZ);
mdextend(reln, forknum,
nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
nextsegno * ((BlockNumber)RELSEG_SIZE) - 1,
zerobuf, skipFsync);
pfree(zerobuf);
}
flags = O_CREAT;
}
else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
nblocks < ((BlockNumber) RELSEG_SIZE))
nblocks < ((BlockNumber)RELSEG_SIZE))
{
/*
* When not extending (or explicitly including truncated
@ -1564,7 +1569,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
errmsg("could not seek to end of file \"%s\": %m",
FilePathName(seg->mdfd_vfd))));
/* note that this calculation will ignore any partial block at EOF */
return (BlockNumber) (len / BLCKSZ);
return (BlockNumber)(len / BLCKSZ);
}
/*
@ -1573,8 +1578,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
*
* Return 0 on success, -1 on failure, with errno set.
*/
int
mdsyncfiletag(const FileTag *ftag, char *path)
int mdsyncfiletag(const FileTag *ftag, char *path)
{
SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
File file;
@ -1622,8 +1626,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
*
* Return 0 on success, -1 on failure, with errno set.
*/
int
mdunlinkfiletag(const FileTag *ftag, char *path)
int mdunlinkfiletag(const FileTag *ftag, char *path)
{
char *p;
@ -1641,8 +1644,7 @@ mdunlinkfiletag(const FileTag *ftag, char *path)
* a SYNC_FILTER_REQUEST request. This will be called for all pending
* requests to find out whether to forget them.
*/
bool
mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
{
/*
* For now we only use filter requests as a way to drop all scheduled

View File

@ -24,7 +24,9 @@
#include "storage/md.h"
#include "storage/smgr.h"
#include "storage/filecache.h"
#include "utils/hfs.h"
#include "postmaster/secondbuffer.h"
//#include "utils/hfs.h"
#include "utils/backend_status.h"
#include "utils/hsearch.h"
#include "utils/inval.h"
#include "utils/guc.h"
@ -55,7 +57,7 @@ typedef struct f_smgr
BlockNumber blocknum, char *buffer, bool skipFsync);
bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
int (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer);
void (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
@ -390,10 +392,12 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
int i = 0;
RelFileNodeBackend *rnodes;
ForkNumber forknum;
OriginDPageKey odpk;
PageKey pk;
if (nrels == 0)
return;
/*
* Get rid of any remaining buffers for the relations. bufmgr will just
* drop them without bothering to write the contents.
@ -449,7 +453,14 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
smgrsw[which].smgr_unlink(rnodes[i], forknum, isRedo);
//remove unused pages and related wals in localdisk cache.
RemoveBufferFromLocal(rnodes[i].node.dbNode, rnodes[i].node.relNode, MAX_FORKNUM, 0);
// RemoveBufferFromLocal(rnodes[i].node.dbNode, rnodes[i].node.relNode, MAX_FORKNUM, 0);
pk.relfileNode.dbNode = rnodes[i].node.dbNode;
pk.relfileNode.relNode = rnodes[i].node.relNode;
pk.forkNo = MAX_FORKNUM;
odpk.pk = pk;
odpk.opration = DROP;
AddOneItemToDPArray(odpk);
}
pfree(rnodes);
@ -473,11 +484,11 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
// return;
// }
//if ((push_standby != true && EnableHotStandby != true) || IsBootstrapProcessingMode() || InitdbSingle) {
if ((push_standby != true && EnableHotStandby != true) || IsBootstrapProcessingMode() || InitdbSingle || he3mirror) {
smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
buffer, skipFsync);
// elog(LOG,"smgrextend reln %d,flk %d,blk %d",reln->smgr_rnode.node.relNode,forknum,blocknum);
//}
}
/*
* Normally we expect this to increase nblocks by one, but if the cached
@ -562,15 +573,15 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
PageKey pageKey;
pageKey.relfileNode.dbNode = reln->smgr_rnode.node.dbNode;
pageKey.relfileNode.relNode = reln->smgr_rnode.node.relNode;
pageKey.relfileNode.spcNode = reln->smgr_rnode.node.spcNode;
pageKey.blkNo = blocknum;
pageKey.forkNo = forknum;
pageKey.pageLsn = lsn;
EvictOnePageOutOfMemory(pageKey, buffer);
if (push_standby) {
if (push_standby || he3mirror) {
smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, buffer, skipFsync);
} else {
ReceivePageFromDataBuffer(&pageKey, (uint8_t *) buffer);
}
}
else
@ -665,6 +676,8 @@ void
smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks)
{
int i;
PageKey pk;
OriginDPageKey odpk;
/*
* Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
@ -693,7 +706,17 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
//remove unused pages and related wals in localdisk cache.
RemoveBufferFromLocal(reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, forknum[i], nblocks[i]);
// RemoveBufferFromLocal(reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, forknum[i], nblocks[i]);
if (IsBootstrapProcessingMode() != true && InitdbSingle != true)
{
pk.relfileNode.dbNode = reln->smgr_rnode.node.dbNode;
pk.relfileNode.relNode = reln->smgr_rnode.node.relNode;
pk.forkNo = forknum[i];
pk.blkNo = nblocks[i];
odpk.pk = pk;
odpk.opration = (int)TRUNCATE;
AddOneItemToDPArray(odpk);
}
/*
* We might as well update the local smgr_cached_nblocks values. The
* smgr cache inval message that this function sent will cause other
@ -719,7 +742,8 @@ void
smgrtruncatelsn(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks,XLogRecPtr lsn)
{
int i;
PageKey pk;
OriginDPageKey odpk;
/*
* Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
* just drop them without bothering to write the contents.
@ -746,20 +770,26 @@ smgrtruncatelsn(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber
reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
if(!SmgrIsTemp(reln)) {
if (false == flag) {
XLogRecPtr pushLsn;
XLogRecPtr minApplyLsn;
do {
sleep(1);
pushLsn = QueryPushLsn();
printf("====pushlsn=%lx==lsn==%lx==\n",pushLsn,lsn);
} while(pushLsn!=InvalidXLogRecPtr && pushLsn<lsn);
minApplyLsn = He3DBQueryMinLsnFromAllStanby();
printf("====pushlsn=%lx==lsn==%lx==\n",minApplyLsn,lsn);
} while(minApplyLsn!=InvalidXLogRecPtr && minApplyLsn<lsn);
flag = true;
}
}
smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
//remove unused pages and related wals in localdisk cache.
RemoveBufferFromLocal(reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, forknum[i], nblocks[i]);
// RemoveBufferFromLocal(reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, forknum[i], nblocks[i]);
pk.relfileNode.dbNode = reln->smgr_rnode.node.dbNode;
pk.relfileNode.relNode = reln->smgr_rnode.node.relNode;
pk.forkNo = forknum[i];
pk.blkNo = nblocks[i];
odpk.pk = pk;
odpk.opration = TRUNCATE;
AddOneItemToDPArray(odpk);
/*
* We might as well update the local smgr_cached_nblocks values. The
* smgr cache inval message that this function sent will cause other

View File

@ -86,8 +86,15 @@
* global variables
* ----------------
*/
bool isPreCache = false;
bool isPreCacheTable = false;
bool isPreCacheIndex = false;
bool isPreCacheIndexDone = false;
bool needPreCacheEscape = false;
bool needUnpreCacheEscape = false;
bool isPreCacheAction = true;
Oid preCacheNodeOid = 0;
uint16 *preCacheNodesCountPtr = NULL;
Oid *preCacheNodesPtr = NULL;
const char *debug_query_string; /* client-supplied query string */
/* Note: whereToSendOutput is initialized for the bootstrap/standalone case */
@ -1213,9 +1220,23 @@ exec_simple_query(const char *query_string)
*/
MemoryContextSwitchTo(oldcontext);
if (isPreCache)
if (isPreCacheTable || isPreCacheIndex)
{
if (isPreCacheAction)
{
needPreCacheEscape = true;
needUnpreCacheEscape = false;
}
else
{
needPreCacheEscape = false;
needUnpreCacheEscape = true;
}
}
else
{
needPreCacheEscape = false;
needUnpreCacheEscape = false;
}
/*
* Run the portal to completion, and then drop it (and the receiver).
@ -1228,9 +1249,10 @@ exec_simple_query(const char *query_string)
receiver,
&qc);
if (isPreCache)
if (isPreCacheTable || isPreCacheIndex)
{
needPreCacheEscape = false;
needUnpreCacheEscape = false;
}
receiver->rDestroy(receiver);
@ -1329,6 +1351,55 @@ exec_simple_query(const char *query_string)
debug_query_string = NULL;
}
static void
he3_exec_simple_query(const char *query_string)
{
if (strstr(query_string, "precache table ") != NULL && query_string - strstr(query_string, "precache table ") == 0)
{
isPreCacheTable = true;
preCacheNodeOid = 0;
isPreCacheAction = true;
exec_simple_query(query_string + strlen("precache table "));
preCacheNodeOid = 0;
isPreCacheTable = false;
}
else if (strstr(query_string, "precache index ") != NULL && query_string - strstr(query_string, "precache index ") == 0)
{
isPreCacheIndex = true;
isPreCacheIndexDone = false;
preCacheNodeOid = 0;
isPreCacheAction = true;
exec_simple_query(query_string + strlen("precache index "));
preCacheNodeOid = 0;
isPreCacheIndexDone = false;
isPreCacheIndex = false;
}
else if (strstr(query_string, "unprecache table ") != NULL && query_string - strstr(query_string, "unprecache table ") == 0)
{
isPreCacheTable = true;
preCacheNodeOid = 0;
isPreCacheAction = false;
exec_simple_query(query_string + strlen("unprecache table "));
preCacheNodeOid = 0;
isPreCacheTable = false;
}
else if (strstr(query_string, "unprecache index ") != NULL && query_string - strstr(query_string, "unprecache index ") == 0)
{
isPreCacheIndex = true;
isPreCacheIndexDone = false;
preCacheNodeOid = 0;
isPreCacheAction = false;
exec_simple_query(query_string + strlen("unprecache index "));
preCacheNodeOid = 0;
isPreCacheIndexDone = false;
isPreCacheIndex = false;
}
else
{
exec_simple_query(query_string);
}
}
/*
* exec_parse_message
*
@ -4504,16 +4575,7 @@ PostgresMain(int argc, char *argv[], bool PrivateConn,
}
else
{
if (strstr(query_string, "precache ") != NULL && query_string - strstr(query_string, "precache ") == 0)
{
isPreCache = true;
exec_simple_query(query_string + strlen("precache "));
isPreCache = false;
}
else
{
exec_simple_query(query_string);
}
he3_exec_simple_query(query_string);
}
send_ready_for_query = true;

View File

@ -17,6 +17,8 @@
#include "pg_trace.h"
#include "pgstat.h"
#include "port/atomics.h" /* for memory barriers */
#include "replication/walsender.h"
#include "replication/walsender_private.h"
#include "storage/ipc.h"
#include "storage/proc.h" /* for MyProc */
#include "storage/sinvaladt.h"
@ -1148,3 +1150,42 @@ pgstat_clip_activity(const char *raw_activity)
return activity;
}
XLogRecPtr He3DBQueryMinLsnFromAllStanby()
{
int i;
XLogRecPtr minApplyLsn;
int procpid = -1;
for (i = 0; i < NumBackendStatSlots; i++)
{
if (strcmp(BackendStatusArray[i].st_appname, "pgmirror") == 0)
{
procpid = BackendStatusArray[i].st_procpid;
break;
}
}
Assert(WalSndCtl != NULL);
for (i = 0; i < max_wal_senders; i++)
{
int pid;
XLogRecPtr apply;
WalSnd *walsnd = &WalSndCtl->walsnds[i];
SpinLockAcquire(&walsnd->mutex);
if (walsnd->pid == 0)
{
SpinLockRelease(&walsnd->mutex);
continue;
}
pid = walsnd->pid;
apply = walsnd->apply;
SpinLockRelease(&walsnd->mutex);
if (pid != procpid)
{
if (apply < minApplyLsn)
minApplyLsn = apply;
}
}
return minApplyLsn;
}

View File

@ -251,6 +251,12 @@ pgstat_get_wait_activity(WaitEventActivity w)
case WAIT_EVENT_PAGEFLUSH_MAIN:
event_name = "PageFlushMain";
break;
case WAIT_EVENT_CLEAN_LOGINDEX_MAIN:
event_name = "CleanLogindexMain";
break;
case WAIT_EVENT_SECONDBUFFER_MAIN:
event_name = "SecondBufferMain";
break;
/* no default case, so that compiler will warn */
}

View File

@ -74,7 +74,6 @@
#include "catalog/pg_ts_template.h"
#include "catalog/pg_type.h"
#include "catalog/pg_user_mapping.h"
#include "catalog/pg_hot_data.h"
#include "lib/qunique.h"
#include "utils/catcache.h"
#include "utils/rel.h"
@ -476,17 +475,6 @@ static const struct cachedesc cacheinfo[] = {
},
4
},
{HotDataRelationId, /* HOTDATADATNAMERELNAME */
HotDataDatnameRelnameIndexId,
2,
{
Anum_pg_hot_data_datname,
Anum_pg_hot_data_relname,
0,
0
},
4
},
{IndexRelationId, /* INDEXRELID */
IndexRelidIndexId,
1,

View File

@ -74,6 +74,7 @@
#include "postmaster/postmaster.h"
#include "postmaster/syslogger.h"
#include "postmaster/walwriter.h"
#include "postmaster/secondbuffer.h"
#include "replication/logicallauncher.h"
#include "replication/reorderbuffer.h"
#include "replication/slot.h"
@ -233,6 +234,7 @@ static bool check_recovery_target_name(char **newval, void **extra, GucSource so
static void assign_recovery_target_name(const char *newval, void *extra);
static bool check_recovery_target_lsn(char **newval, void **extra, GucSource source);
static void assign_recovery_target_lsn(const char *newval, void *extra);
static void assign_walsender_target_lsn(const char *newval, void *extra);
static bool check_primary_slot_name(char **newval, void **extra, GucSource source);
static bool check_default_with_oids(bool *newval, void **extra, GucSource source);
@ -606,6 +608,10 @@ char *pgstat_temp_directory;
char *application_name;
bool push_standby = false;
bool he3_point_in_time_recovery;
bool he3mirror = false;
bool pgmirror = false;
char *client_application_name = NULL;
int tcp_keepalives_idle;
int tcp_keepalives_interval;
@ -650,6 +656,7 @@ static char *timezone_string;
static char *log_timezone_string;
static char *timezone_abbreviations_string;
static char *data_directory;
//static char *lmdb_directory;
static char *session_authorization_string;
static int max_function_args;
static int max_index_keys;
@ -666,6 +673,7 @@ static char *recovery_target_string;
static char *recovery_target_xid_string;
static char *recovery_target_name_string;
static char *recovery_target_lsn_string;
static char *walSendLsnStr;
/* should be static, but commands/variable.c needs to get at this */
@ -754,6 +762,8 @@ const char *const config_group_names[] =
gettext_noop("Write-Ahead Log / Archive Recovery"),
/* WAL_RECOVERY_TARGET */
gettext_noop("Write-Ahead Log / Recovery Target"),
/* WAL_SEND_LSN */
gettext_noop("Write-Ahead Log / Wal Send Lsn"),
/* REPLICATION_SENDING */
gettext_noop("Replication / Sending Servers"),
/* REPLICATION_PRIMARY */
@ -2135,6 +2145,14 @@ static struct config_bool ConfigureNamesBool[] =
false,
NULL, NULL, NULL
},
{
{"he3mirror", PGC_SIGHUP, WAL_ARCHIVE_RECOVERY,
gettext_noop("Sets he3db as replica if he3mirror is configured true."),
},
&he3mirror,
false,
NULL, NULL, NULL
},
/* End-of-list marker */
{
@ -2357,6 +2375,17 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
{
{"second_buffers", PGC_POSTMASTER, RESOURCES_MEM,
gettext_noop("Sets the number of second buffers used by the server."),
NULL,
GUC_UNIT_BLOCKS
},
&SNBuffers,
1024, 16, INT_MAX / 2,
NULL, NULL, NULL
},
{
{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
gettext_noop("Sets the maximum number of temporary buffers used by each session."),
@ -3950,6 +3979,15 @@ static struct config_string ConfigureNamesString[] =
"",
check_recovery_target_lsn, assign_recovery_target_lsn, NULL
},
{
{"wal_send_lsn", PGC_SIGHUP, WAL_SEND_LSN,
gettext_noop("Sets the LSN of the wal send log location up to which mirror start"),
NULL
},
&walSendLsnStr,
"",
check_recovery_target_lsn, assign_walsender_target_lsn, NULL
},
{
{"promote_trigger_file", PGC_SIGHUP, REPLICATION_STANDBY,
@ -3972,6 +4010,17 @@ static struct config_string ConfigureNamesString[] =
NULL, NULL, NULL
},
{
{"he3_meta_conninfo", PGC_SIGHUP, CONN_AUTH_AUTH,
gettext_noop("Sets the connection string to be used to connect to the meta server."),
NULL,
GUC_SUPERUSER_ONLY
},
&he3_meta_conninfo,
"",
NULL, NULL, NULL
},
{
{"primary_slot_name", PGC_SIGHUP, REPLICATION_STANDBY,
gettext_noop("Sets the name of the replication slot to use on the sending server."),
@ -4377,6 +4426,30 @@ static struct config_string ConfigureNamesString[] =
NULL, NULL, NULL
},
{
//TODO lmdb
{"lmdb_page_directory", PGC_POSTMASTER, FILE_LOCATIONS,
gettext_noop("Sets the lmdb page directory."),
NULL,
GUC_SUPERUSER_ONLY | GUC_DISALLOW_IN_AUTO_FILE
},
&lmdb_page_directory,
"/tmp/pagedb",
NULL, NULL, NULL
},
{
//TODO lmdb
{"lmdb_wal_directory", PGC_POSTMASTER, FILE_LOCATIONS,
gettext_noop("Sets the lmdb wal directory."),
NULL,
GUC_SUPERUSER_ONLY | GUC_DISALLOW_IN_AUTO_FILE
},
&lmdb_wal_directory,
"/tmp/waldb",
NULL, NULL, NULL
},
{
{"config_file", PGC_POSTMASTER, FILE_LOCATIONS,
gettext_noop("Sets the server's main configuration file."),
@ -12516,6 +12589,16 @@ check_recovery_target_lsn(char **newval, void **extra, GucSource source)
return true;
}
static void assign_walsender_target_lsn(const char *newval, void *extra)
{
if (newval && strcmp(newval, "") != 0)
{
walsenderLsn = *((XLogRecPtr *) extra);
} else {
walsenderLsn = 0;
}
}
static void
assign_recovery_target_lsn(const char *newval, void *extra)
{

View File

@ -23,12 +23,12 @@ SUBDIRS = \
pg_controldata \
pg_ctl \
pg_dump \
pg_waldump \
pg_resetwal \
pg_test_fsync \
pg_test_timing \
pg_upgrade \
pg_verifybackup \
pg_waldump \
pgbench \
psql \
scripts

View File

@ -0,0 +1,54 @@
# src/bin/pg_waldump/Makefile
PGFILEDESC = "pg_produce_wal - decode and display WAL"
PGAPPICON=win32
subdir = src/bin/pg_produce_wal
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
OBJS = \
$(RMGRDESCOBJS) \
$(WIN32RES) \
xlogreader.o \
pg_mirror.o
override CPPFLAGS := -DFRONTEND -DPG_NOREPLAY -I$(libpq_srcdir) $(CPPFLAGS)
librust_log = -DFRONTEND -L$(top_builddir)/src/backend/storage/file -lrust_log -lstdc++ -lm -ldl -lpthread -lfuse3 -Wl,-gc-section
LIBS += $(librust_log)
all: pg_produce_wal
pg_produce_wal: pg_produce_wal.o $(OBJS) | submake-libpgport submake-libpq
$(CC) $(CFLAGS) pg_produce_wal.o $(OBJS) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X)
xlogreader.c: % : $(top_srcdir)/src/backend/access/transam/%
rm -f $@ && $(LN_S) $< .
pg_mirror.c: % : $(top_srcdir)/src/backend/access/transam/%
rm -f $@ && $(LN_S) $< .
#xlog.c: % : $(top_srcdir)/src/backend/access/transam/%
# rm -f $@ && $(LN_S) $< .
#$(RMGRDESCSOURCES): % : $(top_srcdir)/src/backend/access/rmgrdesc/%
# rm -f $@ && $(LN_S) $< .
install: all installdirs
$(INSTALL_PROGRAM) pg_produce_wal$(X) '$(DESTDIR)$(bindir)/pg_produce_wal$(X)'
installdirs:
$(MKDIR_P) '$(DESTDIR)$(bindir)'
uninstall:
rm -f '$(DESTDIR)$(bindir)/pg_produce_wal$(X)'
clean distclean maintainer-clean:
rm -f pg_produce_wal$(X) $(OBJS) xlogreader.c pg_mirror.c
rm -rf tmp_check
check:
$(prove_check)
installcheck:
$(prove_installcheck)

View File

@ -0,0 +1,458 @@
#define FRONTEND 1
#include "postgres.h"
#include <time.h>
#include "access/transam.h"
#include "access/xlog.h"
#include "access/pg_mirror.h"
#include "access/xlog_internal.h"
#include "catalog/pg_control.h"
#include "common/controldata_utils.h"
#include "common/logging.h"
#include "getopt_long.h"
#include "pg_getopt.h"
#include "access/heapam_xlog.h"
#include "catalog/pg_control.h"
#include "access/nbtxlog.h"
#include "access/gistxlog.h"
#include "access/spgxlog.h"
#include "access/brin_xlog.h"
#include "common/file_perm.h"
typedef struct XLogDumpPrivate
{
TimeLineID timeline;
XLogRecPtr startptr;
XLogRecPtr endptr;
bool endptr_reached;
} XLogDumpPrivate;
static void
usage(const char *progname)
{
printf(_("%s displays control information of a PostgreSQL database cluster.\n\n"), progname);
printf(_("Usage:\n"));
printf(_(" %s [OPTION] [DATADIR]\n"), progname);
printf(_("\nOptions:\n"));
printf(_(" [-D, --pgdata=]DATADIR data directory\n"));
printf(_(" -V, --version output version information, then exit\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\nIf no data directory (DATADIR) is specified, "
"the environment variable PGDATA\nis used.\n\n"));
printf(_("Report bugs to <%s>.\n"), PACKAGE_BUGREPORT);
printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
}
static const char *
dbState(DBState state)
{
switch (state)
{
case DB_STARTUP:
return _("starting up");
case DB_SHUTDOWNED:
return _("shut down");
case DB_SHUTDOWNED_IN_RECOVERY:
return _("shut down in recovery");
case DB_SHUTDOWNING:
return _("shutting down");
case DB_IN_CRASH_RECOVERY:
return _("in crash recovery");
case DB_IN_ARCHIVE_RECOVERY:
return _("in archive recovery");
case DB_IN_PRODUCTION:
return _("in production");
}
return _("unrecognized status code");
}
static const char *
wal_level_str(WalLevel wal_level)
{
switch (wal_level)
{
case WAL_LEVEL_MINIMAL:
return "minimal";
case WAL_LEVEL_REPLICA:
return "replica";
case WAL_LEVEL_LOGICAL:
return "logical";
}
return _("unrecognized wal_level");
}
/* pg_waldump's XLogReaderRoutine->batch_read callback */
static int
WALDumpBatchRead(XLogReaderState *state, XLogRecPtr targetPtr,
int reqLen, char *readBuff)
{
XLogDumpPrivate *private = state->private_data;
int count;
if (private->endptr != InvalidXLogRecPtr)
{
if (targetPtr >= private->endptr)
{
private->endptr_reached = true;
return -1;
}
}
count = He3DBWALRead(state, targetPtr, SizeOfXLogRecord, readBuff);
return count;
}
#define UsableBytesInPage_tmp (XLOG_BLCKSZ - SizeOfXLogShortPHD)
#define DEFAULT_XLOG_SEG_SIZE (16*1024*1024)
static uint64 UsableBytesInSegment_tmp =
(DEFAULT_XLOG_SEG_SIZE / XLOG_BLCKSZ * UsableBytesInPage_tmp) -
(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
static XLogRecPtr
XLogBytePosToRecPtr_tmp(uint64 bytepos)
{
/*
* original logic, we abandon it.
*/
if(0) {
uint64 fullsegs;
uint64 fullpages;
uint64 bytesleft;
uint32 seg_offset;
XLogRecPtr result;
fullsegs = bytepos / UsableBytesInSegment_tmp;
bytesleft = bytepos % UsableBytesInSegment_tmp;
if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
{
/* fits on first page of segment */
seg_offset = bytesleft + SizeOfXLogLongPHD;
}
else
{
/* account for the first page on segment with long header */
seg_offset = XLOG_BLCKSZ;
bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
fullpages = bytesleft / UsableBytesInPage_tmp;
bytesleft = bytesleft % UsableBytesInPage_tmp;
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
}
XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, DEFAULT_XLOG_SEG_SIZE, result);
return result;
}
return bytepos;
}
/*
* Like XLogBytePosToRecPtr, but if the position is at a page boundary,
* returns a pointer to the beginning of the page (ie. before page header),
* not to where the first xlog record on that page would go to. This is used
* when converting a pointer to the end of a record.
*/
static XLogRecPtr
XLogBytePosToEndRecPtr_tmp(uint64 bytepos)
{
/*
* original logic, we abandon it.
*/
if(0){
uint64 fullsegs;
uint64 fullpages;
uint64 bytesleft;
uint32 seg_offset;
XLogRecPtr result;
fullsegs = bytepos / UsableBytesInSegment_tmp;
bytesleft = bytepos % UsableBytesInSegment_tmp;
if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
{
/* fits on first page of segment */
if (bytesleft == 0)
seg_offset = 0;
else
seg_offset = bytesleft + SizeOfXLogLongPHD;
}
else
{
/* account for the first page on segment with long header */
seg_offset = XLOG_BLCKSZ;
bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
fullpages = bytesleft / UsableBytesInPage_tmp;
bytesleft = bytesleft % UsableBytesInPage_tmp;
if (bytesleft == 0)
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
else
seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
}
XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
return result;
}
return bytepos;
}
static int
BasicOpenFilePerm_tmp(const char *fileName, int fileFlags, mode_t fileMode)
{
int fd;
tryAgain:
fd = open(fileName, fileFlags, fileMode);
if (fd >= 0)
return fd; /* success! */
if (errno == EMFILE || errno == ENFILE)
{
int save_errno = errno;
printf("out of file descriptors %d",errno);
}
return -1; /* failure */
}
static int64_t
XLogFileInit_tmp(char* prefix,XLogSegNo logsegno, bool *use_existent, bool use_lock)
{
char path[MAXPGPATH];
char tmppath[MAXPGPATH];
int64_t fd;
int save_errno;
char buff[XLOG_BLCKSZ]={0};
int n = snprintf(path,sizeof(path),"%s/",prefix);
XLogFilePath(&path[n], 1, logsegno, DEFAULT_XLOG_SEG_SIZE);
/*
* Try to use existent file (checkpoint maker may have created it already)
*/
if (*use_existent)
{
fd = BasicOpenFilePerm_tmp(path, O_RDWR | PG_BINARY | SYNC_METHOD_FSYNC,PG_FILE_MODE_OWNER);
if (fd < 0)
{
if (errno != ENOENT)
printf("open file failed %s\n",path);
}
else
return fd;
} else {
fd = BasicOpenFilePerm_tmp(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,PG_FILE_MODE_OWNER);
off_t offset = 0;
while(offset<DEFAULT_XLOG_SEG_SIZE) {
pg_pwrite(fd,buff,XLOG_BLCKSZ,offset);
offset +=XLOG_BLCKSZ;
}
return fd;
}
return -1;
}
int
main(int argc, char *argv[])
{
static struct option long_options[] = {
{"pgdata", required_argument, NULL, 'D'},
{"walfilename", required_argument, NULL, 'F'},
{NULL, 0, NULL, 0}
};
ControlFileData *ControlFile;
bool crc_ok;
char *DataDir = NULL;
time_t time_tmp;
char pgctime_str[128];
char ckpttime_str[128];
char mock_auth_nonce_str[MOCK_AUTH_NONCE_LEN * 2 + 1];
const char *strftime_fmt = "%c";
const char *progname;
char xlogfilename[MAXFNAMELEN];
int c;
int i;
int WalSegSz;
XLogDumpPrivate private;
pg_logging_init(argv[0]);
set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_produce_wal"));
progname = get_progname(argv[0]);
if (argc > 1)
{
if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
{
usage(progname);
exit(0);
}
if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
{
puts("pg_controldata (PostgreSQL) " PG_VERSION);
exit(0);
}
}
while ((c = getopt_long(argc, argv, "D:", long_options, NULL)) != -1)
{
switch (c)
{
case 'D':
DataDir = optarg;
break;
default:
fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
exit(1);
}
}
if (DataDir == NULL)
{
if (optind < argc)
DataDir = argv[optind++];
else
DataDir = getenv("PGDATA");
}
/* Complain if any arguments remain */
if (optind < argc)
{
pg_log_error("too many command-line arguments (first is \"%s\")",
argv[optind]);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
progname);
exit(1);
}
if (DataDir == NULL)
{
pg_log_error("no data directory specified");
fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
exit(1);
}
/* get a copy of the control file */
ControlFile = get_controlfile(DataDir, &crc_ok);
if (!crc_ok)
printf(_("WARNING: Calculated CRC checksum does not match value stored in file.\n"
"Either the file is corrupt, or it has a different layout than this program\n"
"is expecting. The results below are untrustworthy.\n\n"));
setControlFile(ControlFile);
/* set wal segment size */
WalSegSz = ControlFile->xlog_seg_size;
if (!IsValidWalSegSize(WalSegSz))
{
printf(_("WARNING: invalid WAL segment size\n"));
printf(ngettext("The WAL segment size stored in the file, %d byte, is not a power of two\n"
"between 1 MB and 1 GB. The file is corrupt and the results below are\n"
"untrustworthy.\n\n",
"The WAL segment size stored in the file, %d bytes, is not a power of two\n"
"between 1 MB and 1 GB. The file is corrupt and the results below are\n"
"untrustworthy.\n\n",
WalSegSz),
WalSegSz);
}
/*
* This slightly-chintzy coding will work as long as the control file
* timestamps are within the range of time_t; that should be the case in
* all foreseeable circumstances, so we don't bother importing the
* backend's timezone library into pg_controldata.
*
* Use variable for format to suppress overly-anal-retentive gcc warning
* about %c
*/
time_tmp = (time_t) ControlFile->time;
strftime(pgctime_str, sizeof(pgctime_str), strftime_fmt,
localtime(&time_tmp));
time_tmp = (time_t) ControlFile->checkPointCopy.time;
strftime(ckpttime_str, sizeof(ckpttime_str), strftime_fmt,
localtime(&time_tmp));
memset(&private, 0, sizeof(XLogDumpPrivate));
private.timeline = 1;
private.startptr = ControlFile->checkPoint;
private.endptr = InvalidXLogRecPtr;
private.endptr_reached = false;
/* we have everything we need, start reading */
XLogReaderState *xlogreader_state;
xlogreader_state =
XLogReaderAllocate(WalSegSz, NULL,
XL_ROUTINE(.batch_read = WALDumpBatchRead),
&private);
if (!xlogreader_state)
printf("out of memory");
xlogreader_state->currTLI = ControlFile->checkPointCopy.ThisTimeLineID;
/* first find a valid recptr to start from */
XLogRecPtr first_record;
int ret = -1;
ret = He3DBWALRead(xlogreader_state,
private.startptr,
SizeOfXLogRecord,
xlogreader_state->readBuf);
if (ret < SizeOfXLogRecord) {
printf("He3DBReadWalInternal Failed\n");
return -1;
}
XLogRecord* record = (XLogRecord*)xlogreader_state->readBuf;
char DStr[1024]={0};
int dLen = 0;
uint64 startLsn = 0,endLsn = 0;
int mtrLen = ArrayXlogHe3ToPg(record,record->xl_tot_len,DStr,&dLen,&startLsn,&endLsn);
ControlFile->checkPoint = startLsn;
ControlFile->checkPointCopy.redo = startLsn;
update_controlfile(DataDir,ControlFile,true);
XLogSegNo segno;
XLByteToSeg(ControlFile->checkPointCopy.redo, segno, WalSegSz);
int64_t recvFile = -1;
XLogSegNo recvSegNo = 0;
TimeLineID recvFileTLI = 1;
//ThisTimeLineID = 1;
/* Close the current segment if it's completed */
if (recvFile < 0)
{
bool use_existent = false;
/* Create/use new log file */
XLByteToSeg(ControlFile->checkPoint, recvSegNo, DEFAULT_XLOG_SEG_SIZE);
recvFile = XLogFileInit_tmp(DataDir,recvSegNo, &use_existent, true);
recvFileTLI = 1;
}
int startoff = 0;
int byteswritten;
/* Calculate the start offset of the received logs */
//startoff = XLogSegmentOffset(ControlFile->checkPoint, DEFAULT_XLOG_SEG_SIZE);
//int segbytes;
//if (startoff + endLsn - ControlFile->checkPoint > DEFAULT_XLOG_SEG_SIZE)
// segbytes = DEFAULT_XLOG_SEG_SIZE - startoff;
//else
// segbytes = endLsn - ControlFile->checkPoint;
/* OK to write the logs */
//errno = 0;
byteswritten = pg_pwrite(recvFile, DStr, dLen, (off_t) startoff);
fsync(recvFile);
close(recvFile);
return 0;
}

View File

@ -54,6 +54,18 @@ typedef struct xl_brin_createidx
} xl_brin_createidx;
#define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16))
typedef struct xl_old_brin_insert
{
BlockNumber heapBlk;
/* extra information needed to update the revmap */
BlockNumber pagesPerRange;
OffsetNumber offnum;
} xl_old_brin_insert;
#define SizeOfOldBrinInsert (offsetof(xl_old_brin_insert, offnum) + sizeof(OffsetNumber))
/*
* This is what we need to know about a BRIN tuple insert
*
@ -95,6 +107,16 @@ typedef struct xl_brin_update
#define SizeOfBrinUpdate (offsetof(xl_brin_update, insert) + SizeOfBrinInsert)
typedef struct xl_old_brin_update
{
/* offset number of old tuple on old page */
OffsetNumber oldOffnum;
xl_old_brin_insert insert;
} xl_old_brin_update;
#define SizeOfOldBrinUpdate (offsetof(xl_old_brin_update, insert) + SizeOfOldBrinInsert)
/*
* This is what we need to know about a BRIN tuple samepage update
*

View File

@ -59,6 +59,20 @@ typedef struct gistxlogDelete
#define SizeOfGistxlogDelete (offsetof(gistxlogDelete, ntodelete) + sizeof(uint16))
typedef struct gistoldxlogPageSplit
{
BlockNumber origrlink; /* rightlink of the page before split */
GistNSN orignsn; /* NSN of the page before split */
bool origleaf; /* was splitted page a leaf page? */
uint16 npage; /* # of pages in the split */
bool markfollowright; /* set F_FOLLOW_RIGHT flags */
/*
* follow: 1. gistxlogPage and array of IndexTupleData per page
*/
} gistoldxlogPageSplit;
/*
* Backup Blk 0: If this operation completes a page split, by inserting a
* downlink for the split page, the left half of the split

View File

@ -192,6 +192,22 @@ typedef struct xl_multi_insert_tuple
#define SizeOfMultiInsertTuple (offsetof(xl_multi_insert_tuple, t_hoff) + sizeof(uint8))
typedef struct xl_old_heap_update
{
TransactionId old_xmax; /* xmax of the old tuple */
OffsetNumber old_offnum; /* old tuple's offset */
uint8 old_infobits_set; /* infomask bits to set on old tuple */
uint8 flags;
TransactionId new_xmax; /* xmax of the new tuple */
OffsetNumber new_offnum; /* new tuple's offset */
/*
* If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags
* are set, xl_heap_header and tuple data for the old tuple follow.
*/
} xl_old_heap_update;
#define SizeOfOldHeapUpdate (offsetof(xl_old_heap_update, new_offnum) + sizeof(OffsetNumber))
/*
* This is what we need to know about update|hot_update
*
@ -346,6 +362,14 @@ typedef struct xl_heap_freeze_page
#define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, ntuples) + sizeof(uint16))
typedef struct xl_old_heap_visible
{
TransactionId cutoff_xid;
uint8 flags;
} xl_old_heap_visible;
#define SizeOfOldHeapVisible (offsetof(xl_old_heap_visible, flags) + sizeof(uint8))
/*
* This is what we need to know about setting a visibility map bit
*

View File

@ -86,6 +86,16 @@ typedef struct xl_btree_insert
#define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber))
typedef struct xl_old_btree_split
{
uint32 level; /* tree level of page being split */
OffsetNumber firstrightoff; /* first origpage item on rightpage */
OffsetNumber newitemoff; /* new item's offset */
uint16 postingoff; /* offset inside orig posting tuple */
} xl_old_btree_split;
#define SizeOfOldBtreeSplit (offsetof(xl_old_btree_split, postingoff) + sizeof(uint16))
/*
* On insert with split, we save all the items going into the right sibling
* so that we can restore it completely from the log record. This way takes

View File

@ -6,7 +6,7 @@
#include "storage/buf_internals.h"
//max Page Num
#define G_QUEUE_LEN 2048
#define PARALLEL_NUM 1
#define PARALLEL_NUM 8
typedef struct lsn_list_t {
XLogRecPtr lsn;
XLogRecPtr endlsn;

View File

@ -0,0 +1,8 @@
#ifndef PG_MIRROR_H
#define PG_MIRROR_H
#include "c.h"
#include "catalog/pg_control.h"
extern int ArrayXlogHe3ToPg(char*sBuf,int sLen, char*dBuf,int* dLen,uint64 *startLsn,uint64 *endLsn);
extern void readControlFile(char*pathstr);
extern void setControlFile(ControlFileData *cfile);
#endif

View File

@ -25,10 +25,10 @@ extern XLogRecPtr LastPushPoint;
extern XLogRecPtr QueryMinLsn(XLogRecPtr lsn);
extern XLogRecPtr QueryPushLsn();
// extern XLogRecPtr QueryPushLsn();
extern XLogRecPtr QueryPushChkpointLsn();
extern XLogRecPtr QueryReplyLsn(XLogRecPtr lsn);
// extern XLogRecPtr QueryReplyLsn(XLogRecPtr lsn);
typedef struct DirtyPage {
XLogRecPtr startlsn;

View File

@ -58,6 +58,7 @@ typedef enum BufferStatus{
typedef struct wal_batch_t {
XLogRecPtr startLsn;
XLogRecPtr endLsn;
int dataLen;
pg_atomic_uint32 status;
char* data;

View File

@ -39,6 +39,19 @@ typedef struct spgxlogState
bool isBuild;
} spgxlogState;
typedef struct spgoldxlogAddLeaf
{
bool newPage; /* init dest page? */
bool storesNulls; /* page is in the nulls tree? */
OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */
OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */
OffsetNumber offnumParent; /* where the parent downlink is, if any */
uint16 nodeI;
/* new leaf tuple follows (unaligned!) */
} spgoldxlogAddLeaf;
/*
* Backup Blk 0: destination page for leaf tuple
* Backup Blk 1: parent page (if any)
@ -59,6 +72,35 @@ typedef struct spgxlogAddLeaf
/* new leaf tuple follows (unaligned!) */
} spgxlogAddLeaf;
typedef struct spgoldxlogMoveLeafs
{
uint16 nMoves; /* number of tuples moved from source page */
bool newPage; /* init dest page? */
bool replaceDead; /* are we replacing a DEAD source tuple? */
bool storesNulls; /* pages are in the nulls tree? */
/* where the parent downlink is */
OffsetNumber offnumParent;
uint16 nodeI;
spgxlogState stateSrc;
/*----------
* data follows:
* array of deleted tuple numbers, length nMoves
* array of inserted tuple numbers, length nMoves + 1 or 1
* list of leaf tuples, length nMoves + 1 or 1 (unaligned!)
*
* Note: if replaceDead is true then there is only one inserted tuple
* number and only one leaf tuple in the data, because we are not copying
* the dead tuple from the source
*----------
*/
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
} spgoldxlogMoveLeafs;
#define SizeOfOldSpgxlogMoveLeafs offsetof(spgoldxlogMoveLeafs, offsets)
/*
* Backup Blk 0: source leaf page
* Backup Blk 1: destination leaf page
@ -96,6 +138,44 @@ typedef struct spgxlogMoveLeafs
#define SizeOfSpgxlogMoveLeafs offsetof(spgxlogMoveLeafs, offsets)
typedef struct spgoldxlogAddNode
{
/*
* Offset of the original inner tuple, in the original page (on backup
* block 0).
*/
OffsetNumber offnum;
/*
* Offset of the new tuple, on the new page (on backup block 1). Invalid,
* if we overwrote the old tuple in the original page).
*/
OffsetNumber offnumNew;
bool newPage; /* init new page? */
/*----
* Where is the parent downlink? parentBlk indicates which page it's on,
* and offnumParent is the offset within the page. The possible values for
* parentBlk are:
*
* 0: parent == original page
* 1: parent == new page
* 2: parent == different page (blk ref 2)
* -1: parent not updated
*----
*/
int8 parentBlk;
OffsetNumber offnumParent; /* offset within the parent page */
uint16 nodeI;
spgxlogState stateSrc;
/*
* updated inner tuple follows (unaligned!)
*/
} spgoldxlogAddNode;
/*
* Backup Blk 0: original page
* Backup Blk 1: where new tuple goes, if not same place
@ -162,6 +242,42 @@ typedef struct spgxlogSplitTuple
*/
} spgxlogSplitTuple;
typedef struct spgoldxlogPickSplit
{
bool isRootSplit;
uint16 nDelete; /* n to delete from Src */
uint16 nInsert; /* n to insert on Src and/or Dest */
bool initSrc; /* re-init the Src page? */
bool initDest; /* re-init the Dest page? */
/* where to put new inner tuple */
OffsetNumber offnumInner;
bool initInner; /* re-init the Inner page? */
bool storesNulls; /* pages are in the nulls tree? */
/* where the parent downlink is, if any */
bool innerIsParent; /* is parent the same as inner page? */
OffsetNumber offnumParent;
uint16 nodeI;
spgxlogState stateSrc;
/*----------
* data follows:
* array of deleted tuple numbers, length nDelete
* array of inserted tuple numbers, length nInsert
* array of page selector bytes for inserted tuples, length nInsert
* new inner tuple (unaligned!)
* list of leaf tuples, length nInsert (unaligned!)
*----------
*/
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER];
} spgoldxlogPickSplit;
#define SizeOfOldSpgxlogPickSplit offsetof(spgoldxlogPickSplit, offsets)
/*
* Buffer references in the rdata array are:
* Backup Blk 0: Src page (only if not root)

View File

@ -108,6 +108,7 @@ extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd;
extern bool reachedConsistency;
extern int group_total_len;
extern int grouo_rec_count;
extern int grouo_rec_cur_count;
extern XLogRecord *grouphead[XLR_MAX_BLOCK_ID + 1];
extern int grouplens[XLR_MAX_BLOCK_ID + 1];
extern XLogRecData groupRecData[XLR_MAX_BLOCK_ID + 1];
@ -142,11 +143,14 @@ extern char *PrimarySlotName;
extern bool wal_receiver_create_temp_slot;
extern bool track_wal_io_timing;
extern char *he3_meta_conninfo;
/* indirectly set via GUC system */
extern TransactionId recoveryTargetXid;
extern char *recovery_target_time_string;
extern const char *recoveryTargetName;
extern XLogRecPtr recoveryTargetLSN;
extern XLogRecPtr walsenderLsn;
extern RecoveryTargetType recoveryTarget;
extern char *PromoteTriggerFile;
extern RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal;
@ -386,6 +390,9 @@ extern void XLogRequestWalReceiverReply(void);
extern void assign_max_wal_size(int newval, void *extra);
extern void assign_checkpoint_completion_target(double newval, void *extra);
extern void pushTikv(int onePageListLen,int pageNum,bool flag);
extern XLogRecData *DecodeXLogRecordAssemble(XLogReaderState *state, OldXLogRecord *record,
XLogRecPtr RedoRecPtr, bool doPageWrites,
XLogRecPtr *fpw_lsn, int *num_fpi);
/*
* Routines to start, stop, and get status of a base backup.

View File

@ -219,16 +219,16 @@ struct XLogReaderState
/* last read XLOG position for data currently in readBuf */
uint32 bufoff;
/* last read XLOG position for data currently in readBuf */
// WALSegmentContext segcxt;
// WALOpenSegment seg;
// uint32 segoff;
WALSegmentContext segcxt;
WALOpenSegment seg;
uint32 segoff;
/*
* beginning of prior page read, and its TLI. Doesn't necessarily
* correspond to what's in readBuf; used for timeline sanity checks.
*/
// XLogRecPtr latestPagePtr;
// TimeLineID latestPageTLI;
XLogRecPtr latestPagePtr;
TimeLineID latestPageTLI;
/* beginning of the WAL record being read. */
XLogRecPtr currRecPtr;
@ -275,6 +275,7 @@ struct XLogReaderState
Buffer buffer;
bool isreplay;
bool streamStart;
bool insertTikv;
};
/* Get a new XLogReader */
@ -303,6 +304,8 @@ extern struct XLogRecord *He3DBXLogReadRecord(XLogReaderState *state,
extern struct XLogRecord *He3DBXLogListReadRecord(XLogReaderState *state,
char **errormsg, char *pageXlogBuf);
extern struct XLogRecord *StartupXLogReadRecord(XLogReaderState *state, char **errormsg);
/* Validate a page */
extern bool XLogReaderValidatePageHeader(XLogReaderState *state,
XLogRecPtr recptr, char *phdr);

View File

@ -17,6 +17,21 @@
#include "storage/block.h"
#include "storage/relfilenode.h"
typedef struct OldXLogRecord
{
uint32 xl_tot_len; /* total len of entire record */
TransactionId xl_xid; /* xact id */
XLogRecPtr xl_prev; /* ptr to previous record in log */
uint8 xl_info; /* flag bits, see below */
RmgrId xl_rmid; /* resource manager for this record */
/* 2 bytes of padding here, initialize to zero */
pg_crc32c xl_crc; /* CRC for this record */
/* XLogRecordBlockHeaders and XLogRecordDataHeader follow, no padding */
} OldXLogRecord;
#define SizeOfOldXLogRecord (offsetof(OldXLogRecord, xl_crc) + sizeof(pg_crc32c))
/*
* The overall layout of an XLOG record is:
* Fixed-size header (XLogRecord struct)

View File

@ -13,6 +13,7 @@
#include "access/xlogreader.h"
#include "storage/bufmgr.h"
#include "access/xlogutils.h"
extern bool XLogHaveInvalidPages(void);
@ -60,8 +61,9 @@ extern int read_local_xlog_page(XLogReaderState *state,
XLogRecPtr targetPagePtr, int reqLen,
XLogRecPtr targetRecPtr, char *cur_page);
extern int read_local_xlog_batch(XLogReaderState *state,
XLogRecPtr startRecPtr,
int reqLen,
XLogRecPtr startRecPtr, char *cur_page);
char *cur_page);
extern void wal_segment_open(XLogReaderState *state,
XLogSegNo nextSegNo,
TimeLineID *tli_p);

View File

@ -1,66 +0,0 @@
/*-------------------------------------------------------------------------
*
* pg_hot_data.h
* definition of the "hot_data" system catalog (pg_hot_data)
*
*
* Portions Copyright (c) 2022, He3DB Global Development Group
*
* src/include/catalog/pg_hot_data.h
*
* NOTES
* The Catalog.pm module reads this file and derives schema
* information.
*
*-------------------------------------------------------------------------
*/
#ifndef PG_HOT_DATA_H
#define PG_HOT_DATA_H
#include "catalog/genbki.h"
#include "catalog/pg_hot_data_d.h"
/* ----------------
* pg_hot_data definition. cpp turns this into
* typedef struct FormData_pg_hot_data
* ----------------
*/
CATALOG(pg_hot_data,4790,HotDataRelationId) BKI_SHARED_RELATION BKI_ROWTYPE_OID(4793,HotDataRelation_Rowtype_Id) BKI_SCHEMA_MACRO
{
/* database name */
NameData datname;
/* relation name */
NameData relname;
/* caching rules */
char crules;
/* client name */
NameData clientname;
/* client addr */
NameData clientaddr;
#ifdef CATALOG_VARLEN /* variable-length fields start here */
/* cache rules schedule time */
timestamptz crulessettime;
/* hot data cache time */
timestamptz cachetime;
#endif
}FormData_pg_hot_data;
/* ----------------
* Form_pg_hot_data corresponds to a pointer to a tuple with
* the format of pg_hot_data relation.
* ----------------
*/
typedef FormData_pg_hot_data *Form_pg_hot_data;
DECLARE_UNIQUE_INDEX(pg_hot_data_datname_relname_index, 4791, on pg_hot_data using btree(datname name_ops, relname name_ops));
#define HotDataDatnameRelnameIndexId 4791
extern void PrecacheHotData();
#endif

View File

@ -0,0 +1,130 @@
#include "postgres.h"
#include "utils/hfs.h"
#include <lmdb.h>
#define MAXREADERS 512
#define MAPSIE (uint64)1<<40
#define DEFAULTPAGEPATH "/tmp/pagedb"
#define DEFAULTWALPATH "/tmp/waldb"
#define PAGE 1
#define WAL 2
#define BLKSZ 8192
#define DROP 1
#define TRUNCATE 2
#define EVICT 3
#define SDLEN 1024
#define SDNUM 128
#define SecondBufferTableHashPartition(hashcode) \
((hashcode) % NUM_LOCK_PARTITIONS)
#define SecondBufferMappingPartitionLock(hashcode) \
(&SecondBufferMainLWLockArray[SecondBufferTableHashPartition(hashcode)].lock)
extern char *lmdb_page_directory;
extern char *lmdb_wal_directory;
extern Size SNBuffers;
/*
for secondbufferhash code
*/
typedef struct SdPageKey
{
uint32 dbid;
uint32 relid;
uint32 forkno;
uint32 blkno;
} SdPageKey;
typedef struct SdPageKeyEntity
{
SdPageKey spk;
struct SdPageKeyEntity *next;
} SdPageKeyEntity;
typedef struct SdPageKeyList
{
SdPageKeyEntity *head;
SdPageKeyEntity *tail;
} SdPageKeyList;
typedef struct LdPageKey
{
SdPageKey sk;
} LdPageKey;
typedef struct WalLdPageKey
{
SdPageKey sk;
uint64 pageLsn;
uint8 partition;
} WalLdPageKey;
typedef struct OriginDPageKey
{
PageKey pk;
int opration;
} OriginDPageKey;
typedef struct SdPageValue
{
SdPageKey pk;
uint8 pagecontent[BLKSZ];
} SdPageValue;
typedef struct DPageKey
{
PageKey pk;
bool pagedeleted;
uint8_t operation;
} DPageKey;
typedef struct kvStruct {
LdPageKey lpk;
uint8_t *buf;
int32 length;
uint64_t lsn;
} kvStruct;
//extern SingleKeyArray *MultiKeyArrays;
extern MDB_env *pageEnv;
extern MDB_env *walEnv;
extern MDB_dbi pageDbi;
extern MDB_dbi walDbi;
extern MDB_txn *pageTxn;
extern MDB_txn *walTxn;
extern MDB_cursor *cursor;
// MDB_stat mst;
// MDB_cursor_op op;
extern void InitSecondBufferMeta(void);
extern void InitSecondBufferHash(void);
extern void InitDPageKeyArray(void);
extern void InitPageDBEnv(void);
extern void InitWalDBEnv(void);
extern void storeWalInLocalBuffer(kvStruct *ks,int32 length);
extern void ReceivePageFromDataBuffer(PageKey *pk, uint8_t *buffer); // when evict one page out databuffer, we should call this to store the page.
extern void GetPageFromCurrentNode(PageKey pk,Bufrd *bufrd); // async delete old version page and wal. we should call this when move page from ld/sdb to db.
extern Bufrd GetWalFromLd(PageKey *pk);
extern Bufrd GetWalFromLocalBuffer(WalLdPageKey *pk);
extern void AddOneItemToDPArray(OriginDPageKey pk);
extern void SecondBufferMain(void);
extern void ClosePageDBEnv(void);
extern void CloseWalEnv(void);
extern void CreateSecondBufferLWLocks(void);
extern Size SecondBufferLWLockShmemSize(void);
extern Size SecondBufferShmemSize(void);

View File

@ -189,6 +189,8 @@ typedef struct BufferDesc
BufferTag tag; /* ID of page contained in buffer */
int buf_id; /* buffer's index number (from 0) */
bool isPreCacheEscape; /* escape from clock algorithm */
/* state of the tag, containing flags, refcount and usagecount */
pg_atomic_uint32 state;

View File

@ -79,8 +79,15 @@ extern int bgwriter_flush_after;
extern bool bulk_io_is_in_progress;
extern int bulk_io_in_progress_count;
extern bool isPreCache;
extern bool isPreCacheTable;
extern bool isPreCacheIndex;
extern bool isPreCacheIndexDone;
extern bool needPreCacheEscape;
extern bool needUnpreCacheEscape;
extern bool isPreCacheAction;
extern Oid preCacheNodeOid;
extern uint16 *preCacheNodesCountPtr;
extern Oid *preCacheNodesPtr;
/* in buf_init.c */
extern PGDLLIMPORT char *BufferBlocks;
@ -305,4 +312,7 @@ TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page)
*/
#define PAGEXLOG_BLCKSZ 49152
/* Max preCacheNodes */
#define NPreCacheNodes 128
#endif /* BUFMGR_H */

View File

@ -214,23 +214,23 @@ typedef struct WalList
slock_t append_lck;
} WalList;
/*
for secondbufferhash code
*/
typedef struct PageKey
{
uint32 dbid;
uint32 relid;
uint32 forkno;
uint32 blkno;
} PageKey;
// /*
// for secondbufferhash code
// */
// typedef struct SdPageKey
// {
// uint32 dbid;
// uint32 relid;
// uint32 forkno;
// uint32 blkno;
// } SdPageKey;
typedef struct PageValue
{
PageKey pk;
uint8_t page[BLKSZ];
uint8_t pageLsn[LSNSZ];
} PageVlue;
// typedef struct SdPageValue
// {
// SdPageKey pk;
// uint8_t page[BLKSZ];
// uint8_t pageLsn[LSNSZ];
// } SdPageValue;
//**************for fs meta************
@ -678,9 +678,6 @@ extern void RememberSimpleDeadLock(PGPROC *proc1,
extern void InitDeadLockChecking(void);
extern int LockWaiterCount(const LOCKTAG *locktag);
extern void InitSecondBufferHash(void);
extern PageValue *SetupSecondBufferInTable(const PageKey *pageKey);
extern PageValue *FindSecondBufferInTable(const PageKey *pageKey);
#ifdef LOCK_DEBUG
extern void DumpLocks(PGPROC *proc);

View File

@ -42,6 +42,7 @@ typedef enum
PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */
PMSIGNAL_PARALLEL_FLUSH_WORKER,
PMSIGNAL_CLEAN_LOGINDEX_WORKER,
PMSIGNAL_SECONDBUFFER_WORKER,
NUM_PMSIGNALS /* Must be last value of enum! */
} PMSignalReason;

View File

@ -383,7 +383,7 @@ extern PGPROC *PreparedXactProcs;
* operation. Startup process and WAL receiver also consume 2 slots, but WAL
* writer is launched only after startup has exited, so we only need 5 slots.
*/
#define NUM_AUXILIARY_PROCS 5
#define NUM_AUXILIARY_PROCS 10
/* configurable options */
extern PGDLLIMPORT int DeadlockTimeout;

View File

@ -10,6 +10,7 @@
#ifndef BACKEND_STATUS_H
#define BACKEND_STATUS_H
#include "access/xlogdefs.h"
#include "datatype/timestamp.h"
#include "libpq/pqcomm.h"
#include "miscadmin.h" /* for BackendType */
@ -317,5 +318,7 @@ extern PgBackendStatus *pgstat_fetch_stat_beentry(int beid);
extern LocalPgBackendStatus *pgstat_fetch_stat_local_beentry(int beid);
extern char *pgstat_clip_activity(const char *raw_activity);
extern XLogRecPtr He3DBQueryMinLsnFromAllStanby();
#endif /* BACKEND_STATUS_H */

View File

@ -270,10 +270,14 @@ extern PGDLLIMPORT char *ConfigFileName;
extern char *HbaFileName;
extern char *IdentFileName;
extern char *external_pid_file;
extern char *client_application_name;
extern PGDLLIMPORT char *application_name;
extern PGDLLIMPORT bool push_standby;
extern PGDLLIMPORT bool he3_point_in_time_recovery;
extern PGDLLIMPORT bool he3mirror;
extern PGDLLIMPORT bool pgmirror;
extern int tcp_keepalives_idle;
extern int tcp_keepalives_interval;

View File

@ -69,6 +69,7 @@ enum config_group
WAL_ARCHIVING,
WAL_ARCHIVE_RECOVERY,
WAL_RECOVERY_TARGET,
WAL_SEND_LSN,
REPLICATION_SENDING,
REPLICATION_PRIMARY,
REPLICATION_STANDBY,

View File

@ -3,14 +3,17 @@
#include <stdint.h>
#include <stdlib.h>
#include "utils/pg_lsn.h"
#include "storage/relfilenode.h"
typedef struct{
typedef struct
{
uint8_t *buf;
size_t count;
size_t cap;
} Bufrd;
typedef struct{
typedef struct
{
int64_t fd;
int32_t error;
@ -63,7 +66,8 @@ extern Bufrd dataRead(int64_t fd,
extern void free_dataRead(uint8_t *buf, size_t count, size_t cap);
extern Bufrd readfs(int64_t fd, int64_t offset, uint32_t size);
extern int batchRead(uint8_t *buf, uint32_t timeline, uint64_t startPtr, bool needStore);
extern int batchRead(uint8_t *buf, uint32_t timeline, uint64_t startPtr,uint64_t endPtr, bool needStore);
extern int batchReadForTools(uint8_t *buf, uint32_t timeline, uint64_t startPtr,uint64_t endPtr, bool needStore);
extern uint8_t kvwrite(XLogItem *xlogItem);
extern uint8_t flushwals(XLogItem *xlogItem, uint32_t timeline);
extern uint8_t kvflush(XLogRecPtr lsn);
@ -76,7 +80,14 @@ extern Bufrd ReadWalsByPage(uint32_t dbid,
extern void InsertConsistToKV(uint64_t lsn);
extern uint64_t GetConsistLsn(uint64_t lsn);
extern void DelConsistLsns(uint64_t lsn);
extern void DelRangeWals(uint32_t timeline, uint64_t startPtr,uint64_t endPtr);
//extern void ReceivePageFromDataBuffer(PageKey *pk, uint8_t *buffer); //when evict one page out databuffer, we should call this to store the page.
extern uint8_t EvictOnePageOutOfMemory(PageKey pageKey, char *value);
//GetPageFromCurrentNode(PageKey *pk);
extern Bufrd MoveOnePageToMemory(PageKey pageKey);
extern Bufrd GetWalsFromDisk(PageKey pageKey);
//extern Bufrd GetWalsFromDisk(PageKey pageKey);
extern void RemoveBufferFromLocal(uint32_t dbid, uint32_t relid, uint32_t forkno, uint32_t blkno);

View File

@ -63,7 +63,6 @@ enum SysCacheIdentifier
FOREIGNSERVERNAME,
FOREIGNSERVEROID,
FOREIGNTABLEREL,
HOTDATADATNAMERELNAME,
INDEXRELID,
LANGNAME,
LANGOID,