diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index 4f9fa25..48d0132 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -178,7 +178,7 @@ pg_prewarm(PG_FUNCTION_ARGS) for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); - smgrread(rel->rd_smgr, forkNumber, block, blockbuffer.data, GetXLogWriteRecPtr()); + smgrread(rel->rd_smgr, forkNumber, block, blockbuffer.data); ++blocks_done; } } diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index dd0c124..0b8a037 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -22,6 +22,7 @@ #include "storage/smgr.h" #include "utils/rel.h" #include "utils/snapmgr.h" +#include "postmaster/bgwriter.h" PG_MODULE_MAGIC; @@ -385,6 +386,7 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS) Relation rel; ForkNumber fork; BlockNumber block; + XLogRecPtr lsn = InvalidXLogRecPtr; rel = relation_open(relid, AccessExclusiveLock); @@ -394,13 +396,6 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS) RelationOpenSmgr(rel); rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber; - block = visibilitymap_prepare_truncate(rel, 0); - if (BlockNumberIsValid(block)) - { - fork = VISIBILITYMAP_FORKNUM; - smgrtruncate(rel->rd_smgr, &fork, 1, &block); - } - if (RelationNeedsWAL(rel)) { xl_smgr_truncate xlrec; @@ -411,8 +406,36 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS) XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + XLogFlush(lsn); + if (IsBootstrapProcessingMode() != true && InitdbSingle != true) { + RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT + | CHECKPOINT_FLUSH_ALL); + } + } - XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + block = visibilitymap_prepare_truncate(rel, 0); + if (BlockNumberIsValid(block)) + { + fork = VISIBILITYMAP_FORKNUM; + /* + * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will + * just drop them without bothering to write the contents. + */ + DropRelFileNodeBuffers(rel->rd_smgr, &fork, 1, &block); + + /* + * Send a shared-inval message to force other backends to close any smgr + * references they may have for this rel. This is useful because they + * might have open file pointers to segments that got removed, and/or + * smgr_targblock variables pointing past the new rel end. (The inval + * message will come back to our backend, too, causing a + * probably-unnecessary local smgr flush. But we don't expect that this + * is a performance-critical path.) As in the unlink code, we want to be + * sure the message is sent before we start changing things on-disk. + */ + CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode); + smgrtruncatelsn(rel->rd_smgr, &fork, 1, &block, lsn); } /* diff --git a/doc/he3db/deploy-for-pg-as-primary-he3db-as-replica.md b/doc/he3db/deploy-for-pg-as-primary-he3db-as-replica.md new file mode 100644 index 0000000..c4aa939 --- /dev/null +++ b/doc/he3db/deploy-for-pg-as-primary-he3db-as-replica.md @@ -0,0 +1,107 @@ +# 部署 +## 1 启动原生PG作为主 +### 1.1 PG14.2源码编译安装 +```shell +./configure --enable-depend --enable-cassert --enable-debug CFLAGS="-ggdb -O0" --prefix=/home/postgres/psql14_pg +make && make install +``` +其中,configure选项参考[CONFIGURE-OPTIONS](https://www.postgresql.org/docs/current/install-procedure.html#CONFIGURE-OPTIONS) + +### 1.2 初始化数据 +```shell +cd /home/postgres/psql14_pg +./bin/initdb -D /home/postgres/pgdata_14 +``` + +### 1.3 修改配置文件 +```shell +vim /home/postgres/pgdata_14/postgresql.conf + +port=15432 +wal_level = replica +wal_recycle=off +``` + +修改访问控制文件 +```shell +vim /home/postgres/pgdata_14/pg_hba.conf + +host repl all 0.0.0.0/0 trust +``` + +### 1.4 启动服务 +```shell +./bin/pg_ctl -D /home/postgres/pgdata_14 start -l logfile +``` +### 1.5 创建流复制用户 +```shell +./bin/psql -h127.0.0.1 -p15432 + +postgres=# CREATE ROLE repl login replication encrypted password 'repl'; +``` +## 2 启动He3DB作为备 +### 2.1 编译安装PG He3DB +```shell +//编译需要依赖静态库 he3pg/src/backend/storage/file/librust_log.a +./configure --enable-depend --enable-cassert --enable-debug CFLAGS="-ggdb -O0" --prefix=/home/postgres/psqlhe3_mirror +make && make install +``` +### 2.2 从主备份数据 +```shell +cd /home/postgres/psqlhe3_mirror +./bin/pg_basebackup -h 127.0.0.1 -p 15432 -U repl -R -Fp -Xs -Pv -D /home/postgres/pgdata_mirror +``` +### 2.3 修改postgres.conf配置 +```shell +vim /home/postgres/pgdata_mirror/postgresql.conf + +// 配置文件最后添加配置 +primary_conninfo = 'application_name=pushstandby user=repl host=127.0.0.1 port=15432 sslmode=disable sslcompression=0 gssencmode=disable target_session_attrs=any' +hot_standby=on +port = 5434 +push_standby=on +wal_recycle=off +fsync=off +wal_keep_size=10000 +full_page_writes=off +he3mirror=true +``` +### 2.4 启动服务 +```shell +./bin/pg_ctl -D /home/postgres/pgdata_mirror start -l logfile +``` +## 3 验证 +### 3.1 链接主插入新数据 +```shell +./bin/psql -h127.0.0.1 -p15432 +postgres=# create table "t1" (id int); +CREATE TABLE +postgres=# insert into t1 values(1); +INSERT 0 1 +``` +### 3.2 备机验证数据 +```shell +./bin/psql -h127.0.0.1 -p5434 +postgres=# select * from t1; +id +---- +1 + +(1 row) +``` +### 3.3 链接主插入新数据 +``` +./bin/psql -h127.0.0.1 -p15432 +postgres=# insert into t1 values(2); +INSERT 0 1 +``` +### 3.4 备机验证数据 +```shell +./bin/psql -h127.0.0.1 -p5434 +postgres=# select * from t1; +id +---- +1 +2 +(2 row) +``` \ No newline at end of file diff --git a/hbr-raw/cmd/archive-raw.go b/hbr-raw/cmd/archive-raw.go new file mode 100644 index 0000000..af9c780 --- /dev/null +++ b/hbr-raw/cmd/archive-raw.go @@ -0,0 +1,118 @@ +package cmd + +import ( + "bytes" + "fmt" + "strconv" + "time" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + "github.com/pingcap/tidb/config" + "github.com/pingcap/tidb/store/tikv" + "github.com/spf13/cobra" +) + +var archiveCmd = &cobra.Command{ + Use: "archive", + Short: "Archive He3DB Xlog KV", + Long: "Welcome to use hbr for He3DB xlog archive", + Run: runArchive, +} + +func init() { + rootCmd.AddCommand(archiveCmd) +} + +func runArchive(cmd *cobra.Command, args []string) { + var sem = make(chan bool, concurrency) + archiveStart := time.Now() + access_key, _ := cmd.Flags().GetString("access_key") + secret_key, _ := cmd.Flags().GetString("secret_key") + endpoint, _ := cmd.Flags().GetString("endpoint") + region, _ := cmd.Flags().GetString("region") + bucket, _ := cmd.Flags().GetString("bucket") + pd, _ := cmd.Flags().GetString("pd") + backup_name, _ := cmd.Flags().GetString("name") + archive_start_time_line, _ := cmd.Flags().GetString("archive_start_time_line") + archive_start_lsn, _ := cmd.Flags().GetString("archive_start_lsn") + + if access_key == "" || secret_key == "" || endpoint == "" || region == "" || bucket == "" || pd == "" || backup_name == "" || archive_start_time_line == "" || archive_start_lsn == "" { + fmt.Printf("PARAMETER ERROR!\n") + return + } + + client, err := tikv.NewRawKVClient([]string{pd}, config.Security{}) + if err != nil { + fmt.Printf("Connect Tikv Error!\n%v\n", err) + return + } + + sess, err := session.NewSession(&aws.Config{ + Region: aws.String(region), + Endpoint: aws.String(endpoint), + Credentials: credentials.NewStaticCredentials(access_key, secret_key, ""), + S3ForcePathStyle: aws.Bool(true), + }) + if err != nil { + fmt.Printf("Connect S3 Error!\n%v\n", err) + return + } + s3_client := s3.New(sess) + + var filename string = "" + wlCount := 0 + + // archive wal kv + fmt.Printf("archive wal kv!\n") + for id := 0; id < 8; id++ { + //06000000000000000100000000000000070000000000000000 + //因为加了个id字段,目前不能跨时间线备份 + retStartString := fmt.Sprintf("06%s000000000000000%d%s", archive_start_time_line, id, archive_start_lsn) + //retEndString := fmt.Sprintf("06ffffffffffffffff000000000000000%dffffffffffffffff", id) + retEndString := fmt.Sprintf("06%s000000000000000%dffffffffffffffff", archive_start_time_line, id) + + retStart := make([]byte, 25) + retEnd := make([]byte, 25) + index := 0 + for i := 0; i < len(retStartString); i += 2 { + value, _ := strconv.ParseUint(retStartString[i:i+2], 16, 8) + retStart[index] = byte(0xff & value) + value, _ = strconv.ParseUint(retEndString[i:i+2], 16, 8) + retEnd[index] = byte(0xff & value) + index++ + } + fmt.Printf("%x\n", retStart) + fmt.Printf("%x\n", retEnd) + + limit := 10240 + + for { + keys, values, _ := client.Scan(retStart, retEnd, limit) + for k, _ := range keys { + fmt.Printf("%x\n", keys[k]) + filename = fmt.Sprintf("%x", keys[k]) + wg.Add(1) + sem <- true + go s3PutKV(s3_client, bucket, backup_name, filename, values[k], sem) + + if bytes.Compare(retStart, keys[k]) < 0 { + retStart = keys[k] + } + wlCount++ + } + if len(keys) < limit { + break + } + wlCount-- + } + } + + wg.Wait() + client.Close() + + fmt.Printf("wal kv count:%v\n", wlCount) + fmt.Println("backup time:", time.Since(archiveStart)) +} diff --git a/hbr-raw/cmd/help.go b/hbr-raw/cmd/help.go new file mode 100644 index 0000000..208710e --- /dev/null +++ b/hbr-raw/cmd/help.go @@ -0,0 +1,110 @@ +package cmd + +import ( + "bytes" + //"context" + "fmt" + "io/ioutil" + "strconv" + "sync" + "os" + //"time" + + "github.com/aws/aws-sdk-go/aws" + //"github.com/aws/aws-sdk-go/aws/credentials" + //"github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + //"github.com/pingcap/tidb/config" + "github.com/pingcap/tidb/store/tikv" + "github.com/spf13/cobra" +) + +var rootCmd = &cobra.Command{ + Use: "hbr", + Short: "He3DB backup&restore", + Long: "Welcome to use hbr for He3DB backup&restore", + Run: runRoot, +} + +var wg sync.WaitGroup +var concurrency int + +func init() { + rootCmd.PersistentFlags().String("access_key", "", "S3 Access Key") + rootCmd.PersistentFlags().String("secret_key", "", "S3 Secret Key") + rootCmd.PersistentFlags().String("endpoint", "", "S3 endpoint") + rootCmd.PersistentFlags().String("region", "", "S3 region") + rootCmd.PersistentFlags().String("bucket", "", "S3 bucket") + rootCmd.PersistentFlags().String("pd", "http://127.0.0.1:2379", "Tikv placement driber") + rootCmd.PersistentFlags().String("name", "", "Backup name") + rootCmd.PersistentFlags().String("archive_start_file", "000000010000000000000001", "start key of archive[included]") + rootCmd.PersistentFlags().String("archive_start_time_line", "0000000000000001", "start time line of archive[included]") + rootCmd.PersistentFlags().String("archive_start_lsn", "0000000000000000", "start lsn of archive[included]") + rootCmd.PersistentFlags().IntVar(&concurrency, "concurrency", 100, "concurrency") +} + +func Execute() { + if err := rootCmd.Execute(); err != nil { + panic(err) + } +} + +func runRoot(cmd *cobra.Command, args []string) { + fmt.Printf("Welcome to use hbr for He3DB backup&restore\n") +} + +func s3PutKV(s3_client *s3.S3, bucket string, backup_name string, filename string, v []byte, sem chan bool) { + defer wg.Done() + defer func() { + <-sem + }() + _, err := s3_client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(backup_name + "/" + filename), + Body: bytes.NewReader(v), + }) + if err != nil { + fmt.Printf("S3 PutObject Error!\n%v\n", err) + os.Exit(1) + } + //fmt.Printf("S3 PutObject!\n") +} + +func s3RestoreKVRaw(s3_client *s3.S3, bucket string, backup_name string, keys *s3.Object, client *tikv.RawKVClient, sem chan bool) { + defer wg.Done() + defer func() { + <-sem + }() + + out, err := s3_client.GetObject(&s3.GetObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(*keys.Key), + }) + if err != nil { + fmt.Printf("S3 GetObject Error!\n%v\n", err) + os.Exit(1) + } + defer out.Body.Close() + + data, err := ioutil.ReadAll(out.Body) + if err != nil { + fmt.Printf("out.Body.Read!\n%v\n", err) + os.Exit(1) + } + + fmt.Printf("filename:%s\n", (*keys.Key)[len(backup_name)+1:]) + + ret := make([]byte, (len(*keys.Key)-len(backup_name)-1)/2) + index := 0 + for i := len(backup_name) + 1; i < len(*keys.Key); i += 2 { + value, _ := strconv.ParseUint((*keys.Key)[i:i+2], 16, 8) + ret[index] = byte(0xff & value) + index++ + + } + + if err := client.Put(ret, data); err != nil { + fmt.Printf("Tikv Set Error!\n%v\n", err) + os.Exit(1) + } +} diff --git a/hbr-raw/cmd/restore-raw.go b/hbr-raw/cmd/restore-raw.go new file mode 100644 index 0000000..617ab2f --- /dev/null +++ b/hbr-raw/cmd/restore-raw.go @@ -0,0 +1,95 @@ +package cmd + +import ( + //"context" + "fmt" + //"io/ioutil" + //"strconv" + "time" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + "github.com/spf13/cobra" + + "github.com/pingcap/tidb/config" + "github.com/pingcap/tidb/store/tikv" +) + +var restoreCmd = &cobra.Command{ + Use: "restore", + Short: "Restore He3DB", + Long: "Welcome to use hbr for He3DB restore", + Run: runRestore, +} + +func init() { + rootCmd.AddCommand(restoreCmd) +} + +func runRestore(cmd *cobra.Command, args []string) { + var sem = make(chan bool, concurrency) + restoreStart := time.Now() + access_key, _ := cmd.Flags().GetString("access_key") + secret_key, _ := cmd.Flags().GetString("secret_key") + endpoint, _ := cmd.Flags().GetString("endpoint") + region, _ := cmd.Flags().GetString("region") + bucket, _ := cmd.Flags().GetString("bucket") + pd, _ := cmd.Flags().GetString("pd") + backup_name, _ := cmd.Flags().GetString("name") + + if access_key == "" || secret_key == "" || endpoint == "" || region == "" || bucket == "" || pd == "" || backup_name == "" { + fmt.Printf("PARAMETER ERROR!\n") + return + } + + client, err := tikv.NewRawKVClient([]string{pd}, config.Security{}) + if err != nil { + fmt.Printf("Connect Tikv Error!\n%v\n", err) + return + } + + sess, err := session.NewSession(&aws.Config{ + Region: aws.String(region), + Endpoint: aws.String(endpoint), + Credentials: credentials.NewStaticCredentials(access_key, secret_key, ""), + S3ForcePathStyle: aws.Bool(true), + }) + if err != nil { + fmt.Printf("Connect S3 Error!\n%v\n", err) + return + } + s3_client := s3.New(sess) + + count := 0 + input := &s3.ListObjectsInput{ + Bucket: aws.String(bucket), + Prefix: aws.String(backup_name), + } + for { + resp, err := s3_client.ListObjects(input) + if err != nil { + fmt.Printf("S3 ListObjects Error!\n%v\n", err) + return + } + + for _, keys := range resp.Contents { + wg.Add(1) + sem <- true + go s3RestoreKVRaw(s3_client, bucket, backup_name, keys, client, sem) + count++ + } + + if resp.NextMarker == nil { + fmt.Printf("Done!\n") + break + } + input.Marker = resp.NextMarker + } + wg.Wait() + fmt.Printf("N:%v\n", count) + fmt.Println("restore time:", time.Since(restoreStart)) +} + + diff --git a/hbr-raw/cmd/scan.go b/hbr-raw/cmd/scan.go new file mode 100644 index 0000000..f904f75 --- /dev/null +++ b/hbr-raw/cmd/scan.go @@ -0,0 +1,92 @@ +package cmd + +import ( + "bytes" + "fmt" + "strconv" + "time" + + "github.com/pingcap/tidb/config" + "github.com/pingcap/tidb/store/tikv" + "github.com/spf13/cobra" +) + +var archive3Cmd = &cobra.Command{ + Use: "scan", + Short: "Archive He3DB Xlog KV", + Long: "Welcome to use hbr for He3DB xlog archive", + Run: runArchive3, +} + +func init() { + rootCmd.AddCommand(archive3Cmd) +} + +func runArchive3(cmd *cobra.Command, args []string) { + archiveStart := time.Now() + access_key, _ := cmd.Flags().GetString("access_key") + secret_key, _ := cmd.Flags().GetString("secret_key") + endpoint, _ := cmd.Flags().GetString("endpoint") + region, _ := cmd.Flags().GetString("region") + bucket, _ := cmd.Flags().GetString("bucket") + pd, _ := cmd.Flags().GetString("pd") + backup_name, _ := cmd.Flags().GetString("name") + archive_start_time_line, _ := cmd.Flags().GetString("archive_start_time_line") + archive_start_lsn, _ := cmd.Flags().GetString("archive_start_lsn") + + if access_key == "" || secret_key == "" || endpoint == "" || region == "" || bucket == "" || pd == "" || backup_name == "" || archive_start_time_line == "" || archive_start_lsn == "" { + fmt.Printf("PARAMETER ERROR!\n") + return + } + + client, err := tikv.NewRawKVClient([]string{pd}, config.Security{}) + if err != nil { + fmt.Printf("Connect Tikv Error!\n%v\n", err) + return + } + + wlCount := 0 + + // archive wal kv + fmt.Printf("archive wal kv!\n") + //0600000000000000010000000000000000 + retStartString := fmt.Sprintf("06%s%s", archive_start_time_line, archive_start_lsn) + retEndString := "06ffffffffffffffffffffffffffffffff" + + retStart := make([]byte, 17) + retEnd := make([]byte, 17) + index := 0 + for i := 0; i < len(retStartString); i += 2 { + value, _ := strconv.ParseUint(retStartString[i:i+2], 16, 8) + retStart[index] = byte(0xff & value) + value, _ = strconv.ParseUint(retEndString[i:i+2], 16, 8) + retEnd[index] = byte(0xff & value) + index++ + } + fmt.Printf("%x\n", retStart) + fmt.Printf("%x\n", retEnd) + + limit := 10240 + + for { + keys, _, _ := client.Scan(retStart, retEnd, limit) + for k, _ := range keys { + fmt.Printf("%x\n", keys[k]) + + if bytes.Compare(retStart, keys[k]) < 0 { + retStart = keys[k] + } + wlCount++ + } + if len(keys) < limit { + break + } + wlCount-- + } + + //wg.Wait() + client.Close() + + fmt.Printf("wal kv count:%v\n", wlCount) + fmt.Println("backup time:", time.Since(archiveStart)) +} diff --git a/hbr-raw/cmd/version.go b/hbr-raw/cmd/version.go new file mode 100644 index 0000000..343eec4 --- /dev/null +++ b/hbr-raw/cmd/version.go @@ -0,0 +1,21 @@ +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" +) + +var versionCmd = &cobra.Command{ + Use: "version", + Short: "Show Version", + Run: runVersion, +} + +func init() { + rootCmd.AddCommand(versionCmd) +} + +func runVersion(cmd *cobra.Command, args []string) { + fmt.Println("Version 1.0.0 ") +} diff --git a/hbr-raw/go.mod b/hbr-raw/go.mod new file mode 100644 index 0000000..8e0a8e0 --- /dev/null +++ b/hbr-raw/go.mod @@ -0,0 +1,62 @@ +module hbr-raw + +go 1.18 + +require ( + github.com/aws/aws-sdk-go v1.30.24 + github.com/pingcap/tidb v1.1.0-beta.0.20210419034717-00632fb3c710 + github.com/spf13/cobra v1.0.0 +) + +require ( + github.com/BurntSushi/toml v0.3.1 // indirect + github.com/StackExchange/wmi v0.0.0-20190523213315-cbe66965904d // indirect + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.1.1 // indirect + github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f // indirect + github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f // indirect + github.com/cznic/mathutil v0.0.0-20181122101859-297441e03548 // indirect + github.com/danjacques/gofslock v0.0.0-20191023191349-0a45f885bc37 // indirect + github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 // indirect + github.com/go-ole/go-ole v1.2.4 // indirect + github.com/gogo/protobuf v1.3.1 // indirect + github.com/golang/protobuf v1.3.4 // indirect + github.com/google/btree v1.0.0 // indirect + github.com/google/uuid v1.1.1 // indirect + github.com/grpc-ecosystem/go-grpc-middleware v1.1.0 // indirect + github.com/inconshreveable/mousetrap v1.0.0 // indirect + github.com/jmespath/go-jmespath v0.3.0 // indirect + github.com/konsorten/go-windows-terminal-sequences v1.0.3 // indirect + github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect + github.com/opentracing/opentracing-go v1.1.0 // indirect + github.com/pingcap/errors v0.11.5-0.20201126102027-b0a155152ca3 // indirect + github.com/pingcap/failpoint v0.0.0-20200702092429-9f69995143ce // indirect + github.com/pingcap/kvproto v0.0.0-20201126113434-70db5fb4b0dc // indirect + github.com/pingcap/log v0.0.0-20201112100606-8f1e84a3abc8 // indirect + github.com/pingcap/parser v0.0.0-20210107054750-53e33b4018fe // indirect + github.com/pingcap/tipb v0.0.0-20200618092958-4fad48b4c8c3 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/prometheus/client_golang v1.5.1 // indirect + github.com/prometheus/client_model v0.2.0 // indirect + github.com/prometheus/common v0.9.1 // indirect + github.com/prometheus/procfs v0.0.8 // indirect + github.com/remyoudompheng/bigfft v0.0.0-20190728182440-6a916e37a237 // indirect + github.com/shirou/gopsutil v2.20.3+incompatible // indirect + github.com/sirupsen/logrus v1.6.0 // indirect + github.com/spaolacci/murmur3 v1.1.0 // indirect + github.com/spf13/pflag v1.0.5 // indirect + github.com/tikv/pd v0.0.0-20210105112549-e5be7fd38659 // indirect + github.com/uber/jaeger-client-go v2.22.1+incompatible // indirect + github.com/uber/jaeger-lib v2.2.0+incompatible // indirect + go.etcd.io/etcd v0.5.0-alpha.5.0.20191023171146-3cf2f69b5738 // indirect + go.uber.org/atomic v1.7.0 // indirect + go.uber.org/multierr v1.6.0 // indirect + go.uber.org/zap v1.16.0 // indirect + golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc // indirect + golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208 // indirect + golang.org/x/sys v0.5.0 // indirect + golang.org/x/text v0.3.4 // indirect + google.golang.org/genproto v0.0.0-20200108215221-bd8f9a0ef82f // indirect + google.golang.org/grpc v1.26.0 // indirect + gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect +) diff --git a/hbr-raw/go.sum b/hbr-raw/go.sum new file mode 100644 index 0000000..d4a2c9e --- /dev/null +++ b/hbr-raw/go.sum @@ -0,0 +1,784 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= +cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= +cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= +cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= +cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= +cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= +cloud.google.com/go v0.51.0 h1:PvKAVQWCtlGUSlZkGW3QLelKaWq7KYv/MW1EboG8bfM= +cloud.google.com/go v0.51.0/go.mod h1:hWtGJ6gnXH+KgDv+V0zFGDvpi07n3z8ZNj3T1RW0Gcw= +cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= +cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= +cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= +cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= +cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= +cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= +cloud.google.com/go/storage v1.5.0 h1:RPUcBvDeYgQFMfQu1eBMq6piD1SXmLH+vK3qjewZPus= +cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= +dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= +github.com/Jeffail/gabs/v2 v2.5.1/go.mod h1:xCn81vdHKxFUuWWAaD5jCTQDNPBMh5pPs9IJ+NcziBI= +github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= +github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= +github.com/PuerkitoBio/purell v1.1.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= +github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= +github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= +github.com/StackExchange/wmi v0.0.0-20190523213315-cbe66965904d h1:G0m3OIz70MZUWq3EgK3CesDbo8upS2Vm9/P3FtgI+Jk= +github.com/StackExchange/wmi v0.0.0-20190523213315-cbe66965904d/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg= +github.com/VividCortex/ewma v1.1.1 h1:MnEK4VOv6n0RSY4vtRe3h11qjxL3+t0B8yOL8iMXdcM= +github.com/VividCortex/ewma v1.1.1/go.mod h1:2Tkkvm3sRDVXaiyucHiACn4cqf7DpdyLvmxzcbUokwA= +github.com/VividCortex/mysqlerr v0.0.0-20200629151747-c28746d985dd/go.mod h1:f3HiCrHjHBdcm6E83vGaXh1KomZMA2P6aeo3hKx/wg0= +github.com/Xeoncross/go-aesctr-with-hmac v0.0.0-20200623134604-12b17a7ff502/go.mod h1:pmnBM9bxWSiHvC/gSWunUIyDvGn33EkP2CUjxFKtTTM= +github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= +github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= +github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/antihax/optional v0.0.0-20180407024304-ca021399b1a6/go.mod h1:V8iCPQYkqmusNa815XgQio277wI47sdRh1dUOLdyC6Q= +github.com/appleboy/gin-jwt/v2 v2.6.3/go.mod h1:MfPYA4ogzvOcVkRwAxT7quHOtQmVKDpTwxyUrC2DNw0= +github.com/appleboy/gofight/v2 v2.1.2/go.mod h1:frW+U1QZEdDgixycTj4CygQ48yLTUhplt43+Wczp3rw= +github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= +github.com/aws/aws-sdk-go v1.30.24 h1:y3JPD51VuEmVqN3BEDVm4amGpDma2cKJcDPuAU1OR58= +github.com/aws/aws-sdk-go v1.30.24/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZveU8YkpAk0= +github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= +github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= +github.com/blacktear23/go-proxyprotocol v0.0.0-20180807104634-af7a81e8dd0d/go.mod h1:VKt7CNAQxpFpSDz3sXyj9hY/GbVsQCr0sB3w59nE7lU= +github.com/cakturk/go-netstat v0.0.0-20200220111822-e5b49efee7a5/go.mod h1:jtAfVaU/2cu1+wdSRPWE2c1N2qeAA3K4RH9pYgqwets= +github.com/cenkalti/backoff/v4 v4.0.2/go.mod h1:eEew/i+1Q6OrCDZh3WiXYv3+nJwBASZ8Bog/87DQnVg= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= +github.com/cespare/xxhash/v2 v2.1.0/go.mod h1:dgIUBU3pDso/gPgZ1osOZ0iQf77oPR28Tjxl5dIMyVM= +github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY= +github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cheggaaa/pb/v3 v3.0.4 h1:QZEPYOj2ix6d5oEg63fbHmpolrnNiwjUsk+h74Yt4bM= +github.com/cheggaaa/pb/v3 v3.0.4/go.mod h1:7rgWxLrAUcFMkvJuv09+DYi7mMUYi8nO9iOWcvGJPfw= +github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= +github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= +github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa h1:OaNxuTZr7kxeODyLWsRMC+OD03aFUH+mW6r2d+MWa5Y= +github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa/go.mod h1:zn76sxSg3SzpJ0PPJaLDCu+Bu0Lg3sKTORVIj19EIF8= +github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd h1:qMd81Ts1T2OTKmB4acZcyKaMtRnY5Y44NuXGX2GFJ1w= +github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI= +github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk= +github.com/coreos/etcd v3.3.10+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc32PjwdhPthX9715RE= +github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= +github.com/coreos/go-semver v0.3.0 h1:wkHLiw0WNATZnSG7epLsujiMCgPAc9xhjJ4tgnAxmfM= +github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= +github.com/coreos/go-systemd v0.0.0-20180511133405-39ca1b05acc7/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f h1:JOrtw2xFKzlg+cbHpyrpLDmnN1HqhBfnX7WDiW7eG2c= +github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= +github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f h1:lBNOc5arjvs8E5mO2tbpBpLoyyu8B6e44T7hJy6potg= +github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA= +github.com/corona10/goimagehash v1.0.2/go.mod h1:/l9umBhvcHQXVtQO1V6Gp1yD20STawkhRnnX0D1bvVI= +github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= +github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= +github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= +github.com/cznic/golex v0.0.0-20181122101858-9c343928389c/go.mod h1:+bmmJDNmKlhWNG+gwWCkaBoTy39Fs+bzRxVBzoTQbIc= +github.com/cznic/mathutil v0.0.0-20181122101859-297441e03548 h1:iwZdTE0PVqJCos1vaoKsclOGD3ADKpshg3SRtYBbwso= +github.com/cznic/mathutil v0.0.0-20181122101859-297441e03548/go.mod h1:e6NPNENfs9mPDVNRekM7lKScauxd5kXTr1Mfyig6TDM= +github.com/cznic/parser v0.0.0-20160622100904-31edd927e5b1/go.mod h1:2B43mz36vGZNZEwkWi8ayRSSUXLfjL8OkbzwW4NcPMM= +github.com/cznic/sortutil v0.0.0-20181122101858-f5f958428db8 h1:LpMLYGyy67BoAFGda1NeOBQwqlv7nUXpm+rIVHGxZZ4= +github.com/cznic/sortutil v0.0.0-20181122101858-f5f958428db8/go.mod h1:q2w6Bg5jeox1B+QkJ6Wp/+Vn0G/bo3f1uY7Fn3vivIQ= +github.com/cznic/strutil v0.0.0-20171016134553-529a34b1c186/go.mod h1:AHHPPPXTw0h6pVabbcbyGRK1DckRn7r/STdZEeIDzZc= +github.com/cznic/y v0.0.0-20170802143616-045f81c6662a/go.mod h1:1rk5VM7oSnA4vjp+hrLQ3HWHa+Y4yPCa3/CsJrcNnvs= +github.com/danjacques/gofslock v0.0.0-20191023191349-0a45f885bc37 h1:X6mKGhCFOxrKeeHAjv/3UvT6e5RRxW6wRdlqlV6/H4w= +github.com/danjacques/gofslock v0.0.0-20191023191349-0a45f885bc37/go.mod h1:DC3JtzuG7kxMvJ6dZmf2ymjNyoXwgtklr7FN+Um2B0U= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/denisenkom/go-mssqldb v0.0.0-20191124224453-732737034ffd/go.mod h1:xbL0rPBG9cCiLr28tMa8zpbdarY27NDyej4t/EjAShU= +github.com/dgrijalva/jwt-go v3.2.0+incompatible h1:7qlOGliEKZXTDg6OTjfoBKDXWrumCAMpl/TFQ4/5kLM= +github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= +github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 h1:tdlZCpZ/P9DhczCTSixgIKmwPv6+wP5DGjqLYw5SUiA= +github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= +github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= +github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/dustin/go-humanize v0.0.0-20171111073723-bb3d318650d4/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= +github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= +github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= +github.com/eknkc/amber v0.0.0-20171010120322-cdade1c07385/go.mod h1:0vRUJqYpeSZifjYj7uP3BG/gKcuzL9xWVV/Y+cK33KM= +github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/erikstmartin/go-testdb v0.0.0-20160219214506-8d10e4a1bae5/go.mod h1:a2zkGnVExMxdzMo3M0Hi/3sEU+cWnZpSni0O6/Yb/P0= +github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= +github.com/fatih/color v1.9.0 h1:8xPHl4/q1VyqGIPif1F+1V3Y3lSmrq01EabUW3CoW5s= +github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU= +github.com/fatih/structtag v1.2.0/go.mod h1:mBJUNpUnHmRKrKlQQlmCrh5PuhftFbNv8Ys4/aAZl94= +github.com/fogleman/gg v1.3.0/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= +github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/fsouza/fake-gcs-server v1.17.0/go.mod h1:D1rTE4YCyHFNa99oyJJ5HyclvN/0uQR+pM/VdlL83bw= +github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= +github.com/gin-contrib/gzip v0.0.1/go.mod h1:fGBJBCdt6qCZuCAOwWuFhBB4OOq9EFqlo5dEaFhhu5w= +github.com/gin-contrib/sse v0.0.0-20170109093832-22d885f9ecc7/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s= +github.com/gin-contrib/sse v0.0.0-20190301062529-5545eab6dad3/go.mod h1:VJ0WA2NBN22VlZ2dKZQPAPnyWw5XTlK1KymzLKsr59s= +github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI= +github.com/gin-gonic/gin v1.3.0/go.mod h1:7cKuhb5qV2ggCFctp2fJQ+ErvciLZrIeoOSOm6mUr7Y= +github.com/gin-gonic/gin v1.4.0/go.mod h1:OW2EZn3DO8Ln9oIKOvM++LBO+5UPHJJDH72/q/3rZdM= +github.com/gin-gonic/gin v1.5.0/go.mod h1:Nd6IXA8m5kNZdNEHMBd93KT+mdY3+bewLgRvmCsR2Do= +github.com/go-chi/chi v4.0.2+incompatible/go.mod h1:eB3wogJHnLi3x/kFX2A+IbTBlXxmMeXJVKy9tTv1XzQ= +github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= +github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= +github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= +github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= +github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= +github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= +github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= +github.com/go-ole/go-ole v1.2.4 h1:nNBDSCOigTSiarFpYE9J/KtEA1IOW4CNeqT9TQDqCxI= +github.com/go-ole/go-ole v1.2.4/go.mod h1:XCwSNxSkXRo4vlyPy93sltvi/qJq0jqQhjqQNIwKuxM= +github.com/go-openapi/jsonpointer v0.17.0/go.mod h1:cOnomiV+CVVwFLk0A/MExoFMjwdsUdVpsRhURCKh+3M= +github.com/go-openapi/jsonpointer v0.19.2/go.mod h1:3akKfEdA7DF1sugOqz1dVQHBcuDBPKZGEoHC/NkiQRg= +github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= +github.com/go-openapi/jsonreference v0.17.0/go.mod h1:g4xxGn04lDIRh0GJb5QlpE3HfopLOL6uZrK/VgnsK9I= +github.com/go-openapi/jsonreference v0.19.0/go.mod h1:g4xxGn04lDIRh0GJb5QlpE3HfopLOL6uZrK/VgnsK9I= +github.com/go-openapi/jsonreference v0.19.2/go.mod h1:jMjeRr2HHw6nAVajTXJ4eiUwohSTlpa0o73RUL1owJc= +github.com/go-openapi/jsonreference v0.19.3/go.mod h1:rjx6GuL8TTa9VaixXglHmQmIL98+wF9xc8zWvFonSJ8= +github.com/go-openapi/spec v0.19.0/go.mod h1:XkF/MOi14NmjsfZ8VtAKf8pIlbZzyoTvZsdfssdxcBI= +github.com/go-openapi/spec v0.19.4/go.mod h1:FpwSN1ksY1eteniUU7X0N/BgJ7a4WvBFVA8Lj9mJglo= +github.com/go-openapi/swag v0.17.0/go.mod h1:AByQ+nYG6gQg71GINrmuDXCPWdL640yX49/kXLo40Tg= +github.com/go-openapi/swag v0.19.2/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= +github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= +github.com/go-playground/locales v0.12.1/go.mod h1:IUMDtCfWo/w/mtMfIE/IG2K+Ey3ygWanZIBtBW0W2TM= +github.com/go-playground/overalls v0.0.0-20180201144345-22ec1a223b7c/go.mod h1:UqxAgEOt89sCiXlrc/ycnx00LVvUO/eS8tMUkWX4R7w= +github.com/go-playground/universal-translator v0.16.0/go.mod h1:1AnU7NaIRDWWzGEKwgtJRd2xk99HeFyHw3yid4rvQIY= +github.com/go-sql-driver/mysql v1.4.1/go.mod h1:zAC/RDZ24gD3HViQzih4MyKcchzm+sOG5ZlKdlhCg5w= +github.com/go-sql-driver/mysql v1.5.0 h1:ozyZYNQW3x3HtqT1jira07DN2PArx2v7/mN66gGcHOs= +github.com/go-sql-driver/mysql v1.5.0/go.mod h1:DCzpHaOWr8IXmIStZouvnhqoel9Qv2LBy8hT2VhHyBg= +github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= +github.com/goccy/go-graphviz v0.0.5/go.mod h1:wXVsXxmyMQU6TN3zGRttjNn3h+iCAS7xQFC6TlNvLhk= +github.com/gogo/protobuf v0.0.0-20180717141946-636bf0302bc9/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= +github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= +github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= +github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls= +github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= +github.com/golang-sql/civil v0.0.0-20190719163853-cb61b32ac6fe/go.mod h1:8vg3r2VgvsThLBIFL93Qb5yWzgyZWhEmBwUJWevAkK0= +github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7 h1:5ZkaAPbicIKTF2I64qf5Fh8Aa83Q/dnOafMYV0OMwjA= +github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= +github.com/golang/protobuf v0.0.0-20180814211427-aa810b61a9c7/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.4 h1:87PNWwrRvUSnqS4dlcBU/ftvOIBep4sYuBLlh6rX2wk= +github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4= +github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/btree v1.0.0 h1:0udJVsspx3VBr5FwtLhQQtuAsVc79tTq0ocGIPAU6qo= +github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0 h1:xsAVV57WRhGj6kEIi8ReJzQlHHqcBYCElAvkovg3B/4= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= +github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= +github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/pprof v0.0.0-20200407044318-7d83b28da2e9 h1:K+lX49/3eURCE1IjlaZN//u6c+9nfDAMnyQ9E2dsJbY= +github.com/google/pprof v0.0.0-20200407044318-7d83b28da2e9/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= +github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= +github.com/google/shlex v0.0.0-20181106134648-c34317bd91bf/go.mod h1:RpwtwJQFrIEPstU94h88MWPXP2ektJZ8cZ0YntAmXiE= +github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.1.1 h1:Gkbcsh/GbpXz7lPftLA3P6TYMwjCLYm83jiFQZF/3gY= +github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= +github.com/googleapis/gax-go/v2 v2.0.5 h1:sjZBwGj9Jlw33ImPtvFviGYvseOtDM7hkSKB7+Tv3SM= +github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= +github.com/gorilla/handlers v1.4.2/go.mod h1:Qkdc/uu4tH4g6mTK6auzZ766c4CA0Ng8+o/OAirnOIQ= +github.com/gorilla/mux v1.7.3/go.mod h1:1lud6UwP+6orDFRuTfBEV8e9/aOM/c4fVVCaMa2zaAs= +github.com/gorilla/mux v1.7.4/go.mod h1:DVbg23sWSpFRCP0SfiEN6jmj59UnW/n46BH5rLB71So= +github.com/gorilla/websocket v0.0.0-20170926233335-4201258b820c/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= +github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= +github.com/gorilla/websocket v1.4.1 h1:q7AeDBpnBk8AogcD4DSag/Ukw/KV+YhzLj2bP5HvKCM= +github.com/gorilla/websocket v1.4.1/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= +github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de4/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= +github.com/grpc-ecosystem/go-grpc-middleware v1.1.0 h1:THDBEeQ9xZ8JEaCLyLQqXMMdRqNr0QAUJTIkQAUtFjg= +github.com/grpc-ecosystem/go-grpc-middleware v1.1.0/go.mod h1:f5nM7jw/oeRSadq3xCzHAvxcr8HZnzsqU6ILg/0NiiE= +github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 h1:Ovs26xHkKqVztRpIrF/92BcuyuQ/YW4NSIpoGtfXNho= +github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= +github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= +github.com/grpc-ecosystem/grpc-gateway v1.9.5/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= +github.com/grpc-ecosystem/grpc-gateway v1.12.1/go.mod h1:8XEsbTttt/W+VvjtQhLACqCisSPWTxCZ7sBRjU6iH9c= +github.com/grpc-ecosystem/grpc-gateway v1.14.3 h1:OCJlWkOUoTnl0neNGlf4fUm3TmbEtguw7vR+nGtnDjY= +github.com/grpc-ecosystem/grpc-gateway v1.14.3/go.mod h1:6CwZWGDSPRJidgKAtJVvND6soZe6fT7iteq8wDPdhb0= +github.com/gtank/cryptopasta v0.0.0-20170601214702-1f550f6f2f69/go.mod h1:YLEMZOtU+AZ7dhN9T/IpGhXVGly2bvkJQ+zxj3WeVQo= +github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= +github.com/hypnoglow/gormzap v0.3.0/go.mod h1:5Wom8B7Jl2oK0Im9hs6KQ+Kl92w4Y7gKCrj66rhyvw0= +github.com/iancoleman/strcase v0.0.0-20191112232945-16388991a334 h1:VHgatEHNcBFEB7inlalqfNqw65aNkM1lGX2yt3NmbS8= +github.com/iancoleman/strcase v0.0.0-20191112232945-16388991a334/go.mod h1:SK73tn/9oHe+/Y0h39VT4UCxmurVJkR5NA7kMEAOgSE= +github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= +github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM= +github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= +github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= +github.com/jinzhu/gorm v1.9.12/go.mod h1:vhTjlKSJUTWNtcbQtrMBFCxy7eXTzeCAzfL5fBZT/Qs= +github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc= +github.com/jinzhu/now v1.0.1/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= +github.com/jmespath/go-jmespath v0.3.0 h1:OS12ieG61fsCg5+qLJ+SsW9NicxNkg3b25OyT2yCeUc= +github.com/jmespath/go-jmespath v0.3.0/go.mod h1:9QtRXoHjLGCJ5IBSaohpXITPlowMeeYCZ7fLUTSywik= +github.com/joho/godotenv v1.3.0/go.mod h1:7hK45KPybAkOC6peb+G5yklZfMxEjkZhHbwpqxOKXbg= +github.com/jonboulle/clockwork v0.1.0 h1:VKV+ZcuP6l3yW9doeqz6ziZGgcynBVQO+obU0+0hcPo= +github.com/jonboulle/clockwork v0.1.0/go.mod h1:Ii8DK3G1RaLaWxj9trq07+26W01tbo22gdxWY5EU2bo= +github.com/joomcode/errorx v1.0.1/go.mod h1:kgco15ekB6cs+4Xjzo7SPeXzx38PbJzBwbnu9qfVNHQ= +github.com/json-iterator/go v1.1.5/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= +github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= +github.com/json-iterator/go v1.1.7/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= +github.com/json-iterator/go v1.1.9 h1:9yzud/Ht36ygwatGx56VwCZtlI/2AD15T1X2sjSuGns= +github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= +github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= +github.com/jstemmer/go-junit-report v0.9.1 h1:6QPYqodiu3GuPL+7mfx+NwDdp2eTkp9IfEUpgAwUN0o= +github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= +github.com/juju/ratelimit v1.0.1 h1:+7AIFJVQ0EQgq/K9+0Krm7m530Du7tIz0METWzN0RgY= +github.com/juju/ratelimit v1.0.1/go.mod h1:qapgC/Gy+xNh9UxzV13HGGl/6UXNN+ct+vwSgWNm/qk= +github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= +github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvWXihfKN4Q= +github.com/kisielk/errcheck v1.2.0/go.mod h1:/BMXB+zMLi60iA8Vv6Ksmxu/1UDYcXs4uQLJ+jE2L00= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/cpuid v1.2.1 h1:vJi+O/nMdFt0vqm8NZBI6wzALWdA2X+egi0ogNyrC/w= +github.com/klauspost/cpuid v1.2.1/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek= +github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= +github.com/konsorten/go-windows-terminal-sequences v1.0.3 h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8= +github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= +github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/leodido/go-urn v1.1.0/go.mod h1:+cyI34gQWZcE1eQU7NVgKkkzdXDQHr1dBMtdAPozLkw= +github.com/lib/pq v1.1.1/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= +github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= +github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= +github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= +github.com/mattn/go-colorable v0.1.2/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= +github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= +github.com/mattn/go-colorable v0.1.7 h1:bQGKb3vps/j0E9GfJQ03JyhRuxsvdAanXlT9BTw3mdw= +github.com/mattn/go-colorable v0.1.7/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= +github.com/mattn/go-isatty v0.0.4/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= +github.com/mattn/go-isatty v0.0.7/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/mattn/go-isatty v0.0.8/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= +github.com/mattn/go-isatty v0.0.9/go.mod h1:YNRxwqDuOph6SZLI9vUUz6OYw3QyUt7WiY2yME+cCiQ= +github.com/mattn/go-isatty v0.0.10/go.mod h1:qgIWMr58cqv1PHHyhnkY9lrL7etaEgOFcMEpPG5Rm84= +github.com/mattn/go-isatty v0.0.11/go.mod h1:PhnuNfih5lzO57/f3n+odYbM4JtupLOxQOAqxQCu2WE= +github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY= +github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= +github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= +github.com/mattn/go-runewidth v0.0.7/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= +github.com/mattn/go-runewidth v0.0.9 h1:Lm995f3rfxdpd6TSmuVCHVb/QhupuXlYr8sCI/QdE+0= +github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= +github.com/mattn/go-shellwords v1.0.3/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o= +github.com/mattn/go-sqlite3 v2.0.1+incompatible/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsOqkbpncsNc= +github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU= +github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= +github.com/mgechev/dots v0.0.0-20190921121421-c36f7dcfbb81/go.mod h1:KQ7+USdGKfpPjXk4Ga+5XxQM4Lm4e3gAogrreFAYpOg= +github.com/mgechev/revive v1.0.2/go.mod h1:rb0dQy1LVAxW9SWy5R3LPUjevzUbUS316U5MFySA2lo= +github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= +github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI= +github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/montanaflynn/stats v0.5.0 h1:2EkzeTSqBB4V4bJwWrt5gIIrZmpJBcoIRGS2kWLgzmk= +github.com/montanaflynn/stats v0.5.0/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= +github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= +github.com/nfnt/resize v0.0.0-20160724205520-891127d8d1b5/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8= +github.com/ngaut/pools v0.0.0-20180318154953-b7bc8c42aac7 h1:7KAv7KMGTTqSmYZtNdcNTgsos+vFzULLwyElndwn+5c= +github.com/ngaut/pools v0.0.0-20180318154953-b7bc8c42aac7/go.mod h1:iWMfgwqYW+e8n5lC/jjNEhwcjbRDpl5NT7n2h+4UNcI= +github.com/ngaut/sync2 v0.0.0-20141008032647-7a24ed77b2ef h1:K0Fn+DoFqNqktdZtdV3bPQ/0cuYh2H4rkg0tytX/07k= +github.com/ngaut/sync2 v0.0.0-20141008032647-7a24ed77b2ef/go.mod h1:7WjlapSfwQyo6LNmIvEWzsW1hbBQfpUO4JWnuQRmva8= +github.com/nicksnyder/go-i18n v1.10.0/go.mod h1:HrK7VCrbOvQoUAQ7Vpy7i87N7JZZZ7R2xBGjv0j365Q= +github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= +github.com/oleiade/reflections v1.0.0/go.mod h1:RbATFBbKYkVdqmSFtx13Bb/tVhR0lgOBXunWTZKeL4w= +github.com/olekukonko/tablewriter v0.0.0-20170122224234-a0225b3f23b5/go.mod h1:vsDQFd/mU46D+Z4whnwzcISnGGzXWMclvtLoiIKAKIo= +github.com/olekukonko/tablewriter v0.0.4/go.mod h1:zq6QwlOf5SlnkVbMSr5EoBv3636FWnp+qbPhuoO21uA= +github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.11.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/onsi/gomega v1.8.1/go.mod h1:Ho0h+IUsWyvy1OpqCwxlQ/21gkhVunqlU8fDGcoTdcA= +github.com/opentracing/basictracer-go v1.0.0 h1:YyUAhaEfjoWXclZVJ9sGoNct7j4TVk7lZWlQw5UXuoo= +github.com/opentracing/basictracer-go v1.0.0/go.mod h1:QfBfYuafItcjQuMwinw9GhYKwFXS9KnPs5lxoYwgW74= +github.com/opentracing/opentracing-go v1.1.0 h1:pWlfV3Bxv7k65HYwkikxat0+s3pV4bsqf19k25Ur8rU= +github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= +github.com/pelletier/go-toml v1.2.0/go.mod h1:5z9KED0ma1S8pY6P1sdut58dfprrGBbd/94hg7ilaic= +github.com/pelletier/go-toml v1.3.0/go.mod h1:PN7xzY2wHTK0K9p34ErDQMlFxa51Fk0OUruD3k1mMwo= +github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5/go.mod h1:jvVRKCrJTQWu0XVbaOlby/2lO20uSCHEMzzplHXte1o= +github.com/phf/go-queue v0.0.0-20170504031614-9abe38d0371d h1:U+PMnTlV2tu7RuMK5etusZG3Cf+rpow5hqQByeCzJ2g= +github.com/phf/go-queue v0.0.0-20170504031614-9abe38d0371d/go.mod h1:lXfE4PvvTW5xOjO6Mba8zDPyw8M93B6AQ7frTGnMlA8= +github.com/pingcap-incubator/tidb-dashboard v0.0.0-20201126111827-6c8be2240067/go.mod h1:EONGys2gM5n14pII2vjmU/5VG3Dtj6kpqUT1GUZ4ysw= +github.com/pingcap/br v4.0.9-0.20201215065036-804aa9087197+incompatible h1:Ceeu3/hX1LSdKpcaI8Sc6STOAxurxa9tDo0mqHmQ/Yc= +github.com/pingcap/br v4.0.9-0.20201215065036-804aa9087197+incompatible/go.mod h1:ymVmo50lQydxib0tmK5hHk4oteB7hZ0IMCArunwy3UQ= +github.com/pingcap/check v0.0.0-20190102082844-67f458068fc8/go.mod h1:B1+S9LNcuMyLH/4HMTViQOJevkGiik3wW2AN9zb2fNQ= +github.com/pingcap/check v0.0.0-20191107115940-caf2b9e6ccf4/go.mod h1:PYMCGwN0JHjoqGr3HrZoD+b8Tgx8bKnArhSq8YVzUMc= +github.com/pingcap/check v0.0.0-20191216031241-8a5a85928f12/go.mod h1:PYMCGwN0JHjoqGr3HrZoD+b8Tgx8bKnArhSq8YVzUMc= +github.com/pingcap/check v0.0.0-20200212061837-5e12011dc712 h1:R8gStypOBmpnHEx1qi//SaqxJVI4inOqljg/Aj5/390= +github.com/pingcap/check v0.0.0-20200212061837-5e12011dc712/go.mod h1:PYMCGwN0JHjoqGr3HrZoD+b8Tgx8bKnArhSq8YVzUMc= +github.com/pingcap/errcode v0.3.0/go.mod h1:4b2X8xSqxIroj/IZ9MX/VGZhAwc11wB9wRIzHvz6SeM= +github.com/pingcap/errors v0.11.0/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= +github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= +github.com/pingcap/errors v0.11.5-0.20190809092503-95897b64e011/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= +github.com/pingcap/errors v0.11.5-0.20201029093017-5a7df2af2ac7/go.mod h1:G7x87le1poQzLB/TqvTJI2ILrSgobnq4Ut7luOwvfvI= +github.com/pingcap/errors v0.11.5-0.20201126102027-b0a155152ca3 h1:LllgC9eGfqzkfubMgjKIDyZYaa609nNWAyNZtpy2B3M= +github.com/pingcap/errors v0.11.5-0.20201126102027-b0a155152ca3/go.mod h1:G7x87le1poQzLB/TqvTJI2ILrSgobnq4Ut7luOwvfvI= +github.com/pingcap/failpoint v0.0.0-20191029060244-12f4ac2fd11d/go.mod h1:DNS3Qg7bEDhU6EXNHF+XSv/PGznQaMJ5FWvctpm6pQI= +github.com/pingcap/failpoint v0.0.0-20200702092429-9f69995143ce h1:Y1kCxlCtlPTMtVcOkjUcuQKh+YrluSo7+7YMCQSzy30= +github.com/pingcap/failpoint v0.0.0-20200702092429-9f69995143ce/go.mod h1:w4PEZ5y16LeofeeGwdgZB4ddv9bLyDuIX+ljstgKZyk= +github.com/pingcap/fn v0.0.0-20200306044125-d5540d389059/go.mod h1:fMRU1BA1y+r89AxUoaAar4JjrhUkVDt0o0Np6V8XbDQ= +github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989 h1:surzm05a8C9dN8dIUmo4Be2+pMRb6f55i+UIYrluu2E= +github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989/go.mod h1:O17XtbryoCJhkKGbT62+L2OlrniwqiGLSqrmdHCMzZw= +github.com/pingcap/kvproto v0.0.0-20191211054548-3c6b38ea5107/go.mod h1:WWLmULLO7l8IOcQG+t+ItJ3fEcrL5FxF0Wu+HrMy26w= +github.com/pingcap/kvproto v0.0.0-20200411081810-b85805c9476c/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI= +github.com/pingcap/kvproto v0.0.0-20200907074027-32a3a0accf7d/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI= +github.com/pingcap/kvproto v0.0.0-20201126113434-70db5fb4b0dc h1:BtszN3YR5EScxiGGTD3tAf4CQE90bczkOY0lLa07EJA= +github.com/pingcap/kvproto v0.0.0-20201126113434-70db5fb4b0dc/go.mod h1:IOdRDPLyda8GX2hE/jO7gqaCV/PNFh8BZQCQZXfIOqI= +github.com/pingcap/log v0.0.0-20191012051959-b742a5d432e9/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8= +github.com/pingcap/log v0.0.0-20200117041106-d28c14d3b1cd/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8= +github.com/pingcap/log v0.0.0-20200511115504-543df19646ad/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8= +github.com/pingcap/log v0.0.0-20201112100606-8f1e84a3abc8 h1:M+DNpOu/I3uDmwee6vcnoPd6GgSMqND4gxvDQ/W584U= +github.com/pingcap/log v0.0.0-20201112100606-8f1e84a3abc8/go.mod h1:4rbK1p9ILyIfb6hU7OG2CiWSqMXnp3JMbiaVJ6mvoY8= +github.com/pingcap/parser v0.0.0-20210107054750-53e33b4018fe h1:sukVKRva68HNGZ4nuPvQS/wMvH7NMxTXV2NIhmoYP4Y= +github.com/pingcap/parser v0.0.0-20210107054750-53e33b4018fe/go.mod h1:GbEr2PgY72/4XqPZzmzstlOU/+il/wrjeTNFs6ihsSE= +github.com/pingcap/sysutil v0.0.0-20200206130906-2bfa6dc40bcd/go.mod h1:EB/852NMQ+aRKioCpToQ94Wl7fktV+FNnxf3CX/TTXI= +github.com/pingcap/sysutil v0.0.0-20201130064824-f0c8aa6a6966 h1:JI0wOAb8aQML0vAVLHcxTEEC0VIwrk6gtw3WjbHvJLA= +github.com/pingcap/sysutil v0.0.0-20201130064824-f0c8aa6a6966/go.mod h1:EB/852NMQ+aRKioCpToQ94Wl7fktV+FNnxf3CX/TTXI= +github.com/pingcap/tidb v1.1.0-beta.0.20210419034717-00632fb3c710 h1:PlH7u1SkJNXlUtFzh+NHkM8fgXoDsT7BIzX/7+sOZcg= +github.com/pingcap/tidb v1.1.0-beta.0.20210419034717-00632fb3c710/go.mod h1:WbISBEy4rQRvGhvFJsjK3WHYl14OpZeqchjrlQbIeHc= +github.com/pingcap/tidb-tools v4.0.9-0.20201127090955-2707c97b3853+incompatible h1:ceznmu/lLseGHP/jKyOa/3u/5H3wtLLLqkH2V3ssSjg= +github.com/pingcap/tidb-tools v4.0.9-0.20201127090955-2707c97b3853+incompatible/go.mod h1:XGdcy9+yqlDSEMTpOXnwf3hiTeqrV6MN/u1se9N8yIM= +github.com/pingcap/tipb v0.0.0-20200618092958-4fad48b4c8c3 h1:ESL3eIt1kUt8IMvR1011ejZlAyDcOzw89ARvVHvpD5k= +github.com/pingcap/tipb v0.0.0-20200618092958-4fad48b4c8c3/go.mod h1:RtkHW8WbcNxj8lsbzjaILci01CtYnYbIkQhjyZWrWVI= +github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= +github.com/prometheus/client_golang v0.9.3/go.mod h1:/TN21ttK/J9q6uSwhBd54HahCDft0ttaMvbicHlPoso= +github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= +github.com/prometheus/client_golang v1.2.1/go.mod h1:XMU6Z2MjaRKVu/dC1qupJI9SiNkDYzz3xecMgSW/F+U= +github.com/prometheus/client_golang v1.5.1 h1:bdHYieyGlH+6OLEk2YQha8THib30KP0/yD0YH9m6xcA= +github.com/prometheus/client_golang v1.5.1/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= +github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= +github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/prometheus/client_model v0.2.0 h1:uq5h0d+GuxiXLJLNABMgp2qUWDPiLvgCzz2dUR+/W/M= +github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/prometheus/common v0.0.0-20181113130724-41aa239b4cce/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro= +github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= +github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= +github.com/prometheus/common v0.7.0/go.mod h1:DjGbpBbp5NYNiECxcL/VnbXCCaQpKd3tt26CguLLsqA= +github.com/prometheus/common v0.9.1 h1:KOMtN28tlbam3/7ZKEYKHhKoJZYYj3gMH4uc62x7X7U= +github.com/prometheus/common v0.9.1/go.mod h1:yhUN8i9wzaXS3w1O07YhxHEBxD+W35wd8bs7vj7HSQ4= +github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= +github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= +github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= +github.com/prometheus/procfs v0.0.5/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDaekg4FpcdQ= +github.com/prometheus/procfs v0.0.8 h1:+fpWZdT24pJBiqJdAwYBjPSk+5YmQzYNPYzQsdzLkt8= +github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= +github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= +github.com/remyoudompheng/bigfft v0.0.0-20190728182440-6a916e37a237 h1:HQagqIiBmr8YXawX/le3+O26N+vPPC1PtjaF3mwnook= +github.com/remyoudompheng/bigfft v0.0.0-20190728182440-6a916e37a237/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= +github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= +github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= +github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/rs/cors v1.7.0/go.mod h1:gFx+x8UowdsKA9AchylcLynDq+nNFfI8FkUZdN/jGCU= +github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/sasha-s/go-deadlock v0.2.0/go.mod h1:StQn567HiB1fF2yJ44N9au7wOhrPS3iZqiDbRupzT10= +github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= +github.com/sergi/go-diff v1.0.1-0.20180205163309-da645544ed44/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= +github.com/shirou/gopsutil v2.19.10+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= +github.com/shirou/gopsutil v2.20.3+incompatible h1:0JVooMPsT7A7HqEYdydp/OfjSOYSjhXV7w1hkKj/NPQ= +github.com/shirou/gopsutil v2.20.3+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= +github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc= +github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749/go.mod h1:ZY1cvUeJuFPAdZ/B6v7RHavJWZn2YPVFQ1OSXhCGOkg= +github.com/shurcooL/httpgzip v0.0.0-20190720172056-320755c1c1b0/go.mod h1:919LwcH0M7/W4fcZ0/jy0qGght1GIhqyS/EgWGH2j5Q= +github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= +github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd/go.mod h1:TrYk7fJVaAttu97ZZKrO9UbRa8izdowaMIZcxYMbVaw= +github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= +github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= +github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I= +github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= +github.com/soheilhy/cmux v0.1.4 h1:0HKaf1o97UwFjHH9o5XsHUOF+tqmdA7KEzXLpiyaw0E= +github.com/soheilhy/cmux v0.1.4/go.mod h1:IM3LyeVVIOuxMH7sFAkER9+bJ4dT7Ms6E4xg4kGIyLM= +github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= +github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= +github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= +github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= +github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= +github.com/spf13/cobra v0.0.3/go.mod h1:1l0Ry5zgKvJasoi3XT1TypsSe7PqH0Sj9dhYf7v3XqQ= +github.com/spf13/cobra v1.0.0 h1:6m/oheQuQ13N9ks4hubMG6BnvwOeaJrqSPLahSnczz8= +github.com/spf13/cobra v1.0.0/go.mod h1:/6GTrnGXV9HjY+aR4k0oJ5tcvakLuG6EuKReYlHNrgE= +github.com/spf13/jwalterweatherman v1.0.0/go.mod h1:cQK4TGJAtQXfYWX+Ddv3mKDzgVb68N+wFjFa4jdeBTo= +github.com/spf13/pflag v1.0.1/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= +github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= +github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= +github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/viper v1.4.0/go.mod h1:PTJ7Z/lr49W6bUbkmS1V3by4uWynFiR9p7+dSq/yZzE= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/swaggo/files v0.0.0-20190704085106-630677cd5c14/go.mod h1:gxQT6pBGRuIGunNf/+tSOB5OHvguWi8Tbt82WOkf35E= +github.com/swaggo/gin-swagger v1.2.0/go.mod h1:qlH2+W7zXGZkczuL+r2nEBR2JTT+/lX05Nn6vPhc7OI= +github.com/swaggo/http-swagger v0.0.0-20200308142732-58ac5e232fba/go.mod h1:O1lAbCgAAX/KZ80LM/OXwtWFI/5TvZlwxSg8Cq08PV0= +github.com/swaggo/swag v1.5.1/go.mod h1:1Bl9F/ZBpVWh22nY0zmYyASPO1lI/zIwRDrpZU+tv8Y= +github.com/swaggo/swag v1.6.3/go.mod h1:wcc83tB4Mb2aNiL/HP4MFeQdpHUrca+Rp/DRNgWAUio= +github.com/swaggo/swag v1.6.6-0.20200529100950-7c765ddd0476/go.mod h1:xDhTyuFIujYiN3DKWC/H/83xcfHp+UE/IzWWampG7Zc= +github.com/syndtr/goleveldb v1.0.1-0.20190318030020-c3a204f8e965/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA= +github.com/syndtr/goleveldb v1.0.1-0.20190625010220-02440ea7a285 h1:uSDYjYejelKyceA6DiCsngFof9jAyeaSyX9XC5a1a7Q= +github.com/syndtr/goleveldb v1.0.1-0.20190625010220-02440ea7a285/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA= +github.com/thoas/go-funk v0.7.0/go.mod h1:+IWnUfUmFO1+WVYQWQtIJHeRRdaIyyYglZN7xzUPe4Q= +github.com/tiancaiamao/appdash v0.0.0-20181126055449-889f96f722a2/go.mod h1:2PfKggNGDuadAa0LElHrByyrz4JPZ9fFx6Gs7nx7ZZU= +github.com/tidwall/gjson v1.3.5/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls= +github.com/tidwall/match v1.0.1/go.mod h1:LujAq0jyVjBy028G1WhWfIzbpQfMO8bBZ6Tyb0+pL9E= +github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= +github.com/tikv/pd v0.0.0-20210105112549-e5be7fd38659 h1:k7pQD4T2iTVphdaYRjRhv7lZ+dlUpsdAK+ogDVYkBbk= +github.com/tikv/pd v0.0.0-20210105112549-e5be7fd38659/go.mod h1:Zh9gNK7Q02Q0DByC05P+HJETLelP8R8RMYeyQ1EMMJA= +github.com/tmc/grpc-websocket-proxy v0.0.0-20170815181823-89b8d40f7ca8/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= +github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5 h1:LnC5Kc/wtumK+WB441p7ynQJzVuNRJiqddSIE3IlSEQ= +github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= +github.com/uber-go/atomic v1.4.0 h1:yOuPqEq4ovnhEjpHmfFwsqBXDYbQeT6Nb0bwD6XnD5o= +github.com/uber-go/atomic v1.4.0/go.mod h1:/Ct5t2lcmbJ4OSe/waGBoaVvVqtO0bmtfVNex1PFV8g= +github.com/uber/jaeger-client-go v2.22.1+incompatible h1:NHcubEkVbahf9t3p75TOCR83gdUHXjRJvjoBh1yACsM= +github.com/uber/jaeger-client-go v2.22.1+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= +github.com/uber/jaeger-lib v2.2.0+incompatible h1:MxZXOiR2JuoANZ3J6DE/U0kSFv/eJ/GfSYVCjK7dyaw= +github.com/uber/jaeger-lib v2.2.0+incompatible/go.mod h1:ComeNDZlWwrWnDv8aPp0Ba6+uUTzImX/AauajbLI56U= +github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc= +github.com/ugorji/go v1.1.5-pre/go.mod h1:FwP/aQVg39TXzItUBMwnWp9T9gPQnXw4Poh4/oBQZ/0= +github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw= +github.com/ugorji/go/codec v0.0.0-20181022190402-e5e69e061d4f/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0= +github.com/ugorji/go/codec v1.1.5-pre/go.mod h1:tULtS6Gy1AE1yCENaw4Vb//HLH5njI2tfCQDUqRd8fI= +github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY= +github.com/unrolled/render v1.0.1/go.mod h1:gN9T0NhL4Bfbwu8ann7Ry/TGHYfosul+J0obPf6NBdM= +github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA= +github.com/urfave/cli/v2 v2.1.1/go.mod h1:SE9GqnLQmjVa0iPEY0f1w3ygNIYcIJ0OKPMoW2caLfQ= +github.com/urfave/negroni v0.3.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4= +github.com/vmihailenco/msgpack/v4 v4.3.11/go.mod h1:gborTTJjAo/GWTqqRjrLCn9pgNN+NXzzngzBKDPIqw4= +github.com/vmihailenco/msgpack/v5 v5.0.0-beta.1/go.mod h1:xlngVLeyQ/Qi05oQxhQ+oTuqa03RjMwMfk/7/TCs+QI= +github.com/vmihailenco/tagparser v0.1.1/go.mod h1:OeAg3pn3UbLjkWt+rN9oFYB6u/cQgqMEUPoW2WPyhdI= +github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 h1:eY9dn8+vbi4tKz5Qo6v2eYzo7kUS51QINcR5jNpbZS8= +github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= +github.com/xordataexchange/crypt v0.0.3-0.20170626215501-b2862e3d0a77/go.mod h1:aYKd//L2LvnjZzWKhF00oedf4jCCReLcmhLdhm1A27Q= +github.com/yookoala/realpath v1.0.0/go.mod h1:gJJMA9wuX7AcqLy1+ffPatSCySA1FQ2S8Ya9AIoYBpE= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.etcd.io/bbolt v1.3.2/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= +go.etcd.io/bbolt v1.3.3/go.mod h1:IbVyRI1SCnLcuJnV2u8VeU0CEYM7e686BmAb1XKL+uU= +go.etcd.io/bbolt v1.3.5 h1:XAzx9gjCb0Rxj7EoqcClPD1d5ZBxZJk0jbuoPHenBt0= +go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= +go.etcd.io/etcd v0.0.0-20191023171146-3cf2f69b5738/go.mod h1:dnLIgRNXwCJa5e+c6mIZCrds/GIG4ncV9HhK5PX7jPg= +go.etcd.io/etcd v0.5.0-alpha.5.0.20191023171146-3cf2f69b5738 h1:lWF4f9Nypl1ZqSb4gLeh/DGvBYVaUYHuiB93teOmwgc= +go.etcd.io/etcd v0.5.0-alpha.5.0.20191023171146-3cf2f69b5738/go.mod h1:dnLIgRNXwCJa5e+c6mIZCrds/GIG4ncV9HhK5PX7jPg= +go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= +go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= +go.opencensus.io v0.22.2 h1:75k/FF0Q2YM8QYo07VPddOLBslDt1MZOdEslOHvmzAs= +go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= +go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= +go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= +go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= +go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= +go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/automaxprocs v1.2.0/go.mod h1:YfO3fm683kQpzETxlTGZhGIVmXAhaw3gxeBADbpZtnU= +go.uber.org/dig v1.8.0/go.mod h1:X34SnWGr8Fyla9zQNO2GSO2D+TIuqB14OS8JhYocIyw= +go.uber.org/fx v1.10.0/go.mod h1:vLRicqpG/qQEzno4SYU86iCwfT95EZza+Eba0ItuxqY= +go.uber.org/goleak v0.10.0 h1:G3eWbSNIskeRqtsN/1uI5B+eP73y3JUuBsv9AZjehb4= +go.uber.org/goleak v0.10.0/go.mod h1:VCZuO8V8mFPlL0F5J5GK1rtHV3DrFcQ1R8ryq7FK0aI= +go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= +go.uber.org/multierr v1.3.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= +go.uber.org/multierr v1.4.0/go.mod h1:VgVr7evmIr6uPjLBxg28wmKNXyqE9akIJ5XnfpiKl+4= +go.uber.org/multierr v1.5.0/go.mod h1:FeouvMocqHpRaaGuG9EjoKcStLC43Zu/fmqdUMPcKYU= +go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= +go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= +go.uber.org/tools v0.0.0-20190618225709-2cfd321de3ee/go.mod h1:vJERXedbb3MVM5f9Ejo0C68/HhF8uaILCdgjnY+goOA= +go.uber.org/zap v1.8.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= +go.uber.org/zap v1.9.1/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= +go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= +go.uber.org/zap v1.12.0/go.mod h1:zwrFLgMcdUuIBviXEYEH1YKNaOBnKXsx2IPda5bBwHM= +go.uber.org/zap v1.13.0/go.mod h1:zwrFLgMcdUuIBviXEYEH1YKNaOBnKXsx2IPda5bBwHM= +go.uber.org/zap v1.15.0/go.mod h1:Mb2vm2krFEG5DV0W9qcHBYFtp/Wku1cvYaqPsS/WYfc= +go.uber.org/zap v1.16.0 h1:uFRZXykJGK9lLY4HtgSw44DnIcAM+kRBP7x5m+NpAOM= +go.uber.org/zap v1.16.0/go.mod h1:MA8QOfq0BHJwdXa996Y4dYkAqRKB8/1K1QMMZVaNZjQ= +golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190325154230-a5d413f7728c/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20190611184440-5c40567a22f8/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191205180655-e7c4368fe9dd/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20200204104054-c9f3fb736b72/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9 h1:psW17arqaxU48Z5kZ0CQnkZWQJsqcURM6tKiBApRjXI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= +golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= +golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= +golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/exp v0.0.0-20191227195350-da58074b4299 h1:zQpM52jfKHG6II1ISZY1ZcpygvuSFZpLwfluuF89XOg= +golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= +golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= +golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/image v0.0.0-20200119044424-58c23975cae1/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= +golang.org/x/lint v0.0.0-20200302205851-738671d3881b h1:Wh+f8QHJXR411sJR8/vRBTZ7YapZaRvUcLFFJhusH0k= +golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= +golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= +golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= +golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= +golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= +golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20181005035420-146acd28ed58/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20181220203305-927f97764cc3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= +golang.org/x/net v0.0.0-20190611141213-3f473d35a33a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191002035440-2ec189313ef0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc h1:zK/HqS5bZxDptfPJNq8v7vJfXtkU7r9TLIoSr1bXaP4= +golang.org/x/net v0.0.0-20200813134508-3edf25e44fcc/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d h1:TzXSXBo42m9gQenoE3b9BGiEpg5IG2JkU5FkPIawgtw= +golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208 h1:qwRHBd0NqMbJxfbotnDhm2ByMI1Shq4Y6oRJo21SGJA= +golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20181107165924-66b7b1311ac8/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20181228144115-9a3f9b0469bb/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190610200419-93c9922d18ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190616124812-15dcb6c0061f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190813064441-fde4db37ae7a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191010194322-b09406accb47/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191128015809-6d18c012aee9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200819171115-d785dc25833f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.4 h1:0YWbFKbhXG/wIiuHDSKpS0Iy7FSA+u45VtBMfQcFTTc= +golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.0.0-20191024005414-555d28b269f0 h1:/5xXl8Y5W96D+TtHSlonuFqGHIWVuyCkGJLwGh9JJFs= +golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180221164845-07fd8470d635/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20181030221726-6c7e314b6563/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20190606050223-4d9ae51c2468/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190611222205-d73e1c7e250b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190614205625-5aca471b1d59/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191029041327-9cc4af7d6b2c/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191029190741-b9c20aec41a5/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191030062658-86caa796c7ab/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191107010934-f79515f33823/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191114200427-caa0b0f7d508/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200225230052-807dcd883420/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= +golang.org/x/tools v0.0.0-20200410194907-79a7a3126eef/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200527183253-8e7acdbce89d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20200820010801-b793a1359eac h1:DugppSxw0LSF8lcjaODPJZoDzq0ElTGskTst3ZaBkHI= +golang.org/x/tools v0.0.0-20200820010801-b793a1359eac/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= +google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= +google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= +google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/api v0.15.1 h1:5mMS6mYvK5LVB8+ujVBC33Y8gltBo/kT6HBm6kU80G4= +google.golang.org/api v0.15.1/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= +google.golang.org/appengine v1.6.5 h1:tycE03LOZYQNhDpS27tcQdAzLCVMaj7QT2SXxebnpCM= +google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20181004005441-af9cb2a35e7f/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= +google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= +google.golang.org/genproto v0.0.0-20190927181202-20e1ac93f88c/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= +google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/genproto v0.0.0-20200108215221-bd8f9a0ef82f h1:2wh8dWY8959cBGQvk1RD+/eQBgRYYDaZ+hT0/zsARoA= +google.golang.org/genproto v0.0.0-20200108215221-bd8f9a0ef82f/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= +google.golang.org/grpc v0.0.0-20180607172857-7a6a684ca69e/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= +google.golang.org/grpc v1.21.0/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= +google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.23.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.24.0/go.mod h1:XDChyiUovWa60DnaeDeZmSW86xtLtjtZbwvSiRnRtcA= +google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= +google.golang.org/grpc v1.26.0 h1:2dTRdpdFEEhJYQD8EMLB61nnrzSCTbG38PhqdhvOltg= +google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= +gopkg.in/alecthomas/gometalinter.v2 v2.0.12/go.mod h1:NDRytsqEZyolNuAgTzJkZMkSQM7FIKyzVzGhjB/qfYo= +gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= +gopkg.in/alecthomas/kingpin.v3-unstable v3.0.0-20180810215634-df19058c872c/go.mod h1:3HH7i1SgMqlzxCcBmUHW657sD4Kvv9sC3HpL3YukzwA= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/cheggaaa/pb.v1 v1.0.25/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qStrOgw= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/go-playground/assert.v1 v1.2.1/go.mod h1:9RXL0bg/zibRAgZUYszZSwO/z8Y/a8bDuhia5mkpMnE= +gopkg.in/go-playground/validator.v8 v8.18.2/go.mod h1:RX2a/7Ha8BgOhfk7j780h4/u/RRjR0eouCJSH80/M2Y= +gopkg.in/go-playground/validator.v9 v9.29.1/go.mod h1:+c9/zcJMFNgbLvly1L1V+PpxWdVbfP1avr/N00E2vyQ= +gopkg.in/natefinch/lumberjack.v2 v2.0.0 h1:1Lc07Kr7qY4U2YPouBjpCLxpiyxIVoxqXgkXLknAOE8= +gopkg.in/natefinch/lumberjack.v2 v2.0.0/go.mod h1:l0ndWWf7gzL7RNwBG7wST/UCcT4T24xpD6X8LsfU/+k= +gopkg.in/oleiade/reflections.v1 v1.0.0/go.mod h1:SpA8pv+LUnF0FbB2hyRxc8XSng78D6iLBZ11PDb8Z5g= +gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= +gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.7/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU= +gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= +honnef.co/go/tools v0.0.1-2020.1.6 h1:W18jzjh8mfPez+AwGLxmOImucz/IFjpNlrKVnaj2YVc= +honnef.co/go/tools v0.0.1-2020.1.6/go.mod h1:pyyisuGw24ruLjrr1ddx39WE0y9OooInRzEYLhQB2YY= +k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I= +rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= +sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= +sigs.k8s.io/yaml v1.2.0 h1:kr/MCeFWJWTwyaHoR9c8EjH9OumOmoF9YGiZd7lFm/Q= +sigs.k8s.io/yaml v1.2.0/go.mod h1:yfXDCHCao9+ENCvLSE62v9VSji2MKu5jeNfTrofGhJc= +sourcegraph.com/sourcegraph/appdash v0.0.0-20190731080439-ebfcffb1b5c0 h1:ucqkfpjg9WzSUubAO62csmucvxl4/JeW3F4I4909XkM= +sourcegraph.com/sourcegraph/appdash v0.0.0-20190731080439-ebfcffb1b5c0/go.mod h1:hI742Nqp5OhwiqlzhgfbWU4mW4yO10fP+LoT9WOswdU= +sourcegraph.com/sourcegraph/appdash-data v0.0.0-20151005221446-73f23eafcf67/go.mod h1:L5q+DGLGOQFpo1snNEkLOJT2d1YTW66rWNzatr3He1k= diff --git a/hbr-raw/main.go b/hbr-raw/main.go new file mode 100644 index 0000000..a3cc3b0 --- /dev/null +++ b/hbr-raw/main.go @@ -0,0 +1,9 @@ +package main + +import ( + "hbr-raw/cmd" +) + +func main() { + cmd.Execute() +} diff --git a/hbr/cmd/archive.go b/hbr/cmd/archive.go new file mode 100644 index 0000000..4e2e200 --- /dev/null +++ b/hbr/cmd/archive.go @@ -0,0 +1,313 @@ +package cmd + +import ( + "bytes" + "context" + "encoding/binary" + "encoding/json" + "fmt" + "strconv" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + "github.com/spf13/cobra" + "github.com/tikv/client-go/txnkv" +) + +var archiveCmd = &cobra.Command{ + Use: "archive", + Short: "Archive He3DB Xlog", + Long: "Welcome to use hbr for He3DB xlog archive", + Run: runArchive, +} + +func init() { + rootCmd.AddCommand(archiveCmd) +} + +type Inode struct { + Ino uint64 `json:"ino"` +} + +func Unt64ToBytes(n uint64) []byte { + x := uint64(n) + bytesBuffer := bytes.NewBuffer([]byte{}) + binary.Write(bytesBuffer, binary.BigEndian, x) + return bytesBuffer.Bytes() +} + +func runArchive(cmd *cobra.Command, args []string) { + access_key, _ := cmd.Flags().GetString("access_key") + secret_key, _ := cmd.Flags().GetString("secret_key") + endpoint, _ := cmd.Flags().GetString("endpoint") + region, _ := cmd.Flags().GetString("region") + bucket, _ := cmd.Flags().GetString("bucket") + pd, _ := cmd.Flags().GetString("pd") + backup_name, _ := cmd.Flags().GetString("name") + archive_start_file, _ := cmd.Flags().GetString("archive_start_file") + + if access_key == "" || secret_key == "" || endpoint == "" || region == "" || bucket == "" || pd == "" || backup_name == "" || archive_start_file == "" { + fmt.Printf("PARAMETER ERROR!\n") + return + } + + client, err := txnkv.NewClient([]string{pd}) + if err != nil { + fmt.Printf("Connect Tikv Error!\n%v\n", err) + return + } + txn, err := client.Begin() + if err != nil { + fmt.Printf("Tikv Transaction Begin Error!\n%v\n", err) + return + } + + sess, err := session.NewSession(&aws.Config{ + Region: aws.String(region), + Endpoint: aws.String(endpoint), + Credentials: credentials.NewStaticCredentials(access_key, secret_key, ""), + S3ForcePathStyle: aws.Bool(true), + }) + if err != nil { + fmt.Printf("Connect S3 Error!\n%v\n", err) + return + } + s3_client := s3.New(sess) + + var filename string = "" + allCount := 0 + //1:meta + fmt.Printf("Backup meta!\n") + + ret := make([]byte, 1) + value, _ := strconv.ParseUint("00", 16, 8) + ret[0] = byte(0xff & value) + + metaValue, err := txn.Get(context.TODO(), ret) + if err != nil { + fmt.Printf("Client Get meta Error!\n%v\n", err) + return + } + + filename = fmt.Sprintf("%x", ret) + fmt.Printf("filename:%s\n", filename) + + _, err = s3_client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(backup_name + "/" + filename), + Body: bytes.NewReader(metaValue), + }) + if err != nil { + fmt.Printf("S3 Put meta Error!\n%v\n", err) + return + } + allCount++ + + //2:pg_wal inode + fmt.Printf("Backup pg_wal inode!\n") + + pwValue, err := txn.Get(context.TODO(), []byte("pg_wal")) + if err != nil { + fmt.Printf("Client Get pg_wal Error!\n%v\n", err) + return + } + + var pwi Inode + json.Unmarshal(pwValue, &pwi) + pwiKeyString := fmt.Sprintf("01%x", Unt64ToBytes(pwi.Ino)) + fmt.Printf("%v\n", pwiKeyString) + ret = make([]byte, 9) + index := 0 + for i := 0; i < len(pwiKeyString); i += 2 { + value, _ := strconv.ParseUint(pwiKeyString[i:i+2], 16, 8) + ret[index] = byte(0xff & value) + index++ + } + + pwiValue, err := txn.Get(context.TODO(), ret) + if err != nil { + fmt.Printf("Client Get pwiValue Error!\n%v\n", err) + return + } + + filename = fmt.Sprintf("%x", ret) + fmt.Printf("filename:%s\n", filename) + + _, err = s3_client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(backup_name + "/" + filename), + Body: bytes.NewReader(pwiValue), + }) + if err != nil { + fmt.Printf("S3 Put pwiValue Error!\n%v\n", err) + return + } + allCount++ + + //3:pg_wal directory block + fmt.Printf("Backup pg_wal directory block!\n") + pwbKeyString := fmt.Sprintf("02%x0000000000000000", Unt64ToBytes(pwi.Ino)) + ret = make([]byte, 17) + index = 0 + for i := 0; i < len(pwbKeyString); i += 2 { + value, _ := strconv.ParseUint(pwbKeyString[i:i+2], 16, 8) + ret[index] = byte(0xff & value) + index++ + } + + pwbValue, err := txn.Get(context.TODO(), ret) + if err != nil { + fmt.Printf("Client Get pwbValue Error!\n%v\n", err) + return + } + + filename = fmt.Sprintf("%x", ret) + fmt.Printf("filename:%s\n", filename) + + _, err = s3_client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), // bucket名称 + Key: aws.String(backup_name + "/" + filename), + Body: bytes.NewReader(pwbValue), + }) + if err != nil { + fmt.Printf("S3 Put pwbValue Error!\n%v\n", err) + return + } + allCount++ + //4:new file index + fmt.Printf("Backup new file index!\n") + pwiPrefixString := fmt.Sprintf("04%x", Unt64ToBytes(pwi.Ino)) + retStart := []byte(fmt.Sprintf("400000002%s", archive_start_file)) + retEnd := []byte("400000002ffffffffffffffffffffffff") + index = 0 + for i := 0; i < len(pwiPrefixString); i += 2 { + value, _ := strconv.ParseUint(pwiPrefixString[i:i+2], 16, 8) + retStart[index] = byte(0xff & value) + retEnd[index] = byte(0xff & value) + index++ + } + + fiiter, err := txn.Iter(retStart, retEnd) + if err != nil { + fmt.Printf("new file index Iter Error!\n%v\n", err) + return + } + + newFileIndexCount := 0 + for fiiter.Valid() { + k, v := fiiter.Key(), fiiter.Value() + + filename = fmt.Sprintf("%x", k) + fmt.Printf("filename:%s\n", filename) + + _, err = s3_client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(backup_name + "/" + filename), + Body: bytes.NewReader(v), + }) + if err != nil { + fmt.Printf("S3 PutObject Error!\n%v\n", err) + return + } + + if err := fiiter.Next(); err != nil { + fmt.Printf("Iter Next Error!\n%v\n", err) + return + } + newFileIndexCount++ + allCount++ + + //5:new xlog inode + fmt.Printf("Backup new xlog inode!\n") + var wali Inode + json.Unmarshal(v, &wali) + waliKeyString := fmt.Sprintf("01%x", Unt64ToBytes(wali.Ino)) + ret = make([]byte, 9) + index = 0 + for i := 0; i < len(waliKeyString); i += 2 { + value, _ := strconv.ParseUint(waliKeyString[i:i+2], 16, 8) + ret[index] = byte(0xff & value) + index++ + } + + waliValue, err := txn.Get(context.TODO(), ret) + if err != nil { + fmt.Printf("Client Get waliValue Error!\n%v\n", err) + return + } + + filename = fmt.Sprintf("%x", ret) + fmt.Printf("filename:%s\n", filename) + + _, err = s3_client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(backup_name + "/" + filename), + Body: bytes.NewReader(waliValue), + }) + if err != nil { + fmt.Printf("S3 Put waliValue Error!\n%v\n", err) + return + } + allCount++ + + //6:new file block + fmt.Printf("Backup new file block!\n") + walbPrefixString := fmt.Sprintf("02%x", Unt64ToBytes(wali.Ino)) + retStartString := fmt.Sprintf("%s0000000000000000", walbPrefixString) + retEndString := fmt.Sprintf("%s0000000000000100", walbPrefixString) + + retStart := make([]byte, 17) + retEnd := make([]byte, 17) + index = 0 + for i := 0; i < len(retStartString); i += 2 { + value, _ := strconv.ParseUint(retStartString[i:i+2], 16, 8) + retStart[index] = byte(0xff & value) + value, _ = strconv.ParseUint(retEndString[i:i+2], 16, 8) + retEnd[index] = byte(0xff & value) + index++ + } + + walbIter, err := txn.Iter(retStart, retEnd) + if err != nil { + fmt.Printf("walbIter Error!\n%v\n", err) + return + } + + walbCount := 0 + for walbIter.Valid() { + k1, v1 := walbIter.Key(), walbIter.Value() + + filename = fmt.Sprintf("%x", k1) + fmt.Printf("filename:%s\n", filename) + + _, err = s3_client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), // bucket名称 + Key: aws.String(backup_name + "/" + filename), + Body: bytes.NewReader(v1), + }) + if err != nil { + fmt.Printf("S3 PutObject Error!\n%v\n", err) + return + } + + if err := walbIter.Next(); err != nil { + fmt.Printf("walbIter Next Error!\n%v\n", err) + return + } + walbCount++ + allCount++ + } + walbIter.Close() + fmt.Printf("walbCount:%v\n", walbCount) + } + if err := txn.Commit(context.TODO()); err != nil { + fmt.Printf("Tikv Transaction Commit Error!\n%v\n", err) + return + } + fiiter.Close() + fmt.Printf("new file index count:%v\n", newFileIndexCount) + fmt.Printf("allCount:%v\n", allCount) + client.Close() +} diff --git a/hbr/cmd/backup.go b/hbr/cmd/backup.go new file mode 100644 index 0000000..86c0e37 --- /dev/null +++ b/hbr/cmd/backup.go @@ -0,0 +1,100 @@ +package cmd + +import ( + "bytes" + "fmt" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + "github.com/spf13/cobra" + "github.com/tikv/client-go/txnkv" +) + +var backupCmd = &cobra.Command{ + Use: "backup", + Short: "Backup He3DB", + Long: "Welcome to use hbr for He3DB backup", + Run: runBackup, +} + +func init() { + + rootCmd.AddCommand(backupCmd) +} + +func runBackup(cmd *cobra.Command, args []string) { + access_key, _ := cmd.Flags().GetString("access_key") + secret_key, _ := cmd.Flags().GetString("secret_key") + endpoint, _ := cmd.Flags().GetString("endpoint") + region, _ := cmd.Flags().GetString("region") + bucket, _ := cmd.Flags().GetString("bucket") + pd, _ := cmd.Flags().GetString("pd") + backup_name, _ := cmd.Flags().GetString("name") + + if access_key == "" || secret_key == "" || endpoint == "" || region == "" || bucket == "" || pd == "" || backup_name == "" { + fmt.Printf("PARAMETER ERROR!\n") + return + } + + client, err := txnkv.NewClient([]string{pd}) + if err != nil { + fmt.Printf("Connect Tikv Error!\n%v\n", err) + return + } + defer func() { + client.Close() + }() + + sess, err := session.NewSession(&aws.Config{ + Region: aws.String(region), + Endpoint: aws.String(endpoint), + Credentials: credentials.NewStaticCredentials(access_key, secret_key, ""), + S3ForcePathStyle: aws.Bool(true), + }) + if err != nil { + fmt.Printf("Connect S3 Error!\n%v\n", err) + return + } + s3_client := s3.New(sess) + + ts, err := client.CurrentTimestamp("global") + if err != nil { + fmt.Printf("Set Timestamp Error!\n%v\n", err) + return + } + snapshot := client.GetSnapshot(ts) + + iter, err := snapshot.Iter([]byte(""), []byte("")) + if err != nil { + fmt.Printf("Iter Error!\n%v\n", err) + return + } + defer iter.Close() + count := 0 + for iter.Valid() { + k, v := iter.Key(), iter.Value() + var filename string = "" + + filename = fmt.Sprintf("%x", k) + fmt.Printf("filename:%s\n", filename) + + _, err = s3_client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), // bucket名称 + Key: aws.String(backup_name + "/" + filename), + Body: bytes.NewReader(v), + }) + if err != nil { + fmt.Printf("S3 PutObject Error!\n%v\n", err) + return + } + + if err := iter.Next(); err != nil { + fmt.Printf("Iter Next Error!\n%v\n", err) + return + } + count++ + } + fmt.Printf("N:%v\n", count) +} diff --git a/hbr/cmd/help.go b/hbr/cmd/help.go new file mode 100644 index 0000000..8156f7d --- /dev/null +++ b/hbr/cmd/help.go @@ -0,0 +1,35 @@ +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" +) + +var rootCmd = &cobra.Command{ + Use: "hbr", + Short: "He3DB backup&restore", + Long: "Welcome to use hbr for He3DB backup&restore", + Run: runRoot, +} + +func init() { + rootCmd.PersistentFlags().String("access_key", "", "S3 Access Key") + rootCmd.PersistentFlags().String("secret_key", "", "S3 Secret Key") + rootCmd.PersistentFlags().String("endpoint", "", "S3 endpoint") + rootCmd.PersistentFlags().String("region", "", "S3 region") + rootCmd.PersistentFlags().String("bucket", "", "S3 bucket") + rootCmd.PersistentFlags().String("pd", "http://127.0.0.1:2379", "Tikv placement driber") + rootCmd.PersistentFlags().String("name", "", "Backup name") + rootCmd.PersistentFlags().String("archive_start_file", "000000010000000000000001", "start key of archive[included]") +} + +func Execute() { + if err := rootCmd.Execute(); err != nil { + panic(err) + } +} + +func runRoot(cmd *cobra.Command, args []string) { + fmt.Printf("Welcome to use hbr for He3DB backup&restore\n") +} diff --git a/hbr/cmd/restore.go b/hbr/cmd/restore.go new file mode 100644 index 0000000..6ec2eb4 --- /dev/null +++ b/hbr/cmd/restore.go @@ -0,0 +1,128 @@ +package cmd + +import ( + "context" + "fmt" + "io/ioutil" + "strconv" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + "github.com/spf13/cobra" + "github.com/tikv/client-go/txnkv" +) + +var restoreCmd = &cobra.Command{ + Use: "restore", + Short: "Restore He3DB", + Long: "Welcome to use hbr for He3DB restore", + Run: runRestore, +} + +func init() { + rootCmd.AddCommand(restoreCmd) +} + +func runRestore(cmd *cobra.Command, args []string) { + access_key, _ := cmd.Flags().GetString("access_key") + secret_key, _ := cmd.Flags().GetString("secret_key") + endpoint, _ := cmd.Flags().GetString("endpoint") + region, _ := cmd.Flags().GetString("region") + bucket, _ := cmd.Flags().GetString("bucket") + pd, _ := cmd.Flags().GetString("pd") + backup_name, _ := cmd.Flags().GetString("name") + + if access_key == "" || secret_key == "" || endpoint == "" || region == "" || bucket == "" || pd == "" || backup_name == "" { + fmt.Printf("PARAMETER ERROR!\n") + return + } + + client, err := txnkv.NewClient([]string{pd}) + if err != nil { + fmt.Printf("Connect Tikv Error!\n%v\n", err) + return + } + defer func() { + client.Close() + }() + + sess, err := session.NewSession(&aws.Config{ + Region: aws.String(region), + Endpoint: aws.String(endpoint), + Credentials: credentials.NewStaticCredentials(access_key, secret_key, ""), + S3ForcePathStyle: aws.Bool(true), + }) + if err != nil { + fmt.Printf("Connect S3 Error!\n%v\n", err) + return + } + s3_client := s3.New(sess) + + count := 0 + input := &s3.ListObjectsInput{ + Bucket: aws.String(bucket), + Prefix: aws.String(backup_name), + } + for { + resp, err := s3_client.ListObjects(input) + if err != nil { + fmt.Printf("S3 ListObjects Error!\n%v\n", err) + return + } + + for _, keys := range resp.Contents { + out, err := s3_client.GetObject(&s3.GetObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(*keys.Key), + }) + if err != nil { + fmt.Printf("S3 GetObject Error!\n%v\n", err) + return + } + defer out.Body.Close() + + data, err := ioutil.ReadAll(out.Body) + if err != nil { + fmt.Printf("out.Body.Read!\n%v\n", err) + return + } + + txn, err := client.Begin() + if err != nil { + fmt.Printf("Tikv Transaction Begin Error!\n%v\n", err) + return + } + + fmt.Printf("filename:%s\n", (*keys.Key)[len(backup_name)+1:]) + + ret := make([]byte, (len(*keys.Key)-len(backup_name)-1)/2) + index := 0 + for i := len(backup_name) + 1; i < len(*keys.Key); i += 2 { + value, _ := strconv.ParseUint((*keys.Key)[i:i+2], 16, 8) + ret[index] = byte(0xff & value) + index++ + + } + + if err := txn.Set(ret, data); err != nil { + fmt.Printf("Tikv Set Error!\n%v\n", err) + return + } + + if err := txn.Commit(context.TODO()); err != nil { + fmt.Printf("Tikv Transaction Commit Error!\n%v\n", err) + return + } + count++ + } + if resp.NextMarker == nil { + fmt.Printf("Done!\n") + break + } + input.Marker = resp.NextMarker + } + fmt.Printf("N:%v\n", count) + fmt.Printf("Done!\n") +} diff --git a/hbr/cmd/version.go b/hbr/cmd/version.go new file mode 100644 index 0000000..7bc6ae0 --- /dev/null +++ b/hbr/cmd/version.go @@ -0,0 +1,21 @@ +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" +) + +var versionCmd = &cobra.Command{ + Use: "version", + Short: "Show Version", + Run: runVersion, +} + +func init() { + rootCmd.AddCommand(versionCmd) +} + +func runVersion(cmd *cobra.Command, args []string) { + fmt.Println("Version 1.0.0 ") +} diff --git a/hbr/main.go b/hbr/main.go new file mode 100644 index 0000000..1c16660 --- /dev/null +++ b/hbr/main.go @@ -0,0 +1,9 @@ +package main + +import ( + "hbr/cmd" +) + +func main() { + cmd.Execute() +} diff --git a/initdata/masterconf/postgresql.conf b/initdata/masterconf/postgresql.conf index 9f4052b..c12c9ed 100644 --- a/initdata/masterconf/postgresql.conf +++ b/initdata/masterconf/postgresql.conf @@ -93,7 +93,7 @@ max_connections = 100 # (change requires restart) # - Authentication - #authentication_timeout = 1min # 1s-600s -#password_encryption = scram-sha-256 # scram-sha-256 or md5 + #db_user_namespace = off # GSSAPI using Kerberos diff --git a/initdata/pushconf/postgresql.conf b/initdata/pushconf/postgresql.conf index a816004..27f6089 100644 --- a/initdata/pushconf/postgresql.conf +++ b/initdata/pushconf/postgresql.conf @@ -93,7 +93,7 @@ max_connections = 100 # (change requires restart) # - Authentication - #authentication_timeout = 1min # 1s-600s -password_encryption = md5 # scram-sha-256 or md5 + #db_user_namespace = off # GSSAPI using Kerberos diff --git a/script/activehe3pgfromhe3pg.sh b/script/activehe3pgfromhe3pg.sh new file mode 100644 index 0000000..9d75eb6 --- /dev/null +++ b/script/activehe3pgfromhe3pg.sh @@ -0,0 +1,46 @@ +#!/bin/bash +export PATH=/home/postgres/psql14/bin:$PATH +export PGDATABASE=postgres +export PGHOST=127.0.0.1 +export PGUSER=postgres +export PGPORT=15433 +export PGPASSWORD=123456 +slaveDataDir=/home/postgres/slavedata/pgdata +slaveConninfo='application_name=pushstandby user=repl password=123456 host=127.0.0.1 port=15433 sslmode=disable sslcompression=0 gssencmode=disable target_session_attrs=any' +pushDataDir=/home/postgres/data/pgdata +pushImdbPageDirectory=/tmp/pushpagedb +pushImdbWalDirectory=/tmp/pushwaldb +pushLogfile=/home/postgres/logfile2 + +pg_ctl -D $pushDataDir -l $pushLogfile stop +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB push instance stop failed!" + exit 1 +fi + +sed -i 's/^primary_conninfo/#primary_conninfo/g' $pushDataDir/postgresql.auto.conf +sed -i 's/^primary_conninfo/#primary_conninfo/g' $pushDataDir/postgresql.conf + +echo -e "primary_conninfo = '$slaveConninfo'" >> $pushDataDir/postgresql.conf + +sed -i 's/^hot_standby/#hot_standby/g' $slaveDataDir/postgresql.conf +sed -i 's/^primary_conninfo/#primary_conninfo/g' $slaveDataDir/postgresql.auto.conf +sed -i 's/^primary_conninfo/#primary_conninfo/g' $slaveDataDir/postgresql.conf + +echo -e "hot_standby=off" >> $slaveDataDir/postgresql.conf + +psql -c 'SELECT pg_promote(true, 30)' +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB slave instance promote failed!" + exit 1 +fi + +rm -rf $pushImdbPageDirectory $pushImdbWalDirectory +pg_ctl -D $pushDataDir -l $pushLogfile start +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB push instance start failed!" + exit 1 +fi \ No newline at end of file diff --git a/script/activehe3pgfromhe3pgforprivate.sh b/script/activehe3pgfromhe3pgforprivate.sh new file mode 100644 index 0000000..b78f566 --- /dev/null +++ b/script/activehe3pgfromhe3pgforprivate.sh @@ -0,0 +1,44 @@ +#!/bin/bash +export PATH=/home/postgres/psql14/bin:$PATH +export PGDATABASE=postgres +export PGHOST=127.0.0.1 +export PGUSER=postgres +export PGPORT=15433 +export PGPASSWORD=123456 +slaveDataDir=/home/postgres/slavedata/pgdata +slavepushDataDir=/home/postgres/slavepushdata/pgdata + +sed -i 's/^he3share/#he3share/g' $slaveDataDir/postgresql.auto.conf +sed -i 's/^he3share/#he3share/g' $slaveDataDir/postgresql.conf +sed -i 's/^hot_standby/#hot_standby/g' $slaveDataDir/postgresql.conf +sed -i 's/^primary_conninfo/#primary_conninfo/g' $slaveDataDir/postgresql.auto.conf +sed -i 's/^primary_conninfo/#primary_conninfo/g' $slaveDataDir/postgresql.conf + +echo -e "he3share = on" >> $slaveDataDir/postgresql.conf +echo -e "hot_standby=off" >> $slaveDataDir/postgresql.conf + +sed -i 's/^mpush/#mpush/g' $slavepushDataDir/postgresql.conf +sed -i 's/^mpush/#mpush/g' $slavepushDataDir/postgresql.auto.conf + +echo -e "mpush=on" >> $slavepushDataDir/postgresql.conf + +psql -c 'SELECT pg_promote(true, 30)' +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB slave instance promote failed!" + exit 1 +fi + +pg_ctl -D $slaveDataDir reload +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB slave instance reload failed!" + exit 1 +fi + +pg_ctl -D $slavepushDataDir reload +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB push instance reload failed!" + exit 1 +fi \ No newline at end of file diff --git a/script/activehe3pgfrompg.sh b/script/activehe3pgfrompg.sh new file mode 100644 index 0000000..9cd538c --- /dev/null +++ b/script/activehe3pgfrompg.sh @@ -0,0 +1,75 @@ +#!/bin/bash +export PATH=/home/postgres/psql14/bin:$PATH +primaryDataDir=/home/postgres/primary/pgdata +primaryImdbPageDirectory=/tmp/primarypagedb +primaryImdbWalDirectory=/tmp/primarywaldb +primaryLogfile=/home/postgres/primarylogfile +primaryPort=15432 +primaryConninfo='application_name=pushstandby user=repl password=123456 host=127.0.0.1 port=15432 sslmode=disable sslcompression=0 gssencmode=disable target_session_attrs=any' +pushDataDir=/home/postgres/push/pgdata +pushImdbPageDirectory=/tmp/pushpagedb +pushImdbWalDirectory=/tmp/pushwaldb +pushLogfile=/home/postgres/pushlogfile + +if [ ! -d "$primaryDataDir" ]; then + echo "$primaryDataDir does not exist!" + exit 1 +fi + +if [ "`ls -A $primaryDataDir`" != "" ]; then + echo "$primaryDataDir is not enpty!" + exit 1 +fi + +pg_ctl -D $pushDataDir -l $pushLogfile stop +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB stop failed!" + exit 1 +fi + +sed -i 's/^primary_conninfo/#primary_conninfo/g' $pushDataDir/postgresql.auto.conf +sed -i 's/^primary_conninfo/#primary_conninfo/g' $pushDataDir/postgresql.conf +sed -i 's/^he3mirror/#he3mirror/g' $pushDataDir/postgresql.conf + +rsync -av --exclude base --exclude global --exclude standby.signal --exclude backup_label.old --exclude backup_manifest $pushDataDir/* $primaryDataDir/ +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): sync data file failed!" + exit 1 +fi + +ln -s $pushDataDir/base $primaryDataDir/base +ln -s $pushDataDir/global $primaryDataDir/global + +echo -e "primary_conninfo = '$primaryConninfo'" >> $pushDataDir/postgresql.conf +echo -e "he3mirror=false" >> $pushDataDir/postgresql.conf + +sed -i 's/^push_standby/#push_standby/g' $primaryDataDir/postgresql.conf +sed -i 's/^hot_standby/#hot_standby/g' $primaryDataDir/postgresql.conf +sed -i 's/^port/#port/g' $primaryDataDir/postgresql.conf +sed -i 's/^lmdb_page_directory/#lmdb_page_directory/g' $primaryDataDir/postgresql.conf +sed -i 's/^lmdb_wal_directory/#lmdb_wal_directory/g' $primaryDataDir/postgresql.conf + +echo -e "push_standby=off" >> $primaryDataDir/postgresql.conf +echo -e "hot_standby=off" >> $primaryDataDir/postgresql.conf +echo -e "port=$primaryPort" >> $primaryDataDir/postgresql.conf +echo -e "he3mirror=false" >> $primaryDataDir/postgresql.conf +echo -e "lmdb_page_directory='$primaryImdbPageDirectory'" >> $primaryDataDir/postgresql.conf +echo -e "lmdb_wal_directory='$primaryImdbWalDirectory'" >> $primaryDataDir/postgresql.conf + +rm -rf $primaryImdbPageDirectory $primaryImdbWalDirectory $pushImdbPageDirectory $pushImdbWalDirectory + +pg_ctl -D $primaryDataDir -l $primaryLogfile start +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB primary instance start failed!" + exit 1 +fi + +pg_ctl -D $pushDataDir -l $pushLogfile start +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB push instance start failed!" + exit 1 +fi \ No newline at end of file diff --git a/script/activehe3pgfrompgforprivate.sh b/script/activehe3pgfrompgforprivate.sh new file mode 100644 index 0000000..ea51033 --- /dev/null +++ b/script/activehe3pgfrompgforprivate.sh @@ -0,0 +1,76 @@ +#!/bin/bash +export PATH=/home/postgres/psql14/bin:$PATH +primaryDataDir=/home/postgres/primary/pgdata +primaryImdbPageDirectory=/tmp/primarypagedb +primaryImdbWalDirectory=/tmp/primarywaldb +primaryLogfile=/home/postgres/primarylogfile +primaryPort=15432 +primaryConninfo='application_name=pushstandby user=repl password=123456 host=127.0.0.1 port=15432 sslmode=disable sslcompression=0 gssencmode=disable target_session_attrs=any' +pushDataDir=/home/postgres/push/pgdata +pushImdbPageDirectory=/tmp/pushpagedb +pushImdbWalDirectory=/tmp/pushwaldb +pushLogfile=/home/postgres/pushlogfile + +if [ ! -d "$primaryDataDir" ]; then + echo "$primaryDataDir does not exist!" + exit 1 +fi + +if [ "`ls -A $primaryDataDir`" != "" ]; then + echo "$primaryDataDir is not enpty!" + exit 1 +fi + +pg_ctl -D $pushDataDir -l $pushLogfile stop +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB stop failed!" + exit 1 +fi + +sed -i 's/^primary_conninfo/#primary_conninfo/g' $pushDataDir/postgresql.auto.conf +sed -i 's/^primary_conninfo/#primary_conninfo/g' $pushDataDir/postgresql.conf +sed -i 's/^he3mirror/#he3mirror/g' $pushDataDir/postgresql.conf + +rsync -av --exclude base --exclude global --exclude standby.signal --exclude backup_label.old --exclude backup_manifest $pushDataDir/* $primaryDataDir/ +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): sync data file failed!" + exit 1 +fi + +ln -s $pushDataDir/base $primaryDataDir/base +ln -s $pushDataDir/global $primaryDataDir/global + +echo -e "primary_conninfo = '$primaryConninfo'" >> $pushDataDir/postgresql.conf +echo -e "he3mirror=false" >> $pushDataDir/postgresql.conf +echo -e "mpush=on" >> $pushDataDir/postgresql.conf + +sed -i 's/^push_standby/#push_standby/g' $primaryDataDir/postgresql.conf +sed -i 's/^hot_standby/#hot_standby/g' $primaryDataDir/postgresql.conf +sed -i 's/^port/#port/g' $primaryDataDir/postgresql.conf +sed -i 's/^lmdb_page_directory/#lmdb_page_directory/g' $primaryDataDir/postgresql.conf +sed -i 's/^lmdb_wal_directory/#lmdb_wal_directory/g' $primaryDataDir/postgresql.conf + +echo -e "push_standby=off" >> $primaryDataDir/postgresql.conf +echo -e "hot_standby=off" >> $primaryDataDir/postgresql.conf +echo -e "port=$primaryPort" >> $primaryDataDir/postgresql.conf +echo -e "he3mirror=false" >> $primaryDataDir/postgresql.conf +echo -e "lmdb_page_directory='$primaryImdbPageDirectory'" >> $primaryDataDir/postgresql.conf +echo -e "lmdb_wal_directory='$primaryImdbWalDirectory'" >> $primaryDataDir/postgresql.conf + +rm -rf $primaryImdbPageDirectory $primaryImdbWalDirectory $pushImdbPageDirectory $pushImdbWalDirectory + +pg_ctl -D $primaryDataDir -l $primaryLogfile start +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB primary instance start failed!" + exit 1 +fi + +pg_ctl -D $pushDataDir -l $pushLogfile start +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB push instance start failed!" + exit 1 +fi \ No newline at end of file diff --git a/script/addslaveandpusherforprivate.sh b/script/addslaveandpusherforprivate.sh new file mode 100644 index 0000000..68ab403 --- /dev/null +++ b/script/addslaveandpusherforprivate.sh @@ -0,0 +1,115 @@ +#!/bin/bash +export PATH=/home/postgres/psql14/bin:$PATH +export PGPASSWORD=123456 + +bakDataDir=/home/postgres/bak1 +bakInstancePort=15432 +bakInstanceUser=repl +bakInstanceHost=127.0.0.1 + +slaveDataDir=/home/postgres/slavedata2/pgdata +slaveImdbPageDirectory=/tmp/slave2pagedb +slaveImdbWalDirectory=/tmp/slave2waldb +slaveLogfile=/home/postgres/slave2logfile +slavePort=15433 +slaveConninfo='application_name=pushstandby2 user=repl password=123456 host=127.0.0.1 port=15433 sslmode=disable sslcompression=0 gssencmode=disable target_session_attrs=any' + +pushDataDir=/home/postgres/pushdata2/pgdata +pushImdbPageDirectory=/tmp/push2pagedb +pushImdbWalDirectory=/tmp/push2waldb +pushLogfile=/home/postgres/push2logfile +pushPort=15434 + +if [ ! -d "$bakDataDir" ]; then + echo "$bakDataDir does not exist!" + exit 1 +fi + +if [ "`ls -A $bakDataDir`" != "" ]; then + echo "$bakDataDir is not enpty!" + exit 1 +fi + +if [ ! -d "$slaveDataDir" ]; then + echo "$slaveDataDir does not exist!" + exit 1 +fi + +if [ "`ls -A $slaveDataDir`" != "" ]; then + echo "$slaveDataDir is not enpty!" + exit 1 +fi + +if [ ! -d "$pushDataDir" ]; then + echo "$pushDataDir does not exist!" + exit 1 +fi + +if [ "`ls -A $pushDataDir`" != "" ]; then + echo "$pushDataDir is not enpty!" + exit 1 +fi + +pg_basebackup -F p --progress -X none -h $bakInstanceHost -p $bakInstancePort -U $bakInstanceUser -v -D $bakDataDir +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB pg_basebackup failed!" + exit 1 +fi + +rsync -av $bakDataDir/* $pushDataDir/ + +sed -i 's/^push_standby/#push_standby/g' $pushDataDir/postgresql.conf +sed -i 's/^port/#port/g' $pushDataDir/postgresql.conf +sed -i 's/^lmdb_page_directory/#lmdb_page_directory/g' $pushDataDir/postgresql.conf +sed -i 's/^lmdb_wal_directory/#lmdb_wal_directory/g' $pushDataDir/postgresql.conf +sed -i 's/^he3share/#he3share/g' $pushDataDir/postgresql.conf +sed -i 's/^mpush/#mpush/g' $pushDataDir/postgresql.conf + +echo -e "he3_point_in_time_recovery = on" >> $pushDataDir/postgresql.auto.conf + +rsync -av --exclude base --exclude global $pushDataDir/* $slaveDataDir/ +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): sync data file failed!" + exit 1 +fi + +ln -s $pushDataDir/base $slaveDataDir/base +ln -s $pushDataDir/global $slaveDataDir/global + +echo -e "push_standby=off" >> $slaveDataDir/postgresql.conf +echo -e "port=$slavePort" >> $slaveDataDir/postgresql.conf +echo -e "lmdb_page_directory='$slaveImdbPageDirectory'" >> $slaveDataDir/postgresql.conf +echo -e "lmdb_wal_directory='$slaveImdbWalDirectory'" >> $slaveDataDir/postgresql.conf +echo -e "he3share=off" >> $slaveDataDir/postgresql.conf + +sed -i 's/^primary_conninfo/#primary_conninfo/g' $pushDataDir/postgresql.auto.conf +sed -i 's/^primary_conninfo/#primary_conninfo/g' $pushDataDir/postgresql.conf +echo -e "primary_conninfo = '$slaveConninfo'" >> $pushDataDir/postgresql.conf +echo -e "push_standby=on" >> $pushDataDir/postgresql.conf +echo -e "port=$pushPort" >> $pushDataDir/postgresql.conf +echo -e "lmdb_page_directory='$pushImdbPageDirectory'" >> $pushDataDir/postgresql.conf +echo -e "lmdb_wal_directory='$pushImdbWalDirectory'" >> $pushDataDir/postgresql.conf + +rm -rf $slaveImdbPageDirectory $slaveImdbWalDirectory $pushImdbPageDirectory $pushImdbWalDirectory + +chmod 0750 $slaveDataDir -R +chmod 0750 $pushDataDir -R + +pg_ctl -D $slaveDataDir -l $slaveLogfile start +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB slave instance start failed!" + exit 1 +fi + +pg_ctl -D $pushDataDir -l $pushLogfile start +if [ $? -ne 0 ] +then + echo "$(date "+%F %T"): He3DB push instance start failed!" + exit 1 +fi + +sed -i 's/^he3_point_in_time_recovery/#he3_point_in_time_recovery/g' $slaveDataDir/postgresql.auto.conf +sed -i 's/^he3_point_in_time_recovery/#he3_point_in_time_recovery/g' $pushDataDir/postgresql.auto.conf \ No newline at end of file diff --git a/src/backend/Makefile b/src/backend/Makefile index f79bbf6..5cb6726 100644 --- a/src/backend/Makefile +++ b/src/backend/Makefile @@ -120,10 +120,10 @@ $(top_builddir)/src/port/libpgport_srv.a: | submake-libpgport # The postgres.o target is needed by the rule in Makefile.global that # creates the exports file when MAKE_EXPORTS = true. LIBS += $(libpq) -libredis = -L$(top_builddir)/src/backend/access/transam/ -lhiredis -LIBS += $(libredis) -libfs = -L$(top_builddir)/src/backend/storage/file/ -lfs -lz -lpthread -lm -LIBS += $(libfs) +librust_log = -L$(top_builddir)/src/backend/storage/file/ -lrust_log -lstdc++ -lm -ldl -lpthread -lfuse3 -Wl,-gc-section +LIBS += $(librust_log) +libglib = -L/usr/lib/x86_64-linux-gnu/ -lglib-2.0 -I/usr/include/glib-2.0/ -I/usr/lib/x86_64-linux-gnu/glib-2.0/include/ -lpthread -llmdb +LIBS += $(libglib) postgres.o: $(OBJS) $(CC) $(LDREL) $(call expand_subsys,$^) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@ diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c index fee6fce..5c02e2d 100644 --- a/src/backend/access/brin/brin_xlog.c +++ b/src/backend/access/brin/brin_xlog.c @@ -211,9 +211,6 @@ brin_xlog_update(XLogReaderState *record) XLogRecPtr lsn = record->EndRecPtr; xl_brin_update *xlrec = (xl_brin_update *) XLogRecGetData(record); Buffer buffer; - RelFileNode rnode; - BlockNumber blkno; - ForkNumber forknum; XLogRedoAction action; //XLogRecGetBlockTag(record, 0, &rnode, &forknum, &blkno); @@ -504,59 +501,10 @@ he3_brin_xlog_desummarize_page(XLogReaderState *record) } } -static void -brinRedoCommon(XLogReaderState *record){ - XLogRecPtr lsn = record->currRecPtr; //最终要推进到的lsn - Buffer buffer; //lsn要修改的页的buffer - uint8 blockNum = (uint8) XLogRecGetBlockNum(record); - - RelFileNode rnode; - ForkNumber forkNum; - BlockNumber blkno; - - Page page; - - XLogRecPtr procLsn; //从page的lsn到最终要推进到的lsn中间的lsn - char *errormsg; - - /* - * 获取lsn所要修改的page - */ - XLogRecGetBlockTag(record, blockNum, &rnode, &forkNum, &blkno); - buffer = XLogReadBufferExtended(rnode, forkNum, blkno,RBM_NORMAL); - - page = BufferGetPage(buffer); - procLsn = PageGetLSN(page); //获取页的lsn - XLogBeginRead(record, procLsn); - - while (procLsn < lsn) { - BlockNumber tmpBlockNo; //页面的lsn+1的lsn修改的页面的页号 - uint8 tmpBlockId; //页面的lsn+1的lsn的block_id - - //定位到page的lsn - - XLogReadRecord(record, &errormsg); - - tmpBlockId = (uint8) XLogRecGetBlockNum(record); - tmpBlockNo = record->blocks[tmpBlockId].blkno; - //page的lsn后的第一个lsn是不是还是修改这个page? - if (tmpBlockNo != blkno) { - procLsn = record->EndRecPtr; - continue; - }else{ - brin_redo(record); - } - } - -} void brin_redo(XLogReaderState *record) { - if (data_buffer_for_replay(record) == false) { - return; - } - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; switch (info & XLOG_BRIN_OPMASK) diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 4c41126..e437953 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -972,7 +972,6 @@ ginRedoDeleteListPages(XLogReaderState *record) Buffer metabuffer; Page metapage; int i; - uint8 blocknum = XLogRecGetBlockNum(record); metabuffer = XLogInitBufferForRedo(record, 0); Assert(BufferGetBlockNumber(metabuffer) == GIN_METAPAGE_BLKNO); metapage = BufferGetPage(metabuffer); @@ -1052,10 +1051,6 @@ he3GinRedoDeleteListPages(XLogReaderState *record) void gin_redo(XLogReaderState *record) { - if (data_buffer_for_replay(record) == false) { - return; - } - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; MemoryContext oldCtx; diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index bd4bf09..8203c38 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -276,7 +276,6 @@ he3gistRedoPageSplitRecord(XLogReaderState *record) XLogRecPtr lsn = record->EndRecPtr; gistxlogPageSplit *xldata = (gistxlogPageSplit *) XLogRecGetData(record); - Buffer firstbuffer = InvalidBuffer; Buffer buffer; Page page; bool isrootsplit = false; @@ -319,7 +318,7 @@ he3gistRedoPageSplitRecord(XLogReaderState *record) nextblkno = decodeNextBlockNumber(data, datalen); } else { offset = sizeof(Buffer); - firstbuffer = decodeFirstBuffer(data,datalen); + decodeFirstBuffer(data,datalen); } tuples = decodePageSplitRecord(data+offset, datalen-offset, &num); @@ -427,10 +426,6 @@ gistRedoPageReuse(XLogReaderState *record) void gist_redo(XLogReaderState *record) { - if (data_buffer_for_replay(record) == false) { - return; - } - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; MemoryContext oldCxt; diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index db3839b..f58e258 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -34,7 +34,7 @@ hash_xlog_init_meta_page(XLogReaderState *record) XLogRecPtr lsn = record->EndRecPtr; Page page; Buffer metabuf; - ForkNumber forknum; + // ForkNumber forknum; xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) XLogRecGetData(record); @@ -53,9 +53,9 @@ hash_xlog_init_meta_page(XLogReaderState *record) * special handling for init forks as create index operations don't log a * full page image of the metapage. */ - XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) - FlushOneBuffer(metabuf); + // XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); + // if (forknum == INIT_FORKNUM) + // FlushOneBuffer(metabuf); /* all done */ UnlockReleaseBuffer(metabuf); @@ -97,8 +97,8 @@ he3hash_xlog_init_bitmap_page(XLogReaderState *record) * full page image of the metapage. */ XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) - FlushOneBuffer(bitmapbuf); + // if (forknum == INIT_FORKNUM) + // FlushOneBuffer(bitmapbuf); UnlockReleaseBuffer(bitmapbuf); break; } @@ -122,9 +122,9 @@ he3hash_xlog_init_bitmap_page(XLogReaderState *record) PageSetLSN(page, lsn); MarkBufferDirty(metabuf); - XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); - if (forknum == INIT_FORKNUM) - FlushOneBuffer(metabuf); + // XLogRecGetBlockTag(record, 0, NULL, &forknum, NULL); + // if (forknum == INIT_FORKNUM) + // FlushOneBuffer(metabuf); } if (BufferIsValid(metabuf)) UnlockReleaseBuffer(metabuf); @@ -408,8 +408,6 @@ he3hash_xlog_split_allocate_page(XLogReaderState *record) newbuf = XLogInitBufferForRedo(record, 0); _hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket, xlrec->new_bucket_flag, true); - if (!IsBufferCleanupOK(newbuf)) - elog(PANIC, "hash_xlog_split_allocate_page: failed to acquire cleanup lock"); MarkBufferDirty(newbuf); PageSetLSN(BufferGetPage(newbuf), lsn); @@ -749,6 +747,10 @@ he3hash_xlog_squeeze_page(XLogReaderState *record) * is to ensure a cleanup lock on primary bucket page. */ (void) XLogReadBufferForRedoExtended(record, 0, mode, true, &bucketbuf); + Page writepage; + writepage = (Page) BufferGetPage(bucketbuf); + PageSetLSN(writepage, lsn); + MarkBufferDirty(bucketbuf); } if (mode != RBM_NORMAL_VALID && BufferIsValid(bucketbuf)) UnlockReleaseBuffer(bucketbuf); @@ -765,6 +767,7 @@ he3hash_xlog_squeeze_page(XLogReaderState *record) action = XLogReadBufferForRedoExtended(record, 0, mode, true, &writebuf); else { + mode = RBM_NORMAL; action = XLogReadBufferForRedo(record, 0, &writebuf); } @@ -962,7 +965,6 @@ he3hash_xlog_delete(XLogReaderState *record) Buffer deletebuf = InvalidBuffer; Page page; XLogRedoAction action; - Buffer buffer = InvalidBuffer; RelFileNode rnode; BlockNumber blkno; ForkNumber forknum; @@ -1007,6 +1009,7 @@ he3hash_xlog_delete(XLogReaderState *record) action = XLogReadBufferForRedoExtended(record, 0, mode, true, &deletebuf); else { + mode = RBM_NORMAL; action = XLogReadBufferForRedo(record, 0, &deletebuf); } /* replay the record for deleting entries in bucket page */ @@ -1209,10 +1212,6 @@ he3hash_xlog_vacuum_one_page(XLogReaderState *record) void hash_redo(XLogReaderState *record) { - if (data_buffer_for_replay(record) == false) { - return; - } - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; switch (info) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 42496be..1fb5a3b 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -6065,7 +6065,7 @@ heap_inplace_update(Relation relation, HeapTuple tuple) /* inplace updates aren't decoded atm, don't log the origin */ recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE); - XLogFlush(recptr); +// XLogFlush(recptr); PageSetLSN(page, recptr); } @@ -8491,6 +8491,11 @@ heap_xlog_prune(XLogReaderState *record) } if (mode != RBM_NORMAL_VALID && BufferIsValid(buffer)) + { + UnlockReleaseBuffer(buffer); + } + + /*if (mode != RBM_NORMAL_VALID && BufferIsValid(buffer)) { Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); @@ -8505,8 +8510,8 @@ heap_xlog_prune(XLogReaderState *record) * Do this regardless of a full-page image being applied, since the * FSM data is not in the page anyway. */ - XLogRecordPageWithFreeSpace(rnode, blkno, freespace); - } + /*XLogRecordPageWithFreeSpace(rnode, blkno, freespace); + }*/ } /* @@ -8560,6 +8565,11 @@ heap_xlog_vacuum(XLogReaderState *record) } if (BufferIsValid(buffer)) + { + UnlockReleaseBuffer(buffer); + } + + /*if (BufferIsValid(buffer)) { Size freespace = PageGetHeapFreeSpace(BufferGetPage(buffer)); RelFileNode rnode; @@ -8577,8 +8587,8 @@ heap_xlog_vacuum(XLogReaderState *record) * Do this regardless of a full-page image being applied, since the * FSM data is not in the page anyway. */ - XLogRecordPageWithFreeSpace(rnode, blkno, freespace); - } + /*XLogRecordPageWithFreeSpace(rnode, blkno, freespace); + }*/ } /* @@ -8642,6 +8652,8 @@ heap_xlog_visible(XLogReaderState *record) PageSetAllVisible(page); MarkBufferDirty(buffer); + + PageSetLSN(page, lsn); } else if (action == BLK_RESTORED) { @@ -8653,6 +8665,11 @@ heap_xlog_visible(XLogReaderState *record) } if (BufferIsValid(buffer)) + { + UnlockReleaseBuffer(buffer); + } + + /*if (BufferIsValid(buffer)) { Size space = PageGetFreeSpace(BufferGetPage(buffer)); @@ -8675,9 +8692,9 @@ heap_xlog_visible(XLogReaderState *record) * Do this regardless of a full-page image being applied, since the * FSM data is not in the page anyway. */ - if (xlrec->flags & VISIBILITYMAP_VALID_BITS) + /*if (xlrec->flags & VISIBILITYMAP_VALID_BITS) XLogRecordPageWithFreeSpace(rnode, blkno, space); - } + }*/ /* * Even if we skipped the heap page update due to the LSN interlock, it's @@ -8842,6 +8859,11 @@ he3_heap_xlog_visible(XLogReaderState *record) } if (BufferIsValid(buffer)) + { + UnlockReleaseBuffer(buffer); + } + + /*if (BufferIsValid(buffer)) { Size space = PageGetFreeSpace(BufferGetPage(buffer)); @@ -8864,18 +8886,11 @@ he3_heap_xlog_visible(XLogReaderState *record) * Do this regardless of a full-page image being applied, since the * FSM data is not in the page anyway. */ - if (xlrec->flags & VISIBILITYMAP_VALID_BITS) + /*if (xlrec->flags & VISIBILITYMAP_VALID_BITS) XLogRecordPageWithFreeSpace(rnode, blkno, space); - } + }*/ break; - - - } - - - - - + } } @@ -9156,8 +9171,8 @@ heap_xlog_insert(XLogReaderState *record) * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ - if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) - XLogRecordPageWithFreeSpace(target_node, blkno, freespace); + // if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + // XLogRecordPageWithFreeSpace(target_node, blkno, freespace); } /* @@ -9312,8 +9327,8 @@ heap_xlog_multi_insert(XLogReaderState *record) * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ - if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) - XLogRecordPageWithFreeSpace(rnode, blkno, freespace); + // if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5) + // XLogRecordPageWithFreeSpace(rnode, blkno, freespace); } /* @@ -9592,8 +9607,8 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) * don't bother to update the FSM in that case, it doesn't need to be * totally accurate anyway. */ - if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) - XLogRecordPageWithFreeSpace(rnode, newblk, freespace); + // if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5) + // XLogRecordPageWithFreeSpace(rnode, newblk, freespace); break; } @@ -9886,10 +9901,6 @@ heap_xlog_inplace(XLogReaderState *record) void heap_redo(XLogReaderState *record) { - if (data_buffer_for_replay(record) == false) { - return; - } - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; /* @@ -9936,10 +9947,6 @@ heap_redo(XLogReaderState *record) void heap2_redo(XLogReaderState *record) { - if (data_buffer_for_replay(record) == false) { - return; - } - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; switch (info & XLOG_HEAP_OPMASK) diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index e198df6..47b530c 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -91,6 +91,7 @@ #include "access/xlog.h" #include "miscadmin.h" #include "port/pg_bitutils.h" +#include "postmaster/secondbuffer.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "storage/smgr.h" @@ -254,8 +255,8 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk); #endif - Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); - Assert(InRecovery || BufferIsValid(heapBuf)); + //Assert(InRecovery || XLogRecPtrIsInvalid(recptr)); + //Assert(InRecovery || BufferIsValid(heapBuf)); Assert(flags & VISIBILITYMAP_VALID_BITS); /* Check that we have the right heap page pinned, if present */ @@ -656,6 +657,18 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now, pg.data, false); + if (!(InitdbSingle || IsBootstrapProcessingMode() == true) && !push_standby && !he3mirror) + { + PageKey pageKey; + pageKey.relfileNode.dbNode = rel->rd_smgr->smgr_rnode.node.dbNode; + pageKey.relfileNode.relNode = rel->rd_smgr->smgr_rnode.node.relNode; + + pageKey.blkNo = vm_nblocks_now; + pageKey.forkNo = VISIBILITYMAP_FORKNUM; + pageKey.pageLsn = 0; + + ReceivePageFromDataBuffer(&pageKey, (uint8_t *) pg.data); + } vm_nblocks_now++; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 78f78e7..5725812 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -681,8 +681,9 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) else { /* overwriting a block we zero-filled before */ - smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, - (char *) page, true); + XLogRecPtr lsn = PageGetLSN(page); + he3dbsmgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno, + (char *) page, true, lsn); } pfree(page); diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c index 4479d0a..153095d 100644 --- a/src/backend/access/spgist/spgxlog.c +++ b/src/backend/access/spgist/spgxlog.c @@ -1505,9 +1505,6 @@ spgRedoVacuumRedirect(XLogReaderState *record) void spg_redo(XLogReaderState *record) { - if (data_buffer_for_replay(record) == false) { - return; - } uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; MemoryContext oldCxt; diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index edf6483..1e6354a 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -11,7 +11,7 @@ subdir = src/backend/access/transam top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -override CPPFLAGS := -I$(srcdir) -I$(libpq_srcdir) -lhiredis $(CPPFLAGS) +override CPPFLAGS := -I$(srcdir) -I/usr/include/glib-2.0/ -I/usr/lib/x86_64-linux-gnu/glib-2.0/include/ -L/usr/lib/x86_64-linux-gnu/ -lglib-2.0 -lpthread -I$(libpq_srcdir) $(CPPFLAGS) OBJS = \ clog.o \ @@ -34,7 +34,11 @@ OBJS = \ xlogfuncs.o \ xloginsert.o \ xlogreader.o \ - xlogutils.o + xlogutils.o \ + pagehashqueue.o \ + ringbuffer.o \ + pthreadpool.o \ + pg_mirror.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c index 305238a..63301a1 100644 --- a/src/backend/access/transam/generic_xlog.c +++ b/src/backend/access/transam/generic_xlog.c @@ -477,9 +477,6 @@ applyPageRedo(Page page, const char *delta, Size deltaSize) void generic_redo(XLogReaderState *record) { - if (data_buffer_for_replay(record) == false) { - return; - } XLogRecPtr lsn = record->EndRecPtr; Buffer buffers[MAX_GENERIC_XLOG_PAGES]; uint8 block_id; diff --git a/src/backend/access/transam/libhiredis.a b/src/backend/access/transam/libhiredis.a deleted file mode 100644 index e04fc0f..0000000 Binary files a/src/backend/access/transam/libhiredis.a and /dev/null differ diff --git a/src/backend/access/transam/pagehashqueue.c b/src/backend/access/transam/pagehashqueue.c new file mode 100644 index 0000000..0b7fcd1 --- /dev/null +++ b/src/backend/access/transam/pagehashqueue.c @@ -0,0 +1,909 @@ +#include "access/pagehashqueue.h" +#include +#include "utils/palloc.h" +#include "utils/hsearch.h" +#include "storage/pmsignal.h" +#include "miscadmin.h" +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "postmaster/interrupt.h" +#include "libpq/pqsignal.h" +#include "storage/ipc.h" +#include "utils/wait_event.h" +#include "c.h" +#include "utils/ps_status.h" +#include "storage/procsignal.h" +#include "utils/memutils.h" +#include "postmaster/fork_process.h" +#include "postmaster/postmaster.h" +#include "storage/proc.h" +#include "access/pushpage.h" +#include "storage/buf_internals.h" +#include "utils/guc.h" +#include "storage/he3db_logindex.h" +#include "utils/hfs.h" + +static void WakeupFlushWork(void); +XLogRecPtr *g_redoStartLsn; +static HTAB *PageLogindexHash = NULL; +static int MaxNum(int num) { + if (num <= 0) return 1; + if ((num & (num - 1)) == 0) return num; + num |= num >> 1; + num |= num >> 2; + num |= num >> 4; + num |= num >> 8; + num |= num >> 16; + return num + 1; +} +#define FREELISTBRUCKET 32 +typedef struct FreelistManage { + slock_t mutex; + int curNum; + lsn_list_t*head; +}FreelistManage; +static FreelistManage* FreeList; + +const int multiple = 1; +static Size freesize = 0; +#define FREELISTSIZE (freesize?freesize:(freesize = MaxNum((NBuffers + NUM_BUFFER_PARTITIONS) * multiple))) +static Size +LogindexFreeListShmemSize(void) { + Size size; + size = 0; + size = add_size(size, mul_size(FREELISTSIZE, sizeof(lsn_list_t))); + return size; +} + +static Size +FreeListManageShmemSize(void) { + Size size; + size = 0; + size = add_size(size,sizeof(FreelistManage)); + return size; +} + +static Size LogindexFreeListAllShmemSize(void) { + Size size; + size = 0; + for (int i = 0; i < FREELISTBRUCKET;i++) { + size = add_size(size, LogindexFreeListShmemSize()); + size = add_size(size, FreeListManageShmemSize()); + } + return size; +} + +static Size LogindexHashShmemSize(void) { + return hash_estimate_size(NBuffers + NUM_BUFFER_PARTITIONS,sizeof(page_head_list_t)); +} + +Size LogindexHashAllShmemSize(void) { + Size size; + return LogindexFreeListAllShmemSize() + LogindexHashShmemSize(); +} + + +static void +LogindexFreeListShmemInit(void) +{ + Size size = LogindexFreeListAllShmemSize(); + bool found; + FreeList = (FreelistManage*) + ShmemInitStruct("LogindexSpace", + size, + &found); + if (!found) + { + for (Size i = 0; i < FREELISTBRUCKET;i++) { + FreelistManage* FreePos = (FreelistManage*)(((char*)FreeList)+ i * (LogindexFreeListShmemSize()+FreeListManageShmemSize())); + lsn_list_t* begin = (lsn_list_t*)(((char*)FreePos) + FreeListManageShmemSize()); + FreePos->head = begin; + FreePos->curNum = 0; + SpinLockInit(&FreePos->mutex); + int j = 0; + for (;j < FREELISTSIZE-1; j++) { + begin[j].next = &begin[j+1]; + } + begin[j].next = NULL; + } + } + +} + +static FreelistManage* getFreeList(uint32 hashcode) { + uint32 idx = hashcode % FREELISTBRUCKET; + return (FreelistManage*)(((char*)FreeList) + ((Size)idx) * (LogindexFreeListShmemSize()+FreeListManageShmemSize())); +} + +static int popLsnListElem(uint32 hashcode,lsn_list_t**data) { + FreelistManage* curFreelist = getFreeList(hashcode); + SpinLockAcquire(&curFreelist->mutex); + if (curFreelist->curNum == FREELISTSIZE) { + SpinLockRelease(&curFreelist->mutex); + return 0; + } + curFreelist->curNum++; + *data = curFreelist->head; + curFreelist->head = curFreelist->head->next; + SpinLockRelease(&curFreelist->mutex); + return 1; +} + +static int pushLsnListElemArr(uint32 hashcode,lsn_list_t*head,lsn_list_t*tail,int num) { + FreelistManage* curFreelist = getFreeList(hashcode); + SpinLockAcquire(&curFreelist->mutex); + if (curFreelist->curNum == 0) { + SpinLockRelease(&curFreelist->mutex); + return 0; + } + curFreelist->curNum -= num; + tail->next = curFreelist->head; + curFreelist->head = head; + SpinLockRelease(&curFreelist->mutex); + return 1; +} + +page_head_list_t* +PageLogindexInsert(BufferTag *tagPtr, uint32 hashcode, XLogRecPtr lsn,XLogRecPtr endlsn) +{ + page_head_list_t *result; + bool found; + lsn_list_t *data = NULL; + int re; + re = popLsnListElem(hashcode,(void**)&data); + if (re == 0) { + return NULL; + } + data->lsn = lsn; + data->endlsn = endlsn; + data->next = NULL; + result = (page_head_list_t *) + hash_search_with_hash_value(PageLogindexHash, + (void *) tagPtr, + hashcode, + HASH_ENTER, + &found); + + if (found && result->tail->lsn >= lsn) { + return result; + } + + if (found) { + result->count++; + result->tail->next = data; + result->tail = data; + } else { + result->count = 1; + result->tail = data; + result->head = data; + } + return result; +} + +void +PageLogindexDelete(BufferTag *tagPtr, uint32 hashcode,XLogRecPtr lsn) +{ + page_head_list_t *result; + + result = (page_head_list_t *) + hash_search_with_hash_value(PageLogindexHash, + (void *) tagPtr, + hashcode, + HASH_FIND, + NULL); + if (!result) { + return; + } else { + lsn_list_t* tail,*next; + next = result->head; + int delNum = 0; + while(next != NULL && next->lsn < lsn) { + delNum++; + tail = next; + next = tail->next; + result->count--; + } + if (delNum != 0) { + pushLsnListElemArr(hashcode,result->head,tail,delNum); + } + if (next == NULL) { + result = (page_head_list_t *) + hash_search_with_hash_value(PageLogindexHash, + (void *) tagPtr, + hashcode, + HASH_REMOVE, + NULL); + + if (!result) /* shouldn't happen */ + elog(ERROR, "PageLogindexHash hash table corrupted"); + } else { + result->head = next; + } + } +} + +uint32 +PageLogindexHashCode(BufferTag *tagPtr) +{ + return get_hash_value(PageLogindexHash, (void *) tagPtr); +} + +void +InitLogindexHashBrucket(void) +{ + HASHCTL info; + long init_table_size, + max_table_size; + info.keysize = sizeof(BufferTag); + info.entrysize = sizeof(page_head_list_t); + init_table_size = NBuffers + NUM_BUFFER_PARTITIONS; + max_table_size = NBuffers + NUM_BUFFER_PARTITIONS; + info.num_partitions = NUM_BUFFER_PARTITIONS; + PageLogindexHash = ShmemInitHash("PageLogindexHash", + init_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS| HASH_PARTITION); + LogindexFreeListShmemInit(); +} + +page_head_list_t * +PageLogindexLookup(BufferTag *tagPtr,uint32_t hashcode) +{ + page_head_list_t *result; + result = (page_head_list_t *) + hash_search_with_hash_value(PageLogindexHash, + (void *) tagPtr, + hashcode, + HASH_FIND, + NULL); + return result; +} + +void cleanOneList(BufferTag *tagPtr,XLogRecPtr cleanLsn) { + uint32 hashcode = PageLogindexHashCode(tagPtr); + LWLock *partition_lock = LOGIndexPartitionLock(hashcode); + LWLockAcquire(partition_lock, LW_EXCLUSIVE); + PageLogindexDelete(tagPtr,hashcode,cleanLsn); + LWLockRelease(partition_lock); +} + +static void threadCleanLogIndex(XLogRecPtr cleanLsn) +{ + HASH_SEQ_STATUS scan_status; + page_head_list_t *item; + hash_seq_init(&scan_status, PageLogindexHash); + while ((item = (page_head_list_t *) hash_seq_search(&scan_status)) != NULL) + { + uint32 hash = PageLogindexHashCode(&item->tag); + LWLock *partition_lock = LOGIndexPartitionLock(hash); + LWLockAcquire(partition_lock, LW_EXCLUSIVE); + PageLogindexDelete(&item->tag,hash,cleanLsn); + LWLockRelease(partition_lock); + } +} + +static void +LogIndexProcShutdownHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + ShutdownRequestPending = true; + + SetLatch(MyLatch); + + errno = save_errno; +} + +/* + * CleanLogIndexMain + */ +void +CleanLogIndexMain(int argc, char *argv[]) +{ + sigjmp_buf local_sigjmp_buf; + + MyBackendType = B_CLEAN_LOGINDEX; + MemoryContext CleanLogIndex_context; + init_ps_display(NULL); + + SetProcessingMode(InitProcessing); + + /* + * Set up signal handlers. We operate on databases much like a regular + * backend, so we use the same signal handling. See equivalent code in + * tcop/postgres.c. + */ + pqsignal(SIGHUP, SIG_IGN); + + /* + * SIGINT is used to signal canceling the current table's vacuum; SIGTERM + * means abort and exit cleanly, and SIGQUIT means abandon ship. + */ + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, LogIndexProcShutdownHandler); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); + pqsignal(SIGCHLD, SIG_DFL); + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. Formerly this code just ran in + * TopMemoryContext, but resetting that would be a really bad idea. + */ + CleanLogIndex_context = AllocSetContextCreate(TopMemoryContext, + "CleanLogIndexFlush", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(CleanLogIndex_context); + + /* + * If an exception is encountered, processing resumes here. + * + * Unlike most auxiliary processes, we don't attempt to continue + * processing after an error; we just clean up and exit. The autovac + * launcher is responsible for spawning another worker later. + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we exit. It might + * seem that this policy makes the HOLD_INTERRUPTS() call redundant, but + * it is not since InterruptPending might be set already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevents interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * We can now go away. Note that because we called InitProcess, a + * callback was registered to do ProcKill, which will clean up + * necessary state. + */ + proc_exit(0); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + PG_SETMASK(&UnBlockSig); + char strname[128]; + char *prefix = "clean logindex "; + int n = pg_snprintf(strname,sizeof(strname),prefix,strlen(prefix)); + /* + * Loop forever + */ + SetProcessingMode(NormalProcessing); + XLogRecPtr pushStandbyPoint = 0; + XLogRecPtr pushStandbyPrePoint = 0; + for (;;) + { + /* Clear any already-pending wakeups */ + ResetLatch(MyLatch); + + if (ShutdownRequestPending) + proc_exit(0); + int hasData = 0; + + pushStandbyPrePoint = pushStandbyPoint; + if (push_standby == true || EnableHotStandby == false || *isPromoteIsTriggered) { + pushStandbyPoint = GetXLogPushToDisk(); + if (pushStandbyPrePoint < pushStandbyPoint) { + hasData++; + } + } else { + if (LastPushPoint == 0) { + PrevPushPoint = *g_redoStartLsn; + } + if (PrevPushPoint != 0) { + XLogRecPtr lastReplPtr = GetXLogReplayRecPtr(NULL); +// elog(LOG, "deal page from %x to %x", PrevPushPoint, lastReplPtr); + TagNode *tagList = GetBufTagByLsnRange(PrevPushPoint,lastReplPtr-1); + if (tagList->next != NULL && tagList->tag.lsn >= PrevPushPoint) { + LastPushPoint = tagList->tag.lsn; + TagNode *next = tagList->next; + int pageNum = 0; + while(next!=NULL) { + // elog(LOG,"add tag rel %d, fork %d, blk %d", + // next->tag.tag.rnode.relNode, next->tag.tag.forkNum, next->tag.tag.blockNum); + addFileKey(&next->tag.tag); + next = next->next; + pageNum++; + } + FreeTagNode(tagList); + pushSlaveReplayQueue(pageNum); + hasData++; + PrevPushPoint = LastPushPoint+1; + SetXLogPushToDisk(PrevPushPoint); + pushStandbyPoint = GetConsistLsn(PrevPushPoint); + } else { + LastPushPoint = PrevPushPoint = lastReplPtr; + if (pushStandbyPrePoint < PrevPushPoint) { + SetXLogPushToDisk(PrevPushPoint); + pushStandbyPoint = GetConsistLsn(PrevPushPoint+1); + } + } + } + } + int pos; + if (pushStandbyPrePoint < pushStandbyPoint) { + pos = pg_snprintf(strname+n,sizeof(strname)-n,"lsn from %X/%X to %X/%X tasking",LSN_FORMAT_ARGS(pushStandbyPrePoint),LSN_FORMAT_ARGS(pushStandbyPoint)); + strname[n+pos] = '\0'; + set_ps_display(strname); + } + if (pushStandbyPrePoint < pushStandbyPoint) { + elog(LOG,"start threadCleanLogIndex lsn from %X/%X to %X/%X",LSN_FORMAT_ARGS(pushStandbyPrePoint),LSN_FORMAT_ARGS(pushStandbyPoint)); + CleanLogIndexByPage(pushStandbyPoint); + //threadCleanLogIndex(LastPushPoint); + elog(LOG,"end threadCleanLogIndex lsn from %X/%X to %X/%X",LSN_FORMAT_ARGS(pushStandbyPrePoint),LSN_FORMAT_ARGS(pushStandbyPoint)); + } + if (hasData != 0) { + continue; + } + pos = pg_snprintf(strname+n,sizeof(strname)-n,"to lsn: %X/%X idle",LSN_FORMAT_ARGS(pushStandbyPoint)); + strname[n+pos] = '\0'; + set_ps_display(strname); + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 50L /* convert to ms */ , + WAIT_EVENT_CLEAN_LOGINDEX_MAIN); + } +} + +void SignalStartCleanLogIndexWork(void) { + SendPostmasterSignal(PMSIGNAL_CLEAN_LOGINDEX_WORKER); +} + +typedef struct PageValue { + BufferTag tag; + uint16_t num; +} PageValue; + +static HTAB *PageCountHash = NULL; +static uint32_t curLatchPos = 0; + +typedef struct { + slock_t mutex; + volatile uint32 gpushpos; + volatile bool ready; + volatile uint32 gpos; + pg_atomic_uint32 latchPos; + pg_atomic_uint32 taskNum; + uint32 modifyNum; + Latch pageFlushWakeupLatch[PARALLEL_NUM]; + PageValue*gtag[G_QUEUE_LEN]; +}PageHashQueueShmemStruct; +static PageHashQueueShmemStruct *PageHashQueueShmem; + +void pushSlaveReplayQueue(int pageNum) { + + if (PageHashQueueShmem->gpos != 0 && PageHashQueueShmem->ready == false) { + SpinLockAcquire(&PageHashQueueShmem->mutex); + PageHashQueueShmem->ready = true; + SpinLockRelease(&PageHashQueueShmem->mutex); + WakeupFlushWork(); + } + + while(pageNum > CompletedTaskNum()) { + pg_usleep(1000L); + } + cleanMap(); +} + +Latch* GetCurrentLatch(uint32_t pos) { + return &PageHashQueueShmem->pageFlushWakeupLatch[pos]; +} + +void WakeupOneFlushWork(uint32_t pos) { + SetLatch(&PageHashQueueShmem->pageFlushWakeupLatch[pos]); +} + +static void WakeupFlushWork(void) +{ + for (int i = 0;ipageFlushWakeupLatch[i]); + } +} + +uint32_t AssignLatchPos(void) { + return pg_atomic_fetch_add_u32(&PageHashQueueShmem->latchPos,1); +} + +void ResetFlushLatch(uint32_t pos) { + ResetLatch(&PageHashQueueShmem->pageFlushWakeupLatch[pos]); +} + +void OwnFlushLatch(uint32_t pos) { + OwnLatch(&PageHashQueueShmem->pageFlushWakeupLatch[pos]); +} + +Size +PageHashQueueShmemSize(void) +{ + Size size; + + /* + * Currently, the size of the gtag[] array is arbitrarily set equal to + * NBuffers. This may prove too large or small ... + */ + size = offsetof(PageHashQueueShmemStruct, gtag); + size = add_size(size, mul_size(G_QUEUE_LEN, sizeof(PageValue*))); + + return size; +} + +void +PageHashQueueShmemInit(void) +{ + Size size = PageHashQueueShmemSize(); + bool found; + + PageHashQueueShmem = (PageHashQueueShmemStruct *) + ShmemInitStruct("PageHashQueue", + size, + &found); + + if (!found) + { + SpinLockInit(&PageHashQueueShmem->mutex); + SpinLockAcquire(&PageHashQueueShmem->mutex); + PageHashQueueShmem->ready = false; + PageHashQueueShmem->gpushpos = 0; + SpinLockRelease(&PageHashQueueShmem->mutex); + PageHashQueueShmem->gpos = 0; + pg_atomic_init_u32(&PageHashQueueShmem->taskNum,0); + pg_atomic_init_u32(&PageHashQueueShmem->latchPos, 0); + PageHashQueueShmem->modifyNum = 0; + for (int i = 0;ipageFlushWakeupLatch[i]); + } + } +} + +static Size RedoStartPointSize(void) { + return sizeof(XLogRecPtr); +} + +Size PageHashMapSize(void) { + return RedoStartPointSize() + hash_estimate_size(G_QUEUE_LEN,sizeof(PageValue)); +} + +void +InitBufferPoolHashMap(void) +{ + HASHCTL info; + long init_table_size, + max_table_size; + info.keysize = sizeof(BufferTag); + info.entrysize = sizeof(PageValue); + init_table_size = G_QUEUE_LEN; + max_table_size = G_QUEUE_LEN; + PageCountHash = ShmemInitHash("PageHashCount", + init_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS); + bool found; + g_redoStartLsn = (XLogRecPtr*)ShmemInitStruct("redoStartPoint", + RedoStartPointSize(), + &found); + if (!found) + { + memset(g_redoStartLsn,0,RedoStartPointSize()); + } +} + +uint32_t addFileKey(BufferTag*onePage) { + PageValue *result; + bool found; + uint32_t newHash = get_hash_value(PageCountHash,onePage); + result = (PageValue*) + hash_search_with_hash_value(PageCountHash, + (void *) onePage, + newHash, + HASH_ENTER, + &found); + if (found == false) { + result->num = 0; + uint32_t gpos = PageHashQueueShmem->gpos++; + PageHashQueueShmem->gtag[gpos] = result; + } + result->num++; + PageHashQueueShmem->modifyNum++; + return PageHashQueueShmem->modifyNum; +} + +void cleanMap(void) { + HASH_SEQ_STATUS scan_status; + PageValue *item; + + hash_seq_init(&scan_status, PageCountHash); + while ((item = (PageValue *) hash_seq_search(&scan_status)) != NULL) + { + + if (hash_search(PageCountHash, (const void *) &item->tag, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "hash table corrupted"); + } + SpinLockAcquire(&PageHashQueueShmem->mutex); + PageHashQueueShmem->ready = false; + PageHashQueueShmem->gpushpos = 0; + SpinLockRelease(&PageHashQueueShmem->mutex); + PageHashQueueShmem->gpos = 0; + pg_atomic_init_u32(&PageHashQueueShmem->taskNum,0); + PageHashQueueShmem->modifyNum = 0; +} + +uint32_t hashMapSize(void) { + return hash_get_num_entries(PageCountHash); +} + +static int cmp(const void* a,const void* b) { + return (*((const PageValue**)b))->num - (*((const PageValue**)a))->num; +} + +void SortPageQueue(void) { + if (PageHashQueueShmem->gpos != 0 && PageHashQueueShmem->ready == false) { + qsort(PageHashQueueShmem->gtag,PageHashQueueShmem->gpos,sizeof(PageValue*),cmp); + SpinLockAcquire(&PageHashQueueShmem->mutex); + PageHashQueueShmem->ready = true; + SpinLockRelease(&PageHashQueueShmem->mutex); + WakeupFlushWork(); + return; + } +} + +BufferTag* QueuePushPage(void) { + uint32_t gpushpos; + bool hasData = false; + if (PageHashQueueShmem->ready == true) { + SpinLockAcquire(&PageHashQueueShmem->mutex); + if (PageHashQueueShmem->ready == true && PageHashQueueShmem->gpushpos < PageHashQueueShmem->gpos) { + hasData = true; + gpushpos = PageHashQueueShmem->gpushpos++; + } + SpinLockRelease(&PageHashQueueShmem->mutex); + } + if (hasData == false) { + return NULL; + } else { + return &(PageHashQueueShmem->gtag[gpushpos]->tag); + } +} + +void ProcFlushBufferToDisk(BufferTag*tag) { + Buffer buffer = XLogReadBufferExtended(tag->rnode, tag->forkNum, tag->blockNum, + RBM_NORMAL); + if (!BufferIsValid(buffer)) + { + elog(PANIC,"ProcFlushBufferToDisk is invalid rel %d,flk %d,blk %d",tag->rnode.relNode,tag->forkNum,tag->blockNum); + pg_atomic_fetch_add_u32(&PageHashQueueShmem->taskNum,1); + return; + } + + // elog(LOG, "replay rel %d, fork %d, blkno %d, pagelsn %X/%X", tag->rnode.relNode, + // tag->forkNum,tag->blockNum, LSN_FORMAT_ARGS(PageGetLSN(BufferGetPage(buffer)))); + //slave no need to flush disk + if (push_standby == true) { + BufferDesc *buf; + buf = GetBufferDescriptor(buffer-1); + uint32 buf_state = pg_atomic_read_u32(&buf->state); + if (buf_state & BM_DIRTY) { + LWLockAcquire(BufferDescriptorGetContentLock(buf), + LW_SHARED); + FlushOneBuffer(buffer); + LWLockRelease(BufferDescriptorGetContentLock(buf)); + ScheduleBufferTagForWriteback(&BackendWritebackContext, + &buf->tag); + } + } + ReleaseBuffer(buffer); + pg_atomic_fetch_add_u32(&PageHashQueueShmem->taskNum,1); +} + +uint32_t CompletedTaskNum(void) { + return pg_atomic_read_u32(&PageHashQueueShmem->taskNum); +} + + +static void +ParallelFlushProcShutdownHandler(SIGNAL_ARGS) +{ + int save_errno = errno; + + ShutdownRequestPending = true; + + WakeupOneFlushWork(curLatchPos); + + errno = save_errno; +} + +/* + * PageFlushWorkerMain + */ +NON_EXEC_STATIC void +PageFlushWorkerMain(int argc, char *argv[]) +{ + sigjmp_buf local_sigjmp_buf; + InRecovery = true; + MyBackendType = B_PARALLEL_FLUSH; + MemoryContext parallelflush_context; + init_ps_display(NULL); + + SetProcessingMode(InitProcessing); + + /* + * Set up signal handlers. We operate on databases much like a regular + * backend, so we use the same signal handling. See equivalent code in + * tcop/postgres.c. + */ + pqsignal(SIGHUP, SIG_IGN); + + /* + * SIGINT is used to signal canceling the current table's vacuum; SIGTERM + * means abort and exit cleanly, and SIGQUIT means abandon ship. + */ + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, ParallelFlushProcShutdownHandler); + /* SIGQUIT handler was already set up by InitPostmasterChild */ + + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); + pqsignal(SIGCHLD, SIG_DFL); + /* + * Create a memory context that we will do all our work in. We do this so + * that we can reset the context during error recovery and thereby avoid + * possible memory leaks. Formerly this code just ran in + * TopMemoryContext, but resetting that would be a really bad idea. + */ + parallelflush_context = AllocSetContextCreate(TopMemoryContext, + "ParallelFlush", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(parallelflush_context); + + /* Early initialization */ + BaseInit(); + + /* + * Create a per-backend PGPROC struct in shared memory, except in the + * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do + * this before we can use LWLocks (and in the EXEC_BACKEND case we already + * had to do some stuff with LWLocks). + */ +#ifndef EXEC_BACKEND + InitProcess(); +#endif + + /* + * If an exception is encountered, processing resumes here. + * + * Unlike most auxiliary processes, we don't attempt to continue + * processing after an error; we just clean up and exit. The autovac + * launcher is responsible for spawning another worker later. + * + * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask + * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, + * signals other than SIGQUIT will be blocked until we exit. It might + * seem that this policy makes the HOLD_INTERRUPTS() call redundant, but + * it is not since InterruptPending might be set already. + */ + if (sigsetjmp(local_sigjmp_buf, 1) != 0) + { + /* since not using PG_TRY, must reset error stack by hand */ + error_context_stack = NULL; + + /* Prevents interrupts while cleaning up */ + HOLD_INTERRUPTS(); + + /* Report the error to the server log */ + EmitErrorReport(); + + /* + * We can now go away. Note that because we called InitProcess, a + * callback was registered to do ProcKill, which will clean up + * necessary state. + */ + proc_exit(0); + } + + /* We can now handle ereport(ERROR) */ + PG_exception_stack = &local_sigjmp_buf; + + PG_SETMASK(&UnBlockSig); + + curLatchPos = AssignLatchPos(); + + OwnFlushLatch(curLatchPos); + char strname[64]; + char *prefix = "parallel flush workid: "; + int n = pg_snprintf(strname,sizeof(strname),prefix,strlen(prefix)); + /* + * Loop forever + */ + for (;;) + { + /* Clear any already-pending wakeups */ + ResetFlushLatch(curLatchPos); + + if (ShutdownRequestPending) + proc_exit(0); + BufferTag *tag = NULL; + SetProcessingMode(NormalProcessing); + int pos = pg_snprintf(strname+n,sizeof(strname)-n,"%d tasking",curLatchPos); + strname[n+pos] = '\0'; + set_ps_display(strname); + while((tag=QueuePushPage())!=NULL) { + ProcFlushBufferToDisk(tag); + } + pos = pg_snprintf(strname+n,sizeof(strname)-n,"%d idle",curLatchPos); + strname[n+pos] = '\0'; + set_ps_display(strname); + (void) WaitLatch(GetCurrentLatch(curLatchPos), + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 1000L /* convert to ms */ , + WAIT_EVENT_PAGEFLUSH_MAIN); + } +} + +void SignalStartFlushWork(void) { + SendPostmasterSignal(PMSIGNAL_PARALLEL_FLUSH_WORKER); + //sleep wait flush work startup + usleep(200000); +} + +/* + * Main entry point for autovacuum worker process. + * + * This code is heavily based on pgarch.c, q.v. + */ +int +StartPageFlushWorker(void) +{ + pid_t worker_pid; + +#ifdef EXEC_BACKEND + switch ((worker_pid = avworker_forkexec())) +#else + switch ((worker_pid = fork_process())) +#endif + { + case -1: + ereport(LOG, + (errmsg("could not fork autovacuum worker process: %m"))); + return 0; + +#ifndef EXEC_BACKEND + case 0: + /* in postmaster child ... */ + InitPostmasterChild(); + + /* Close the postmaster's sockets */ + ClosePostmasterPorts(false); + + CreateAuxProcessResourceOwner(); + + //MyPMChildSlot = AssignPostmasterChildSlot(); + + IsParallelFlushWorker = true; + + PageFlushWorkerMain(0, NULL); + break; +#endif + default: + return (int) worker_pid; + } + + /* shouldn't get here */ + return 0; +} + + diff --git a/src/backend/access/transam/pg_mirror.c b/src/backend/access/transam/pg_mirror.c new file mode 100644 index 0000000..e173384 --- /dev/null +++ b/src/backend/access/transam/pg_mirror.c @@ -0,0 +1,742 @@ +#include "access/pg_mirror.h" +#include "postgres.h" +#include "access/xlogrecord.h" +#include "access/heapam_xlog.h" +#include "access/nbtxlog.h" +#include "access/gistxlog.h" +#include "access/spgxlog.h" +#include "access/brin_xlog.h" +#include "assert.h" +#include "common/controldata_utils.h" +#include "miscadmin.h" +#define INSERT_FREESPACE_MIRROR(endptr) \ + (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ)) + +static ControlFileData *ControlFile = NULL; +//default 16MB +static int WalSegSz = 16777216; + +//muti block to one record +typedef struct XLogHe3ToPg { + uint64 CurrBytePos; + uint64 PrevBytePos; +}XLogHe3ToPg; +static XLogHe3ToPg g_walHe3ToPg; + +static void ReConvertMainData(XLogRecord* sRecord, char*sMainData, uint32_t*sLen, char* dMainData, uint32_t* dLen) { + RmgrId rmid = sRecord->xl_rmid; + uint8 info = (sRecord->xl_info & ~XLR_INFO_MASK); + bool hasChange = false; + switch(rmid) { + case RM_HEAP2_ID: + { + if ((info & XLOG_HEAP_OPMASK) == XLOG_HEAP2_VISIBLE) { + xl_heap_visible *xlrec = (xl_heap_visible *)sMainData; + xl_old_heap_visible xlrecOld; + xlrecOld.cutoff_xid = xlrec->cutoff_xid; + xlrecOld.flags = xlrec->flags; + *dLen = sizeof(xl_old_heap_visible); + memcpy(dMainData,&xlrecOld,*dLen); + hasChange = true; + } + break; + } + case RM_HEAP_ID: + { + if (((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_UPDATE) || + ((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_HOT_UPDATE)) { + xl_heap_update *xlrec = (xl_heap_update *)sMainData; + xl_old_heap_update xlrecOld; + xlrecOld.old_xmax = xlrec->old_xmax; + xlrecOld.old_offnum = xlrec->old_offnum; + xlrecOld.old_infobits_set = xlrec->old_infobits_set; + xlrecOld.flags = xlrec->flags; + xlrecOld.new_xmax = xlrec->new_xmax; + xlrecOld.new_offnum = xlrec->new_offnum; + *dLen = sizeof(xl_old_heap_update); + memcpy(dMainData,&xlrecOld,*dLen); + hasChange = true; + } + break; + } + case RM_BTREE_ID: + { + if (info == XLOG_BTREE_SPLIT_L || info == XLOG_BTREE_SPLIT_R) { + xl_btree_split *xlrec = (xl_btree_split *)sMainData; + xl_old_btree_split xlrecOld; + xlrecOld.level = xlrec->level; + xlrecOld.firstrightoff = xlrec->firstrightoff; + xlrecOld.newitemoff = xlrec->newitemoff; + xlrecOld.postingoff = xlrec->postingoff; + *dLen = sizeof(xl_old_btree_split); + memcpy(dMainData,&xlrecOld,*dLen); + hasChange = true; + } + break; + } + case RM_GIST_ID: + { + if (info == XLOG_GIST_PAGE_SPLIT) { + gistxlogPageSplit *xlrec = (gistxlogPageSplit *)sMainData; + gistoldxlogPageSplit xlrecOld; + xlrecOld.origrlink = xlrec->origrlink; + xlrecOld.orignsn = xlrec->orignsn; + xlrecOld.origleaf = xlrec->origleaf; + xlrecOld.npage = xlrec->npage; + xlrecOld.markfollowright = xlrec->markfollowright; + *dLen = sizeof(gistoldxlogPageSplit); + memcpy(dMainData,&xlrecOld,*dLen); + hasChange = true; + } + break; + } + case RM_SPGIST_ID: + { + if (info == XLOG_SPGIST_ADD_LEAF) { + spgxlogAddLeaf *xlrec = (spgxlogAddLeaf *)sMainData; + spgoldxlogAddLeaf xlrecOld; + xlrecOld.newPage = xlrec->newPage; + xlrecOld.storesNulls = xlrec->storesNulls; + xlrecOld.offnumLeaf = xlrec->offnumLeaf; + xlrecOld.offnumHeadLeaf = xlrec->offnumHeadLeaf; + xlrecOld.offnumParent = xlrec->offnumParent; + xlrecOld.nodeI = xlrec->nodeI; + *dLen = sizeof(spgoldxlogAddLeaf); + memcpy(dMainData,&xlrecOld,*dLen); + hasChange = true; + } else if (info == XLOG_SPGIST_MOVE_LEAFS) { + spgxlogMoveLeafs *xlrec = (spgxlogMoveLeafs *)sMainData; + spgoldxlogMoveLeafs xlrecOld; + xlrecOld.nMoves = xlrec->nMoves; + xlrecOld.newPage = xlrec->newPage; + xlrecOld.replaceDead = xlrec->replaceDead; + xlrecOld.storesNulls = xlrec->storesNulls; + xlrecOld.offnumParent = xlrec->offnumParent; + xlrecOld.nodeI = xlrec->nodeI; + xlrecOld.stateSrc = xlrec->stateSrc; + *dLen = SizeOfOldSpgxlogMoveLeafs; + memcpy(dMainData,&xlrecOld,*dLen); + memcpy(dMainData+*dLen,xlrec->offsets,*sLen-SizeOfSpgxlogMoveLeafs); + *dLen += *sLen-SizeOfSpgxlogMoveLeafs; + hasChange = true; + } else if (info == XLOG_SPGIST_ADD_NODE) { + spgxlogAddNode *xlrec = (spgxlogAddNode *)sMainData; + spgoldxlogAddNode xlrecOld; + xlrecOld.offnum = xlrec->offnum; + xlrecOld.offnumNew = xlrec->offnumNew; + xlrecOld.newPage = xlrec->newPage; + xlrecOld.parentBlk = xlrec->parentBlk; + xlrecOld.offnumParent = xlrec->offnumParent; + xlrecOld.nodeI = xlrec->nodeI; + xlrecOld.stateSrc = xlrec->stateSrc; + *dLen = sizeof(spgoldxlogAddNode); + memcpy(dMainData,&xlrecOld,*dLen); + hasChange = true; + } else if (info == XLOG_SPGIST_PICKSPLIT) { + spgxlogPickSplit *xlrec = (spgxlogPickSplit *)sMainData; + spgoldxlogPickSplit xlrecOld; + xlrecOld.isRootSplit = xlrec->isRootSplit; + xlrecOld.nDelete = xlrec->nDelete; + xlrecOld.nInsert = xlrec->nInsert; + xlrecOld.initSrc = xlrec->initSrc; + xlrecOld.initDest = xlrec->initDest; + xlrecOld.offnumInner = xlrec->offnumInner; + xlrecOld.initInner = xlrec->initInner; + xlrecOld.storesNulls = xlrec->storesNulls; + xlrecOld.innerIsParent = xlrec->innerIsParent; + xlrecOld.offnumParent = xlrec->offnumParent; + xlrecOld.nodeI = xlrec->nodeI; + xlrecOld.stateSrc = xlrec->stateSrc; + *dLen = SizeOfOldSpgxlogPickSplit; + memcpy(dMainData,&xlrecOld,*dLen); + memcpy(dMainData+*dLen,xlrec->offsets,*sLen-SizeOfSpgxlogPickSplit); + *dLen += *sLen-SizeOfSpgxlogPickSplit; + hasChange = true; + } + break; + } + case RM_BRIN_ID: + { + if (info == XLOG_BRIN_INSERT) { + xl_brin_insert *xlrec = (xl_brin_insert *)sMainData; + xl_old_brin_insert xlrecOld; + xlrecOld.heapBlk = xlrec->heapBlk; + /* extra information needed to update the revmap */ + xlrecOld.pagesPerRange = xlrec->pagesPerRange; + xlrecOld.offnum = xlrec->offnum; + *dLen = sizeof(xl_old_brin_insert); + memcpy(dMainData,&xlrecOld,*dLen); + hasChange = true; + } else if ( info == XLOG_BRIN_UPDATE) { + xl_brin_update *xlrec = (xl_brin_update *) sMainData; + xl_old_brin_update xlrecUpdate; + xl_brin_insert *xlrecInsert = &xlrec->insert; + xl_old_brin_insert xlrecOld; + xlrecOld.heapBlk = xlrecInsert->heapBlk; + /* extra information needed to update the revmap */ + xlrecOld.pagesPerRange = xlrecInsert->pagesPerRange; + xlrecOld.offnum = xlrecInsert->offnum; + /* offset number of old tuple on old page */ + xlrecUpdate.oldOffnum = xlrec->oldOffnum; + xlrecUpdate.insert = xlrecOld; + *dLen = sizeof(xl_old_brin_update); + memcpy(dMainData,&xlrecUpdate,*dLen); + hasChange = true; + } + break; + } + default: + { + break; + } + } + if (hasChange == false) { + *dLen = *sLen; + memcpy(dMainData,sMainData,*dLen); + } +} + +static int XlogHe3ToPg(XLogRecord*newRecord[],int n, OldXLogRecord*oldRecord) { + oldRecord->xl_xid = newRecord[0]->xl_xid; + oldRecord->xl_info = newRecord[0]->xl_info; + oldRecord->xl_rmid = newRecord[0]->xl_rmid; + char d_main_data[8192]; + int dPos = 0; + char* dst = (char*)oldRecord; + dPos += sizeof(OldXLogRecord); + uint32_t d_main_data_len = 0; + uint32 main_data_len = 0; + uint8_t blkNum = 0; + bool hasblk = false; + char*img_ptr[XLR_MAX_BLOCK_ID + 1] = {0}; + char*data_ptr[XLR_MAX_BLOCK_ID + 1] = {0}; + uint16_t bimg_len[XLR_MAX_BLOCK_ID + 1] = {0}; + uint16_t data_len[XLR_MAX_BLOCK_ID + 1] = {0}; + for(int i = 0;ixl_tot_len - sizeof(XLogRecord); + uint32 datatotal = 0; + sPos += sizeof(XLogRecord); + while(remaining > datatotal) { + uint8_t block_id = *(src + sPos); + if (block_id == XLR_BLOCK_ID_DATA_SHORT) { + sPos += sizeof(block_id); + remaining -= sizeof(block_id); + if (i == n-1) { + memcpy(dst + dPos,&block_id,sizeof(block_id)); + dPos += sizeof(block_id); + } + main_data_len = *((uint8_t*)(src + sPos)); + //main_data_len type XLR_BLOCK_ID_DATA_SHORT + uint8 d_len; + if (i == n-1) { + ReConvertMainData(newRecord[i],src + sPos + sizeof(d_len)+bimg_len[blkNum]+data_len[blkNum],&main_data_len,d_main_data,&d_main_data_len); + d_len = d_main_data_len; + memcpy(dst + dPos,&d_len,sizeof(d_len)); + dPos += sizeof(d_len); + } + sPos += sizeof(d_len); + remaining -= sizeof(d_len); + datatotal += main_data_len; + break; + } else if (block_id == XLR_BLOCK_ID_DATA_LONG) { + sPos += sizeof(block_id); + remaining -= sizeof(block_id); + if (i == n-1) { + memcpy((dst + dPos),&block_id,sizeof(block_id)); + dPos += sizeof(block_id); + } + memcpy(&main_data_len,src + sPos,sizeof(uint32)); + if (i == n-1) { + ReConvertMainData(newRecord[i],src + sPos + sizeof(main_data_len)+bimg_len[blkNum]+data_len[blkNum],&main_data_len,d_main_data,&d_main_data_len); + if (d_main_data_len > 255) { + memcpy(dst + dPos,&d_main_data_len,sizeof(d_main_data_len)); + dPos += sizeof(d_main_data_len); + } else { + *(dst + dPos - 1) = XLR_BLOCK_ID_DATA_SHORT; + uint8_t d_len = d_main_data_len; + memcpy(dst + dPos,&d_len,sizeof(d_len)); + dPos += sizeof(d_len); + } + } + sPos += sizeof(main_data_len); + remaining -= sizeof(main_data_len); + datatotal += main_data_len; + break; + } else if (block_id == XLR_BLOCK_ID_ORIGIN) { + sPos += sizeof(block_id); + remaining -= sizeof(block_id); + if (i == n-1) { + memcpy(dst + dPos,&block_id,sizeof(block_id)); + dPos += sizeof(block_id); + memcpy(dst + dPos,src+sPos,sizeof(RepOriginId)); + dPos += sizeof(RepOriginId); + } + sPos += sizeof(RepOriginId); + remaining -= sizeof(RepOriginId); + } else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID) { + sPos += sizeof(block_id); + remaining -= sizeof(block_id); + if (i == n - 1) { + memcpy(dst + dPos,&block_id,sizeof(block_id)); + dPos += sizeof(block_id); + memcpy(dst + dPos,src+sPos,sizeof(TransactionId)); + dPos += sizeof(TransactionId); + } + sPos += sizeof(TransactionId); + remaining -= sizeof(TransactionId); + } else if (block_id <= XLR_MAX_BLOCK_ID) { + memcpy(dst + dPos, src + sPos, SizeOfXLogRecordBlockHeader); + uint8_t fork_flags = *(src + sPos + sizeof(block_id)); + *(dst + dPos) = blkNum; + hasblk = true; + data_len[blkNum] = *((uint16_t*)(src + sPos + sizeof(block_id) + sizeof(fork_flags))); + datatotal += data_len[blkNum]; + sPos += SizeOfXLogRecordBlockHeader; + dPos += SizeOfXLogRecordBlockHeader; + remaining -= SizeOfXLogRecordBlockHeader; + if ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0) { + bimg_len[blkNum] = *((uint16_t*)(src + sPos)); + datatotal += bimg_len[blkNum]; + uint16_t hole_offset = *((uint16_t*)(src + sPos + sizeof(bimg_len))); + uint8_t bimg_info = *((uint16_t*)(src + sPos + sizeof(bimg_len) + sizeof(hole_offset))); + memcpy(dst + dPos, src + sPos, SizeOfXLogRecordBlockImageHeader); + sPos += SizeOfXLogRecordBlockImageHeader; + dPos += SizeOfXLogRecordBlockImageHeader; + remaining -= SizeOfXLogRecordBlockImageHeader; + if ((bimg_info & BKPIMAGE_IS_COMPRESSED) != 0) { + if ((bimg_info & BKPIMAGE_HAS_HOLE) != 0) { + memcpy(dst + dPos, src + sPos, SizeOfXLogRecordBlockCompressHeader); + sPos += SizeOfXLogRecordBlockCompressHeader; + dPos += SizeOfXLogRecordBlockCompressHeader; + remaining -= SizeOfXLogRecordBlockCompressHeader; + } + } + } + if (!(fork_flags & BKPBLOCK_SAME_REL)) { + memcpy(dst + dPos, src + sPos, sizeof(RelFileNode)); + sPos += sizeof(RelFileNode); + dPos += sizeof(RelFileNode); + remaining -= sizeof(RelFileNode); + } + memcpy(dst + dPos, src + sPos, sizeof(BlockNumber)); + sPos += sizeof(BlockNumber); + dPos += sizeof(BlockNumber); + remaining -= sizeof(BlockNumber); + } else { + printf("invalid block_id %u",block_id); + } + } + assert(remaining == datatotal); + if (bimg_len[blkNum] != 0 ) { + img_ptr[blkNum] = src + sPos; + sPos += bimg_len[blkNum]; + } + if (data_len[blkNum] != 0) { + data_ptr[blkNum] = src + sPos; + sPos += data_len[blkNum]; + } + if (hasblk == true) { + blkNum++; + } + + sPos += main_data_len; + assert(sPos == newRecord[i]->xl_tot_len); + } + int idx = 0; + while(idx < blkNum) { + if (bimg_len[idx] != 0) { + memcpy(dst + dPos, img_ptr[idx], bimg_len[idx]); + dPos += bimg_len[idx]; + } + if (data_len[idx] != 0){ + memcpy(dst + dPos, data_ptr[idx], data_len[idx]); + dPos += data_len[idx]; + } + idx++; + } + memcpy(dst + dPos, d_main_data, d_main_data_len); + dPos += d_main_data_len; + oldRecord->xl_tot_len = dPos; + return dPos; +} + +static int OldUsableBytesInSegment = + (DEFAULT_XLOG_SEG_SIZE / XLOG_BLCKSZ * (XLOG_BLCKSZ - SizeOfXLogShortPHD)) - + (SizeOfXLogLongPHD - SizeOfXLogShortPHD); + + +static XLogRecPtr +OldXLogBytePosToRecPtr(uint64 bytepos) +{ + uint64 fullsegs; + uint64 fullpages; + uint64 bytesleft; + uint32 seg_offset; + XLogRecPtr result; + + fullsegs = bytepos / OldUsableBytesInSegment; + bytesleft = bytepos % OldUsableBytesInSegment; + + if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) + { + /* fits on first page of segment */ + seg_offset = bytesleft + SizeOfXLogLongPHD; + } + else + { + /* account for the first page on segment with long header */ + seg_offset = XLOG_BLCKSZ; + bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; + + fullpages = bytesleft / (XLOG_BLCKSZ - SizeOfXLogShortPHD); + bytesleft = bytesleft % (XLOG_BLCKSZ - SizeOfXLogShortPHD); + + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + } + + XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, DEFAULT_XLOG_SEG_SIZE, result); + + return result; +} + +static XLogRecPtr +OldXLogBytePosToEndRecPtr(uint64 bytepos) +{ + uint64 fullsegs; + uint64 fullpages; + uint64 bytesleft; + uint32 seg_offset; + XLogRecPtr result; + + fullsegs = bytepos / OldUsableBytesInSegment; + bytesleft = bytepos % OldUsableBytesInSegment; + + if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) + { + /* fits on first page of segment */ + if (bytesleft == 0) + seg_offset = 0; + else + seg_offset = bytesleft + SizeOfXLogLongPHD; + } + else + { + /* account for the first page on segment with long header */ + seg_offset = XLOG_BLCKSZ; + bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; + + fullpages = bytesleft / (XLOG_BLCKSZ - SizeOfXLogShortPHD); + bytesleft = bytesleft % (XLOG_BLCKSZ - SizeOfXLogShortPHD); + + if (bytesleft == 0) + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft; + else + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + } + + XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, DEFAULT_XLOG_SEG_SIZE, result); + + return result; +} + +static uint64 +OldXLogRecPtrToBytePos(XLogRecPtr ptr) +{ + uint64 fullsegs; + uint32 fullpages; + uint32 offset; + uint64 result; + + XLByteToSeg(ptr, fullsegs, DEFAULT_XLOG_SEG_SIZE); + + fullpages = (XLogSegmentOffset(ptr, DEFAULT_XLOG_SEG_SIZE)) / XLOG_BLCKSZ; + offset = ptr % XLOG_BLCKSZ; + + if (fullpages == 0) + { + result = fullsegs * OldUsableBytesInSegment; + if (offset > 0) + { + Assert(offset >= SizeOfXLogLongPHD); + result += offset - SizeOfXLogLongPHD; + } + } + else + { + result = fullsegs * OldUsableBytesInSegment + + (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */ + (fullpages - 1) * (XLOG_BLCKSZ - SizeOfXLogShortPHD); /* full pages */ + if (offset > 0) + { + Assert(offset >= SizeOfXLogShortPHD); + result += offset - SizeOfXLogShortPHD; + } + } + + return result; +} + +static bool +ReserveXLogWalSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) +{ + XLogHe3ToPg *Insert = &g_walHe3ToPg; + uint64 startbytepos; + uint64 endbytepos; + uint64 prevbytepos; + uint32 size = MAXALIGN(SizeOfOldXLogRecord); + XLogRecPtr ptr; + uint32 segleft; + + startbytepos = Insert->CurrBytePos; + + ptr = OldXLogBytePosToEndRecPtr(startbytepos); + if (XLogSegmentOffset(ptr, DEFAULT_XLOG_SEG_SIZE) == 0) + { + *EndPos = *StartPos = ptr; + return false; + } + + endbytepos = startbytepos + size; + prevbytepos = Insert->PrevBytePos; + + *StartPos = OldXLogBytePosToRecPtr(startbytepos); + *EndPos = OldXLogBytePosToEndRecPtr(endbytepos); + + segleft = DEFAULT_XLOG_SEG_SIZE - XLogSegmentOffset(*EndPos, DEFAULT_XLOG_SEG_SIZE); + if (segleft != DEFAULT_XLOG_SEG_SIZE) + { + /* consume the rest of the segment */ + *EndPos += segleft; + endbytepos = OldXLogRecPtrToBytePos(*EndPos); + } + Insert->CurrBytePos = endbytepos; + Insert->PrevBytePos = startbytepos; + + *PrevPtr = OldXLogBytePosToRecPtr(prevbytepos); + + Assert(XLogSegmentOffset(*EndPos, DEFAULT_XLOG_SEG_SIZE) == 0); + Assert(OldXLogRecPtrToBytePos(*EndPos) == endbytepos); + Assert(OldXLogRecPtrToBytePos(*StartPos) == startbytepos); + Assert(OldXLogRecPtrToBytePos(*PrevPtr) == prevbytepos); + + return true; +} + +static void +ReserveXLogWalInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos, + XLogRecPtr *PrevPtr) +{ + XLogHe3ToPg *Insert = &g_walHe3ToPg; + uint64 startbytepos; + uint64 endbytepos; + uint64 prevbytepos; + + size = MAXALIGN(size); + + /* All (non xlog-switch) records should contain data. */ + Assert(size > SizeOfOldXLogRecord); + + /* + * The duration the spinlock needs to be held is minimized by minimizing + * the calculations that have to be done while holding the lock. The + * current tip of reserved WAL is kept in CurrBytePos, as a byte position + * that only counts "usable" bytes in WAL, that is, it excludes all WAL + * page headers. The mapping between "usable" byte positions and physical + * positions (XLogRecPtrs) can be done outside the locked region, and + * because the usable byte position doesn't include any headers, reserving + * X bytes from WAL is almost as simple as "CurrBytePos += X". + */ + + startbytepos = Insert->CurrBytePos; + endbytepos = startbytepos + size; + prevbytepos = Insert->PrevBytePos; + Insert->CurrBytePos = endbytepos; + Insert->PrevBytePos = startbytepos; + + *StartPos = OldXLogBytePosToRecPtr(startbytepos); + *EndPos = OldXLogBytePosToEndRecPtr(endbytepos); + *PrevPtr = OldXLogBytePosToRecPtr(prevbytepos); + + /* + * Check that the conversions between "usable byte positions" and + * XLogRecPtrs work consistently in both directions. + */ + Assert(OldXLogRecPtrToBytePos(*StartPos) == startbytepos); + Assert(OldXLogRecPtrToBytePos(*EndPos) == endbytepos); + Assert(OldXLogRecPtrToBytePos(*PrevPtr) == prevbytepos); +} + +static void CopyXLogRecordToPgWAL(int write_len,OldXLogRecord* rechdr,XLogRecPtr StartPos, XLogRecPtr EndPos, +char*dBuf,int* dLen) { + + char *currpos; + int freespace; + int written; + XLogRecPtr CurrPos; + XLogPageHeader pagehdr; + CurrPos = StartPos; + XLogPageHeader page; + XLogLongPageHeader longpage; + currpos = dBuf; + if (CurrPos % XLOG_BLCKSZ == SizeOfXLogShortPHD && + XLogSegmentOffset(CurrPos, DEFAULT_XLOG_SEG_SIZE) > XLOG_BLCKSZ) { + page = (XLogPageHeader)currpos; + page->xlp_magic = XLOG_PAGE_MAGIC; + page->xlp_info = 0; + page->xlp_tli = ControlFile->checkPointCopy.ThisTimeLineID; + page->xlp_pageaddr = CurrPos - (CurrPos % XLOG_BLCKSZ); + currpos += SizeOfXLogShortPHD; + } + else if (CurrPos % XLOG_BLCKSZ == SizeOfXLogLongPHD && + XLogSegmentOffset(CurrPos, DEFAULT_XLOG_SEG_SIZE) < XLOG_BLCKSZ) { + page = (XLogPageHeader)currpos; + page->xlp_magic = XLOG_PAGE_MAGIC; + page->xlp_info = XLP_LONG_HEADER; + page->xlp_tli = ControlFile->checkPointCopy.ThisTimeLineID; + page->xlp_pageaddr = CurrPos - (CurrPos % XLOG_BLCKSZ); + longpage = (XLogLongPageHeader) page; + longpage->xlp_sysid = ControlFile->system_identifier; + longpage->xlp_seg_size = WalSegSz; + longpage->xlp_xlog_blcksz = XLOG_BLCKSZ; + currpos += SizeOfXLogLongPHD; + } + + freespace = INSERT_FREESPACE_MIRROR(CurrPos); + Assert(freespace >= sizeof(uint32)); + /* Copy record data */ + written = 0; + if (rechdr != NULL) { + char *rdata_data = rechdr; + int rdata_len = rechdr->xl_tot_len; + while (rdata_len > freespace) + { + Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0); + memcpy(currpos, rdata_data, freespace); + rdata_data += freespace; + rdata_len -= freespace; + written += freespace; + CurrPos += freespace; + currpos += freespace; + + pagehdr = (XLogPageHeader) currpos; + pagehdr->xlp_info = 0; + pagehdr->xlp_tli = ControlFile->checkPointCopy.ThisTimeLineID; + pagehdr->xlp_magic = XLOG_PAGE_MAGIC; + pagehdr->xlp_pageaddr = CurrPos - (CurrPos % XLOG_BLCKSZ); + pagehdr->xlp_rem_len = write_len - written; + pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD; + if (XLogSegmentOffset(CurrPos, DEFAULT_XLOG_SEG_SIZE) == 0) { + CurrPos += SizeOfXLogLongPHD; + currpos += SizeOfXLogLongPHD; + pagehdr->xlp_info |= XLP_LONG_HEADER; + longpage = (XLogLongPageHeader) pagehdr; + longpage->xlp_sysid = ControlFile->system_identifier; + longpage->xlp_seg_size = WalSegSz; + longpage->xlp_xlog_blcksz = XLOG_BLCKSZ; + } else { + CurrPos += SizeOfXLogShortPHD; + currpos += SizeOfXLogShortPHD; + } + freespace = INSERT_FREESPACE_MIRROR(CurrPos); + } + Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0); + memcpy(currpos, rdata_data, rdata_len); + currpos += rdata_len; + CurrPos += rdata_len; + freespace -= rdata_len; + written += rdata_len; + } + Assert(written == write_len); + int extra_space = MAXALIGN64(CurrPos) - CurrPos; + CurrPos = MAXALIGN64(CurrPos); + if (CurrPos != EndPos) + printf("ERROR space reserved for WAL record does not match what was written"); + currpos += extra_space; + *dLen = (int)(currpos - dBuf); +} + +void readControlFile(char*pathstr) { + bool crc_ok; + + ControlFile = get_controlfile(pathstr,&crc_ok); + if (!crc_ok) + printf(_("WARNING: Calculated CRC checksum does not match value stored in file.\n" + "Either the file is corrupt, or it has a different layout than this program\n" + "is expecting. The results below are untrustworthy.\n\n")); + + /* set wal segment size */ + WalSegSz = ControlFile->xlog_seg_size; +} + +void setControlFile(ControlFileData *cfile) { + ControlFile = cfile; +} + + +int ArrayXlogHe3ToPg(char*sBuf,int sLen, char*dBuf,int* dLen,uint64 *startLsn,uint64 *endLsn) { + XLogRecord*one = (XLogRecord*)sBuf; + //32kB + static char tBuf[32768]; + int tLen = 0; + int MtrLen = 0; + int iLen = 0; + int oLen = 0; + *dLen = 0; + for(;iLenmtr == false) { + newRecord[n++] = one; + iLen += one->xl_tot_len; + one = (((char*)one) + one->xl_tot_len); + if (iLen > sLen) { + break; + } + } + newRecord[n++] = one; + iLen += one->xl_tot_len; + one = (((char*)one) + one->xl_tot_len); + if (iLen > sLen) { + break; + } + XlogHe3ToPg(newRecord,n,tBuf+tLen); + uint64 StartPos,EndPos; + XLogRecPtr reduceV = 0; + if (g_walHe3ToPg.PrevBytePos == 0) { + uint64 xl_prev = newRecord[0]->xl_end - newRecord[0]->xl_tot_len; + g_walHe3ToPg.PrevBytePos = g_walHe3ToPg.CurrBytePos = xl_prev; + bool Insert = ReserveXLogWalSwitch(&StartPos,&EndPos,&xl_prev); + g_walHe3ToPg.PrevBytePos = g_walHe3ToPg.CurrBytePos; + reduceV = 1; + } + OldXLogRecord* rechdr = (OldXLogRecord*)(tBuf + tLen); + ReserveXLogWalInsertLocation(rechdr->xl_tot_len,&StartPos,&EndPos,&rechdr->xl_prev); + //for pg check + if (rechdr->xl_rmid == RM_XLOG_ID && + (rechdr->xl_info == XLOG_CHECKPOINT_SHUTDOWN || rechdr->xl_info == XLOG_CHECKPOINT_ONLINE)) { + CheckPoint*cp = (CheckPoint*)(((char*)rechdr)+SizeOfOldXLogRecord + SizeOfXLogRecordDataHeaderShort); + cp->redo = StartPos; + rechdr->xl_prev = rechdr->xl_prev-reduceV; + } + pg_crc32c rdata_crc; + INIT_CRC32C(rdata_crc); + COMP_CRC32C(rdata_crc, ((char*)rechdr) + SizeOfOldXLogRecord, rechdr->xl_tot_len - SizeOfOldXLogRecord); + COMP_CRC32C(rdata_crc, rechdr, offsetof(OldXLogRecord, xl_crc)); + FIN_CRC32C(rdata_crc); + rechdr->xl_crc = rdata_crc; + CopyXLogRecordToPgWAL(rechdr->xl_tot_len,rechdr,StartPos,EndPos,dBuf+*dLen,&oLen); + if (*startLsn == 0) { + *startLsn = StartPos; + } + *endLsn = EndPos; + *dLen += oLen; + tLen += rechdr->xl_tot_len; + MtrLen = iLen; + } + return MtrLen; +} + + + diff --git a/src/backend/access/transam/pthreadpool.c b/src/backend/access/transam/pthreadpool.c new file mode 100644 index 0000000..b80f020 --- /dev/null +++ b/src/backend/access/transam/pthreadpool.c @@ -0,0 +1,86 @@ +#include +#include "postgres.h" +#include "access/pthreadpool.h" +#include "access/xlog.h" +#include "access/xlogrecord.h" +#include +#include "utils/guc.h" +#include "utils/hfs.h" +GThreadPool *gpool = NULL; +static __thread GError *gerr = NULL; +static bool IsInitPool = false; +static void getWalFunc(gpointer data, gpointer user_data) { + bool walStoreToLocal = false; + if (EnableHotStandby && *isPromoteIsTriggered == false && !push_standby) + walStoreToLocal = true; + wal_batch_t* elem = (wal_batch_t*)data; + if (elem != NULL) { + //elem->status = STARTSTATUS; + int r; + //clock_t start = clock(); + r = batchRead((uint8_t *) elem->data, ThisTimeLineID2>ThisTimeLineID?ThisTimeLineID2:ThisTimeLineID, elem->startLsn, elem->endLsn, walStoreToLocal); + //clock_t end = clock(); + //printf("====LSN %X/%X==pid %d==len %d===time %u\n",LSN_FORMAT_ARGS(elem->startLsn),pthread_self(),r,end-start); + elem->dataLen = r; + if (r > sizeof(XLogRecord)) { + XLogRecord* record = ((XLogRecord*)elem->data); + elem->startLsn = record->xl_end - record->xl_tot_len; + } else { + elem->startLsn = 0; + } + pg_atomic_exchange_u32(&elem->status,(uint32_t)COMPLETEDSTATUS); + } +} + +static void produceWalFunc(gpointer data, gpointer user_data) { + wal_batch_t* elem = (wal_batch_t*)data; + elem->dataLen = elem->endLsn - elem->startLsn; + if (elem->dataLen != 0) { + pushXlogToTikv(elem->data,elem->dataLen); + } + pg_atomic_exchange_u32(&elem->status,(uint32_t)COMPLETEDSTATUS); +} + +int initPthreadPool(void) { + if (IsInitPool == true) { + return 0; + } + if (!g_thread_supported()) + { + elog(FATAL,"Not support g_thread!"); + return -1; + } + //default 8 thread read + if(he3mirror){ + gpool = g_thread_pool_new(produceWalFunc,NULL,4,FALSE,NULL); + } else{ + gpool = g_thread_pool_new(getWalFunc,NULL,8,FALSE,NULL); + } + elog(LOG,"thread pool max threads is %d,num thread is %d", + g_thread_pool_get_max_threads(gpool),g_thread_pool_get_num_threads(gpool)); + return 0; +} + +int WalTaskPool(wal_batch_t*data) { + g_thread_pool_push(gpool,(gpointer)data,&gerr); + if (gerr != NULL) { + elog(FATAL,"WalTaskPool err %s",gerr->message); + return -1; + } + return 0; +} + +void WalTaskFree(void) { + return g_thread_pool_free(gpool,FALSE,TRUE); +} + +void WalTaskImmediateFree(void) { + g_thread_pool_free(gpool,TRUE,TRUE); + gpool = NULL; +} + +bool IsFreePthreadPool(void) { + return gpool == NULL; +} + + diff --git a/src/backend/access/transam/pushpage.c b/src/backend/access/transam/pushpage.c index fdedda0..1e96aa9 100644 --- a/src/backend/access/transam/pushpage.c +++ b/src/backend/access/transam/pushpage.c @@ -14,6 +14,13 @@ clock_t start_time; XLogRecPtr PushPtr = 0; XLogRecPtr ApplyLsn = 0; XLogRecPtr PrePushPtr = 0; +XLogRecPtr CheckPointPtr = InvalidXLogRecPtr; +XLogRecPtr FileCheckPointPtr = InvalidXLogRecPtr; + +//this for cut logindex +XLogRecPtr PrevPushPoint = InvalidXLogRecPtr; +XLogRecPtr LastPushPoint = InvalidXLogRecPtr; + CheckPoint GlobalCheckPoint; uint8 GlobalState; @@ -22,83 +29,31 @@ uint8 GlobalState; static PGconn *pushconn = NULL; static PGconn *connToPushStandby = NULL; pid_t startupPid = 0; -static redisContext *redisconn = NULL; -static bool ConnectRedis() { - redisconn = redisConnect("127.0.0.1", 6379); - if (redisconn == NULL) { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("connect to redis failed"))); - return false; - } - if (redisconn->err) { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("connect to redis failed: %s",redisconn->errstr))); - redisconn = NULL; - return false; - } - char*redis_password = "VlJi7uBV"; - redisReply *reply = (redisReply *)redisCommand(redisconn, "AUTH %s", redis_password); - if (reply->type == REDIS_REPLY_ERROR) { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("connect to redis passwd failed: %s",redisconn->errstr))); - redisconn = NULL; - return false; - } - return true; -} - -bool pushRedisList(const char*str) { - if (redisconn == NULL) { - if (ConnectRedis() == false) { - return false; - } - } - redisReply* r = (redisReply*)redisCommand(redisconn, str); - if (NULL == r) { - redisFree(redisconn); - redisconn = NULL; - return false; - } - if (!(r->type == REDIS_REPLY_STATUS && strcasecmp(r->str,"OK") == 0) && (r->type!=REDIS_REPLY_INTEGER)) { - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("connect to redis failed1: %s",r->str))); - freeReplyObject(r); - redisFree(redisconn); - redisconn = NULL; - return false; - } - freeReplyObject(r); - return true; -} -static bool ConnectPushStandbyDB() { - char *err; - const char *keys[] = {"dbname","user","password","host","port",NULL}; - const char *vals[] = {"postgres","repl","123456","127.0.0.1","15431",NULL}; - connToPushStandby = PQconnectdbParams(keys, vals, false); - if (PQstatus(connToPushStandby) == CONNECTION_BAD) - { - err = pchomp(PQerrorMessage(connToPushStandby)); - ereport(ERROR, - (errcode(ERRCODE_CONNECTION_FAILURE), - errmsg("push standby could not connect to the push standby server: %s", err))); - return false; - } - return true; +// static bool ConnectPushStandbyDB() { +// char *err; +// const char *keys[] = {"dbname","user","password","host","port",NULL}; +// const char *vals[] = {"postgres","repl","123456","100.73.36.123","15431",NULL}; +// connToPushStandby = PQconnectdbParams(keys, vals, false); +// if (PQstatus(connToPushStandby) == CONNECTION_BAD) +// { +// err = pchomp(PQerrorMessage(connToPushStandby)); +// ereport(ERROR, +// (errcode(ERRCODE_CONNECTION_FAILURE), +// errmsg("push standby could not connect to the push standby server: %s", err))); +// return false; +// } +// return true; -} +// } +static bool ConnectPrimaryDB(void); - -static bool ConnectPrimaryDB() { +static bool ConnectPrimaryDB(void) { char *err; char conninfo[maxconnlen]; - const char *keys[] = {"dbname","user","password","host","port",NULL}; - const char *vals[] = {"postgres","repl","123456","127.0.0.1","15432",NULL}; + // const char *keys[] = {"dbname","user","password","host","port",NULL}; + // const char *vals[] = {"postgres","repl","123456","100.73.36.123","15432",NULL}; strlcpy(conninfo, (char *) PrimaryConnInfo, maxconnlen); /* Establish the connection to the primary for query Min Lsn*/ /* @@ -106,11 +61,12 @@ static bool ConnectPrimaryDB() { * URI), and pass some extra options. */ /* Note we do not want libpq to re-expand the dbname parameter */ - pushconn = PQconnectdbParams(keys, vals, true); + pushconn = PQconnectdb(conninfo); + // pushconn = PQconnectdbParams(keys, vals, true); if (PQstatus(pushconn) == CONNECTION_BAD) { err = pchomp(PQerrorMessage(pushconn)); - ereport(ERROR, + ereport(WARNING, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("push standby could not connect to the primary server: %s", err))); return false; @@ -118,41 +74,77 @@ static bool ConnectPrimaryDB() { return true; } -XLogRecPtr QueryPushLsn() -{ - StringInfoData cmd; - XLogRecPtr replylsn = InvalidXLogRecPtr; - char *replyptr; - initStringInfo(&cmd); - appendStringInfoString(&cmd,"select pg_last_wal_replay_lsn()"); - replylsn = InvalidXLogRecPtr; - if (connToPushStandby == NULL) { - if (ConnectPushStandbyDB() == false) { - return InvalidXLogRecPtr; - } - } - PGresult *pgres = NULL; - pgres = PQexec(connToPushStandby, cmd.data); - if (PQresultStatus(pgres) == PGRES_TUPLES_OK && PQntuples(pgres) == 1) { - replyptr = PQgetvalue(pgres, 0, 0); - bool flag; - replylsn = pg_lsn_in_internal(replyptr,&flag); - +bool ReConnectPrimaryDB(void) { + if (push_standby == true && pushconn!=NULL) { + PQfinish(pushconn); + pushconn = NULL; + if (ConnectPrimaryDB() == true) { + return true; + } } - PQfinish(connToPushStandby); - connToPushStandby = NULL; - PQclear(pgres); - return replylsn; - + return false; } -XLogRecPtr QueryPushChkpointLsn() + +// static bool ConnectPrimaryDB4ReplyLSN() { +// char *err; +// char conninfo[maxconnlen]; +// const char *keys[] = {"dbname","user","password","host","port",NULL}; +// const char *vals[] = {"postgres","postgres","","100.73.36.123","15432",NULL}; +// strlcpy(conninfo, (char *) PrimaryConnInfo, maxconnlen); +// /* Establish the connection to the primary for query Min Lsn*/ +// /* +// * We use the expand_dbname parameter to process the connection string (or +// * URI), and pass some extra options. +// */ +// /* Note we do not want libpq to re-expand the dbname parameter */ +// pushconn = PQconnectdbParams(keys, vals, true); +// if (PQstatus(pushconn) == CONNECTION_BAD) +// { +// err = pchomp(PQerrorMessage(pushconn)); +// ereport(WARNING, +// (errcode(ERRCODE_CONNECTION_FAILURE), +// errmsg("push standby could not connect to the primary server: %s", err))); +// return false; +// } +// return true; +// } + + +// XLogRecPtr QueryPushLsn() +// { +// StringInfoData cmd; +// XLogRecPtr replylsn = InvalidXLogRecPtr; +// char *replyptr; +// initStringInfo(&cmd); +// appendStringInfoString(&cmd,"select pg_last_wal_replay_lsn()"); +// replylsn = InvalidXLogRecPtr; +// if (connToPushStandby == NULL) { +// if (ConnectPushStandbyDB() == false) { +// return InvalidXLogRecPtr; +// } +// } +// PGresult *pgres = NULL; +// pgres = PQexec(connToPushStandby, cmd.data); +// if (PQresultStatus(pgres) == PGRES_TUPLES_OK && PQntuples(pgres) == 1) { +// replyptr = PQgetvalue(pgres, 0, 0); +// bool flag; +// replylsn = pg_lsn_in_internal(replyptr,&flag); + +// } +// PQfinish(connToPushStandby); +// connToPushStandby = NULL; +// PQclear(pgres); +// return replylsn; + +// } + +XLogRecPtr QueryPushChkpointLsn(void) { ControlFileData *ControlFile; int fd; - char ControlFilePath[MAXPGPATH]; - pg_crc32c crc; int r; + XLogRecPtr checkPoint; ControlFile = palloc(sizeof(ControlFileData)); @@ -179,8 +171,10 @@ XLogRecPtr QueryPushChkpointLsn() XLOG_CONTROL_FILE, r, sizeof(ControlFileData)))); } close(fd); - - return ControlFile->checkPoint; + checkPoint = ControlFile->checkPoint; + pfree(ControlFile); + + return checkPoint; } XLogRecPtr QueryMinLsn(XLogRecPtr lsn) @@ -188,9 +182,6 @@ XLogRecPtr QueryMinLsn(XLogRecPtr lsn) StringInfoData cmd; XLogRecPtr replylsn; PGresult *pgres = NULL; - char *appname; - char *state; - char *syncstate; char *replyptr; replylsn = InvalidXLogRecPtr; if (pushconn == NULL) { @@ -200,90 +191,89 @@ XLogRecPtr QueryMinLsn(XLogRecPtr lsn) } initStringInfo(&cmd); - appendStringInfoString(&cmd, "SELECT t.application_name, t.replay_lsn, t.state, t.sync_state FROM pg_catalog.pg_stat_replication t WHERE t.application_name <> \'"); - appendStringInfoString(&cmd, "pushstandby"); - appendStringInfoString(&cmd, "\' order by t.replay_lsn limit 1"); + appendStringInfoString(&cmd, "SELECT t.application_name, t.replay_lsn, t.state, t.sync_state FROM pg_catalog.pg_stat_replication t WHERE t.application_name not like \'"); + appendStringInfoString(&cmd, "push%"); + appendStringInfoString(&cmd, "\' and t.application_name not like \'priv%\' order by t.replay_lsn limit 1"); pgres = PQexec(pushconn, cmd.data); - if (PQresultStatus(pgres) == PGRES_TUPLES_OK && PQntuples(pgres) == 1) { - appname = PQgetvalue(pgres, 0, 0); - replyptr = PQgetvalue(pgres, 0, 1); - bool flag; - replylsn = pg_lsn_in_internal(replyptr,&flag); - //replylsn = atol(replyptr); - state = PQgetvalue(pgres, 0, 2); - syncstate = PQgetvalue(pgres, 0, 3); + if (PQresultStatus(pgres) == PGRES_TUPLES_OK) { + if (PQntuples(pgres) == 1) { + replyptr = PQgetvalue(pgres, 0, 1); + bool flag; + replylsn = pg_lsn_in_internal(replyptr,&flag); + if (replylsn == InvalidXLogRecPtr) { + elog(ERROR,"query pg_stat_replication replylsn failed"); + PQclear(pgres); + return 1; + } + } + //no slave,pushstandby no need wait + } + else if (PQresultStatus(pgres) == PGRES_FATAL_ERROR) + { + //master crash,pushstandby need replay to master crash point for private + PQclear(pgres); + return InvalidXLogRecPtr; } - else if (PQresultStatus(pgres) == PGRES_BAD_RESPONSE || - PQresultStatus(pgres) == PGRES_NONFATAL_ERROR || - PQresultStatus(pgres) == PGRES_FATAL_ERROR) + else { PQfinish(pushconn); pushconn = NULL; PQclear(pgres); - return InvalidXLogRecPtr; - } + return 1; + } //elog(LOG,"appnamelsn: %x: replylsn %x",lsn,replylsn); - if (lsn !=InvalidXLogRecPtr && lsn < replylsn||replylsn == InvalidXLogRecPtr) { + if ((lsn !=InvalidXLogRecPtr && lsn < replylsn)||(replylsn == InvalidXLogRecPtr)) { replylsn = lsn; } PQclear(pgres); return replylsn; } -Queue DirtyPq = { - NULL, - NULL -}; - -void QueuePush(QDataType x) -{ - Queue* pq = &DirtyPq; - QNode* newnode = (QNode*)malloc(sizeof(QNode)); - newnode->next = NULL; - memcpy(&newnode->data,&x,sizeof(x)); - if (pq->tail == NULL) - { - pq->head = pq->tail = newnode; - } - else - { - pq->tail->next = newnode; - pq->tail = newnode; - } -} - -//出队列 -QDataType QueuePop() -{ - Queue* pq = &DirtyPq; - QDataType data; - memcpy(&data,&pq->head->data,sizeof(QDataType)); - if (pq->head->next == NULL) - { - free(pq->head); - pq->head = pq->tail = NULL; - } - else - { - QNode* next = pq->head->next; - free(pq->head); - pq->head = next; - } - return data; -} - -bool QueueEmpty() -{ - Queue* pq = &DirtyPq; - return pq->head == NULL; -} - -XLogRecPtr QueueHeadEndLsn() -{ - Queue* pq = &DirtyPq; - return pq->head->data.endlsn; -} - +// XLogRecPtr QueryReplyLsn(XLogRecPtr lsn) +// { +// StringInfoData cmd; +// XLogRecPtr replylsn; +// PGresult *pgres = NULL; +// char *appname; +// char *state; +// char *syncstate; +// char *replyptr; +// replylsn = InvalidXLogRecPtr; +// if (pushconn == NULL) { +// if (ConnectPrimaryDB4ReplyLSN() == false) { +// return InvalidXLogRecPtr; +// } +// } +// initStringInfo(&cmd); +// appendStringInfoString(&cmd, "SELECT t.application_name, t.replay_lsn, t.state, t.sync_state FROM pg_catalog.pg_stat_replication t WHERE t.application_name <> \'"); +// appendStringInfoString(&cmd, "pushstandby"); +// appendStringInfoString(&cmd, "\' order by t.replay_lsn limit 1"); +// pgres = PQexec(pushconn, cmd.data); +// if (PQresultStatus(pgres) == PGRES_TUPLES_OK && PQntuples(pgres) == 1) { +// appname = PQgetvalue(pgres, 0, 0); +// replyptr = PQgetvalue(pgres, 0, 1); +// bool flag; +// replylsn = pg_lsn_in_internal(replyptr,&flag); +// //replylsn = atol(replyptr); +// state = PQgetvalue(pgres, 0, 2); +// syncstate = PQgetvalue(pgres, 0, 3); +// } +// else if (PQresultStatus(pgres) == PGRES_BAD_RESPONSE || +// PQresultStatus(pgres) == PGRES_NONFATAL_ERROR || +// PQresultStatus(pgres) == PGRES_FATAL_ERROR) +// { +// PQfinish(pushconn); +// pushconn = NULL; +// PQclear(pgres); +// return InvalidXLogRecPtr; +// } +// //elog(LOG,"appnamelsn: %x: replylsn %x",lsn,replylsn); +// if (lsn !=InvalidXLogRecPtr && lsn < replylsn||replylsn == InvalidXLogRecPtr) { +// replylsn = lsn; +// } +// PQclear(pgres); +// return replylsn; +// } diff --git a/src/backend/access/transam/ringbuffer.c b/src/backend/access/transam/ringbuffer.c new file mode 100644 index 0000000..1c32c9e --- /dev/null +++ b/src/backend/access/transam/ringbuffer.c @@ -0,0 +1,193 @@ +#include "access/ringbuffer.h" +#include +#include "access/xlogrecord.h" +/** + * @file + * Implementation of ring buffer functions. + */ + +void ring_buffer_init(ring_buffer_t *buffer, wal_batch_t *buf, size_t buf_size) { + RING_BUFFER_ASSERT(RING_BUFFER_IS_POWER_OF_TWO(buf_size) == 1); + SpinLockInit(&buffer->mutex); + buffer->buffer = buf; + buffer->buffer_mask = buf_size - 1; + buffer->tail_index = 0; + buffer->head_index = 0; +} + +wal_batch_t *ring_buffer_queue(ring_buffer_t *buffer, wal_batch_t data) { + wal_batch_t* curWal = NULL; + SpinLockAcquire(&buffer->mutex); + /* Is buffer full? */ + if(ring_buffer_is_full(buffer)) { + SpinLockRelease(&buffer->mutex); + return NULL; + } + buffer->buffer[buffer->head_index].startLsn = data.startLsn; + buffer->buffer[buffer->head_index].endLsn = data.endLsn; + buffer->buffer[buffer->head_index].checkPointLsn = data.checkPointLsn; + pg_atomic_exchange_u32(&buffer->buffer[buffer->head_index].status,(uint32_t)UNKOWNSTATUS); + curWal = &buffer->buffer[buffer->head_index]; + buffer->head_index = ((buffer->head_index + 1) & RING_BUFFER_MASK(buffer)); + SpinLockRelease(&buffer->mutex); + return curWal; +} + +uint8_t ring_buffer_dequeue(ring_buffer_t *buffer, wal_batch_t *data) { + SpinLockAcquire(&buffer->mutex); + if(ring_buffer_is_empty(buffer)) { + /* No items */ + SpinLockRelease(&buffer->mutex); + return 0; + } + if (data != NULL) { + *data = buffer->buffer[buffer->tail_index]; + } + buffer->tail_index = ((buffer->tail_index + 1) & RING_BUFFER_MASK(buffer)); + SpinLockRelease(&buffer->mutex); + return 1; +} + +uint8_t ring_buffer_dequeue_arr(ring_buffer_t *buffer, uint32 size) { + SpinLockAcquire(&buffer->mutex); + if(ring_buffer_is_empty(buffer)) { + /* No items */ + SpinLockRelease(&buffer->mutex); + return 0; + } + ring_buffer_size_t pos = buffer->tail_index; + for(uint32 i = 0;ibuffer[pos].status,(uint32_t)UNKOWNSTATUS); + pos = ((pos+1) & RING_BUFFER_MASK(buffer)); + } + buffer->tail_index = ((buffer->tail_index + size) & RING_BUFFER_MASK(buffer)); + SpinLockRelease(&buffer->mutex); + return 1; +} + + +uint8_t ring_buffer_peek(ring_buffer_t *buffer, wal_batch_t **data, ring_buffer_size_t index) { + SpinLockAcquire(&buffer->mutex); + if(index >= ring_buffer_num_items(buffer)) { + /* No items at index */ + SpinLockRelease(&buffer->mutex); + return 0; + } + /* Add index to pointer */ + ring_buffer_size_t data_index = ((buffer->tail_index + index) & RING_BUFFER_MASK(buffer)); + *data = &buffer->buffer[data_index]; + SpinLockRelease(&buffer->mutex); + return 1; +} + +uint8_t ring_buffer_will_full(ring_buffer_t *buffer) { + ring_buffer_size_t num = ((buffer->head_index - buffer->tail_index) & RING_BUFFER_MASK(buffer)); + return num > 0.9 * RING_BUFFER_MASK(buffer); +} + +ring_buffer_t* gRingBufferManger; +Size WalReadBufferShmemSize(void) { + Size size; + size = 0; + size = add_size(size,sizeof(ring_buffer_t)); + //spaceNum of numbers wal batchs to manage wal buffer + size = add_size(size, spaceNum * sizeof(wal_batch_t)); + //256MB cache for wal parallel read + size = add_size(size, spaceNum * 4 * XLOG_BLCKSZ); + return size; +} + +// one elem max receive no more than 32k,64MB = spaceNum * 4 * XLOG_BLCKSZ +const int spaceNum = 8192; +void InitRingBufferSpace(void) { + //default 256MB for cache + char* gFreeSpace = NULL; + bool found; + gFreeSpace = (char *) + ShmemInitStruct("walreadbuffer", + sizeof(ring_buffer_t) + spaceNum * sizeof(wal_batch_t) + spaceNum * 4 * XLOG_BLCKSZ, + &found); + + if (gFreeSpace == NULL) { + elog(FATAL,"gFreeSpace malloc failed"); + } + gRingBufferManger = (ring_buffer_t*)gFreeSpace; + gFreeSpace += sizeof(ring_buffer_t); + wal_batch_t* gManageFreeList; + gManageFreeList = (wal_batch_t*)gFreeSpace ; + gFreeSpace += spaceNum * sizeof(wal_batch_t); + int i = 0; + for(;imaxIdx; + if (maxIdx == 0) { + return -1; + } + ring_buffer_size_t tailIdx = gRingBufferManger->tail_index; + int low = tailIdx,high = ((tailIdx+maxIdx) & RING_BUFFER_MASK(gRingBufferManger)), mid = 0; + if (low > high) { + if (gRingBufferManger->buffer[gRingBufferManger->buffer_mask].startLsn + gRingBufferManger->buffer[gRingBufferManger->buffer_mask].dataLen > lsn) { + high = gRingBufferManger->buffer_mask+1; + } else { + low = 0; + } + } + if (gRingBufferManger->buffer[high-1].startLsn == 0) { + high -= 2; + } else { + high -=1; + } + bool find = false; + while(low <= high) { + mid = (low + high) / 2; + if (gRingBufferManger->buffer[mid].startLsn > lsn) { + high = mid - 1; + } else if (gRingBufferManger->buffer[mid].startLsn < lsn) { + low = mid + 1; + } else { + find = true; + break; + } + } + XLogRecord* record = NULL; + int xllen = -1; + bool extandFlag = false; + if (find == true) { + record = (XLogRecord*)gRingBufferManger->buffer[mid].data; + xllen = record->xl_tot_len; + } else { + record = (XLogRecord*)gRingBufferManger->buffer[high].data; + if (gRingBufferManger->buffer[high].startLsn + gRingBufferManger->buffer[high].dataLen <= lsn) { + return -1; + } else { + record = (XLogRecord*)(gRingBufferManger->buffer[high].data + (lsn-gRingBufferManger->buffer[high].startLsn)); + xllen = record->xl_tot_len; + } + } + if (xllen != -1) { + while (*curpos + xllen > *maxspace) { + *maxspace += *maxspace; + extandFlag = true; + } + if (extandFlag == true) { + char* ptr = malloc(*maxspace); + memcpy(ptr,*buffer,*curpos); + free(*buffer); + *buffer = ptr; + } + memcpy(*buffer+*curpos,record,xllen); + *curpos += xllen; + } + return xllen; +} + + +extern inline uint8_t ring_buffer_is_empty(ring_buffer_t *buffer); +extern inline uint8_t ring_buffer_is_full(ring_buffer_t *buffer); +extern inline ring_buffer_size_t ring_buffer_num_items(ring_buffer_t *buffer); + diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 7cc76c1..5e5b8a4 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -1387,9 +1387,8 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) TimeLineID save_currtli = ThisTimeLineID; xlogreader = XLogReaderAllocate(wal_segment_size, NULL, - XL_ROUTINE(.page_read = &read_local_xlog_page, - .segment_open = &wal_segment_open, - .segment_close = &wal_segment_close), + XL_ROUTINE(.batch_read = &read_local_xlog_batch, + ), NULL); if (!xlogreader) ereport(ERROR, @@ -1398,8 +1397,8 @@ XlogReadTwoPhaseData(XLogRecPtr lsn, char **buf, int *len) errdetail("Failed while allocating a WAL reading processor."))); XLogBeginRead(xlogreader, lsn); - record = XLogReadRecord(xlogreader, &errormsg); - + // record = XLogReadRecord(xlogreader, &errormsg); + record = He3DBXLogReadRecord(xlogreader, &errormsg); /* * Restore immediately the timeline where it was previously, as * read_local_xlog_page() could have changed it if the record was read diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index ca6f6d5..3fd0fa6 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -43,6 +43,7 @@ #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" +#include "postmaster/bgwriter.h" #include "replication/logical.h" #include "replication/logicallauncher.h" #include "replication/origin.h" @@ -1318,6 +1319,14 @@ RecordTransactionCommit(void) /* Tell bufmgr and smgr to prepare for commit */ BufmgrCommit(); + /* + * He3DB: do checkpoint ahead when existing pendingDelete relations. Avoid pushstandby shutdown before checkpoint and + * after redo commit, which cause redo failed when primary restart. + */ + if (nrels > 0 && IsBootstrapProcessingMode() != true && InitdbSingle != true) + RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT + | CHECKPOINT_FLUSH_ALL); + /* * Mark ourselves as within our "commit critical section". This * forces any concurrent checkpoint to wait until we've updated @@ -1682,6 +1691,15 @@ RecordTransactionAbort(bool isSubXact) nrels = smgrGetPendingDeletes(false, &rels); nchildren = xactGetCommittedChildren(&children); + /* + * He3DB: do checkpoint ahead when existing pendingDelete relations. Avoid pushstandby shutdown before checkpoint and + * after redo abort, which cause redo failed when primary restart. + */ + if (nrels > 0 && IsBootstrapProcessingMode() != true && InitdbSingle != true) + RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT + | CHECKPOINT_FLUSH_ALL); + + /* XXX do we really need a critical section here? */ START_CRIT_SECTION(); @@ -5808,7 +5826,7 @@ static void xact_redo_commit(xl_xact_parsed_commit *parsed, TransactionId xid, XLogRecPtr lsn, - RepOriginId origin_id) + RepOriginId origin_id, XLogRecPtr startlsn) { TransactionId max_xid; TimestampTz commit_time; @@ -5913,6 +5931,20 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, */ XLogFlush(lsn); + if (push_standby || !EnableHotStandby) + { + pushTikv(0, hashMapSize(), true); + } else { + XLogRecPtr consistPtr; + consistPtr = GetXLogPushToDisk(); + while (consistPtr < startlsn) + { + pg_usleep(100000L); + elog(LOG, "standby consist lsn %ld, commit lsn %ld", consistPtr, startlsn); + consistPtr = GetXLogPushToDisk(); + } + } + /* Make sure files supposed to be dropped are dropped */ DropRelationFiles(parsed->xnodes, parsed->nrels, true); } @@ -5952,7 +5984,7 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, */ static void xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid, - XLogRecPtr lsn, RepOriginId origin_id) + XLogRecPtr lsn, RepOriginId origin_id, XLogRecPtr startlsn) { TransactionId max_xid; @@ -6017,6 +6049,20 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid, */ XLogFlush(lsn); + if (push_standby || !EnableHotStandby) + { + pushTikv(0, hashMapSize(), true); + } else { + XLogRecPtr consistPtr; + consistPtr = GetXLogPushToDisk(); + while (consistPtr < startlsn) + { + pg_usleep(100000L); + elog(LOG, "standby consist lsn %ld, abort lsn %ld", consistPtr, startlsn); + consistPtr = GetXLogPushToDisk(); + } + } + DropRelationFiles(parsed->xnodes, parsed->nrels, true); } } @@ -6036,7 +6082,7 @@ xact_redo(XLogReaderState *record) ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed); xact_redo_commit(&parsed, XLogRecGetXid(record), - record->EndRecPtr, XLogRecGetOrigin(record)); + record->EndRecPtr, XLogRecGetOrigin(record), record->ReadRecPtr); } else if (info == XLOG_XACT_COMMIT_PREPARED) { @@ -6045,7 +6091,7 @@ xact_redo(XLogReaderState *record) ParseCommitRecord(XLogRecGetInfo(record), xlrec, &parsed); xact_redo_commit(&parsed, parsed.twophase_xid, - record->EndRecPtr, XLogRecGetOrigin(record)); + record->EndRecPtr, XLogRecGetOrigin(record), record->ReadRecPtr); /* Delete TwoPhaseState gxact entry and/or 2PC file. */ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); @@ -6059,7 +6105,7 @@ xact_redo(XLogReaderState *record) ParseAbortRecord(XLogRecGetInfo(record), xlrec, &parsed); xact_redo_abort(&parsed, XLogRecGetXid(record), - record->EndRecPtr, XLogRecGetOrigin(record)); + record->EndRecPtr, XLogRecGetOrigin(record), record->ReadRecPtr); } else if (info == XLOG_XACT_ABORT_PREPARED) { @@ -6068,7 +6114,7 @@ xact_redo(XLogReaderState *record) ParseAbortRecord(XLogRecGetInfo(record), xlrec, &parsed); xact_redo_abort(&parsed, parsed.twophase_xid, - record->EndRecPtr, XLogRecGetOrigin(record)); + record->EndRecPtr, XLogRecGetOrigin(record), record->ReadRecPtr); /* Delete TwoPhaseState gxact entry and/or 2PC file. */ LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a18e261..6637bde 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -20,8 +20,12 @@ #include #include #include +#include #include - +#include +#include +#include +#include #include "access/clog.h" #include "access/commit_ts.h" #include "access/heaptoast.h" @@ -32,6 +36,7 @@ #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" +#include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogarchive.h" #include "access/xloginsert.h" @@ -40,7 +45,6 @@ #include "catalog/catversion.h" #include "catalog/pg_control.h" #include "catalog/pg_database.h" -#include "catalog/pg_hot_data.h" #include "commands/progress.h" #include "commands/tablespace.h" #include "common/controldata_utils.h" @@ -53,6 +57,7 @@ #include "postmaster/bgwriter.h" #include "postmaster/startup.h" #include "postmaster/walwriter.h" +#include "postmaster/secondbuffer.h" #include "replication/basebackup.h" #include "replication/logical.h" #include "replication/origin.h" @@ -83,59 +88,83 @@ #include "utils/timestamp.h" #include "port/pg_crc32c.h" #include "storage/buf_internals.h" -#ifndef PG_NOREPLAY +#ifndef PG_NOREPLAY #include "access/pushpage.h" +#include "access/pagehashqueue.h" #endif +#include "catalog/pg_tablespace_d.h" +#include "access/ringbuffer.h" +#include "access/pthreadpool.h" +#include "storage/he3db_logindex.h" +/* precache table */ +#include "libpq-fe.h" +#include "lib/stringinfo.h" +#include "utils/timestamp.h" +#include "access/xlog.h" +#include "postmaster/postmaster.h" +#include +#include "catalog/storage_xlog.h" + extern uint32 bootstrap_data_checksum_version; + /* Unsupported old recovery command file names (relative to $PGDATA) */ -#define RECOVERY_COMMAND_FILE "recovery.conf" -#define RECOVERY_COMMAND_DONE "recovery.done" +#define RECOVERY_COMMAND_FILE "recovery.conf" +#define RECOVERY_COMMAND_DONE "recovery.done" /* User-settable parameters */ -int max_wal_size_mb = 1024; /* 1 GB */ -int min_wal_size_mb = 80; /* 80 MB */ -int wal_keep_size_mb = 0; -int XLOGbuffers = -1; -int XLogArchiveTimeout = 0; -int XLogArchiveMode = ARCHIVE_MODE_OFF; -char *XLogArchiveCommand = NULL; -bool EnableHotStandby = false; -bool fullPageWrites = true; -bool wal_log_hints = false; -bool wal_compression = false; -char *wal_consistency_checking_string = NULL; -bool *wal_consistency_checking = NULL; -bool wal_init_zero = true; -bool wal_recycle = true; -bool log_checkpoints = false; -int sync_method = DEFAULT_SYNC_METHOD; -int wal_level = WAL_LEVEL_MINIMAL; -int CommitDelay = 0; /* precommit delay in microseconds */ -int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ -int wal_retrieve_retry_interval = 5000; -int max_slot_wal_keep_size_mb = -1; -bool track_wal_io_timing = false; +int max_wal_size_mb = 1024; /* 1 GB */ +int min_wal_size_mb = 80; /* 80 MB */ +int wal_keep_size_mb = 0; +int XLOGbuffers = -1; +int XLogArchiveTimeout = 0; +int XLogArchiveMode = ARCHIVE_MODE_OFF; +char *XLogArchiveCommand = NULL; +bool EnableHotStandby = false; +bool fullPageWrites = true; +bool wal_log_hints = false; +bool wal_compression = false; +char *wal_consistency_checking_string = NULL; +bool *wal_consistency_checking = NULL; +bool wal_init_zero = true; +bool wal_recycle = true; +bool log_checkpoints = false; +int sync_method = DEFAULT_SYNC_METHOD; +int wal_level = WAL_LEVEL_MINIMAL; +int CommitDelay = 0; /* precommit delay in microseconds */ +int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ +int wal_retrieve_retry_interval = 5000; +int max_slot_wal_keep_size_mb = -1; +bool track_wal_io_timing = false; + +int flushFlag = 8 * 1024 * 1024; /*TODO it should be set by GUC*/ #ifdef WAL_DEBUG -bool XLOG_DEBUG = false; +bool XLOG_DEBUG = false; #endif int wal_segment_size = DEFAULT_XLOG_SEG_SIZE; +//timeOut Check 1s +const time_t timeOut = 1; +//all Page modify Num +const int pageMaxLen = 8192; + +static time_t start; +static bool startInit = false; /* * Number of WAL insertion locks to use. A higher value allows more insertions * to happen concurrently, but adds some CPU overhead to flushing the WAL, * which needs to iterate all the locks. */ -#define NUM_XLOGINSERT_LOCKS 8 +#define NUM_XLOGINSERT_LOCKS 8 /* * Max distance from last checkpoint, before triggering a new xlog-based * checkpoint. */ -int CheckPointSegments; +int CheckPointSegments; /* Estimated distance between checkpoints, in bytes */ static double CheckPointDistanceEstimate = 0; @@ -158,9 +187,7 @@ const struct config_enum_entry sync_method_options[] = { #ifdef OPEN_DATASYNC_FLAG {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false}, #endif - {NULL, 0, false} -}; - + {NULL, 0, false}}; /* * Although only "on", "off", and "always" are documented, @@ -176,15 +203,14 @@ const struct config_enum_entry archive_mode_options[] = { {"no", ARCHIVE_MODE_OFF, true}, {"1", ARCHIVE_MODE_ON, true}, {"0", ARCHIVE_MODE_OFF, true}, - {NULL, 0, false} -}; + {NULL, 0, false}}; const struct config_enum_entry recovery_target_action_options[] = { {"pause", RECOVERY_TARGET_ACTION_PAUSE, false}, {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false}, {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false}, - {NULL, 0, false} -}; + {NULL, 0, false}}; + /* * Statistics for current checkpoint are collected in this global struct. @@ -197,7 +223,8 @@ CheckpointStatsData CheckpointStats; * ThisTimeLineID will be same in all backends --- it identifies current * WAL timeline for the database system. */ -TimeLineID ThisTimeLineID = 0; +TimeLineID ThisTimeLineID = 0; +TimeLineID ThisTimeLineID2 = 0; /* * Are we doing recovery from XLOG? @@ -210,7 +237,7 @@ TimeLineID ThisTimeLineID = 0; * process you're running in, use RecoveryInProgress() but only after shared * memory startup and lock initialization. */ -bool InRecovery = false; +bool InRecovery = false; /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */ HotStandbyState standbyState = STANDBY_DISABLED; @@ -220,6 +247,7 @@ static XLogRecPtr LastRec; /* Local copy of WalRcv->flushedUpto */ static XLogRecPtr flushedUpto = 0; static TimeLineID receiveTLI = 0; +static XLogRecPtr readedUpto = 0; /* * abortedRecPtr is the start pointer of a broken record at end of WAL when @@ -267,7 +295,7 @@ static bool LocalPromoteIsTriggered = false; * The coding in XLogInsertAllowed() depends on the first two of these states * being numerically the same as bool true and false. */ -static int LocalXLogInsertAllowed = -1; +static int LocalXLogInsertAllowed = -1; /* * When ArchiveRecoveryRequested is set, archive recovery was requested, @@ -279,9 +307,10 @@ static int LocalXLogInsertAllowed = -1; * currently performing crash recovery using only XLOG files in pg_wal, but * will switch to using offline XLOG archives as soon as we reach the end of * WAL in pg_wal. -*/ -bool ArchiveRecoveryRequested = false; -bool InArchiveRecovery = false; + */ +bool ArchiveRecoveryRequested = false; +bool InArchiveRecovery = false; +bool IsPrivatePgControl = false; static bool standby_signal_file_found = false; static bool recovery_signal_file_found = false; @@ -294,28 +323,38 @@ static char *replay_image_masked = NULL; static char *primary_image_masked = NULL; /* options formerly taken from recovery.conf for archive recovery */ -char *recoveryRestoreCommand = NULL; -char *recoveryEndCommand = NULL; -char *archiveCleanupCommand = NULL; +char *recoveryRestoreCommand = NULL; +char *recoveryEndCommand = NULL; +char *archiveCleanupCommand = NULL; RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET; -bool recoveryTargetInclusive = true; -int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE; +bool recoveryTargetInclusive = true; +int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE; TransactionId recoveryTargetXid; -char *recovery_target_time_string; +char *recovery_target_time_string; static TimestampTz recoveryTargetTime; const char *recoveryTargetName; -XLogRecPtr recoveryTargetLSN; -int recovery_min_apply_delay = 0; +XLogRecPtr recoveryTargetLSN; +XLogRecPtr walsenderLsn; + +int recovery_min_apply_delay = 0; /* options formerly taken from recovery.conf for XLOG streaming */ -bool StandbyModeRequested = false; -char *PrimaryConnInfo = NULL; -char *PrimarySlotName = NULL; -char *PromoteTriggerFile = NULL; -bool wal_receiver_create_temp_slot = false; +bool StandbyModeRequested = false; +char *PrimaryConnInfo = NULL; +char *PrimarySlotName = NULL; +char *PromoteTriggerFile = NULL; +bool wal_receiver_create_temp_slot = false; + +char *he3_meta_conninfo; /* are we currently in standby mode? */ -bool StandbyMode = false; +bool StandbyMode = false; + + +int32 globalInode = 0; +int globalOpenLogSegNo = -1; +bool isInitDB = false; +XLogRecPtr localApplyLSN = 0; /* * if recoveryStopsBefore/After returns true, it saves information of the stop @@ -354,8 +393,8 @@ static bool recoveryStopAfter; * to decrease. */ RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST; -TimeLineID recoveryTargetTLIRequested = 0; -TimeLineID recoveryTargetTLI = 0; +TimeLineID recoveryTargetTLIRequested = 0; +TimeLineID recoveryTargetTLI = 0; static List *expectedTLEs; static TimeLineID curFileTLI; @@ -374,9 +413,9 @@ static TimeLineID curFileTLI; * stored here. The parallel leader advances its own copy, when necessary, * in WaitForParallelWorkersToFinish. */ -XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr; -XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr; -XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr; +XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr; +XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr; +XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr; /* * RedoRecPtr is this backend's local copy of the REDO record pointer @@ -451,16 +490,29 @@ static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr; typedef struct XLogwrtRqst { - XLogRecPtr Write; /* last byte + 1 to write out */ - XLogRecPtr Flush; /* last byte + 1 to flush */ + XLogRecPtr Write; /* last byte + 1 to write out */ + XLogRecPtr Flush; /* last byte + 1 to flush */ } XLogwrtRqst; typedef struct XLogwrtResult { - XLogRecPtr Write; /* last byte + 1 written out */ - XLogRecPtr Flush; /* last byte + 1 flushed */ + XLogRecPtr Write; /* last byte + 1 written out */ + pg_atomic_uint64 Flush; /* last byte + 1 flushed */ } XLogwrtResult; + + + +typedef struct XLogItemList +{ + XLogItem *head; + XLogItem *tail; +} XLogItemList; + +XLogItemList *xlogItemList = NULL; + + + /* * Inserting to WAL is protected by a small fixed number of WAL insertion * locks. To insert to the WAL, you must hold one of the locks - it doesn't @@ -499,9 +551,9 @@ typedef struct XLogwrtResult */ typedef struct { - LWLock lock; - XLogRecPtr insertingAt; - XLogRecPtr lastImportantAt; + LWLock lock; + XLogRecPtr insertingAt; + XLogRecPtr lastImportantAt; } WALInsertLock; /* @@ -514,7 +566,7 @@ typedef struct typedef union WALInsertLockPadded { WALInsertLock l; - char pad[PG_CACHE_LINE_SIZE]; + char pad[PG_CACHE_LINE_SIZE]; } WALInsertLockPadded; /* @@ -551,7 +603,7 @@ static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE; */ typedef struct XLogCtlInsert { - slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */ + slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */ /* * CurrBytePos is the end of reserved WAL. The next record will be @@ -560,8 +612,8 @@ typedef struct XLogCtlInsert * prev-link of the next record. These are stored as "usable byte * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()). */ - uint64 CurrBytePos; - uint64 PrevBytePos; + uint64 CurrBytePos; + uint64 PrevBytePos; /* * Make sure the above heavily-contended spinlock and byte positions are @@ -570,7 +622,7 @@ typedef struct XLogCtlInsert * read on every WAL insertion, but updated rarely, and we don't want * those reads to steal the cache line containing Curr/PrevBytePos. */ - char pad[PG_CACHE_LINE_SIZE]; + char pad[PG_CACHE_LINE_SIZE]; /* * fullPageWrites is the authoritative value used by all backends to @@ -583,9 +635,9 @@ typedef struct XLogCtlInsert * To read these fields, you must hold an insertion lock. To modify them, * you must hold ALL the locks. */ - XLogRecPtr RedoRecPtr; /* current redo point for insertions */ - bool forcePageWrites; /* forcing full-page writes for PITR? */ - bool fullPageWrites; + XLogRecPtr RedoRecPtr; /* current redo point for insertions */ + bool forcePageWrites; /* forcing full-page writes for PITR? */ + bool fullPageWrites; /* * exclusiveBackupState indicates the state of an exclusive backup (see @@ -596,8 +648,8 @@ typedef struct XLogCtlInsert * as a starting point for an online backup. */ ExclusiveBackupState exclusiveBackupState; - int nonExclusiveBackups; - XLogRecPtr lastBackupStart; + int nonExclusiveBackups; + XLogRecPtr lastBackupStart; /* * WAL insertion locks. @@ -605,6 +657,29 @@ typedef struct XLogCtlInsert WALInsertLockPadded *WALInsertLocks; } XLogCtlInsert; +typedef struct TimeVal +{ + long sec; + long usec; +} TimeVal; + +TimeVal timeVal; + +typedef struct FlushInfo +{ + XLogRecPtr wrtLsn; + bool written; +}FlushInfo; + +typedef struct XLogParralFlush +{ + pg_atomic_uint64 begin; + uint64 last; +// uint32 count; +// uint32 diff; + // FlushInfo wrtResult[128]; +}XLogParralFlush; + /* * Total shared-memory state for XLOG. */ @@ -614,20 +689,20 @@ typedef struct XLogCtlData /* Protected by info_lck: */ XLogwrtRqst LogwrtRqst; - XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */ - FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */ - XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */ - XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */ + XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */ + FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */ + XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */ + XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */ - XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */ + XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */ /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */ - XLogRecPtr unloggedLSN; - slock_t ulsn_lck; + XLogRecPtr unloggedLSN; + slock_t ulsn_lck; /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */ - pg_time_t lastSegSwitchTime; - XLogRecPtr lastSegSwitchLSN; + pg_time_t lastSegSwitchTime; + XLogRecPtr lastSegSwitchLSN; /* * Protected by info_lck and WALWriteLock (you must hold either lock to @@ -635,6 +710,8 @@ typedef struct XLogCtlData */ XLogwrtResult LogwrtResult; + XLogParralFlush LogFlush; + /* * Latest initialized page in the cache (last byte position + 1). * @@ -645,16 +722,16 @@ typedef struct XLogCtlData * in-progress insertions to the page by calling * WaitXLogInsertionsToFinish(). */ - XLogRecPtr InitializedUpTo; + XLogRecPtr InitializedUpTo; /* * These values do not change after startup, although the pointed-to pages * and xlblocks values certainly do. xlblocks values are protected by * WALBufMappingLock. */ - char *pages; /* buffers for unwritten XLOG pages */ - XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */ - int XLogCacheBlck; /* highest allocated xlog buffer index */ + char *pages; /* buffers for unwritten XLOG pages */ + XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */ + int XLogCacheBlck; /* highest allocated xlog buffer index */ /* * Shared copy of ThisTimeLineID. Does not change after end-of-recovery. @@ -662,8 +739,8 @@ typedef struct XLogCtlData * PrevTimeLineID is the old timeline's ID that we forked off from. * Otherwise it's equal to ThisTimeLineID. */ - TimeLineID ThisTimeLineID; - TimeLineID PrevTimeLineID; + TimeLineID ThisTimeLineID; + TimeLineID PrevTimeLineID; /* * SharedRecoveryState indicates if we're still in crash or archive @@ -675,20 +752,20 @@ typedef struct XLogCtlData * SharedHotStandbyActive indicates if we allow hot standby queries to be * run. Protected by info_lck. */ - bool SharedHotStandbyActive; + bool SharedHotStandbyActive; /* * SharedPromoteIsTriggered indicates if a standby promotion has been * triggered. Protected by info_lck. */ - bool SharedPromoteIsTriggered; + bool SharedPromoteIsTriggered; /* * WalWriterSleeping indicates whether the WAL writer is currently in * low-power mode (and hence should be nudged if an async commit occurs). * Protected by info_lck. */ - bool WalWriterSleeping; + bool WalWriterSleeping; /* * recoveryWakeupLatch is used to wake up the startup process to continue @@ -705,7 +782,7 @@ typedef struct XLogCtlData * recoveryWakeupLatch and procLatch, should be used for inter-process * communication for WAL replay and recovery conflict, respectively. */ - Latch recoveryWakeupLatch; + Latch recoveryWakeupLatch; /* * During recovery, we keep a copy of the latest checkpoint record here. @@ -715,9 +792,9 @@ typedef struct XLogCtlData * * Protected by info_lck. */ - XLogRecPtr lastCheckPointRecPtr; - XLogRecPtr lastCheckPointEndPtr; - CheckPoint lastCheckPoint; + XLogRecPtr lastCheckPointRecPtr; + XLogRecPtr lastCheckPointEndPtr; + CheckPoint lastCheckPoint; /* * lastReplayedEndRecPtr points to end+1 of the last record successfully @@ -725,10 +802,10 @@ typedef struct XLogCtlData * function, replayEndRecPtr points to the end+1 of the record being * replayed, otherwise it's equal to lastReplayedEndRecPtr. */ - XLogRecPtr lastReplayedEndRecPtr; - TimeLineID lastReplayedTLI; - XLogRecPtr replayEndRecPtr; - TimeLineID replayEndTLI; + XLogRecPtr lastReplayedEndRecPtr; + TimeLineID lastReplayedTLI; + XLogRecPtr replayEndRecPtr; + TimeLineID replayEndTLI; /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */ TimestampTz recoveryLastXTime; @@ -745,9 +822,18 @@ typedef struct XLogCtlData * lastFpwDisableRecPtr points to the start of the last replayed * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled. */ - XLogRecPtr lastFpwDisableRecPtr; + XLogRecPtr lastFpwDisableRecPtr; + + long timestamp; + // long timestp; + long oldflush; + // long ol; + + XLogRecPtr globalUpto; slock_t info_lck; /* locks shared variables shown above */ + XLogRecPtr pushToDisk; + XLogRecPtr fileLsn; } XLogCtlData; static XLogCtlData *XLogCtl = NULL; @@ -764,18 +850,18 @@ static ControlFileData *ControlFile = NULL; * Calculate the amount of space left on the page after 'endptr'. Beware * multiple evaluation! */ -#define INSERT_FREESPACE(endptr) \ +#define INSERT_FREESPACE(endptr) \ (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ)) /* Macro to advance to next buffer index. */ -#define NextBufIdx(idx) \ - (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1)) +#define NextBufIdx(idx) \ + (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1)) /* * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or * would hold if it was in cache, the page containing 'recptr'. */ -#define XLogRecPtrToBufIdx(recptr) \ +#define XLogRecPtrToBufIdx(recptr) \ (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1)) /* @@ -787,16 +873,16 @@ static ControlFileData *ControlFile = NULL; * Convert values of GUCs measured in megabytes to equiv. segment count. * Rounds down. */ -#define ConvertToXSegs(x, segsize) XLogMBVarToSegs((x), (segsize)) +#define ConvertToXSegs(x, segsize) XLogMBVarToSegs((x), (segsize)) /* The number of bytes in a WAL segment usable for WAL data. */ -static int UsableBytesInSegment; +static int UsableBytesInSegment; /* * Private, possibly out-of-date copy of shared LogwrtResult. * See discussion above. */ -static XLogwrtResult LogwrtResult = {0, 0}; +static XLogwrtResult LogwrtResult; /* * Codes indicating where we got a WAL file from during recovery, or where @@ -804,10 +890,10 @@ static XLogwrtResult LogwrtResult = {0, 0}; */ typedef enum { - XLOG_FROM_ANY = 0, /* request to read WAL from any source */ - XLOG_FROM_ARCHIVE, /* restored using restore_command */ - XLOG_FROM_PG_WAL, /* existing file in pg_wal */ - XLOG_FROM_STREAM /* streamed from primary */ + XLOG_FROM_ANY = 0, /* request to read WAL from any source */ + XLOG_FROM_ARCHIVE, /* restored using restore_command */ + XLOG_FROM_PG_WAL, /* existing file in pg_wal */ + XLOG_FROM_STREAM /* streamed from primary */ } XLogSource; /* human-readable names for XLogSources, for debugging output */ @@ -819,7 +905,7 @@ static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "strea * write the XLOG, and so will normally refer to the active segment. * Note: call Reserve/ReleaseExternalFD to track consumption of this FD. */ -static int openLogFile = -1; +static int64_t openLogFile = -1; static XLogSegNo openLogSegNo = 0; /* @@ -831,7 +917,7 @@ static XLogSegNo openLogSegNo = 0; * this FD too; but it doesn't currently seem worthwhile, since the XLOG is * not read by general-purpose sessions. */ -static int readFile = -1; +static int64_t readFile = -1; static XLogSegNo readSegNo = 0; static uint32 readOff = 0; static uint32 readLen = 0; @@ -853,9 +939,9 @@ static bool pendingWalRcvRestart = false; typedef struct XLogPageReadPrivate { - int emode; - bool fetching_ckpt; /* are we fetching a checkpoint record? */ - bool randAccess; + int emode; + bool fetching_ckpt; /* are we fetching a checkpoint record? */ + bool randAccess; } XLogPageReadPrivate; /* @@ -870,8 +956,8 @@ static TimestampTz XLogReceiptTime = 0; static XLogSource XLogReceiptSource = XLOG_FROM_ANY; /* State information for XLOG reading */ -static XLogRecPtr ReadRecPtr; /* start of last record read */ -static XLogRecPtr EndRecPtr; /* end+1 of last record read */ +static XLogRecPtr ReadRecPtr; /* start of last record read */ +static XLogRecPtr EndRecPtr; /* end+1 of last record read */ /* * Local copies of equivalent fields in the control file. When running @@ -889,7 +975,7 @@ static bool updateMinRecoveryPoint = true; * to replay all the WAL, so reachedConsistency is never set. During archive * recovery, the database is consistent once minRecoveryPoint is reached. */ -bool reachedConsistency = false; +bool reachedConsistency = false; static bool InRedo = false; @@ -897,7 +983,7 @@ static bool InRedo = false; static bool bgwriterLaunched = false; /* For WALInsertLockAcquire/Release functions */ -static int MyLockNo = 0; +static int MyLockNo = 0; static bool holdingAllLocks = false; #ifdef WAL_DEBUG @@ -907,7 +993,9 @@ static MemoryContext walDebugCxt = NULL; static void readRecoverySignalFile(void); static void validateRecoveryParameters(void); static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog); +static void exitHe3ArchiveRecovery(TimeLineID endTLI); static bool recoveryStopsBefore(XLogReaderState *record); +static bool he3recoveryStopsAfter(XLogReaderState *record); static bool recoveryStopsAfter(XLogReaderState *record); static void ConfirmRecoveryPaused(void); static void recoveryPausesHere(bool endOfRecovery); @@ -928,19 +1016,25 @@ static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo); static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic); +static void He3DBAdvanceXLInsertBuffer(int xlogLength,XLogRecPtr upto, bool opportunistic); static bool XLogCheckpointNeeded(XLogSegNo new_segno); -static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible); +static void He3DBXLogWrite(XLogwrtRqst WriteRqst, bool flexible); +static void He3DBXLogFakeWrite(XLogwrtRqst WriteRqst); static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, bool find_free, XLogSegNo max_segno, bool use_lock); -static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, - XLogSource source, bool notfoundOk); -static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source); -static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, - int reqLen, XLogRecPtr targetRecPtr, char *readBuf); + +static int XLogBatchRead(XLogReaderState *xlogreader, XLogRecPtr startPtr, + int reqLen, char *readBuf); +static int64_t XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, + XLogSource source, bool notfoundOk); +static int64_t XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source); +static void XLogPageReadAnyTLI(void); +static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, + int reqLen, XLogRecPtr targetRecPtr, char *readBuf); static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr); -static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); +static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); static void XLogFileClose(void); static void PreallocXlogFiles(XLogRecPtr endptr); static void RemoveTempXlogFiles(void); @@ -976,17 +1070,23 @@ static bool read_backup_label(XLogRecPtr *checkPointLoc, static bool read_tablespace_map(List **tablespaces); static void rm_redo_error_callback(void *arg); -static int get_sync_bit(int method); +static int get_sync_bit(int method); static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos); +static void He3DBCopyXLogRecordToWAL(int write_len, XLogRecPtr StartPos, XLogRecPtr EndPos); static void ReserveXLogInsertLocation(int size, int firstsize, XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr, XLogRecPtr *startbytepos); +static void He3DBReserveXLogInsertLocation(int size, int firstsize, XLogRecPtr *StartPos, + XLogRecPtr *EndPos, XLogRecPtr *PrevPtr, XLogRecPtr *startbytepos); + static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr); static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto); +static XLogRecPtr He3DBWaitXLogInsertionsToFinish(XLogRecPtr upto); static char *GetXLogBuffer(XLogRecPtr ptr); +static char *He3DBGetXLogBuffer(int xlogLength,XLogRecPtr ptr); static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos); static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); @@ -996,7 +1096,15 @@ static void WALInsertLockAcquire(void); static void WALInsertLockAcquireExclusive(void); static void WALInsertLockRelease(void); static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); - +static int AllXLogBatchRead(XLogReaderState *xlogreader, XLogRecPtr startPtr, int reqLen, + char *readBuf); +static int +consumerXLogBatchRead(XLogReaderState *xlogreader, XLogRecPtr startPtr, int reqLen, + char *readBuf); +static int +producerXLogParallelBatchRead(XLogReaderState *xlogreader, XLogRecPtr startPtr, int reqLen); +static void PrecacheHotDataByRules(); +static bool GetShutDownStatus(void); /* * Insert an XLOG record represented by an already-constructed chain of data * chunks. This is a low-level routine; to construct the WAL record header @@ -1023,21 +1131,376 @@ static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); * before the data page can be written out. This implements the basic * WAL rule "write the log before the data".) */ +// XLogRecPtr +// XLogInsertRecord(XLogRecData *rdata, +// XLogRecPtr fpw_lsn, +// uint8 flags, +// int num_fpi, char **links, RelFileNode *rel_fnode, BlockNumber *blkno) +// { +// XLogCtlInsert *Insert = &XLogCtl->Insert; +// pg_crc32c rdata_crc; +// bool inserted; +// XLogRecord *rechdr = (XLogRecord *) rdata->data; +// uint8 info = rechdr->xl_info & ~XLR_INFO_MASK; +// bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID && +// info == XLOG_SWITCH); +// XLogRecPtr StartPos; +// XLogRecPtr EndPos; +// Prefix *prefix = NULL; +// WalLoc *walLoc = NULL; +// bool prevDoPageWrites = doPageWrites; + +// /* we assume that all of the record header is in the first chunk */ +// Assert(rdata->len >= SizeOfXLogRecord); + +// /* cross-check on whether we should be here or not */ +// if (!XLogInsertAllowed()) +// elog(ERROR, "cannot make new WAL entries during recovery"); + +// /*---------- +// * +// * We have now done all the preparatory work we can without holding a +// * lock or modifying shared state. From here on, inserting the new WAL +// * record to the shared WAL buffer cache is a two-step process: +// * +// * 1. Reserve the right amount of space from the WAL. The current head of +// * reserved space is kept in Insert->CurrBytePos, and is protected by +// * insertpos_lck. +// * +// * 2. Copy the record to the reserved WAL space. This involves finding the +// * correct WAL buffer containing the reserved space, and copying the +// * record in place. This can be done concurrently in multiple processes. +// * +// * To keep track of which insertions are still in-progress, each concurrent +// * inserter acquires an insertion lock. In addition to just indicating that +// * an insertion is in progress, the lock tells others how far the inserter +// * has progressed. There is a small fixed number of insertion locks, +// * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page +// * boundary, it updates the value stored in the lock to the how far it has +// * inserted, to allow the previous buffer to be flushed. +// * +// * Holding onto an insertion lock also protects RedoRecPtr and +// * fullPageWrites from changing until the insertion is finished. +// * +// * Step 2 can usually be done completely in parallel. If the required WAL +// * page is not initialized yet, you have to grab WALBufMappingLock to +// * initialize it, but the WAL writer tries to do that ahead of insertions +// * to avoid that from happening in the critical path. +// * +// *---------- +// */ +// START_CRIT_SECTION(); +// if (isLogSwitch) +// WALInsertLockAcquireExclusive(); +// else +// WALInsertLockAcquire(); + +// /* +// * Check to see if my copy of RedoRecPtr is out of date. If so, may have +// * to go back and have the caller recompute everything. This can only +// * happen just after a checkpoint, so it's better to be slow in this case +// * and fast otherwise. +// * +// * Also check to see if fullPageWrites or forcePageWrites was just turned +// * on; if we weren't already doing full-page writes then go back and +// * recompute. +// * +// * If we aren't doing full-page writes then RedoRecPtr doesn't actually +// * affect the contents of the XLOG record, so we'll update our local copy +// * but not force a recomputation. (If doPageWrites was just turned off, +// * we could recompute the record without full pages, but we choose not to +// * bother.) +// */ +// if (RedoRecPtr != Insert->RedoRecPtr) +// { +// Assert(RedoRecPtr < Insert->RedoRecPtr); +// RedoRecPtr = Insert->RedoRecPtr; +// } +// doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); + +// if (doPageWrites && +// (!prevDoPageWrites || +// (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr))) +// { +// /* +// * Oops, some buffer now needs to be backed up that the caller didn't +// * back up. Start over. +// */ +// WALInsertLockRelease(); +// END_CRIT_SECTION(); +// return InvalidXLogRecPtr; +// } + +// /* +// * Reserve space for the record in the WAL. This also sets the xl_prev +// * pointer. +// */ +// XLogRecPtr startbytepos; +// if (isLogSwitch) +// { +// inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev); +// rechdr->xl_end = EndPos; +// } +// else +// { +// ReserveXLogInsertLocation(group_total_len, rechdr->xl_tot_len, &StartPos, &EndPos, +// &rechdr->xl_prev,&startbytepos); +// inserted = true; +// } + +// uint32 xlog_write_bytes = 0; +// if (inserted) +// { + +// XLogRecPtr tmpStartPos; +// XLogRecPtr tmpEndPos; +// for (int i = 0; i < grouo_rec_count; i++) +// { + +// rechdr = (XLogRecord *)grouphead[i]; + +// // create WalLoc +// if (!isInitDB) +// { +// walLoc = (WalLoc *) malloc(sizeof(WalLoc)); +// walLoc->Lsn = rechdr->xl_end - rechdr->xl_tot_len; +// walLoc->tl = XLogCtl->ThisTimeLineID; + + +// } + +// /* +// * Now that xl_prev has been filled in, calculate CRC of the record +// * header. +// */ +// rdata_crc = rechdr->xl_crc; +// COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc)); +// FIN_CRC32C(rdata_crc); +// rechdr->xl_crc = rdata_crc; +// /* +// * All the record data, including the header, is now ready to be +// * inserted. Copy the record in the space reserved. +// */ +// rdata = (XLogRecData *)&groupRecData[i]; + +// // create prefix + + +// if (isLogSwitch != true) +// { +// tmpStartPos = XLogBytePosToRecPtr(startbytepos); +// startbytepos += grouplens[i]; +// tmpEndPos = XLogBytePosToEndRecPtr(startbytepos); +// } +// else +// { +// tmpStartPos = StartPos; +// tmpEndPos = EndPos; +// } +// CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata, +// tmpStartPos, tmpEndPos); +// xlog_write_bytes += rechdr->xl_tot_len; + +// if (!isInitDB) +// { +// uint8 blok_id = ((XLogRecordBlockHeader *)(rdata->data + SizeOfXLogRecord))->id; + +// if (blok_id <= XLR_MAX_BLOCK_ID) +// { +// prefix = (Prefix *)malloc(sizeof(Prefix)); +// prefix->forkno = ((XLogRecordBlockHeader *)(rdata->data + SizeOfXLogRecord))->fork_flags & BKPBLOCK_FORK_MASK; +// prefix->dbNode = rel_fnode[i].dbNode; +// prefix->relNode = rel_fnode[i].relNode; +// prefix->blockno = blkno[i]; +// // prefix done +// WalList *wl = FindWalLOGInTable(prefix); +// if (wl == NULL) +// { +// wl = SetupWalLOGInTable(prefix); +// SpinLockInit(&(wl->append_lck)); +// SpinLockAcquire(&(wl->append_lck)); +// memcpy(wl->wals, (char *)walLoc, sizeof(WalLoc)); +// wl->len += 1;//sizeof(WalLoc); +// SpinLockRelease(&(wl->append_lck)); +// } +// else +// { + +// SpinLockAcquire(&(wl->append_lck)); + +// memcpy(wl->wals + (wl->len * sizeof(WalLoc)), (char *)walLoc, sizeof(WalLoc)); +// wl->len += 1;//sizeof(WalLoc); +// SpinLockRelease(&(wl->append_lck)); +// } +// // printf("wal len %d, rel %d, fork %d, blk %d, lsn %ld\n", wl->len, wl->px.relNode, +// // wl->px.forkno, wl->px.blockno, walLoc->Lsn); +// free(prefix); +// prefix = NULL; +// } +// } +// if (walLoc != NULL) +// { +// free(walLoc); +// walLoc = NULL; +// } +// } + +// /* +// * Unless record is flagged as not important, update LSN of last +// * important record in the current slot. When holding all locks, just +// * update the first one. +// */ +// if ((flags & XLOG_MARK_UNIMPORTANT) == 0) +// { +// int lockno = holdingAllLocks ? 0 : MyLockNo; + +// WALInsertLocks[lockno].l.lastImportantAt = StartPos; +// } +// } +// else +// { +// /* +// * This was an xlog-switch record, but the current insert location was +// * already exactly at the beginning of a segment, so there was no need +// * to do anything. +// */ +// } + +// /* +// * Done! Let others know that we're finished. +// */ +// WALInsertLockRelease(); + +// MarkCurrentTransactionIdLoggedIfAny(); + +// END_CRIT_SECTION(); + +// /* +// * Update shared LogwrtRqst.Write, if we crossed page boundary. +// */ +// if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ) +// { +// SpinLockAcquire(&XLogCtl->info_lck); +// /* advance global request to include new block(s) */ +// if (XLogCtl->LogwrtRqst.Write < EndPos) +// XLogCtl->LogwrtRqst.Write = EndPos; +// /* update local result copy while I have the chance */ +// LogwrtResult = XLogCtl->LogwrtResult; +// SpinLockRelease(&XLogCtl->info_lck); +// } + +// /* +// * If this was an XLOG_SWITCH record, flush the record and the empty +// * padding space that fills the rest of the segment, and perform +// * end-of-segment actions (eg, notifying archiver). +// */ +// if (isLogSwitch) +// { +// TRACE_POSTGRESQL_WAL_SWITCH(); +// XLogFlush(EndPos); + +// /* +// * Even though we reserved the rest of the segment for us, which is +// * reflected in EndPos, we return a pointer to just the end of the +// * xlog-switch record. +// */ +// if (inserted) +// { +// EndPos = StartPos + SizeOfXLogRecord; +// if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ) +// { +// uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size); + +// if (offset == EndPos % XLOG_BLCKSZ) +// EndPos += SizeOfXLogLongPHD; +// else +// EndPos += SizeOfXLogShortPHD; +// } +// } +// } + +// #ifdef WAL_DEBUG1 +// if (XLOG_DEBUG) +// { +// static XLogReaderState *debug_reader = NULL; +// StringInfoData buf; +// StringInfoData recordBuf; +// char *errormsg = NULL; +// MemoryContext oldCxt; + +// oldCxt = MemoryContextSwitchTo(walDebugCxt); + +// initStringInfo(&buf); +// appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos)); + +// /* +// * We have to piece together the WAL record data from the XLogRecData +// * entries, so that we can pass it to the rm_desc function as one +// * contiguous chunk. +// */ +// initStringInfo(&recordBuf); +// for (; rdata != NULL; rdata = rdata->next) +// appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len); + +// if (!debug_reader) +// debug_reader = XLogReaderAllocate(wal_segment_size, NULL, +// XL_ROUTINE(), NULL); + +// if (!debug_reader) +// { +// appendStringInfoString(&buf, "error decoding record: out of memory"); +// } +// else if (!DecodeXLogRecord(debug_reader, (XLogRecord *)recordBuf.data, +// &errormsg)) +// { +// appendStringInfo(&buf, "error decoding record: %s", +// errormsg ? errormsg : "no error message"); +// } +// else +// { +// appendStringInfoString(&buf, " - "); +// xlog_outdesc(&buf, debug_reader); +// } +// elog(LOG, "%s", buf.data); + +// pfree(buf.data); +// pfree(recordBuf.data); +// MemoryContextSwitchTo(oldCxt); +// } +// #endif + +// /* +// * Update our global variables +// */ +// ProcLastRecPtr = StartPos; +// XactLastRecEnd = EndPos; + +// /* Report WAL traffic to the instrumentation. */ +// if (inserted) +// { +// pgWalUsage.wal_bytes += xlog_write_bytes; +// pgWalUsage.wal_records += grouo_rec_count; +// pgWalUsage.wal_fpi += num_fpi; +// } + +// return EndPos; +// } + +/* + * store xlogs in kv storage with a kind of kv mode + */ XLogRecPtr -XLogInsertRecord(XLogRecData *rdata, +He3DBXLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn, uint8 flags, int num_fpi) { XLogCtlInsert *Insert = &XLogCtl->Insert; pg_crc32c rdata_crc; - bool inserted; XLogRecord *rechdr = (XLogRecord *) rdata->data; uint8 info = rechdr->xl_info & ~XLR_INFO_MASK; - bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID && - info == XLOG_SWITCH); - XLogRecPtr StartPos; - XLogRecPtr EndPos; + XLogRecPtr StartPos; //本次预留空间的起始位置 + XLogRecPtr EndPos; //本次预留空间的结束位置 bool prevDoPageWrites = doPageWrites; /* we assume that all of the record header is in the first chunk */ @@ -1080,10 +1543,10 @@ XLogInsertRecord(XLogRecData *rdata, *---------- */ START_CRIT_SECTION(); - if (isLogSwitch) - WALInsertLockAcquireExclusive(); - else - WALInsertLockAcquire(); + // if (isLogSwitch) + // WALInsertLockAcquireExclusive(); + // else + // WALInsertLockAcquire(); /* * Check to see if my copy of RedoRecPtr is out of date. If so, may have @@ -1116,7 +1579,7 @@ XLogInsertRecord(XLogRecData *rdata, * Oops, some buffer now needs to be backed up that the caller didn't * back up. Start over. */ - WALInsertLockRelease(); +// WALInsertLockRelease(); END_CRIT_SECTION(); return InvalidXLogRecPtr; } @@ -1126,126 +1589,44 @@ XLogInsertRecord(XLogRecData *rdata, * pointer. */ XLogRecPtr startbytepos; - if (isLogSwitch) { - inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev); - rechdr->xl_end = EndPos; - } - else - { - ReserveXLogInsertLocation(group_total_len, rechdr->xl_tot_len, &StartPos, &EndPos, + He3DBReserveXLogInsertLocation(group_total_len, rechdr->xl_tot_len, &StartPos, &EndPos, &rechdr->xl_prev,&startbytepos); - inserted = true; - } - - uint32 xlog_write_bytes = 0; - if (inserted) + for (int i = 0; i < grouo_rec_count; i++) { - - XLogRecPtr tmpStartPos; - XLogRecPtr tmpEndPos; - for (int i = 0; i < grouo_rec_count; i++) - { - rechdr = (XLogRecord *)grouphead[i]; - /* - * Now that xl_prev has been filled in, calculate CRC of the record - * header. - */ - rdata_crc = rechdr->xl_crc; - COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc)); - FIN_CRC32C(rdata_crc); - rechdr->xl_crc = rdata_crc; - /* - * All the record data, including the header, is now ready to be - * inserted. Copy the record in the space reserved. - */ - rdata = (XLogRecData *)&groupRecData[i]; - if (isLogSwitch != true) { - tmpStartPos = XLogBytePosToRecPtr(startbytepos); - startbytepos += grouplens[i]; - tmpEndPos = XLogBytePosToEndRecPtr(startbytepos); - } else { - tmpStartPos = StartPos; - tmpEndPos = EndPos; - } - CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata, - tmpStartPos, tmpEndPos); - xlog_write_bytes += rechdr->xl_tot_len; - - } + rechdr = (XLogRecord *)grouphead[i]; /* - * Unless record is flagged as not important, update LSN of last - * important record in the current slot. When holding all locks, just - * update the first one. - */ - if ((flags & XLOG_MARK_UNIMPORTANT) == 0) - { - int lockno = holdingAllLocks ? 0 : MyLockNo; - - WALInsertLocks[lockno].l.lastImportantAt = StartPos; - } - } - else - { - /* - * This was an xlog-switch record, but the current insert location was - * already exactly at the beginning of a segment, so there was no need - * to do anything. + * Now that xl_prev has been filled in, calculate CRC of the record + * header. */ + rdata_crc = rechdr->xl_crc; + COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc)); + FIN_CRC32C(rdata_crc); + rechdr->xl_crc = rdata_crc; } /* - * Done! Let others know that we're finished. + * All the record data, including the header, is now ready to be + * inserted. Copy the record in the space reserved. */ - WALInsertLockRelease(); + He3DBCopyXLogRecordToWAL(group_total_len, StartPos, EndPos); + /* + * Done! Let others know that we're finished. + */ +// WALInsertLockRelease(); MarkCurrentTransactionIdLoggedIfAny(); END_CRIT_SECTION(); - /* - * Update shared LogwrtRqst.Write, if we crossed page boundary. - */ - if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ) - { - SpinLockAcquire(&XLogCtl->info_lck); - /* advance global request to include new block(s) */ - if (XLogCtl->LogwrtRqst.Write < EndPos) - XLogCtl->LogwrtRqst.Write = EndPos; - /* update local result copy while I have the chance */ - LogwrtResult = XLogCtl->LogwrtResult; - SpinLockRelease(&XLogCtl->info_lck); - } + SpinLockAcquire(&XLogCtl->info_lck); + /* advance global request to include new block(s) */ + if (XLogCtl->LogwrtRqst.Write < EndPos) + XLogCtl->LogwrtRqst.Write = EndPos; + /* update local result copy while I have the chance */ + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); - /* - * If this was an XLOG_SWITCH record, flush the record and the empty - * padding space that fills the rest of the segment, and perform - * end-of-segment actions (eg, notifying archiver). - */ - if (isLogSwitch) - { - TRACE_POSTGRESQL_WAL_SWITCH(); - XLogFlush(EndPos); - - /* - * Even though we reserved the rest of the segment for us, which is - * reflected in EndPos, we return a pointer to just the end of the - * xlog-switch record. - */ - if (inserted) - { - EndPos = StartPos + SizeOfXLogRecord; - if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ) - { - uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size); - - if (offset == EndPos % XLOG_BLCKSZ) - EndPos += SizeOfXLogLongPHD; - else - EndPos += SizeOfXLogShortPHD; - } - } - } #ifdef WAL_DEBUG1 if (XLOG_DEBUG) @@ -1304,16 +1685,13 @@ XLogInsertRecord(XLogRecData *rdata, XactLastRecEnd = EndPos; /* Report WAL traffic to the instrumentation. */ - if (inserted) - { - pgWalUsage.wal_bytes += xlog_write_bytes; - pgWalUsage.wal_records+=grouo_rec_count; - pgWalUsage.wal_fpi += num_fpi; - } + pgWalUsage.wal_bytes += group_total_len; + pgWalUsage.wal_records+=grouo_rec_count; + pgWalUsage.wal_fpi += num_fpi; + return EndPos; } - /* * Reserves the right amount of space for a record of given size from the WAL. * *StartPos is set to the beginning of the reserved section, *EndPos to @@ -1336,10 +1714,10 @@ ReserveXLogInsertLocation(int size, int firstsize, XLogRecPtr *StartPos, XLogRec uint64 groupstartbytepos; uint64 endbytepos; uint64 prevbytepos; - size = MAXALIGN(size); - firstsize=MAXALIGN(firstsize); + // size = MAXALIGN(size); + // firstsize=MAXALIGN(firstsize); /* All (non xlog-switch) records should contain data. */ - Assert(size > SizeOfXLogRecord); + // Assert(size > SizeOfXLogRecord); /* * The duration the spinlock needs to be held is minimized by minimizing @@ -1361,10 +1739,73 @@ ReserveXLogInsertLocation(int size, int firstsize, XLogRecPtr *StartPos, XLogRec Insert->PrevBytePos = endbytepos - grouplens[grouo_rec_count-1]; SpinLockRelease(&Insert->insertpos_lck); for (int i = 1; i < grouo_rec_count; i++) + { + // grouphead[i]->xl_prev = XLogBytePosToRecPtr(groupstartbytepos); + grouphead[i]->xl_prev = groupstartbytepos; + groupstartbytepos += grouplens[i-1]; + // grouphead[i-1]->xl_end = XLogBytePosToEndRecPtr(groupstartbytepos); + grouphead[i-1]->xl_end = groupstartbytepos; + //todo: put page links into kv storage.key is links[i-1], value is groupstartbytepos. + + } + // grouphead[grouo_rec_count-1]->xl_end = XLogBytePosToEndRecPtr(endbytepos); + grouphead[grouo_rec_count-1]->xl_end = endbytepos; + + // *StartPos = XLogBytePosToRecPtr(*startbytepos); + // *EndPos = XLogBytePosToEndRecPtr(endbytepos); + // *PrevPtr = XLogBytePosToRecPtr(prevbytepos); + *StartPos = *startbytepos; + *EndPos = endbytepos; + *PrevPtr = prevbytepos; + + /* + * Check that the conversions between "usable byte positions" and + * XLogRecPtrs work consistently in both directions. + */ + // Assert(XLogRecPtrToBytePos(*StartPos) == *startbytepos); + // Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos); + // Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos); +} + +static void +He3DBReserveXLogInsertLocation(int size, int firstsize, XLogRecPtr *StartPos, XLogRecPtr *EndPos, + XLogRecPtr *PrevPtr, XLogRecPtr *startbytepos) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint64 groupstartbytepos; + uint64 endbytepos; + uint64 prevbytepos; +// size = MAXALIGN(size); +// firstsize=MAXALIGN(firstsize); + /* All (non xlog-switch) records should contain data. */ + Assert(size >= SizeOfXLogRecord); + + /* + * The duration the spinlock needs to be held is minimized by minimizing + * the calculations that have to be done while holding the lock. The + * current tip of reserved WAL is kept in CurrBytePos, as a byte position + * that only counts "usable" bytes in WAL, that is, it excludes all WAL + * page headers. The mapping between "usable" byte positions and physical + * positions (XLogRecPtrs) can be done outside the locked region, and + * because the usable byte position doesn't include any headers, reserving + * X bytes from WAL is almost as simple as "CurrBytePos += X". + */ + SpinLockAcquire(&Insert->insertpos_lck); + //本次预留空间的起始地址 + *startbytepos = Insert->CurrBytePos; + groupstartbytepos = Insert->CurrBytePos; + //本次预留空间的结束地址 + endbytepos = *startbytepos + size; + prevbytepos = Insert->PrevBytePos; + Insert->CurrBytePos = endbytepos; + Insert->PrevBytePos = endbytepos - grouplens[grouo_rec_count-1]; + SpinLockRelease(&Insert->insertpos_lck); + for (int i = 1; i < grouo_rec_count; i++) { grouphead[i]->xl_prev = XLogBytePosToRecPtr(groupstartbytepos); groupstartbytepos += grouplens[i-1]; grouphead[i-1]->xl_end = XLogBytePosToEndRecPtr(groupstartbytepos); + //todo: put page links into kv storage.key is links[i-1], value is groupstartbytepos. } grouphead[grouo_rec_count-1]->xl_end = XLogBytePosToEndRecPtr(endbytepos); @@ -1380,8 +1821,6 @@ ReserveXLogInsertLocation(int size, int firstsize, XLogRecPtr *StartPos, XLogRec Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos); Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos); } - - /* * Like ReserveXLogInsertLocation(), but for an xlog-switch record. * @@ -1564,10 +2003,10 @@ static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos) { - char *currpos; - int freespace; - int written; - XLogRecPtr CurrPos; + char *currpos; + int freespace; + int written; + XLogRecPtr CurrPos; XLogPageHeader pagehdr; /* @@ -1586,8 +2025,8 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, written = 0; while (rdata != NULL) { - char *rdata_data = rdata->data; - int rdata_len = rdata->len; + char *rdata_data = rdata->data; + int rdata_len = rdata->len; while (rdata_len > freespace) { @@ -1600,7 +2039,6 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, rdata_len -= freespace; written += freespace; CurrPos += freespace; - /* * Get pointer to beginning of next page, and set the xlp_rem_len @@ -1612,7 +2050,7 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, * only backend that needs to set the contrecord flag. */ currpos = GetXLogBuffer(CurrPos); - pagehdr = (XLogPageHeader) currpos; + pagehdr = (XLogPageHeader)currpos; pagehdr->xlp_rem_len = write_len - written; pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD; @@ -1635,10 +2073,10 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, CurrPos += rdata_len; freespace -= rdata_len; written += rdata_len; - rdata = rdata->next; + rdata = rdata->next; } Assert(written == write_len); - + /* * If this was an xlog-switch, it's not enough to write the switch record, * we also have to consume all the remaining space in the WAL segment. We @@ -1697,13 +2135,86 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, elog(PANIC, "space reserved for WAL record does not match what was written"); } +static void +He3DBCopyXLogRecordToWAL(int write_len, XLogRecPtr StartPos, XLogRecPtr EndPos) +{ + char *currpos; +// int freespace; + int written; + XLogRecPtr CurrPos; + XLogRecData *rdata; + + /* + * (1)计算xlog buffer的大小 + * (2)根据预取空间的起始地址,计算xlog buffer已经使用的空间 + * (3)根据前两项计算xlog buffer还剩余的空间 + * + * 如果xlog的长度超过剩余空间,就从xlog buffer的起始地址开始复制 + */ + int XlogBufferLength = (XLOGbuffers-1) * XLOG_BLCKSZ; + int UsedXlogBufferLength = StartPos % ((XLOGbuffers-1) * XLOG_BLCKSZ); + int remaindXlogBufferLength = XlogBufferLength - UsedXlogBufferLength; + /* + * Get a pointer to the right place in the right WAL buffer to start + * inserting to. + */ + CurrPos = StartPos; + + /*currpos是xlog在xlog buffer的物理地址*/ + currpos = He3DBGetXLogBuffer(write_len,CurrPos); + + /* Copy record data */ + written = 0; +// if (rdata != NULL) +// { + for (int i = 0; i < grouo_rec_count; i++) + { + rdata = (XLogRecData *)&groupRecData[i]; + if (remaindXlogBufferLength <= 0) + currpos = XLogCtl->pages - remaindXlogBufferLength; + while (rdata != NULL) + { + char *rdata_data = rdata->data; + int rdata_len = rdata->len; + + // if (rdata_len > remaindXlogBufferLength) + // { + // memcpy(currpos, rdata_data, rdata_len); + // currpos = XLogCtl->pages; + // memcpy(currpos, rdata_data + remaindXlogBufferLength, rdata_len - remaindXlogBufferLength); + // currpos = currpos + rdata_len - remaindXlogBufferLength; + // } + // else + // { + memcpy(currpos, rdata_data, rdata_len); + currpos += rdata_len; + // } + + + CurrPos += rdata_len; + written += rdata_len; + remaindXlogBufferLength -= rdata_len; + rdata = rdata->next; + } + } +// memcpy(currpos,1,1); +// CurrPos ++; +// written ++; +// } + + Assert(written == write_len); + + if (CurrPos != EndPos) + elog(PANIC, "space reserved for WAL record does not match what was written"); +} + /* * Acquire a WAL insertion lock, for inserting to WAL. */ static void WALInsertLockAcquire(void) { - bool immed; + bool immed; /* * It doesn't matter which of the WAL insertion locks we acquire, so try @@ -1716,7 +2227,7 @@ WALInsertLockAcquire(void) * (semi-)randomly. This allows the locks to be used evenly if you have a * lot of very short connections. */ - static int lockToTry = -1; + static int lockToTry = -1; if (lockToTry == -1) lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS; @@ -1748,7 +2259,7 @@ WALInsertLockAcquire(void) static void WALInsertLockAcquireExclusive(void) { - int i; + int i; /* * When holding all the locks, all but the last lock's insertingAt @@ -1779,7 +2290,7 @@ WALInsertLockRelease(void) { if (holdingAllLocks) { - int i; + int i; for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) LWLockReleaseClearVar(&WALInsertLocks[i].l.lock, @@ -1835,6 +2346,91 @@ WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt) */ static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto) +{ + uint64 bytepos; + XLogRecPtr reservedUpto; + XLogRecPtr finishedUpto; + XLogCtlInsert *Insert = &XLogCtl->Insert; + int i; + + if (MyProc == NULL) + elog(PANIC, "cannot wait without a PGPROC structure"); + + /* Read the current insert position */ + SpinLockAcquire(&Insert->insertpos_lck); + bytepos = Insert->CurrBytePos; + SpinLockRelease(&Insert->insertpos_lck); + reservedUpto = XLogBytePosToEndRecPtr(bytepos); + + /* + * No-one should request to flush a piece of WAL that hasn't even been + * reserved yet. However, it can happen if there is a block with a bogus + * LSN on disk, for example. XLogFlush checks for that situation and + * complains, but only after the flush. Here we just assume that to mean + * that all WAL that has been reserved needs to be finished. In this + * corner-case, the return value can be smaller than 'upto' argument. + */ + if (upto > reservedUpto) + { + ereport(LOG, + (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X", + LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto)))); + upto = reservedUpto; + } + + /* + * Loop through all the locks, sleeping on any in-progress insert older + * than 'upto'. + * + * finishedUpto is our return value, indicating the point upto which all + * the WAL insertions have been finished. Initialize it to the head of + * reserved WAL, and as we iterate through the insertion locks, back it + * out for any insertion that's still in progress. + */ + finishedUpto = reservedUpto; + +// for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) +// { +// XLogRecPtr insertingat = InvalidXLogRecPtr; +// +// do +// { +// /* +// * See if this insertion is in progress. LWLockWaitForVar will +// * wait for the lock to be released, or for the 'value' to be set +// * by a LWLockUpdateVar call. When a lock is initially acquired, +// * its value is 0 (InvalidXLogRecPtr), which means that we don't +// * know where it's inserting yet. We will have to wait for it. If +// * it's a small insertion, the record will most likely fit on the +// * same page and the inserter will release the lock without ever +// * calling LWLockUpdateVar. But if it has to sleep, it will +// * advertise the insertion point with LWLockUpdateVar before +// * sleeping. +// */ +// if (LWLockWaitForVar(&WALInsertLocks[i].l.lock, +// &WALInsertLocks[i].l.insertingAt, +// insertingat, &insertingat)) +// { +// /* the lock was free, so no insertion in progress */ +// insertingat = InvalidXLogRecPtr; +// break; +// } +// +// /* +// * This insertion is still in progress. Have to wait, unless the +// * inserter has proceeded past 'upto'. +// */ +// } while (insertingat < upto); +// +// if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto) +// finishedUpto = insertingat; +// } + + return finishedUpto; +} + +static XLogRecPtr +He3DBWaitXLogInsertionsToFinish(XLogRecPtr upto) { uint64 bytepos; XLogRecPtr reservedUpto; @@ -1877,12 +2473,12 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto) * out for any insertion that's still in progress. */ finishedUpto = reservedUpto; - for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) - { - XLogRecPtr insertingat = InvalidXLogRecPtr; - - do - { +// for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) +// { +// XLogRecPtr insertingat = InvalidXLogRecPtr; +// +// do +// { /* * See if this insertion is in progress. LWLockWaitForVar will * wait for the lock to be released, or for the 'value' to be set @@ -1895,24 +2491,24 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto) * advertise the insertion point with LWLockUpdateVar before * sleeping. */ - if (LWLockWaitForVar(&WALInsertLocks[i].l.lock, - &WALInsertLocks[i].l.insertingAt, - insertingat, &insertingat)) - { +// if (LWLockWaitForVar(&WALInsertLocks[i].l.lock, +// &WALInsertLocks[i].l.insertingAt, +// insertingat, &insertingat)) +// { /* the lock was free, so no insertion in progress */ - insertingat = InvalidXLogRecPtr; - break; - } +// insertingat = InvalidXLogRecPtr; +// break; +// } /* * This insertion is still in progress. Have to wait, unless the * inserter has proceeded past 'upto'. */ - } while (insertingat < upto); +// } while (insertingat < upto); - if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto) - finishedUpto = insertingat; - } +// if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto) +// finishedUpto = insertingat; +// } return finishedUpto; } @@ -1935,11 +2531,11 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto) static char * GetXLogBuffer(XLogRecPtr ptr) { - int idx; - XLogRecPtr endptr; + int idx; + XLogRecPtr endptr; static uint64 cachedPage = 0; static char *cachedPos = NULL; - XLogRecPtr expectedEndPtr; + XLogRecPtr expectedEndPtr; /* * Fast path for the common case that we need to access again the same @@ -1947,8 +2543,8 @@ GetXLogBuffer(XLogRecPtr ptr) */ if (ptr / XLOG_BLCKSZ == cachedPage) { - Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC); - Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ)); + Assert(((XLogPageHeader)cachedPos)->xlp_magic == XLOG_PAGE_MAGIC); + Assert(((XLogPageHeader)cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ)); return cachedPos + ptr % XLOG_BLCKSZ; } @@ -1982,7 +2578,7 @@ GetXLogBuffer(XLogRecPtr ptr) endptr = XLogCtl->xlblocks[idx]; if (expectedEndPtr != endptr) { - XLogRecPtr initializedUpto; + XLogRecPtr initializedUpto; /* * Before calling AdvanceXLInsertBuffer(), which can block, let others @@ -2030,14 +2626,76 @@ GetXLogBuffer(XLogRecPtr ptr) * offset within the page. */ cachedPage = ptr / XLOG_BLCKSZ; - cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ; + cachedPos = XLogCtl->pages + idx * (Size)XLOG_BLCKSZ; - Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC); - Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ)); + Assert(((XLogPageHeader)cachedPos)->xlp_magic == XLOG_PAGE_MAGIC); + Assert(((XLogPageHeader)cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ)); return cachedPos + ptr % XLOG_BLCKSZ; } +static char * +He3DBGetXLogBuffer(int xlogLength,XLogRecPtr ptr) +{ + uint64 offset = 0; + XLogRecPtr localUpto; + + /*xlog在xlog buffer中的偏移*/ + offset = ptr % ((XLOGbuffers-1) * XLOG_BLCKSZ); + // SpinLockAcquire(&XLogCtl->info_lck); + // LogwrtResult = XLogCtl-> LogwrtResult; + // SpinLockRelease(&XLogCtl->info_lck); + /* + * 如果xlog的起始位置已经开始覆盖xlog buffer上尚未写出的数据了,则等待, + * 暂时不执行初始化空间操作 + */ +// WALInsertLockUpdateInsertingAt(ptr); + XLogRecPtr flushed = (XLogRecPtr) pg_atomic_read_u64(&(XLogCtl->LogwrtResult.Flush)); + while (ptr - flushed > ((XLOGbuffers-1) * XLOG_BLCKSZ)) + { + usleep(500); + // SpinLockAcquire(&XLogCtl->info_lck); + // LogwrtResult = XLogCtl -> LogwrtResult; + // SpinLockRelease(&XLogCtl->info_lck); + flushed = (XLogRecPtr) pg_atomic_read_u64(&(XLogCtl->LogwrtResult.Flush)); + } + + /* + * 如果xlog buffer上剩余的空间能够写下此xlog并且也没有超时,则直接初始化空间,并开始拷贝数据 + */ + if (!(ptr + xlogLength - flushed >= ((XLOGbuffers-2) * XLOG_BLCKSZ) /* || isWriteTimeOut()*/)) + { + // WALInsertLockUpdateInsertingAt(ptr); + He3DBAdvanceXLInsertBuffer(xlogLength, ptr, true); + return XLogCtl -> pages + offset; + } + + + /* + *整个系统中只会有一个进程的xlog会处在buffer上有空间,但是又不够的状态, + *其他进程的xlog要么有足够的空间,要么没有空间 + * + * 如果是因为超时和超空间同时触发两个进程的写操作,需要做处理 + * + * + * 第一个触发写操作的ptr作为upto, + * 其他的需要判断剩余的空间是否足够,来决定继续拷贝还是等待 + */ + SpinLockAcquire(&XLogCtl->info_lck); + if (XLogCtl -> globalUpto == 0) + XLogCtl -> globalUpto = ptr; + SpinLockRelease(&XLogCtl->info_lck); + // WALInsertLockUpdateInsertingAt(ptr); + + if (XLogCtl -> globalUpto == ptr) + He3DBAdvanceXLInsertBuffer(xlogLength, ptr, false); + else + He3DBAdvanceXLInsertBuffer(xlogLength, ptr, true); + + return XLogCtl->pages + offset; + +} + /* * Converts a "usable byte position" to XLogRecPtr. A usable byte position * is the position starting from the beginning of WAL, excluding all WAL @@ -2046,35 +2704,43 @@ GetXLogBuffer(XLogRecPtr ptr) static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos) { - uint64 fullsegs; - uint64 fullpages; - uint64 bytesleft; - uint32 seg_offset; - XLogRecPtr result; + /* + * original logic, we abandon it. + */ + if(0) { + uint64 fullsegs; + uint64 fullpages; + uint64 bytesleft; + uint32 seg_offset; + XLogRecPtr result; - fullsegs = bytepos / UsableBytesInSegment; - bytesleft = bytepos % UsableBytesInSegment; + fullsegs = bytepos / UsableBytesInSegment; + bytesleft = bytepos % UsableBytesInSegment; - if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) - { - /* fits on first page of segment */ - seg_offset = bytesleft + SizeOfXLogLongPHD; - } - else - { - /* account for the first page on segment with long header */ - seg_offset = XLOG_BLCKSZ; - bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; + if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) + { + /* fits on first page of segment */ + seg_offset = bytesleft + SizeOfXLogLongPHD; + } + else + { + /* account for the first page on segment with long header */ + seg_offset = XLOG_BLCKSZ; + bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; - fullpages = bytesleft / UsableBytesInPage; - bytesleft = bytesleft % UsableBytesInPage; + fullpages = bytesleft / UsableBytesInPage; + bytesleft = bytesleft % UsableBytesInPage; - seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + } + + XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result); + + return result; } - XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result); + return bytepos; - return result; } /* @@ -2084,43 +2750,50 @@ XLogBytePosToRecPtr(uint64 bytepos) * when converting a pointer to the end of a record. */ static XLogRecPtr -XLogBytePosToEndRecPtr(uint64 bytepos) +XLogBytePosToEndRecPtr(uint64 bytepos) { - uint64 fullsegs; - uint64 fullpages; - uint64 bytesleft; - uint32 seg_offset; - XLogRecPtr result; + /* + * original logic, we abandon it. + */ + if(0){ + uint64 fullsegs; + uint64 fullpages; + uint64 bytesleft; + uint32 seg_offset; + XLogRecPtr result; - fullsegs = bytepos / UsableBytesInSegment; - bytesleft = bytepos % UsableBytesInSegment; + fullsegs = bytepos / UsableBytesInSegment; + bytesleft = bytepos % UsableBytesInSegment; - if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) - { - /* fits on first page of segment */ - if (bytesleft == 0) - seg_offset = 0; + if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) + { + /* fits on first page of segment */ + if (bytesleft == 0) + seg_offset = 0; + else + seg_offset = bytesleft + SizeOfXLogLongPHD; + } else - seg_offset = bytesleft + SizeOfXLogLongPHD; - } - else - { - /* account for the first page on segment with long header */ - seg_offset = XLOG_BLCKSZ; - bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; + { + /* account for the first page on segment with long header */ + seg_offset = XLOG_BLCKSZ; + bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; - fullpages = bytesleft / UsableBytesInPage; - bytesleft = bytesleft % UsableBytesInPage; + fullpages = bytesleft / UsableBytesInPage; + bytesleft = bytesleft % UsableBytesInPage; - if (bytesleft == 0) - seg_offset += fullpages * XLOG_BLCKSZ + bytesleft; - else - seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + if (bytesleft == 0) + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft; + else + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + } + + XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result); + + return result; } - XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result); - - return result; + return bytepos; } /* @@ -2129,38 +2802,45 @@ XLogBytePosToEndRecPtr(uint64 bytepos) static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr) { - uint64 fullsegs; - uint32 fullpages; - uint32 offset; - uint64 result; + /* + * original logic, we abandon it. + */ + if(0){ + uint64 fullsegs; + uint32 fullpages; + uint32 offset; + uint64 result; - XLByteToSeg(ptr, fullsegs, wal_segment_size); + XLByteToSeg(ptr, fullsegs, wal_segment_size); - fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ; - offset = ptr % XLOG_BLCKSZ; + fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ; + offset = ptr % XLOG_BLCKSZ; - if (fullpages == 0) - { - result = fullsegs * UsableBytesInSegment; - if (offset > 0) + if (fullpages == 0) { - Assert(offset >= SizeOfXLogLongPHD); - result += offset - SizeOfXLogLongPHD; + result = fullsegs * UsableBytesInSegment; + if (offset > 0) + { + Assert(offset >= SizeOfXLogLongPHD); + result += offset - SizeOfXLogLongPHD; + } } - } - else - { - result = fullsegs * UsableBytesInSegment + - (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */ - (fullpages - 1) * UsableBytesInPage; /* full pages */ - if (offset > 0) + else { - Assert(offset >= SizeOfXLogShortPHD); - result += offset - SizeOfXLogShortPHD; + result = fullsegs * UsableBytesInSegment + + (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */ + (fullpages - 1) * UsableBytesInPage; /* full pages */ + if (offset > 0) + { + Assert(offset >= SizeOfXLogShortPHD); + result += offset - SizeOfXLogShortPHD; + } } + + return result; } - return result; + return ptr; } /* @@ -2174,13 +2854,13 @@ static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) { XLogCtlInsert *Insert = &XLogCtl->Insert; - int nextidx; - XLogRecPtr OldPageRqstPtr; + int nextidx; + XLogRecPtr OldPageRqstPtr; XLogwrtRqst WriteRqst; - XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr; - XLogRecPtr NewPageBeginPtr; + XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr; + XLogRecPtr NewPageBeginPtr; XLogPageHeader NewPage; - int npages = 0; + int npages = 0; LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE); @@ -2244,7 +2924,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START(); WriteRqst.Write = OldPageRqstPtr; WriteRqst.Flush = 0; - XLogWrite(WriteRqst, false); + // XLogWrite(WriteRqst, false); LWLockRelease(WALWriteLock); WalStats.m_wal_buffers_full++; TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); @@ -2264,24 +2944,24 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx); - NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ); + NewPage = (XLogPageHeader)(XLogCtl->pages + nextidx * (Size)XLOG_BLCKSZ); /* * Be sure to re-zero the buffer so that bytes beyond what we've * written will look like zeroes and not valid XLOG records... */ - MemSet((char *) NewPage, 0, XLOG_BLCKSZ); + MemSet((char *)NewPage, 0, XLOG_BLCKSZ); /* * Fill the new page's header */ NewPage->xlp_magic = XLOG_PAGE_MAGIC; - /* NewPage->xlp_info = 0; */ /* done by memset */ + /* NewPage->xlp_info = 0; */ /* done by memset */ NewPage->xlp_tli = ThisTimeLineID; NewPage->xlp_pageaddr = NewPageBeginPtr; - /* NewPage->xlp_rem_len = 0; */ /* done by memset */ + /* NewPage->xlp_rem_len = 0; */ /* done by memset */ /* * If online backup is not in progress, mark the header to indicate @@ -2316,7 +2996,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) */ if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0) { - XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage; + XLogLongPageHeader NewLongPage = (XLogLongPageHeader)NewPage; NewLongPage->xlp_sysid = ControlFile->system_identifier; NewLongPage->xlp_seg_size = wal_segment_size; @@ -2331,7 +3011,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) */ pg_write_barrier(); - *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr; + *((volatile XLogRecPtr *)&XLogCtl->xlblocks[nextidx]) = NewPageEndPtr; XLogCtl->InitializedUpTo = NewPageEndPtr; @@ -2348,6 +3028,134 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) #endif } +long getTimeStamp() +{ + gettimeofday(&timeVal,NULL); + return timeVal.sec * 1000 + timeVal.usec /1000; +} + +int isWriteTimeOut() +{ + gettimeofday(&timeVal,NULL); + return timeVal.sec * 1000 + timeVal.usec /1000 - XLogCtl->timestamp > 300; +} + +static void +He3DBAdvanceXLInsertBuffer(int xlogLength, XLogRecPtr upto, bool opportunistic) +{ + XLogCtlInsert *Insert = &XLogCtl->Insert; + + XLogwrtRqst WriteRqst; + int XlogBufferLength = (XLOGbuffers-1) * XLOG_BLCKSZ; + int UsedXlogBufferLength = upto % ((XLOGbuffers-1) * XLOG_BLCKSZ); + int remaindXlogBufferLength = XlogBufferLength - UsedXlogBufferLength; + +// LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE); + + /* + * 以LogwrtResult.Write为写入起点,upto为写入终点 + * LogwrtResult是本地私有变量,不一定和共享存储中的XLogCtl->LogwrtResult的值一致, + * 因此后面需要进一步的判断 + */ + + XLogRecPtr flushed = (XLogRecPtr) pg_atomic_read_u64(&(XLogCtl->LogwrtResult.Flush)); + + if (!opportunistic) + { + + /* Before waiting, get info_lck and update LogwrtResult */ + SpinLockAcquire(&XLogCtl->info_lck); + if (XLogCtl->LogwrtRqst.Write < upto) + XLogCtl->LogwrtRqst.Write = upto; + LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); + + /* + * TODO + * 还要添加一些写入的条件,比如超时、超出空间等等 + */ + if (upto + xlogLength - flushed >= ((XLOGbuffers-2) * XLOG_BLCKSZ) /* || isWriteTimeOut()*/) + { + /* + * Must acquire write lock. Release WALBufMappingLock first, + * to make sure that all insertions that we need to wait for + * can finish (up to this same position). Otherwise we risk + * deadlock. + */ +// LWLockRelease(WALBufMappingLock); + +// He3DBWaitXLogInsertionsToFinish(upto); + + // LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); + // SpinLockAcquire(&XLogCtl->info_lck); + // LogwrtResult = XLogCtl->LogwrtResult; + // SpinLockRelease(&XLogCtl->info_lck); + /* + * 系统中同时会有多个进程竞争一把写锁,在本进程获得写锁后,还要判断一下, + * 是不是达到了要写入的条件,如果没达到就释放锁 + */ + // if (!(upto + xlogLength - LogwrtResult.Write >= ((XLOGbuffers-2) * XLOG_BLCKSZ) /* || isWriteTimeOut()*/)) + // { + /* OK, someone wrote it already */ + // LWLockRelease(WALWriteLock); + // } + // else + if (upto + xlogLength - flushed >= ((XLOGbuffers-2) * XLOG_BLCKSZ)) + { + /* Have to write it ourselves */ + TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START(); + WriteRqst.Write = upto; + WriteRqst.Flush = 0; + He3DBXLogFakeWrite(WriteRqst); + XLogCtl->timestamp = getTimeStamp(); + // LWLockRelease(WALWriteLock); + WalStats.m_wal_buffers_full++; + TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE(); + } + /* Re-acquire WALBufMappingLock and retry */ +// LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE); + } + } + + // SpinLockAcquire(&XLogCtl->info_lck); + // LogwrtResult = XLogCtl->LogwrtResult; + // SpinLockRelease(&XLogCtl->info_lck); + while(upto + xlogLength - flushed >= ((XLOGbuffers-1) * XLOG_BLCKSZ)) + { + usleep(500); + // SpinLockAcquire(&XLogCtl->info_lck); + // LogwrtResult = XLogCtl->LogwrtResult; + // SpinLockRelease(&XLogCtl->info_lck); + flushed = (XLogRecPtr) pg_atomic_read_u64(&(XLogCtl->LogwrtResult.Flush)); + } +/* + if (remaindXlogBufferLength >= xlogLength) + { + MemSet((char *)(XLogCtl->pages + UsedXlogBufferLength), 0, xlogLength); + } + else + { + MemSet((char *)(XLogCtl->pages + UsedXlogBufferLength), 0, xlogLength); + MemSet((char *)(XLogCtl->pages + 0), 0, xlogLength - remaindXlogBufferLength); + } +*/ + /* + * Make sure the initialization of the page becomes visible to others + * before the xlblocks update. GetXLogBuffer() reads xlblocks without + * holding a lock. + */ + pg_write_barrier(); + +// LWLockRelease(WALBufMappingLock); + +#ifdef WAL_DEBUG + if (XLOG_DEBUG && npages > 0) + { + elog(DEBUG1, "initialized %d pages, up to %X/%X", + npages, LSN_FORMAT_ARGS(NewPageEndPtr)); + } +#endif +} /* * Calculate CheckPointSegments based on max_wal_size_mb and * checkpoint_completion_target. @@ -2355,7 +3163,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) static void CalculateCheckpointSegments(void) { - double target; + double target; /*------- * Calculate the distance at which to trigger a checkpoint, to avoid @@ -2371,25 +3179,23 @@ CalculateCheckpointSegments(void) * number of segments consumed between checkpoints. *------- */ - target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) / - (1.0 + CheckPointCompletionTarget); + target = (double)ConvertToXSegs(max_wal_size_mb, wal_segment_size) / + (1.0 + CheckPointCompletionTarget); /* round down */ - CheckPointSegments = (int) target; + CheckPointSegments = (int)target; if (CheckPointSegments < 1) CheckPointSegments = 1; } -void -assign_max_wal_size(int newval, void *extra) +void assign_max_wal_size(int newval, void *extra) { max_wal_size_mb = newval; CalculateCheckpointSegments(); } -void -assign_checkpoint_completion_target(double newval, void *extra) +void assign_checkpoint_completion_target(double newval, void *extra) { CheckPointCompletionTarget = newval; CalculateCheckpointSegments(); @@ -2402,10 +3208,10 @@ assign_checkpoint_completion_target(double newval, void *extra) static XLogSegNo XLOGfileslop(XLogRecPtr lastredoptr) { - XLogSegNo minSegNo; - XLogSegNo maxSegNo; - double distance; - XLogSegNo recycleSegNo; + XLogSegNo minSegNo; + XLogSegNo maxSegNo; + double distance; + XLogSegNo recycleSegNo; /* * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb @@ -2413,9 +3219,9 @@ XLOGfileslop(XLogRecPtr lastredoptr) * remove enough segments to stay below the maximum. */ minSegNo = lastredoptr / wal_segment_size + - ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1; + ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1; maxSegNo = lastredoptr / wal_segment_size + - ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1; + ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1; /* * Between those limits, recycle enough segments to get us through to the @@ -2429,8 +3235,8 @@ XLOGfileslop(XLogRecPtr lastredoptr) /* add 10% for good measure. */ distance *= 1.10; - recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) / - wal_segment_size); + recycleSegNo = (XLogSegNo)ceil(((double)lastredoptr + distance) / + wal_segment_size); if (recycleSegNo < minSegNo) recycleSegNo = minSegNo; @@ -2452,309 +3258,510 @@ XLOGfileslop(XLogRecPtr lastredoptr) static bool XLogCheckpointNeeded(XLogSegNo new_segno) { - XLogSegNo old_segno; + XLogSegNo old_segno; XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size); - if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1)) + if (new_segno >= old_segno + (uint64)(CheckPointSegments - 1)) return true; return false; } -/* - * Write and/or fsync the log at least as far as WriteRqst indicates. - * - * If flexible == true, we don't have to write as far as WriteRqst, but - * may stop at any convenient boundary (such as a cache or logfile boundary). - * This option allows us to avoid uselessly issuing multiple writes when a - * single one would do. - * - * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst) - * must be called before grabbing the lock, to make sure the data is ready to - * write. - */ -static void -XLogWrite(XLogwrtRqst WriteRqst, bool flexible) + +int GetXlogLength(uint8 part[],int n) { - bool ispartialpage; - bool last_iteration; - bool finishing_seg; - bool use_existent; - int curridx; - int npages; - int startidx; - uint32 startoffset; - - /* We should always be inside a critical section here */ - Assert(CritSectionCount > 0); - - /* - * Update local LogwrtResult (caller probably did this already, but...) - */ - LogwrtResult = XLogCtl->LogwrtResult; - - /* - * Since successive pages in the xlog cache are consecutively allocated, - * we can usually gather multiple pages together and issue just one - * write() call. npages is the number of pages we have determined can be - * written together; startidx is the cache block index of the first one, - * and startoffset is the file offset at which it should go. The latter - * two variables are only valid when npages > 0, but we must initialize - * all of them to keep the compiler quiet. - */ - npages = 0; - startidx = 0; - startoffset = 0; - - /* - * Within the loop, curridx is the cache block index of the page to - * consider writing. Begin at the buffer containing the next unwritten - * page, or last partially written page. - */ - curridx = XLogRecPtrToBufIdx(LogwrtResult.Write); - - while (LogwrtResult.Write < WriteRqst.Write) + int stp; + int length = 0; + for (stp = 0; stp < n; stp ++) { - /* - * Make sure we're not ahead of the insert process. This could happen - * if we're passed a bogus WriteRqst.Write that is past the end of the - * last page that's been initialized by AdvanceXLInsertBuffer. - */ - XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx]; + length += part[stp] << (stp * 8); + } + return length; +} - if (LogwrtResult.Write >= EndPtr) - elog(PANIC, "xlog write request %X/%X is past end of log %X/%X", - LSN_FORMAT_ARGS(LogwrtResult.Write), - LSN_FORMAT_ARGS(EndPtr)); +static void reCompleteCrc(XLogRecord* record) { + if (record->xl_rmid == RM_XLOG_ID) { + if (((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN) || + ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_ONLINE)) { + CheckPoint* xlrec = (CheckPoint*)(((char*)record)+record->xl_tot_len-sizeof(CheckPoint)); + xlrec->redo = record->xl_end - record->xl_tot_len; + } + } + pg_crc32c rdata_crc; + INIT_CRC32C(rdata_crc); + COMP_CRC32C(rdata_crc, ((char*)record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); + COMP_CRC32C(rdata_crc, record, offsetof(XLogRecord, xl_crc)); + FIN_CRC32C(rdata_crc); + record->xl_crc = rdata_crc; +} - /* Advance LogwrtResult.Write to end of current buffer page */ - LogwrtResult.Write = EndPtr; - ispartialpage = WriteRqst.Write < LogwrtResult.Write; - - if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, - wal_segment_size)) - { - /* - * Switch to new logfile segment. We cannot have any pending - * pages here (since we dump what we have at segment end). - */ - Assert(npages == 0); - if (openLogFile >= 0) - XLogFileClose(); - XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, - wal_segment_size); - - /* create/use new log file */ - use_existent = true; - openLogFile = XLogFileInit(openLogSegNo, &use_existent, true); - ReserveExternalFD(); +void pushXlogToTikv(char*data,int len) { + XLogItemList xlogItemList; + xlogItemList.head = NULL; + xlogItemList.tail = NULL; + int pos = 0; + while(pos < len) { + XLogRecord* record = ((XLogRecord*)(data + pos)); + reCompleteCrc(record); + if (record->xl_end < ControlFile->minRecoveryPoint) { + pos += record->xl_tot_len; + continue; } - - /* Make sure we have the current logfile open */ - if (openLogFile < 0) + XLogItem *xlogItem = (XLogItem *)malloc(sizeof(XLogItem)); + xlogItem->xlogKey.lsn = record->xl_end - record->xl_tot_len; + xlogItem->begin = record; + xlogItem->length = record->xl_tot_len; + xlogItem->next = NULL; + if (xlogItemList.head == NULL) { - XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, - wal_segment_size); - openLogFile = XLogFileOpen(openLogSegNo); - ReserveExternalFD(); + xlogItemList.head = xlogItem; + xlogItemList.tail = xlogItemList.head; + } else { + xlogItemList.tail->next = xlogItem; + xlogItemList.tail = xlogItemList.tail->next; } + pos += record->xl_tot_len; + } + if (xlogItemList.head != NULL) + { + while (flushwals(xlogItemList.head, ThisTimeLineID) == 0) { + printf("flushwals failed, retry!"); + pg_usleep(1000L); + } + freeItemList(&xlogItemList); + } +} - /* Add current page to the set of pending pages-to-dump */ - if (npages == 0) +void freeItemList(XLogItemList *xlogItemList) +{ + if (xlogItemList->head == NULL){ + return; + } + else + { + XLogItem *xlogItem; + xlogItem = xlogItemList->head; + while (xlogItemList->head->next) { - /* first of group */ - startidx = curridx; - startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ, - wal_segment_size); + xlogItemList->head = xlogItemList->head->next; + free(xlogItem); + xlogItem = xlogItemList->head; } - npages++; - - /* - * Dump the set if this will be the last loop iteration, or if we are - * at the last page of the cache area (since the next page won't be - * contiguous in memory), or if we are at the end of the logfile - * segment. - */ - last_iteration = WriteRqst.Write <= LogwrtResult.Write; - - finishing_seg = !ispartialpage && - (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size; - - if (last_iteration || - curridx == XLogCtl->XLogCacheBlck || - finishing_seg) - { - char *from; - Size nbytes; - Size nleft; - int written; - instr_time start; - - /* OK to write the page(s) */ - from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ; - nbytes = npages * (Size) XLOG_BLCKSZ; - nleft = nbytes; - do - { - errno = 0; - - /* Measure I/O timing to write WAL data */ - if (track_wal_io_timing) - INSTR_TIME_SET_CURRENT(start); - - pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); - written = pg_pwrite(openLogFile, from, nleft, startoffset); - pgstat_report_wait_end(); - - /* - * Increment the I/O timing and the number of times WAL data - * were written out to disk. - */ - if (track_wal_io_timing) - { - instr_time duration; - - INSTR_TIME_SET_CURRENT(duration); - INSTR_TIME_SUBTRACT(duration, start); - WalStats.m_wal_write_time += INSTR_TIME_GET_MICROSEC(duration); - } - - WalStats.m_wal_write++; - - if (written <= 0) - { - char xlogfname[MAXFNAMELEN]; - int save_errno; - - if (errno == EINTR) - continue; - - save_errno = errno; - XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, - wal_segment_size); - errno = save_errno; - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write to log file %s " - "at offset %u, length %zu: %m", - xlogfname, startoffset, nleft))); - } - nleft -= written; - from += written; - startoffset += written; - } while (nleft > 0); - - npages = 0; - - /* - * If we just wrote the whole last page of a logfile segment, - * fsync the segment immediately. This avoids having to go back - * and re-open prior segments when an fsync request comes along - * later. Doing it here ensures that one and only one backend will - * perform this fsync. - * - * This is also the right place to notify the Archiver that the - * segment is ready to copy to archival storage, and to update the - * timer for archive_timeout, and to signal for a checkpoint if - * too many logfile segments have been used since the last - * checkpoint. - */ - if (finishing_seg) - { - issue_xlog_fsync(openLogFile, openLogSegNo); - - /* signal that we need to wakeup walsenders later */ - WalSndWakeupRequest(); - - LogwrtResult.Flush = LogwrtResult.Write; /* end of page */ - - if (XLogArchivingActive()) - XLogArchiveNotifySeg(openLogSegNo); - - XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); - XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush; - - /* - * Request a checkpoint if we've consumed too much xlog since - * the last one. For speed, we first check using the local - * copy of RedoRecPtr, which might be out of date; if it looks - * like a checkpoint is needed, forcibly update RedoRecPtr and - * recheck. - */ - if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo)) - { - (void) GetRedoRecPtr(); - if (XLogCheckpointNeeded(openLogSegNo)) - RequestCheckpoint(CHECKPOINT_CAUSE_XLOG); - } - } - } - - if (ispartialpage) - { - /* Only asked to write a partial page */ - LogwrtResult.Write = WriteRqst.Write; - break; - } - curridx = NextBufIdx(curridx); - - /* If flexible, break out of loop as soon as we wrote something */ - if (flexible && npages == 0) - break; + free(xlogItem); } - Assert(npages == 0); + xlogItemList->tail = xlogItemList->head = NULL; + +} +// static void +// He3DBXLogWrite(XLogwrtRqst WriteRqst, bool flexible) +// { +// LogwrtResult = XLogCtl->LogwrtResult; +// char *from; +// uint8 part[4]; +// int count; +// int stp; +// int xlogLength; + +// from = XLogCtl->pages + LogwrtResult.Write % ((XLOGbuffers-1) * XLOG_BLCKSZ); +// count = LogwrtResult.Write; +// // printf("invoke xlog write, upto %ld\n", WriteRqst.Write); +// while (count < WriteRqst.Write) +// { +// //TODO splite the xlog_buffer and construct k-v pairs +// if (count < WriteRqst.Write) +// { +// if (xlogItemList == NULL) +// { +// xlogItemList = (XLogItemList *)malloc(sizeof(XLogItemList)); +// xlogItemList->head = NULL; +// xlogItemList->tail = NULL; +// } +// for(stp = 0; stp < 4; stp ++) +// { +// part[stp] = *(from + stp); +// } +// xlogLength = GetXlogLength(part,4); +// XLogItem *xlogItem = (XLogItem *)malloc(sizeof(XLogItem)); + +// (xlogItem->xlogKey).lsn = count; +// xlogItem->begin = from; + +// if ((count + xlogLength)/((XLOGbuffers-1) * XLOG_BLCKSZ) > count / ((XLOGbuffers-1) * XLOG_BLCKSZ)) +// { +// from = XLogCtl->pages + ((count + xlogLength)%((XLOGbuffers-1) * XLOG_BLCKSZ)); +// } +// else +// { +// from = from + xlogLength; +// } - /* - * If asked to flush, do so - */ - if (LogwrtResult.Flush < WriteRqst.Flush && - LogwrtResult.Flush < LogwrtResult.Write) +// xlogItem->length = xlogLength; +// xlogItem->next = NULL; +// if (xlogItemList->head == NULL) +// { +// xlogItemList->head = xlogItem; +// xlogItemList->tail = xlogItemList->head; +// } +// else +// { +// xlogItemList->tail->next = xlogItem; +// xlogItemList->tail = xlogItemList->tail->next; +// } +// count += xlogLength; +// } +// } - { - /* - * Could get here without iterating above loop, in which case we might - * have no open file or the wrong one. However, we do not need to - * fsync more than one file. - */ - if (sync_method != SYNC_METHOD_OPEN && - sync_method != SYNC_METHOD_OPEN_DSYNC) - { - if (openLogFile >= 0 && - !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, - wal_segment_size)) - XLogFileClose(); - if (openLogFile < 0) - { - XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, - wal_segment_size); - openLogFile = XLogFileOpen(openLogSegNo); - ReserveExternalFD(); - } +// //TODO flush into queue +// if (xlogItemList != NULL) +// { +// if (xlogItemList->head != NULL) +// { +// kvwrite(xlogItemList->head); +// freeItemList(xlogItemList); +// WalStats.m_wal_write++; +// LogwrtResult.Write = WriteRqst.Write; +// } +// free(xlogItemList); +// xlogItemList = NULL; +// } + - issue_xlog_fsync(openLogFile, openLogSegNo); - } +// if (LogwrtResult.Flush < WriteRqst.Flush && +// LogwrtResult.Flush < LogwrtResult.Write) +// { +// //TODO signal flush +// kvflush(LogwrtResult.Write); +// LogwrtResult.Flush = LogwrtResult.Write; +// } - /* signal that we need to wakeup walsenders later */ - WalSndWakeupRequest(); +// /* +// * Update shared-memory status +// */ +// { +// SpinLockAcquire(&XLogCtl->info_lck); +// XLogCtl->LogwrtResult = LogwrtResult; +// if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write) +// XLogCtl->LogwrtRqst.Write = LogwrtResult.Write; +// if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush) +// XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush; +// XLogCtl->globalUpto = 0; +// SpinLockRelease(&XLogCtl->info_lck); +// } +// } - LogwrtResult.Flush = LogwrtResult.Write; - } +static void +FlushWal(XLogwrtRqst WriteRqst) +{ + char *from = NULL; + // uint8 part[4]; + uint64 count; + // int stp; + int xlogLength; + uint64 curLoc = 0; + bool mustDo = false; + bool nowrite = false; + + /* + * Update shared-memory status + */ +mustflush: + SpinLockAcquire(&XLogCtl->info_lck); + if(WriteRqst.Write - XLogCtl->LogwrtResult.Write < 8192 && (!mustDo)) + { + // elog(LOG,"=+= first time,goto while{},WriteRqst.Write is %llu,XLogCtl->LogwrtResult.Write is %llu",WriteRqst.Write,XLogCtl->LogwrtResult.Write); + SpinLockRelease(&XLogCtl->info_lck); + } + else + { + LogwrtResult = XLogCtl->LogwrtResult; + if (XLogCtl->LogwrtResult.Write < WriteRqst.Write) + { + XLogCtl->LogwrtResult.Write = WriteRqst.Write; + curLoc = XLogCtl->LogFlush.last; + XLogCtl->LogFlush.last += 1; + // printf("curLoc %d, WriteRqst.Write %ld, LogwrtResult.Write %ld\n", curLoc, WriteRqst.Write, LogwrtResult.Write); + } + // elog(LOG,"=+= second time,need to push,WriteRqst.Write is %llu,XLogCtl->LogwrtResult.Write is %llu",WriteRqst.Write,XLogCtl->LogwrtResult.Write); + + SpinLockRelease(&XLogCtl->info_lck); + + + from = XLogCtl->pages + LogwrtResult.Write % ((XLOGbuffers-1) * XLOG_BLCKSZ); + count = LogwrtResult.Write; + // printf("write request %ld, result %ld; flush request %ld, result %ld\n", WriteRqst.Write, count, + // WriteRqst.Flush, LogwrtResult.Flush); + + XLogRecord *record; + while (count < WriteRqst.Write) + { + if (count < WriteRqst.Write) + { + if (xlogItemList == NULL) + { + xlogItemList = (XLogItemList *)malloc(sizeof(XLogItemList)); + xlogItemList->head = NULL; + xlogItemList->tail = NULL; + } + + // while ((*(from+0)|*(from+1)|*(from+2)|*(from+3)) == 0) +// while(*(from+25) == 0 ) + // { + // usleep(1); + // continue; + // } + // for(stp = 0; stp < 4; stp ++) + // { + // part[stp] = *(from + stp); + // } + // xlogLength = GetXlogLength(part,4); + + record = (XLogRecord *) from; + while (record->xl_crc == 0 || !He3DBValidXLogRecord(record)) + { + if (record->xl_crc != 0) + printf("crc not match: xl_crc %ld, pid %d\n ", record->xl_crc, (int)getpid()); + usleep(1); + } + xlogLength = record->xl_tot_len; + +// if (*(from + 25) == 2) +// { +// while(*(from + xlogLength - 1) == 0) +// { +// usleep(1); +// continue; +// } +// } + XLogItem *xlogItem = (XLogItem *)malloc(sizeof(XLogItem)); + + (xlogItem->xlogKey).lsn = count; + xlogItem->begin = from; + + if ((count + xlogLength)/((XLOGbuffers-1) * XLOG_BLCKSZ) > count / ((XLOGbuffers-1) * XLOG_BLCKSZ)) + { + from = XLogCtl->pages + ((count + xlogLength)%((XLOGbuffers-1) * XLOG_BLCKSZ)); + } + else + { + from = from + xlogLength; + } + + xlogItem->length = xlogLength; + xlogItem->next = NULL; + if (xlogItemList->head == NULL) + { + xlogItemList->head = xlogItem; + xlogItemList->tail = xlogItemList->head; + } + else + { + xlogItemList->tail->next = xlogItem; + xlogItemList->tail = xlogItemList->tail->next; + } + count += xlogLength; + } + } + + if (xlogItemList != NULL) + { + // printf("xlogItemList not null, WriteRqst.Write %ld, curLoc %d\n", WriteRqst.Write, curLoc); + if (xlogItemList->head != NULL) + { + // struct timeval tv; + // long timestp,timenow; + // gettimeofday(&tv,NULL); + // timestp =tv.tv_sec*1000 + tv.tv_usec/1000; + + while (flushwals(xlogItemList->head, XLogCtl->ThisTimeLineID) == 0) + { + printf("flush wal failed, retry!"); + pg_usleep(1000L); + } + // gettimeofday(&tv,NULL); + // timenow = tv.tv_sec*1000 + tv.tv_usec/1000; + // printf("flushwals time is %ld\n",timenow - timestp); + freeItemList(xlogItemList); + WalStats.m_wal_write++; + // LogwrtResult.Write = WriteRqst.Write; + } + free(xlogItemList); + xlogItemList = NULL; + + /* + * Update shared-memory status + */ + { + // struct timeval tv; + // long timestp; + // uint64 oldflush; + // long unitflush; + int bRelativeOffset = 0; + int eRelativeOffset = 0; + // SpinLockAcquire(&XLogCtl->info_lck); + // XLogParralFlush flushInfo = XLogCtl->LogFlush; + // SpinLockRelease(&XLogCtl->info_lck); + + // printf("end flush wals, begin %d, curLoc %d, WriteRqst.Write %ld\n", flushInfo.begin, curLoc, WriteRqst.Write); + while (pg_atomic_read_u64(&XLogCtl->LogFlush.begin) < curLoc) + { + pg_usleep(20L); + + } + + /* + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->LogFlush.count++; + XLogParralFlush flushInfo = XLogCtl->LogFlush; + uint32 diff = curLoc - flushInfo.begin + 1 - flushInfo.count + flushInfo.diff; + if (diff == 0) + { + XLogCtl->LogFlush.begin += flushInfo.count; + XLogCtl->oldflush = XLogCtl->LogwrtResult.Flush; + XLogCtl->LogwrtResult.Flush = WriteRqst.Write; + XLogCtl->LogFlush.count = XLogCtl->LogFlush.diff = 0; + bRelativeOffset = XLogCtl->oldflush % ((XLOGbuffers-1) * XLOG_BLCKSZ); + eRelativeOffset = XLogCtl->LogwrtResult.Flush % ((XLOGbuffers-1) * XLOG_BLCKSZ); + if (bRelativeOffset <= eRelativeOffset) + { + MemSet((char *)(XLogCtl->pages + bRelativeOffset), 0, eRelativeOffset - bRelativeOffset); } + else + { + MemSet((char *)(XLogCtl->pages + bRelativeOffset), 0, (XLOGbuffers * XLOG_BLCKSZ) - bRelativeOffset); + MemSet((char *)XLogCtl->pages, 0, eRelativeOffset); + } + SpinLockRelease(&XLogCtl->info_lck); + return; + } else + XLogCtl->LogFlush.diff = diff; + */ + pg_atomic_write_u64(&XLogCtl->LogFlush.begin, curLoc+1); + // SpinLockAcquire(&XLogCtl->info_lck); + + // XLogCtl->LogFlush.begin = curLoc+1; + // XLogCtl->oldflush = XLogCtl->LogwrtResult.Flush; + // SpinLockRelease(&XLogCtl->info_lck); + bRelativeOffset = LogwrtResult.Write % ((XLOGbuffers-1) * XLOG_BLCKSZ); + eRelativeOffset = WriteRqst.Write % ((XLOGbuffers-1) * XLOG_BLCKSZ); + + if (bRelativeOffset <= eRelativeOffset) + { + MemSet((char *)(XLogCtl->pages + bRelativeOffset), 0, eRelativeOffset - bRelativeOffset); } + else + { + MemSet((char *)(XLogCtl->pages + bRelativeOffset), 0, (XLOGbuffers * XLOG_BLCKSZ) - bRelativeOffset); + MemSet((char *)XLogCtl->pages, 0, eRelativeOffset); + } + + // gettimeofday(&tv,NULL); + // timestp =tv.tv_sec*1000 + tv.tv_usec/1000; + // if(XLogCtl->timestp == 0) + // { + // XLogCtl->timestp=timestp; + // XLogCtl->ol = 0; + // }else if(timestp - XLogCtl->timestp>=1000){ + // unitflush=(XLogCtl->LogwrtResult.Flush - XLogCtl->ol)*1000/(timestp-XLogCtl->timestp); + // printf("---unitflush:%llu---\n---now:%lld---\n---last time:%lld---\n---flush:%llu---\n---last flush:%llu---\n",unitflush,timestp/1000,XLogCtl->timestp/1000,XLogCtl->LogwrtResult.Flush,XLogCtl->ol); + // XLogCtl->timestp = timestp; + // XLogCtl->ol = XLogCtl->LogwrtResult.Flush; + + // } + SpinLockAcquire(&XLogCtl->info_lck); + XLogRecPtr flush = pg_atomic_read_u64(&(XLogCtl->LogwrtResult.Flush)); + if (flush < WriteRqst.Write) + pg_atomic_write_u64(&(XLogCtl->LogwrtResult.Flush), WriteRqst.Write); + SpinLockRelease(&XLogCtl->info_lck); + } + //return; + } + } + // SpinLockAcquire(&XLogCtl->info_lck); + // XLogRecPtr flush = XLogCtl->LogwrtResult.Flush; + // SpinLockRelease(&XLogCtl->info_lck); + XLogRecPtr flush = (XLogRecPtr) pg_atomic_read_u64(&(XLogCtl->LogwrtResult.Flush)); + long waitTime = 0; + while (flush < WriteRqst.Flush) + { + pg_usleep(200L); + waitTime += 200; + /* + SpinLockAcquire(&XLogCtl->info_lck); + if (!nowrite && curLoc == (XLogCtl->LogFlush.begin -1)) + { + elog(LOG,"=+= never write,curLoc is %u,XLogCtl->LogFlush.begin is %u\n",curLoc,XLogCtl->LogFlush.begin); + XLogCtl->oldflush = XLogCtl->LogwrtResult.Flush; + XLogCtl->LogwrtResult.Flush = WriteRqst.Write; + + int bRelativeOffset = 0; + int eRelativeOffset = 0; + bRelativeOffset = XLogCtl->oldflush % ((XLOGbuffers-1) * XLOG_BLCKSZ); + eRelativeOffset = XLogCtl->LogwrtResult.Flush % ((XLOGbuffers-1) * XLOG_BLCKSZ); + if (bRelativeOffset <= eRelativeOffset) + { + MemSet((char *)(XLogCtl->pages + bRelativeOffset), 0, eRelativeOffset - bRelativeOffset); } + else + { + MemSet((char *)(XLogCtl->pages + bRelativeOffset), 0, (XLOGbuffers * XLOG_BLCKSZ) - bRelativeOffset); + MemSet((char *)XLogCtl->pages, 0, eRelativeOffset); + } + SpinLockRelease(&XLogCtl->info_lck); + break; + } + flush = XLogCtl->LogwrtResult.Flush; + ipinLockqst.FlushXLogCtl->info_lck); + */ + + if(waitTime > 2000L){ + if(flush >= WriteRqst.Flush) + { + break; + } + mustDo = true; + goto mustflush; + } + + // SpinLockAcquire(&XLogCtl->info_lck); +// if (curLoc == (XLogCtl->LogFlush.begin)) +// elog(PANIC, "should be in here, WriteRqst.Write %ld", WriteRqst.Write); + // flush = XLogCtl->LogwrtResult.Flush; + // SpinLockRelease(&XLogCtl->info_lck); + flush = (XLogRecPtr) pg_atomic_read_u64(&(XLogCtl->LogwrtResult.Flush)); + } + + +} + +static void +He3DBXLogFakeWrite(XLogwrtRqst WriteRqst) +{ + // SpinLockAcquire(&XLogCtl->info_lck); + // LogwrtResult = XLogCtl->LogwrtResult; + // SpinLockRelease(&XLogCtl->info_lck); + + // printf("invoke xlog write, upto %ld\n", WriteRqst.Write); + if ((LogwrtResult.Write <= WriteRqst.Write - (XLOGbuffers-3) * XLOG_BLCKSZ) || + ((XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush) < WriteRqst.Flush)) + { + // printf("invoke xlogwrite result write %ld, flush %ld, request write %d, flush %ld\n", + // LogwrtResult.Write, LogwrtResult.Flush, WriteRqst.Write, WriteRqst.Flush); + FlushWal(WriteRqst); + LogwrtResult.Write = WriteRqst.Write; + pg_atomic_write_u64(&LogwrtResult.Flush, WriteRqst.Write); + } + /* * Update shared-memory status - * - * We make sure that the shared 'request' values do not fall behind the - * 'result' values. This is not absolutely essential, but it saves some - * code in a couple of places. */ { + XLogRecPtr flushed = (XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush); SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->LogwrtResult = LogwrtResult; + // XLogCtl->LogwrtResult = LogwrtResult; if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write) XLogCtl->LogwrtRqst.Write = LogwrtResult.Write; - if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush) - XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush; + if (XLogCtl->LogwrtRqst.Flush < flushed) + XLogCtl->LogwrtRqst.Flush = flushed; + XLogCtl->globalUpto = 0; SpinLockRelease(&XLogCtl->info_lck); } } @@ -2764,11 +3771,10 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) * and nudge the WALWriter if there is work for it to do. * (This should not be called for synchronous commits.) */ -void -XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN) +void XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN) { - XLogRecPtr WriteRqstPtr = asyncXactLSN; - bool sleeping; + XLogRecPtr WriteRqstPtr = asyncXactLSN; + bool sleeping; SpinLockAcquire(&XLogCtl->info_lck); LogwrtResult = XLogCtl->LogwrtResult; @@ -2785,10 +3791,10 @@ XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN) if (!sleeping) { /* back off to last completed page boundary */ - WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ; +// WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ; /* if we have already flushed that far, we're done */ - if (WriteRqstPtr <= LogwrtResult.Flush) + if (WriteRqstPtr <= (XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush)) return; } @@ -2805,15 +3811,13 @@ XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN) * Record the LSN up to which we can remove WAL because it's not required by * any replication slot. */ -void -XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn) +void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn) { SpinLockAcquire(&XLogCtl->info_lck); XLogCtl->replicationSlotMinLSN = lsn; SpinLockRelease(&XLogCtl->info_lck); } - /* * Return the oldest LSN we must retain to satisfy the needs of some * replication slot. @@ -2821,7 +3825,7 @@ XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn) static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void) { - XLogRecPtr retval; + XLogRecPtr retval; SpinLockAcquire(&XLogCtl->info_lck); retval = XLogCtl->replicationSlotMinLSN; @@ -2833,8 +3837,8 @@ XLogGetReplicationSlotMinimumLSN(void) void FlushNewRecoveryPoint(XLogRecPtr lsn) { LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - memcpy(&ControlFile->checkPointCopy,&GlobalCheckPoint,sizeof(CheckPoint)); - ControlFile->time = (pg_time_t) time(NULL); + memcpy(&ControlFile->checkPointCopy, &GlobalCheckPoint, sizeof(CheckPoint)); + ControlFile->time = (pg_time_t)time(NULL); ControlFile->state = GlobalState; ControlFile->checkPoint = lsn; PushUpdateControlFile(); @@ -2884,8 +3888,8 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) updateMinRecoveryPoint = false; else if (force || minRecoveryPoint < lsn) { - XLogRecPtr newMinRecoveryPoint; - TimeLineID newMinRecoveryPointTLI; + XLogRecPtr newMinRecoveryPoint; + TimeLineID newMinRecoveryPointTLI; /* * To avoid having to update the control file too often, we update it @@ -2903,13 +3907,14 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) SpinLockAcquire(&XLogCtl->info_lck); newMinRecoveryPoint = XLogCtl->replayEndRecPtr; newMinRecoveryPointTLI = XLogCtl->replayEndTLI; + SpinLockRelease(&XLogCtl->info_lck); if (!force && newMinRecoveryPoint < lsn) elog(WARNING, "xlog min recovery request %X/%X is past current point %X/%X", LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint)); - + /* update control file */ if (ControlFile->minRecoveryPoint < newMinRecoveryPoint) { @@ -2928,16 +3933,187 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force) LWLockRelease(ControlFileLock); } + +void +XLogFlush(XLogRecPtr record) +{ + He3DBXLogFlush(record); +} /* * Ensure that all XLOG data through the given position is flushed to disk. * * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not * already held, and we try to avoid acquiring it if possible. */ +// void +// XLogFlush(XLogRecPtr record) +// { +// XLogRecPtr WriteRqstPtr; +// XLogwrtRqst WriteRqst; + +// /* +// * During REDO, we are reading not writing WAL. Therefore, instead of +// * trying to flush the WAL, we should update minRecoveryPoint instead. We +// * test XLogInsertAllowed(), not InRecovery, because we need checkpointer +// * to act this way too, and because when it tries to write the +// * end-of-recovery checkpoint, it should indeed flush. +// */ +// if (!XLogInsertAllowed()) +// { +// UpdateMinRecoveryPoint(record, false); +// return; +// } + +// /* Quick exit if already known flushed */ +// if (record <= LogwrtResult.Flush) +// return; + +// #ifdef WAL_DEBUG +// if (XLOG_DEBUG) +// elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X", +// LSN_FORMAT_ARGS(record), +// LSN_FORMAT_ARGS(LogwrtResult.Write), +// LSN_FORMAT_ARGS(LogwrtResult.Flush)); +// #endif + +// START_CRIT_SECTION(); + +// /* +// * Since fsync is usually a horribly expensive operation, we try to +// * piggyback as much data as we can on each fsync: if we see any more data +// * entered into the xlog buffer, we'll write and fsync that too, so that +// * the final value of LogwrtResult.Flush is as large as possible. This +// * gives us some chance of avoiding another fsync immediately after. +// */ + +// /* initialize to given target; may increase below */ +// WriteRqstPtr = record; + +// /* +// * Now wait until we get the write lock, or someone else does the flush +// * for us. +// */ +// for (;;) +// { +// XLogRecPtr insertpos; + +// /* read LogwrtResult and update local state */ +// SpinLockAcquire(&XLogCtl->info_lck); +// if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write) +// WriteRqstPtr = XLogCtl->LogwrtRqst.Write; +// LogwrtResult = XLogCtl->LogwrtResult; +// SpinLockRelease(&XLogCtl->info_lck); + +// /* done already? */ +// if (record <= LogwrtResult.Flush) +// break; + +// /* +// * Before actually performing the write, wait for all in-flight +// * insertions to the pages we're about to write to finish. +// */ +// insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr); + +// /* +// * Try to get the write lock. If we can't get it immediately, wait +// * until it's released, and recheck if we still need to do the flush +// * or if the backend that held the lock did it for us already. This +// * helps to maintain a good rate of group committing when the system +// * is bottlenecked by the speed of fsyncing. +// */ +// if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE)) +// { +// /* +// * The lock is now free, but we didn't acquire it yet. Before we +// * do, loop back to check if someone else flushed the record for +// * us already. +// */ +// continue; +// } + +// /* Got the lock; recheck whether request is satisfied */ +// LogwrtResult = XLogCtl->LogwrtResult; +// if (record <= LogwrtResult.Flush) +// { +// LWLockRelease(WALWriteLock); +// break; +// } + +// /* +// * Sleep before flush! By adding a delay here, we may give further +// * backends the opportunity to join the backlog of group commit +// * followers; this can significantly improve transaction throughput, +// * at the risk of increasing transaction latency. +// * +// * We do not sleep if enableFsync is not turned on, nor if there are +// * fewer than CommitSiblings other backends with active transactions. +// */ +// if (CommitDelay > 0 && enableFsync && +// MinimumActiveBackends(CommitSiblings)) +// { +// pg_usleep(CommitDelay); + +// /* +// * Re-check how far we can now flush the WAL. It's generally not +// * safe to call WaitXLogInsertionsToFinish while holding +// * WALWriteLock, because an in-progress insertion might need to +// * also grab WALWriteLock to make progress. But we know that all +// * the insertions up to insertpos have already finished, because +// * that's what the earlier WaitXLogInsertionsToFinish() returned. +// * We're only calling it again to allow insertpos to be moved +// * further forward, not to actually wait for anyone. +// */ +// insertpos = WaitXLogInsertionsToFinish(insertpos); +// } + +// /* try to write/flush later additions to XLOG as well */ +// WriteRqst.Write = insertpos; +// WriteRqst.Flush = insertpos; + +// XLogWrite(WriteRqst, false); + +// LWLockRelease(WALWriteLock); +// /* done */ +// break; +// } + +// END_CRIT_SECTION(); + +// /* wake up walsenders now that we've released heavily contended locks */ +// WalSndWakeupProcessRequests(); + +// /* +// * If we still haven't flushed to the request point then we have a +// * problem; most likely, the requested flush point is past end of XLOG. +// * This has been seen to occur when a disk page has a corrupted LSN. +// * +// * Formerly we treated this as a PANIC condition, but that hurts the +// * system's robustness rather than helping it: we do not want to take down +// * the whole system due to corruption on one data page. In particular, if +// * the bad page is encountered again during recovery then we would be +// * unable to restart the database at all! (This scenario actually +// * happened in the field several times with 7.1 releases.) As of 8.4, bad +// * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem; +// * the only time we can reach here during recovery is while flushing the +// * end-of-recovery checkpoint record, and we don't expect that to have a +// * bad LSN. +// * +// * Note that for calls from xact.c, the ERROR will be promoted to PANIC +// * since xact.c calls this routine inside a critical section. However, +// * calls from bufmgr.c are not within critical sections and so we will not +// * force a restart for a bad LSN on a data page. +// */ +// if (LogwrtResult.Flush < record) +// elog(ERROR, +// "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", +// LSN_FORMAT_ARGS(record), +// LSN_FORMAT_ARGS(LogwrtResult.Flush)); +// } + void -XLogFlush(XLogRecPtr record) +He3DBXLogFlush(XLogRecPtr record) { - XLogRecPtr WriteRqstPtr; + XLogRecPtr WriteRqstPtr; XLogwrtRqst WriteRqst; /* @@ -2954,7 +4130,7 @@ XLogFlush(XLogRecPtr record) } /* Quick exit if already known flushed */ - if (record <= LogwrtResult.Flush) + if (record <= (XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush)) return; #ifdef WAL_DEBUG @@ -2984,7 +4160,7 @@ XLogFlush(XLogRecPtr record) */ for (;;) { - XLogRecPtr insertpos; + XLogRecPtr insertpos; /* read LogwrtResult and update local state */ SpinLockAcquire(&XLogCtl->info_lck); @@ -2994,14 +4170,14 @@ XLogFlush(XLogRecPtr record) SpinLockRelease(&XLogCtl->info_lck); /* done already? */ - if (record <= LogwrtResult.Flush) + if (record <= (XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush)) break; /* * Before actually performing the write, wait for all in-flight * insertions to the pages we're about to write to finish. */ - insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr); + insertpos = He3DBWaitXLogInsertionsToFinish(WriteRqstPtr); /* * Try to get the write lock. If we can't get it immediately, wait @@ -3010,21 +4186,23 @@ XLogFlush(XLogRecPtr record) * helps to maintain a good rate of group committing when the system * is bottlenecked by the speed of fsyncing. */ - if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE)) - { - /* - * The lock is now free, but we didn't acquire it yet. Before we - * do, loop back to check if someone else flushed the record for - * us already. - */ - continue; - } + // if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE)) + // { + // /* + // * The lock is now free, but we didn't acquire it yet. Before we + // * do, loop back to check if someone else flushed the record for + // * us already. + // */ + // continue; + // } /* Got the lock; recheck whether request is satisfied */ - LogwrtResult = XLogCtl->LogwrtResult; - if (record <= LogwrtResult.Flush) + // SpinLockAcquire(&XLogCtl->info_lck); + // LogwrtResult = XLogCtl->LogwrtResult; + // SpinLockRelease(&XLogCtl->info_lck); + if (record <= (XLogRecPtr) pg_atomic_read_u64(&(XLogCtl->LogwrtResult.Flush))) { - LWLockRelease(WALWriteLock); + // LWLockRelease(WALWriteLock); break; } @@ -3059,9 +4237,9 @@ XLogFlush(XLogRecPtr record) WriteRqst.Write = insertpos; WriteRqst.Flush = insertpos; - XLogWrite(WriteRqst, false); + He3DBXLogFakeWrite(WriteRqst); - LWLockRelease(WALWriteLock); + // LWLockRelease(WALWriteLock); /* done */ break; } @@ -3092,13 +4270,19 @@ XLogFlush(XLogRecPtr record) * calls from bufmgr.c are not within critical sections and so we will not * force a restart for a bad LSN on a data page. */ - if (LogwrtResult.Flush < record) + XLogRecPtr flushed = (XLogRecPtr) pg_atomic_read_u64(&(XLogCtl->LogwrtResult.Flush)); + if ( flushed < record) elog(ERROR, "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X", LSN_FORMAT_ARGS(record), - LSN_FORMAT_ARGS(LogwrtResult.Flush)); + LSN_FORMAT_ARGS(flushed)); } +bool +XLogBackgroundFlush(void) +{ + return He3DBXLogBackgroundFlush(); +} /* * Write & flush xlog, but without specifying exactly where to. * @@ -3123,14 +4307,140 @@ XLogFlush(XLogRecPtr record) * Returns true if there was any work to do, even if we skipped flushing due * to wal_writer_delay/wal_writer_flush_after. */ +// bool +// XLogBackgroundFlush(void) +// { +// XLogwrtRqst WriteRqst; +// bool flexible = true; +// static TimestampTz lastflush; +// TimestampTz now; +// int flushbytes; + +// /* XLOG doesn't need flushing during recovery */ +// if (RecoveryInProgress()) +// return false; + +// /* read LogwrtResult and update local state */ +// SpinLockAcquire(&XLogCtl->info_lck); +// LogwrtResult = XLogCtl->LogwrtResult; +// WriteRqst = XLogCtl->LogwrtRqst; +// SpinLockRelease(&XLogCtl->info_lck); + +// /* back off to last completed page boundary */ +// WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ; + +// /* if we have already flushed that far, consider async commit records */ +// if (WriteRqst.Write <= LogwrtResult.Flush) +// { +// SpinLockAcquire(&XLogCtl->info_lck); +// WriteRqst.Write = XLogCtl->asyncXactLSN; +// SpinLockRelease(&XLogCtl->info_lck); +// flexible = false; /* ensure it all gets written */ +// } + +// /* +// * If already known flushed, we're done. Just need to check if we are +// * holding an open file handle to a logfile that's no longer in use, +// * preventing the file from being deleted. +// */ +// if (WriteRqst.Write <= LogwrtResult.Flush) +// { +// if (openLogFile >= 0) +// { +// if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, +// wal_segment_size)) +// { +// XLogFileClose(); +// } +// } +// return false; +// } + +// /* +// * Determine how far to flush WAL, based on the wal_writer_delay and +// * wal_writer_flush_after GUCs. +// */ +// now = GetCurrentTimestamp(); +// flushbytes = +// WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ; + +// if (WalWriterFlushAfter == 0 || lastflush == 0) +// { +// /* first call, or block based limits disabled */ +// WriteRqst.Flush = WriteRqst.Write; +// lastflush = now; +// } +// else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay)) +// { +// /* +// * Flush the writes at least every WalWriterDelay ms. This is +// * important to bound the amount of time it takes for an asynchronous +// * commit to hit disk. +// */ +// WriteRqst.Flush = WriteRqst.Write; +// lastflush = now; +// } +// else if (flushbytes >= WalWriterFlushAfter) +// { +// /* exceeded wal_writer_flush_after blocks, flush */ +// WriteRqst.Flush = WriteRqst.Write; +// lastflush = now; +// } +// else +// { +// /* no flushing, this time round */ +// WriteRqst.Flush = 0; +// } + +// #ifdef WAL_DEBUG +// if (XLOG_DEBUG) +// elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X", +// LSN_FORMAT_ARGS(WriteRqst.Write), +// LSN_FORMAT_ARGS(WriteRqst.Flush), +// LSN_FORMAT_ARGS(LogwrtResult.Write), +// LSN_FORMAT_ARGS(LogwrtResult.Flush)); +// #endif + +// START_CRIT_SECTION(); + +// /* now wait for any in-progress insertions to finish and get write lock */ +// WaitXLogInsertionsToFinish(WriteRqst.Write); +// LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); +// LogwrtResult = XLogCtl->LogwrtResult; +// if (WriteRqst.Write > LogwrtResult.Write || +// WriteRqst.Flush > LogwrtResult.Flush) +// { +// XLogWrite(WriteRqst, flexible); +// } +// LWLockRelease(WALWriteLock); + +// END_CRIT_SECTION(); + +// /* wake up walsenders now that we've released heavily contended locks */ +// WalSndWakeupProcessRequests(); + +// /* +// * Great, done. To take some work off the critical path, try to initialize +// * as many of the no-longer-needed WAL buffers for future use as we can. +// */ +// AdvanceXLInsertBuffer(InvalidXLogRecPtr, true); + +// /* +// * If we determined that we need to write data, but somebody else +// * wrote/flushed already, it should be considered as being active, to +// * avoid hibernating too early. +// */ +// return true; +// } + bool -XLogBackgroundFlush(void) +He3DBXLogBackgroundFlush(void) { XLogwrtRqst WriteRqst; - bool flexible = true; + bool flexible = true; static TimestampTz lastflush; TimestampTz now; - int flushbytes; + int flushbytes; /* XLOG doesn't need flushing during recovery */ if (RecoveryInProgress()) @@ -3142,33 +4452,18 @@ XLogBackgroundFlush(void) WriteRqst = XLogCtl->LogwrtRqst; SpinLockRelease(&XLogCtl->info_lck); - /* back off to last completed page boundary */ - WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ; - - /* if we have already flushed that far, consider async commit records */ - if (WriteRqst.Write <= LogwrtResult.Flush) - { - SpinLockAcquire(&XLogCtl->info_lck); - WriteRqst.Write = XLogCtl->asyncXactLSN; - SpinLockRelease(&XLogCtl->info_lck); - flexible = false; /* ensure it all gets written */ - } - /* * If already known flushed, we're done. Just need to check if we are * holding an open file handle to a logfile that's no longer in use, * preventing the file from being deleted. */ - if (WriteRqst.Write <= LogwrtResult.Flush) + + SpinLockAcquire(&XLogCtl->info_lck); + WriteRqst.Write = WriteRqst.Write >= XLogCtl->asyncXactLSN ? WriteRqst.Write:XLogCtl->asyncXactLSN; + SpinLockRelease(&XLogCtl->info_lck); + + if (WriteRqst.Write <= (XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush)) { - if (openLogFile >= 0) - { - if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, - wal_segment_size)) - { - XLogFileClose(); - } - } return false; } @@ -3177,8 +4472,8 @@ XLogBackgroundFlush(void) * wal_writer_flush_after GUCs. */ now = GetCurrentTimestamp(); - flushbytes = - WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ; + + flushbytes = WriteRqst.Write - (XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush); if (WalWriterFlushAfter == 0 || lastflush == 0) { @@ -3196,7 +4491,7 @@ XLogBackgroundFlush(void) WriteRqst.Flush = WriteRqst.Write; lastflush = now; } - else if (flushbytes >= WalWriterFlushAfter) + else if (flushbytes >= flushFlag) { /* exceeded wal_writer_flush_after blocks, flush */ WriteRqst.Flush = WriteRqst.Write; @@ -3220,15 +4515,17 @@ XLogBackgroundFlush(void) START_CRIT_SECTION(); /* now wait for any in-progress insertions to finish and get write lock */ - WaitXLogInsertionsToFinish(WriteRqst.Write); - LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); + He3DBWaitXLogInsertionsToFinish(WriteRqst.Write); + // LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); + SpinLockAcquire(&XLogCtl->info_lck); LogwrtResult = XLogCtl->LogwrtResult; + SpinLockRelease(&XLogCtl->info_lck); if (WriteRqst.Write > LogwrtResult.Write || - WriteRqst.Flush > LogwrtResult.Flush) + WriteRqst.Flush > (XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush)) { - XLogWrite(WriteRqst, flexible); + He3DBXLogFakeWrite(WriteRqst); } - LWLockRelease(WALWriteLock); + // LWLockRelease(WALWriteLock); END_CRIT_SECTION(); @@ -3239,7 +4536,7 @@ XLogBackgroundFlush(void) * Great, done. To take some work off the critical path, try to initialize * as many of the no-longer-needed WAL buffers for future use as we can. */ - AdvanceXLInsertBuffer(InvalidXLogRecPtr, true); +// AdvanceXLInsertBuffer(InvalidXLogRecPtr, true); /* * If we determined that we need to write data, but somebody else @@ -3248,15 +4545,13 @@ XLogBackgroundFlush(void) */ return true; } - /* * Test whether XLOG data has been flushed up to (at least) the given position. * * Returns true if a flush is still needed. (It may be that someone else * is already in process of flushing that far, however.) */ -bool -XLogNeedsFlush(XLogRecPtr record) +bool XLogNeedsFlush(XLogRecPtr record) { /* * During recovery, we don't flush WAL but update minRecoveryPoint @@ -3306,7 +4601,7 @@ XLogNeedsFlush(XLogRecPtr record) } /* Quick exit if already known flushed */ - if (record <= LogwrtResult.Flush) + if (record <= (XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush)) return false; /* read LogwrtResult and update local state */ @@ -3315,7 +4610,7 @@ XLogNeedsFlush(XLogRecPtr record) SpinLockRelease(&XLogCtl->info_lck); /* check again */ - if (record <= LogwrtResult.Flush) + if (record <= (XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush)) return false; return true; @@ -3341,16 +4636,16 @@ XLogNeedsFlush(XLogRecPtr record) * take down the system on failure). They will promote to PANIC if we are * in a critical section. */ -int +int64_t XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) { - char path[MAXPGPATH]; - char tmppath[MAXPGPATH]; + char path[MAXPGPATH]; + char tmppath[MAXPGPATH]; PGAlignedXLogBlock zbuffer; - XLogSegNo installed_segno; - XLogSegNo max_segno; - int fd; - int save_errno; + XLogSegNo installed_segno; + XLogSegNo max_segno; + int64_t fd; + int save_errno; XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size); @@ -3379,7 +4674,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) */ elog(DEBUG2, "creating and filling new WAL file"); - snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int)getpid()); unlink(tmppath); @@ -3397,7 +4692,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) if (wal_init_zero) { struct iovec iov[PG_IOV_MAX]; - int blocks; + int blocks; /* * Zero-fill the file. With this setting, we do this the hard way to @@ -3420,8 +4715,8 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) blocks = wal_segment_size / XLOG_BLCKSZ; for (int i = 0; i < blocks;) { - int iovcnt = Min(blocks - i, lengthof(iov)); - off_t offset = i * XLOG_BLCKSZ; + int iovcnt = Min(blocks - i, lengthof(iov)); + off_t offset = i * XLOG_BLCKSZ; if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0) { @@ -3466,7 +4761,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); if (pg_fsync(fd) != 0) { - int save_errno = errno; + int save_errno = errno; close(fd); errno = save_errno; @@ -3547,12 +4842,12 @@ static void XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, int upto) { - char path[MAXPGPATH]; - char tmppath[MAXPGPATH]; + char path[MAXPGPATH]; + char tmppath[MAXPGPATH]; PGAlignedXLogBlock buffer; - int srcfd; - int fd; - int nbytes; + int srcfd; + int fd; + int nbytes; /* * Open the source file @@ -3567,7 +4862,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, /* * Copy into a temp file name. */ - snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int)getpid()); unlink(tmppath); @@ -3583,7 +4878,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, */ for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer)) { - int nread; + int nread; nread = upto - nbytes; @@ -3596,7 +4891,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, if (nread > 0) { - int r; + int r; if (nread > sizeof(buffer)) nread = sizeof(buffer); @@ -3613,15 +4908,15 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("could not read file \"%s\": read %d of %zu", - path, r, (Size) nread))); + path, r, (Size)nread))); } pgstat_report_wait_end(); } errno = 0; pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE); - if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer)) + if ((int)write(fd, buffer.data, sizeof(buffer)) != (int)sizeof(buffer)) { - int save_errno = errno; + int save_errno = errno; /* * If we fail to make the file, delete it to release disk space @@ -3694,7 +4989,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, bool find_free, XLogSegNo max_segno, bool use_lock) { - char path[MAXPGPATH]; + char path[MAXPGPATH]; struct stat stat_buf; XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size); @@ -3748,11 +5043,11 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, /* * Open a pre-existing logfile segment for writing. */ -int +int64_t XLogFileOpen(XLogSegNo segno) { - char path[MAXPGPATH]; - int fd; + char path[MAXPGPATH]; + int64_t fd; XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size); @@ -3771,41 +5066,41 @@ XLogFileOpen(XLogSegNo segno) * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive. * Otherwise, it's assumed to be already available in pg_wal. */ -static int +static int64_t XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, XLogSource source, bool notfoundOk) { - char xlogfname[MAXFNAMELEN]; - char activitymsg[MAXFNAMELEN + 16]; - char path[MAXPGPATH]; - int fd; + char xlogfname[MAXFNAMELEN]; + char activitymsg[MAXFNAMELEN + 16]; + char path[MAXPGPATH]; + int64_t fd; XLogFileName(xlogfname, tli, segno, wal_segment_size); switch (source) { - case XLOG_FROM_ARCHIVE: - /* Report recovery progress in PS display */ - snprintf(activitymsg, sizeof(activitymsg), "waiting for %s", - xlogfname); - set_ps_display(activitymsg); + case XLOG_FROM_ARCHIVE: + /* Report recovery progress in PS display */ + snprintf(activitymsg, sizeof(activitymsg), "waiting for %s", + xlogfname); + set_ps_display(activitymsg); - restoredFromArchive = RestoreArchivedFile(path, xlogfname, - "RECOVERYXLOG", - wal_segment_size, - InRedo); - if (!restoredFromArchive) - return -1; - break; + restoredFromArchive = RestoreArchivedFile(path, xlogfname, + "RECOVERYXLOG", + wal_segment_size, + InRedo); + if (!restoredFromArchive) + return -1; + break; - case XLOG_FROM_PG_WAL: - case XLOG_FROM_STREAM: - XLogFilePath(path, tli, segno, wal_segment_size); - restoredFromArchive = false; - break; + case XLOG_FROM_PG_WAL: + case XLOG_FROM_STREAM: + XLogFilePath(path, tli, segno, wal_segment_size); + restoredFromArchive = false; + break; - default: - elog(ERROR, "invalid XLogFileRead source %d", source); + default: + elog(ERROR, "invalid XLogFileRead source %d", source); } /* @@ -3854,13 +5149,13 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, * * This version searches for the segment with any TLI listed in expectedTLEs. */ -static int +static int64_t XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source) { - char path[MAXPGPATH]; - ListCell *cell; - int fd; - List *tles; + char path[MAXPGPATH]; + ListCell *cell; + int64_t fd; + List *tles; /* * Loop looking for a suitable timeline ID: we might need to read any of @@ -3885,13 +5180,13 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source) else tles = readTimeLineHistory(recoveryTargetTLI); - foreach(cell, tles) + foreach (cell, tles) { - TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell); - TimeLineID tli = hent->tli; + TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *)lfirst(cell); + TimeLineID tli = hent->tli; if (tli < curFileTLI) - break; /* don't bother looking at too-old TLIs */ + break; /* don't bother looking at too-old TLIs */ /* * Skip scanning the timeline ID that the logfile segment to read @@ -3899,7 +5194,7 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source) */ if (hent->begin != InvalidXLogRecPtr) { - XLogSegNo beginseg = 0; + XLogSegNo beginseg = 0; XLByteToSeg(hent->begin, beginseg, wal_segment_size); @@ -3952,6 +5247,34 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source) return -1; } +static void +XLogPageReadAnyTLI(void) +{ + ListCell *cell; + List *tles; + + /* + * Loop looking for a suitable timeline ID: we might need to read any of + * the timelines listed in expectedTLEs. + * + * We expect curFileTLI on entry to be the TLI of the preceding file in + * sequence, or 0 if there was no predecessor. We do not allow curFileTLI + * to go backwards; this prevents us from picking up the wrong file when a + * parent timeline extends to higher segment numbers than the child we + * want to read. + * + * If we haven't read the timeline history file yet, read it now, so that + * we know which TLIs to scan. We don't save the list in expectedTLEs, + * however, unless we actually find a valid segment. That way if there is + * neither a timeline history file nor a WAL segment in the archive, and + * streaming replication is set up, we'll read the timeline history file + * streamed from the primary when we start streaming, instead of + * recovering with a dummy history generated here. + */ + if (!expectedTLEs) + expectedTLEs = readTimeLineHistory(recoveryTargetTLI); +} + /* * Close the current logfile segment for writing. */ @@ -3968,13 +5291,13 @@ XLogFileClose(void) */ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) if (!XLogIsNeeded()) - (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED); + (void)posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED); #endif if (close(openLogFile) != 0) { - char xlogfname[MAXFNAMELEN]; - int save_errno = errno; + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, wal_segment_size); errno = save_errno; @@ -4000,14 +5323,14 @@ XLogFileClose(void) static void PreallocXlogFiles(XLogRecPtr endptr) { - XLogSegNo _logSegNo; - int lf; - bool use_existent; - uint64 offset; + XLogSegNo _logSegNo; + int64_t lf; + bool use_existent; + uint64 offset; XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size); offset = XLogSegmentOffset(endptr - 1, wal_segment_size); - if (offset >= (uint32) (0.75 * wal_segment_size)) + if (offset >= (uint32)(0.75 * wal_segment_size)) { _logSegNo++; use_existent = true; @@ -4030,11 +5353,10 @@ PreallocXlogFiles(XLogRecPtr endptr) * error message about a missing file, while still being able to throw * a normal file-access error afterwards, if this does return. */ -void -CheckXLogRemoved(XLogSegNo segno, TimeLineID tli) +void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli) { - int save_errno = errno; - XLogSegNo lastRemovedSegNo; + int save_errno = errno; + XLogSegNo lastRemovedSegNo; SpinLockAcquire(&XLogCtl->info_lck); lastRemovedSegNo = XLogCtl->lastRemovedSegNo; @@ -4042,7 +5364,7 @@ CheckXLogRemoved(XLogSegNo segno, TimeLineID tli) if (segno <= lastRemovedSegNo) { - char filename[MAXFNAMELEN]; + char filename[MAXFNAMELEN]; XLogFileName(filename, tli, segno, wal_segment_size); errno = save_errno; @@ -4064,7 +5386,7 @@ CheckXLogRemoved(XLogSegNo segno, TimeLineID tli) XLogSegNo XLogGetLastRemovedSegno(void) { - XLogSegNo lastRemovedSegNo; + XLogSegNo lastRemovedSegNo; SpinLockAcquire(&XLogCtl->info_lck); lastRemovedSegNo = XLogCtl->lastRemovedSegNo; @@ -4073,7 +5395,6 @@ XLogGetLastRemovedSegno(void) return lastRemovedSegNo; } - /* * Update the last removed segno pointer in shared memory, to reflect that the * given XLOG file has been removed. @@ -4081,8 +5402,8 @@ XLogGetLastRemovedSegno(void) static void UpdateLastRemovedPtr(char *filename) { - uint32 tli; - XLogSegNo segno; + uint32 tli; + XLogSegNo segno; XLogFromFileName(filename, &tli, &segno, wal_segment_size); @@ -4101,7 +5422,7 @@ UpdateLastRemovedPtr(char *filename) static void RemoveTempXlogFiles(void) { - DIR *xldir; + DIR *xldir; struct dirent *xlde; elog(DEBUG2, "removing all temporary WAL segments"); @@ -4109,7 +5430,7 @@ RemoveTempXlogFiles(void) xldir = AllocateDir(XLOGDIR); while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) { - char path[MAXPGPATH]; + char path[MAXPGPATH]; if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0) continue; @@ -4131,11 +5452,11 @@ RemoveTempXlogFiles(void) static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr) { - DIR *xldir; + DIR *xldir; struct dirent *xlde; - char lastoff[MAXFNAMELEN]; - XLogSegNo endlogSegNo; - XLogSegNo recycleSegNo; + char lastoff[MAXFNAMELEN]; + XLogSegNo endlogSegNo; + XLogSegNo recycleSegNo; /* Initialize info about where to try to recycle to */ XLByteToSeg(endptr, endlogSegNo, wal_segment_size); @@ -4204,12 +5525,12 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr) static void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI) { - DIR *xldir; + DIR *xldir; struct dirent *xlde; - char switchseg[MAXFNAMELEN]; - XLogSegNo endLogSegNo; - XLogSegNo switchLogSegNo; - XLogSegNo recycleSegNo; + char switchseg[MAXFNAMELEN]; + XLogSegNo endLogSegNo; + XLogSegNo switchLogSegNo; + XLogSegNo recycleSegNo; /* * Initialize info about where to begin the work. This will recycle, @@ -4271,9 +5592,9 @@ static void RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo) { - char path[MAXPGPATH]; + char path[MAXPGPATH]; #ifdef WIN32 - char newpath[MAXPGPATH]; + char newpath[MAXPGPATH]; #endif struct stat statbuf; @@ -4300,7 +5621,7 @@ RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo, else { /* No need for any more future segments... */ - int rc; + int rc; ereport(DEBUG2, (errmsg_internal("removing write-ahead log file \"%s\"", @@ -4358,7 +5679,7 @@ RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo, static void ValidateXLOGDirectoryStructure(void) { - char path[MAXPGPATH]; + char path[MAXPGPATH]; struct stat stat_buf; /* Check for pg_wal; if it doesn't exist, error out */ @@ -4397,9 +5718,9 @@ ValidateXLOGDirectoryStructure(void) static void CleanupBackupHistory(void) { - DIR *xldir; + DIR *xldir; struct dirent *xlde; - char path[MAXPGPATH + sizeof(XLOGDIR)]; + char path[MAXPGPATH + sizeof(XLOGDIR)]; xldir = AllocateDir(XLOGDIR); @@ -4436,7 +5757,7 @@ ReadRecord(XLogReaderState *xlogreader, int emode, bool fetching_ckpt) { XLogRecord *record; - XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; + XLogPageReadPrivate *private = (XLogPageReadPrivate *)xlogreader->private_data; /* Pass through parameters to XLogPageRead */ private->fetching_ckpt = fetching_ckpt; @@ -4445,17 +5766,24 @@ ReadRecord(XLogReaderState *xlogreader, int emode, /* This is the first attempt to read this page. */ lastSourceFailed = false; -#ifndef PG_NOREPLAY - if (push_standby == true && PushPtr == InvalidXLogRecPtr) { +#ifndef PG_NOREPLAY + if ((push_standby == true || EnableHotStandby == false || *isPromoteIsTriggered) && + xlogreader->streamStart == true && + PushPtr == InvalidXLogRecPtr) + { PushPtr = xlogreader->EndRecPtr; PrePushPtr = xlogreader->EndRecPtr; ApplyLsn = PrePushPtr; } #endif - for (;;) + while (GetShutDownStatus() == false) { char *errormsg; - record = XLogReadRecord(xlogreader, &errormsg); + if (he3mirror && xlogreader->insertTikv == false) { + record = StartupXLogReadRecord(xlogreader, &errormsg); + } else { + record = He3DBXLogReadRecord(xlogreader, &errormsg); + } ReadRecPtr = xlogreader->ReadRecPtr; EndRecPtr = xlogreader->EndRecPtr; if (record == NULL) @@ -4473,11 +5801,14 @@ ReadRecord(XLogReaderState *xlogreader, int emode, missingContrecPtr = xlogreader->missingContrecPtr; } - if (readFile >= 0) - { - close(readFile); - readFile = -1; - } + if (he3mirror) { + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + } + /* * We only end up here without a message when XLogPageRead() @@ -4487,31 +5818,21 @@ ReadRecord(XLogReaderState *xlogreader, int emode, */ if (errormsg) ereport(emode_for_corrupt_record(emode, EndRecPtr), - (errmsg_internal("%s", errormsg) /* already translated */ )); + (errmsg_internal("%s", errormsg) /* already translated */)); + } else { + if (xlogreader->ReadRecPtr > xlogreader->currRecPtr) { + elog(LOG, "read record ReadRecPtr %X gt currRecPtr %X, need clean wals which ge ReadRecPtr.", + xlogreader->ReadRecPtr, xlogreader->currRecPtr); + DelRangeWals(ThisTimeLineID, xlogreader->ReadRecPtr, PG_UINT64_MAX); + WalTaskImmediateFree(); + return NULL; + } } + /* * Check page TLI is one of the expected values. */ - else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs)) - { - char fname[MAXFNAMELEN]; - XLogSegNo segno; - int32 offset; - - XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size); - offset = XLogSegmentOffset(xlogreader->latestPagePtr, - wal_segment_size); - XLogFileName(fname, xlogreader->seg.ws_tli, segno, - wal_segment_size); - ereport(emode_for_corrupt_record(emode, EndRecPtr), - (errmsg("unexpected timeline ID %u in log segment %s, offset %u", - xlogreader->latestPageTLI, - fname, - offset))); - record = NULL; - } - if (record) { /* Great, got a record */ @@ -4520,7 +5841,9 @@ ReadRecord(XLogReaderState *xlogreader, int emode, else { /* No valid record available from this source */ - lastSourceFailed = true; + if (xlogreader->streamStart != true) { + lastSourceFailed = true; + } /* * If archive recovery was requested, but we were still doing @@ -4605,11 +5928,11 @@ ReadRecord(XLogReaderState *xlogreader, int emode, static bool rescanLatestTimeLine(void) { - List *newExpectedTLEs; - bool found; - ListCell *cell; - TimeLineID newtarget; - TimeLineID oldtarget = recoveryTargetTLI; + List *newExpectedTLEs; + bool found; + ListCell *cell; + TimeLineID newtarget; + TimeLineID oldtarget = recoveryTargetTLI; TimeLineHistoryEntry *currentTle = NULL; newtarget = findNewestTimeLine(recoveryTargetTLI); @@ -4630,9 +5953,9 @@ rescanLatestTimeLine(void) * we cannot proceed to it. */ found = false; - foreach(cell, newExpectedTLEs) + foreach (cell, newExpectedTLEs) { - currentTle = (TimeLineHistoryEntry *) lfirst(cell); + currentTle = (TimeLineHistoryEntry *)lfirst(cell); if (currentTle->tli == recoveryTargetTLI) { @@ -4701,7 +6024,7 @@ rescanLatestTimeLine(void) static void InitControlFile(uint64 sysidentifier) { - char mock_auth_nonce[MOCK_AUTH_NONCE_LEN]; + char mock_auth_nonce[MOCK_AUTH_NONCE_LEN]; /* * Generate a random nonce. This is used for authentication requests that @@ -4736,8 +6059,8 @@ InitControlFile(uint64 sysidentifier) static void WriteControlFile(void) { - int fd; - char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */ + int fd; + char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */ /* * Ensure that the size of the pg_control data structure is sane. See the @@ -4773,7 +6096,7 @@ WriteControlFile(void) /* Contents are protected with a CRC */ INIT_CRC32C(ControlFile->crc); COMP_CRC32C(ControlFile->crc, - (char *) ControlFile, + (char *)ControlFile, offsetof(ControlFileData, crc)); FIN_CRC32C(ControlFile->crc); @@ -4827,10 +6150,10 @@ WriteControlFile(void) static void ReadControlFile(void) { - pg_crc32c crc; - int fd; + pg_crc32c crc; + int fd; static char wal_segsz_str[20]; - int r; + int r; /* * Read data... @@ -4889,7 +6212,7 @@ ReadControlFile(void) /* Now check the CRC. */ INIT_CRC32C(crc); COMP_CRC32C(crc, - (char *) ControlFile, + (char *)ControlFile, offsetof(ControlFileData, crc)); FIN_CRC32C(crc); @@ -4961,14 +6284,14 @@ ReadControlFile(void) (errmsg("database files are incompatible with server"), errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d," " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.", - ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE), + ControlFile->toast_max_chunk_size, (int)TOAST_MAX_CHUNK_SIZE), errhint("It looks like you need to recompile or initdb."))); if (ControlFile->loblksize != LOBLKSIZE) ereport(FATAL, (errmsg("database files are incompatible with server"), errdetail("The database cluster was initialized with LOBLKSIZE %d," " but the server was compiled with LOBLKSIZE %d.", - ControlFile->loblksize, (int) LOBLKSIZE), + ControlFile->loblksize, (int)LOBLKSIZE), errhint("It looks like you need to recompile or initdb."))); #ifdef USE_FLOAT8_BYVAL @@ -5024,23 +6347,20 @@ ReadControlFile(void) * Utility wrapper to update the control file. Note that the control * file gets flushed. */ -void -UpdateControlFile() +void UpdateControlFile() { - if (push_standby != true) { + if (push_standby != true) + { return; } update_controlfile(DataDir, ControlFile, true); } - -void -PushUpdateControlFile() +void PushUpdateControlFile() { update_controlfile(DataDir, ControlFile, true); } - /* * Returns the unique system identifier from control file. */ @@ -5064,8 +6384,7 @@ GetMockAuthenticationNonce(void) /* * Are checksums enabled for data pages? */ -bool -DataChecksumsEnabled(void) +bool DataChecksumsEnabled(void) { Assert(ControlFile != NULL); return (ControlFile->data_checksum_version > 0); @@ -5083,7 +6402,7 @@ DataChecksumsEnabled(void) XLogRecPtr GetFakeLSNForUnloggedRel(void) { - XLogRecPtr nextUnloggedLSN; + XLogRecPtr nextUnloggedLSN; /* increment the unloggedLSN counter, need SpinLock */ SpinLockAcquire(&XLogCtl->ulsn_lck); @@ -5107,7 +6426,7 @@ GetFakeLSNForUnloggedRel(void) static int XLOGChooseNumBuffers(void) { - int xbuffers; + int xbuffers; xbuffers = NBuffers / 32; if (xbuffers > (wal_segment_size / XLOG_BLCKSZ)) @@ -5120,8 +6439,7 @@ XLOGChooseNumBuffers(void) /* * GUC check_hook for wal_buffers */ -bool -check_wal_buffers(int *newval, void **extra, GucSource source) +bool check_wal_buffers(int *newval, void **extra, GucSource source) { /* * -1 indicates a request for auto-tune. @@ -5164,8 +6482,7 @@ check_wal_buffers(int *newval, void **extra, GucSource source) * reset just controls whether previous contents are to be expected (in the * reset case, there's a dangling pointer into old shared memory), or not. */ -void -LocalProcessControlFile(bool reset) +void LocalProcessControlFile(bool reset) { Assert(reset || ControlFile == NULL); ControlFile = palloc(sizeof(ControlFileData)); @@ -5175,10 +6492,9 @@ LocalProcessControlFile(bool reset) /* * Initialization of shared memory for XLOG */ -Size -XLOGShmemSize(void) +Size XLOGShmemSize(void) { - Size size; + Size size; /* * If the value of wal_buffers is -1, use the preferred auto-tune value. @@ -5188,7 +6504,7 @@ XLOGShmemSize(void) */ if (XLOGbuffers == -1) { - char buf[32]; + char buf[32]; snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers()); SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE); @@ -5216,13 +6532,12 @@ XLOGShmemSize(void) return size; } -void -XLOGShmemInit(void) +void XLOGShmemInit(void) { - bool foundCFile, - foundXLog; - char *allocptr; - int i; + bool foundCFile, + foundXLog; + char *allocptr; + int i; ControlFileData *localControlFile; #ifdef WAL_DEBUG @@ -5241,7 +6556,6 @@ XLOGShmemInit(void) } #endif - XLogCtl = (XLogCtlData *) ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog); @@ -5278,17 +6592,16 @@ XLOGShmemInit(void) * multiple of the alignment for same, so no extra alignment padding is * needed here. */ - allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData); - XLogCtl->xlblocks = (XLogRecPtr *) allocptr; + allocptr = ((char *)XLogCtl) + sizeof(XLogCtlData); + XLogCtl->xlblocks = (XLogRecPtr *)allocptr; memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers); allocptr += sizeof(XLogRecPtr) * XLOGbuffers; - /* WAL insertion locks. Ensure they're aligned to the full padded size */ allocptr += sizeof(WALInsertLockPadded) - - ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded); + ((uintptr_t)allocptr) % sizeof(WALInsertLockPadded); WALInsertLocks = XLogCtl->Insert.WALInsertLocks = - (WALInsertLockPadded *) allocptr; + (WALInsertLockPadded *)allocptr; allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS; for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) @@ -5303,9 +6616,9 @@ XLOGShmemInit(void) * This simplifies some calculations in XLOG insertion. It is also * required for O_DIRECT. */ - allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr); + allocptr = (char *)TYPEALIGN(XLOG_BLCKSZ, allocptr); XLogCtl->pages = allocptr; - memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers); + memset(XLogCtl->pages, 0, (Size)XLOG_BLCKSZ * XLOGbuffers); /* * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill @@ -5328,19 +6641,18 @@ XLOGShmemInit(void) * This func must be called ONCE on system install. It creates pg_control * and the initial XLOG segment. */ -void -BootStrapXLOG(void) +void BootStrapXLOG(void) { CheckPoint checkPoint; char *buffer; - XLogPageHeader page; - XLogLongPageHeader longpage; + // XLogPageHeader page; + // XLogLongPageHeader longpage; XLogRecord *record; char *recptr; - bool use_existent; + // bool use_existent; uint64 sysidentifier; struct timeval tv; - pg_crc32c crc; + pg_crc32c crc; /* * Select a hopefully-unique system identifier code for this installation. @@ -5355,8 +6667,8 @@ BootStrapXLOG(void) * perhaps be useful sometimes. */ gettimeofday(&tv, NULL); - sysidentifier = ((uint64) tv.tv_sec) << 32; - sysidentifier |= ((uint64) tv.tv_usec) << 12; + sysidentifier = ((uint64)tv.tv_sec) << 32; + sysidentifier |= ((uint64)tv.tv_usec) << 12; sysidentifier |= getpid() & 0xFFF; /* First timeline ID is always 1 */ @@ -5364,8 +6676,9 @@ BootStrapXLOG(void) /* page buffer must be aligned suitably for O_DIRECT */ buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ); - page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer); - memset(page, 0, XLOG_BLCKSZ); + //page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer); + record = (XLogRecord *) TYPEALIGN(XLOG_BLCKSZ, buffer); + //memset(page, 0, XLOG_BLCKSZ); /* * Set up information for the initial checkpoint record @@ -5374,7 +6687,7 @@ BootStrapXLOG(void) * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not * used, so that we can use 0/0 to mean "before any valid WAL segment". */ - checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD; + checkPoint.redo = 0; checkPoint.ThisTimeLineID = ThisTimeLineID; checkPoint.PrevTimeLineID = ThisTimeLineID; checkPoint.fullPageWrites = fullPageWrites; @@ -5389,7 +6702,7 @@ BootStrapXLOG(void) checkPoint.oldestMultiDB = TemplateDbOid; checkPoint.oldestCommitTsXid = InvalidTransactionId; checkPoint.newestCommitTsXid = InvalidTransactionId; - checkPoint.time = (pg_time_t) time(NULL); + checkPoint.time = (pg_time_t)time(NULL); checkPoint.oldestActiveXid = InvalidTransactionId; ShmemVariableCache->nextXid = checkPoint.nextXid; @@ -5402,74 +6715,50 @@ BootStrapXLOG(void) SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId); /* Set up the XLOG page header */ - page->xlp_magic = XLOG_PAGE_MAGIC; - page->xlp_info = XLP_LONG_HEADER; - page->xlp_tli = ThisTimeLineID; - page->xlp_pageaddr = wal_segment_size; - longpage = (XLogLongPageHeader) page; - longpage->xlp_sysid = sysidentifier; - longpage->xlp_seg_size = wal_segment_size; - longpage->xlp_xlog_blcksz = XLOG_BLCKSZ; + // page->xlp_magic = XLOG_PAGE_MAGIC; + // page->xlp_info = XLP_LONG_HEADER; + // page->xlp_tli = ThisTimeLineID; + // page->xlp_pageaddr = wal_segment_size; + // longpage = (XLogLongPageHeader) page; + // longpage->xlp_sysid = sysidentifier; + // longpage->xlp_seg_size = wal_segment_size; + // longpage->xlp_xlog_blcksz = XLOG_BLCKSZ; /* Insert the initial checkpoint record */ - recptr = ((char *) page + SizeOfXLogLongPHD); - record = (XLogRecord *) recptr; + //recptr = ((char *) page + SizeOfXLogLongPHD); + recptr = (char *) record; + //record = (XLogRecord *) recptr; record->xl_prev = 0; record->xl_xid = InvalidTransactionId; record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint); + record->xl_end = record->xl_tot_len; record->xl_info = XLOG_CHECKPOINT_SHUTDOWN; record->xl_rmid = RM_XLOG_ID; recptr += SizeOfXLogRecord; /* fill the XLogRecordDataHeaderShort struct */ - *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT; + *(recptr++) = (char)XLR_BLOCK_ID_DATA_SHORT; *(recptr++) = sizeof(checkPoint); memcpy(recptr, &checkPoint, sizeof(checkPoint)); recptr += sizeof(checkPoint); - Assert(recptr - (char *) record == record->xl_tot_len); + Assert(recptr - (char *)record == record->xl_tot_len); INIT_CRC32C(crc); - COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); - COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc)); + COMP_CRC32C(crc, ((char *)record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); + COMP_CRC32C(crc, (char *)record, offsetof(XLogRecord, xl_crc)); FIN_CRC32C(crc); record->xl_crc = crc; - /* Create first XLOG segment file */ - use_existent = false; - openLogFile = XLogFileInit(1, &use_existent, false); - - /* - * We needn't bother with Reserve/ReleaseExternalFD here, since we'll - * close the file again in a moment. - */ - - /* Write the first page with the initial record */ - errno = 0; - pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE); - if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ) - { - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write bootstrap write-ahead log file: %m"))); - } - pgstat_report_wait_end(); - - pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC); - if (pg_fsync(openLogFile) != 0) - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not fsync bootstrap write-ahead log file: %m"))); - pgstat_report_wait_end(); - - if (close(openLogFile) != 0) - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not close bootstrap write-ahead log file: %m"))); - - openLogFile = -1; + isInitDB = true; + XLogItem *xlogItem = (XLogItem *)malloc(sizeof(XLogItem)); + + (xlogItem->xlogKey).lsn = record->xl_end - record->xl_tot_len; + xlogItem->begin = record; + xlogItem->length = record->xl_tot_len; + xlogItem->next = NULL; + flushwals(xlogItem, ThisTimeLineID); + free(xlogItem); + /* Now create pg_control */ InitControlFile(sysidentifier); ControlFile->time = checkPoint.time; @@ -5545,26 +6834,26 @@ readRecoverySignalFile(void) */ if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0) { - int fd; + int fd; fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method), S_IRUSR | S_IWUSR); if (fd >= 0) { - (void) pg_fsync(fd); + (void)pg_fsync(fd); close(fd); } standby_signal_file_found = true; } else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0) { - int fd; + int fd; fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method), S_IRUSR | S_IWUSR); if (fd >= 0) { - (void) pg_fsync(fd); + (void)pg_fsync(fd); close(fd); } recovery_signal_file_found = true; @@ -5598,6 +6887,13 @@ readRecoverySignalFile(void) static void validateRecoveryParameters(void) { + if (he3_point_in_time_recovery == true && recoveryTarget == RECOVERY_TARGET_TIME) + { + recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, + CStringGetDatum(recovery_target_time_string), + ObjectIdGetDatum(InvalidOid), + Int32GetDatum(-1))); + } if (!ArchiveRecoveryRequested) return; @@ -5650,7 +6946,7 @@ validateRecoveryParameters(void) */ if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC) { - TimeLineID rtli = recoveryTargetTLIRequested; + TimeLineID rtli = recoveryTargetTLIRequested; /* Timeline 1 does not have a history file, all else should */ if (rtli != 1 && !existsTimeLineHistory(rtli)) @@ -5681,9 +6977,9 @@ validateRecoveryParameters(void) static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) { - char xlogfname[MAXFNAMELEN]; - XLogSegNo endLogSegNo; - XLogSegNo startLogSegNo; + char xlogfname[MAXFNAMELEN]; + XLogSegNo endLogSegNo; + XLogSegNo startLogSegNo; /* we always switch to a new timeline after archive recovery */ Assert(endTLI != ThisTimeLineID); @@ -5702,11 +6998,6 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) * If the ending log segment is still open, close it (to avoid problems on * Windows with trying to rename or delete an open file). */ - if (readFile >= 0) - { - close(readFile); - readFile = -1; - } /* * Calculate the last segment on the old timeline, and the first segment @@ -5741,15 +7032,15 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) * The switch happened at a segment boundary, so just create the next * segment on the new timeline. */ - bool use_existent = true; - int fd; + bool use_existent = true; + int64_t fd; fd = XLogFileInit(startLogSegNo, &use_existent, true); if (close(fd) != 0) { - char xlogfname[MAXFNAMELEN]; - int save_errno = errno; + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size); @@ -5781,6 +7072,39 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) (errmsg("archive recovery complete"))); } +/* + * Exit archive-recovery state + */ +static void +exitHe3ArchiveRecovery(TimeLineID endTLI) +{ + /* we always switch to a new timeline after archive recovery */ + Assert(endTLI != ThisTimeLineID); + + /* + * We are no longer in archive recovery state. + */ + InArchiveRecovery = false; + + /* + * Update min recovery point one last time. + */ + UpdateMinRecoveryPoint(InvalidXLogRecPtr, true); + + /* + * Remove the signal files out of the way, so that we don't accidentally + * re-enter archive recovery mode in a subsequent crash. + */ + if (standby_signal_file_found) + durable_unlink(STANDBY_SIGNAL_FILE, FATAL); + + if (recovery_signal_file_found) + durable_unlink(RECOVERY_SIGNAL_FILE, FATAL); + + ereport(LOG, + (errmsg("archive recovery complete"))); +} + /* * Extract timestamp from WAL record. * @@ -5792,25 +7116,25 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog) static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime) { - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - uint8 xact_info = info & XLOG_XACT_OPMASK; - uint8 rmid = XLogRecGetRmid(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + uint8 xact_info = info & XLOG_XACT_OPMASK; + uint8 rmid = XLogRecGetRmid(record); if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) { - *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time; + *recordXtime = ((xl_restore_point *)XLogRecGetData(record))->rp_time; return true; } if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT || xact_info == XLOG_XACT_COMMIT_PREPARED)) { - *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time; + *recordXtime = ((xl_xact_commit *)XLogRecGetData(record))->xact_time; return true; } if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT || xact_info == XLOG_XACT_ABORT_PREPARED)) { - *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time; + *recordXtime = ((xl_xact_abort *)XLogRecGetData(record))->xact_time; return true; } return false; @@ -5827,9 +7151,9 @@ getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime) static bool recoveryStopsBefore(XLogReaderState *record) { - bool stopsHere = false; - uint8 xact_info; - bool isCommit; + bool stopsHere = false; + uint8 xact_info; + bool isCommit; TimestampTz recordXtime = 0; TransactionId recordXid; @@ -5883,7 +7207,7 @@ recoveryStopsBefore(XLogReaderState *record) } else if (xact_info == XLOG_XACT_COMMIT_PREPARED) { - xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_commit *xlrec = (xl_xact_commit *)XLogRecGetData(record); xl_xact_parsed_commit parsed; isCommit = true; @@ -5899,7 +7223,7 @@ recoveryStopsBefore(XLogReaderState *record) } else if (xact_info == XLOG_XACT_ABORT_PREPARED) { - xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_abort *xlrec = (xl_xact_abort *)XLogRecGetData(record); xl_xact_parsed_abort parsed; isCommit = false; @@ -5966,6 +7290,89 @@ recoveryStopsBefore(XLogReaderState *record) return stopsHere; } +static bool +he3recoveryStopsAfter(XLogReaderState *record) +{ + bool stopsHere = false; + uint8 xact_info; + bool isCommit; + TimestampTz recordXtime = 0; + TransactionId recordXid; + + /* Otherwise we only consider stopping before COMMIT or ABORT records. */ + if (XLogRecGetRmid(record) != RM_XACT_ID) + return false; + + xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; + + if (xact_info == XLOG_XACT_COMMIT) + { + isCommit = true; + recordXid = XLogRecGetXid(record); + } + else if (xact_info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *)XLogRecGetData(record); + xl_xact_parsed_commit parsed; + + isCommit = true; + ParseCommitRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else if (xact_info == XLOG_XACT_ABORT) + { + isCommit = false; + recordXid = XLogRecGetXid(record); + } + else if (xact_info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *)XLogRecGetData(record); + xl_xact_parsed_abort parsed; + + isCommit = false; + ParseAbortRecord(XLogRecGetInfo(record), + xlrec, + &parsed); + recordXid = parsed.twophase_xid; + } + else + return false; + + if (recoveryTarget == RECOVERY_TARGET_TIME && + getRecordTimestamp(record, &recordXtime)) + { + stopsHere = (recordXtime >= recoveryTargetTime); + } + + if (stopsHere) + { + recoveryStopAfter = false; + recoveryStopXid = recordXid; + recoveryStopTime = recordXtime; + recoveryStopLSN = InvalidXLogRecPtr; + recoveryStopName[0] = '\0'; + + if (isCommit) + { + ereport(LOG, + (errmsg("recovery stopping before commit of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + else + { + ereport(LOG, + (errmsg("recovery stopping before abort of transaction %u, time %s", + recoveryStopXid, + timestamptz_to_str(recoveryStopTime)))); + } + } + + return stopsHere; +} + /* * Same as recoveryStopsBefore, but called after applying the record. * @@ -5975,9 +7382,9 @@ recoveryStopsBefore(XLogReaderState *record) static bool recoveryStopsAfter(XLogReaderState *record) { - uint8 info; - uint8 xact_info; - uint8 rmid; + uint8 info; + uint8 xact_info; + uint8 rmid; TimestampTz recordXtime; /* @@ -5999,14 +7406,14 @@ recoveryStopsAfter(XLogReaderState *record) { xl_restore_point *recordRestorePointData; - recordRestorePointData = (xl_restore_point *) XLogRecGetData(record); + recordRestorePointData = (xl_restore_point *)XLogRecGetData(record); if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0) { recoveryStopAfter = true; recoveryStopXid = InvalidTransactionId; recoveryStopLSN = InvalidXLogRecPtr; - (void) getRecordTimestamp(record, &recoveryStopTime); + (void)getRecordTimestamp(record, &recoveryStopTime); strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN); ereport(LOG, @@ -6052,7 +7459,7 @@ recoveryStopsAfter(XLogReaderState *record) /* Extract the XID of the committed/aborted transaction */ if (xact_info == XLOG_XACT_COMMIT_PREPARED) { - xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); + xl_xact_commit *xlrec = (xl_xact_commit *)XLogRecGetData(record); xl_xact_parsed_commit parsed; ParseCommitRecord(XLogRecGetInfo(record), @@ -6062,7 +7469,7 @@ recoveryStopsAfter(XLogReaderState *record) } else if (xact_info == XLOG_XACT_ABORT_PREPARED) { - xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); + xl_xact_abort *xlrec = (xl_xact_abort *)XLogRecGetData(record); xl_xact_parsed_abort parsed; ParseAbortRecord(XLogRecGetInfo(record), @@ -6202,8 +7609,7 @@ GetRecoveryPauseState(void) * to 'not paused' to resume the recovery. The recovery pause will be * confirmed by the ConfirmRecoveryPaused. */ -void -SetRecoveryPause(bool recoveryPause) +void SetRecoveryPause(bool recoveryPause) { SpinLockAcquire(&XLogCtl->info_lck); @@ -6248,10 +7654,10 @@ ConfirmRecoveryPaused(void) static bool recoveryApplyDelay(XLogReaderState *record) { - uint8 xact_info; + uint8 xact_info; TimestampTz xtime; TimestampTz delayUntil; - long msecs; + long msecs; /* nothing to do if no delay configured */ if (recovery_min_apply_delay <= 0) @@ -6325,10 +7731,10 @@ recoveryApplyDelay(XLogReaderState *record) elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs); - (void) WaitLatch(&XLogCtl->recoveryWakeupLatch, - WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, - msecs, - WAIT_EVENT_RECOVERY_APPLY_DELAY); + (void)WaitLatch(&XLogCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + msecs, + WAIT_EVENT_RECOVERY_APPLY_DELAY); } return true; } @@ -6397,8 +7803,7 @@ GetCurrentChunkReplayStartTime(void) * Returns time of receipt of current chunk of XLOG data, as well as * whether it was received from streaming replication or from archives. */ -void -GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream) +void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream) { /* * This must be executed in the startup process, since we don't export the @@ -6421,7 +7826,7 @@ RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue { if (LocalHotStandbyActive) { - bool warned_for_promote = false; + bool warned_for_promote = false; ereport(WARNING, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), @@ -6449,10 +7854,10 @@ RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("promotion is not possible because of insufficient parameter settings"), - /* - * Repeat the detail from above so it's easy to find - * in the log. - */ + /* + * Repeat the detail from above so it's easy to find + * in the log. + */ errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", param_name, currValue, @@ -6482,7 +7887,7 @@ RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue ereport(FATAL, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("recovery aborted because of insufficient parameter settings"), - /* Repeat the detail from above so it's easy to find in the log. */ + /* Repeat the detail from above so it's easy to find in the log. */ errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", param_name, currValue, @@ -6539,27 +7944,94 @@ CheckRequiredParameterValues(void) } } -static void updateLastReplayLsn() { - - /* - * Update lastReplayedEndRecPtr after this record has been - * successfully replayed. - */ - SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->lastReplayedEndRecPtr = EndRecPtr; - XLogCtl->lastReplayedTLI = ThisTimeLineID; - SpinLockRelease(&XLogCtl->info_lck); +static void updateLastReplayLsn(XLogReaderState *state) +{ + + /* + * Update lastReplayedEndRecPtr after this record has been + * successfully replayed. + */ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->lastReplayedEndRecPtr = state->EndRecPtr; + XLogCtl->lastReplayedTLI = ThisTimeLineID; + SpinLockRelease(&XLogCtl->info_lck); } - -bool -data_buffer_for_replay(XLogReaderState *record) +static bool +he3db_xlog_donot_to_replay(XLogRecord *record) { - //no startupxlog pid need to replay - record->isreplay = true; - if (startupPid != getpid() || push_standby == true) { - return true; + RmgrId rmid = record->xl_rmid; + uint8 info; + + if (rmid == RM_HEAP_ID || rmid == RM_HEAP2_ID || rmid == RM_BTREE_ID || rmid == RM_HASH_ID + || rmid == RM_GIN_ID || rmid == RM_GIST_ID || rmid == RM_SEQ_ID + || rmid == RM_SPGIST_ID || rmid == RM_BRIN_ID || rmid == RM_GENERIC_ID) + return true; + + if (rmid != RM_XLOG_ID) + return false; + + info = record->xl_info & ~XLR_INFO_MASK; + + if (info != XLOG_FPI && + info != XLOG_FPI_FOR_HINT) + return false; + + return true; +} + +void pushTikv(int onePageListLen,int pageNum,bool flag) +{ + if (push_standby == true || EnableHotStandby == false || *isPromoteIsTriggered) { + if (startInit == false) { + start = time(NULL); + startInit = true; + } + //modify page more than 1000 || page modify more than pageMaxLen || more than 1000ms + time_t end = time(NULL); + if (onePageListLen > pageMaxLen || pageNum >= G_QUEUE_LEN || (pageNum > 0 && (flag == true || end - start > timeOut))) { + SortPageQueue(); + //wait shared queue data handler + while(pageNum > CompletedTaskNum()) { + pg_usleep(1000L); + } + cleanMap(); + start = end; + PushPtr = GetXLogReplayRecPtr(NULL); + PrePushPtr = PushPtr; + if (push_standby == true && XLogCtl->pushToDisk != PushPtr && !mpush) { + InsertConsistToKV(PushPtr); + } + XLogCtl->pushToDisk = PushPtr; + } else { + if (flag == true) { + PushPtr = GetXLogReplayRecPtr(NULL); + PrePushPtr = PushPtr; + if (push_standby == true && XLogCtl->pushToDisk != PushPtr && !mpush) { + InsertConsistToKV(PushPtr); + } + XLogCtl->pushToDisk = PushPtr; + } + } + } +} + +static void pageInMemoryFlushBufferToDisk(BufferTag*tag) { + Buffer buffer = XLogReadBufferExtended(tag->rnode, tag->forkNum, tag->blockNum, + RBM_NORMAL); + if (!BufferIsValid(buffer)) + { + elog(ERROR,"pageInMemoryFlushBufferToDisk is invalid rel %d,flk %d,blk %d",tag->rnode.relNode,tag->forkNum,tag->blockNum); + return; + } + //slave no need to flush disk + ReleaseBuffer(buffer); +} + +static bool +data_buffer_for_replay(XLogReaderState *record,XLogRecPtr startLsn,XLogRecPtr endLsn) +{ if (record->max_block_id != 0) { //one wal maximum has one block if (record->max_block_id > 0) { @@ -6572,59 +8044,127 @@ data_buffer_for_replay(XLogReaderState *record) BlockNumber blkno; ForkNumber forknum; XLogRecGetBlockTag(record, 0, &rnode, &forknum, &blkno); - memcpy(&(tag.rnode),&rnode,sizeof(rnode)); + memcpy(&(tag.rnode), &rnode, sizeof(rnode)); tag.forkNum = forknum; tag.blockNum = blkno; uint32 hash; + //uint32 hashcode; LWLock *partition_lock; int buf_id; - + //hashcode = PageLogindexHashCode(&tag); + //PageLogindexInsert(&tag,hashcode,startLsn,endLsn); + InsertLogIndexByPage(&tag,startLsn); hash = BufTableHashCode(&tag); partition_lock = BufMappingPartitionLock(hash); - - /* See if the block is in the buffer pool already */ + uint32 buf_state; + bool valid; + BufferDesc *buf; + //for pg master he3db slave or backup restore + SMgrRelation smgr = smgropen(rnode, InvalidBackendId); + smgrcreate(smgr, forknum, true); + BlockNumber blockNum = startupsmgrnblocks(smgr, forknum); + static char blkspace[BLCKSZ] = {0}; + if (blockNum != P_NEW) { + for (int i = blockNum;i<=blkno;i++) { + smgrextend(smgr,forknum,i,blkspace,false); + } + // if (blockNum!=blkno) { + // elog(LOG,"smgr rnode %d frk %d blk0 %d,blk1 %d blkk %d",smgr->smgr_rnode.node.relNode,forknum,blockNum,blkno,smgrnblocks(smgr, forknum)); + // } + } else { + elog(PANIC,"data_buffer_for_replay blockNum is P_NEW"); + } + /* See if the block is in the buffer pool already */ LWLockAcquire(partition_lock, LW_SHARED); buf_id = BufTableLookup(&tag, hash); /* If page is in buffer, we can apply record, otherwise we do nothing */ - if (buf_id < 0) + if (buf_id >= 0) { - record->isreplay = false; - updateLastReplayLsn(); + buf = GetBufferDescriptor(buf_id); + Buffer buffer = BufferDescriptorGetBuffer(buf); + valid = PinBufferForPush(buf, NULL); LWLockRelease(partition_lock); - return false; - } - LWLockRelease(partition_lock); - return true; + if (valid) + { + buf_state = LockBufHdr(buf); + buf_state &= ~BM_VALID; + UnlockBufHdr(buf, buf_state); + } + updateLastReplayLsn(record); + ReleaseBuffer(buffer); + } else { + updateLastReplayLsn(record); + LWLockRelease(partition_lock); + } + if (push_standby == true || EnableHotStandby == false || *isPromoteIsTriggered) { + //tbspace now donot care for test + uint32_t count = addFileKey(&tag); + pushTikv(count,hashMapSize(),false); + } + + return false; } +static void*thr_fn(void* arg) { + if(!he3mirror){ + XLogReaderState *xlogreader = (XLogReaderState *)arg; + producerXLogParallelBatchRead(xlogreader,xlogreader->EndRecPtr,SizeOfXLogRecord); + } else { + XLogReaderState *xlogreader = (XLogReaderState *)arg; + char*errormsg = NULL; + while(GetShutDownStatus() == false) { + StartupXLogReadRecord(xlogreader, &errormsg); + if (xlogreader->localWalComplete == true) { + while(readFile >= 0) { + pg_usleep(50000); + } + pg_usleep(50000); + xlogreader->localWalComplete = false; + } + } + } + return NULL; +} /* * This must be called ONCE during postmaster or standalone-backend startup */ -void -StartupXLOG(void) +void StartupXLOG(void) { XLogCtlInsert *Insert; - CheckPoint checkPoint; - bool wasShutdown; - bool reachedRecoveryTarget = false; - bool haveBackupLabel = false; - bool haveTblspcMap = false; - XLogRecPtr RecPtr, - checkPointLoc, - EndOfLog; - TimeLineID EndOfLogTLI; - TimeLineID PrevTimeLineID; + CheckPoint checkPoint; + bool wasShutdown; + bool reachedRecoveryTarget = false; + bool haveBackupLabel = false; + bool haveTblspcMap = false; + XLogRecPtr RecPtr, + checkPointLoc, + EndOfLog; + TimeLineID EndOfLogTLI; + TimeLineID PrevTimeLineID; XLogRecord *record; TransactionId oldestActiveXID; - bool backupEndRequired = false; - bool backupFromStandby = false; - DBState dbstate_at_startup; - XLogReaderState *xlogreader; + bool backupEndRequired = false; + bool backupFromStandby = false; + DBState dbstate_at_startup; + XLogReaderState *xlogreader = NULL; + XLogReaderState *he3xlogreader = NULL; XLogPageReadPrivate private; - bool promoted = false; + XLogPageReadPrivate he3private; + bool promoted = false; struct stat st; startupPid = getpid(); + if(IsBootstrapProcessingMode() != true && InitdbSingle != true) { + initPthreadPool(); + if ((EnableHotStandby && *isPromoteIsTriggered == false && !push_standby)) + { + pthread_t ntid; + int err; + err = pthread_create(&ntid, NULL, CleanWalsInLmdb, NULL); + if (err != 0) + elog(PANIC,"pthread_create CleanWalsInLmdb failed %s",strerror(err)); + } + } /* * We should have an aux process resource owner to use, and we should not @@ -6638,60 +8178,60 @@ StartupXLOG(void) /* * Check that contents look valid. */ - if (!XRecOffIsValid(ControlFile->checkPoint)) - ereport(FATAL, - (errmsg("control file contains invalid checkpoint location"))); + // if (!XRecOffIsValid(ControlFile->checkPoint)) + // ereport(FATAL, + // (errmsg("control file contains invalid checkpoint location"))); switch (ControlFile->state) { - case DB_SHUTDOWNED: + case DB_SHUTDOWNED: - /* - * This is the expected case, so don't be chatty in standalone - * mode - */ - ereport(IsPostmasterEnvironment ? LOG : NOTICE, - (errmsg("database system was shut down at %s", - str_time(ControlFile->time)))); - break; + /* + * This is the expected case, so don't be chatty in standalone + * mode + */ + ereport(IsPostmasterEnvironment ? LOG : NOTICE, + (errmsg("database system was shut down at %s", + str_time(ControlFile->time)))); + break; - case DB_SHUTDOWNED_IN_RECOVERY: - ereport(LOG, - (errmsg("database system was shut down in recovery at %s", - str_time(ControlFile->time)))); - break; + case DB_SHUTDOWNED_IN_RECOVERY: + ereport(LOG, + (errmsg("database system was shut down in recovery at %s", + str_time(ControlFile->time)))); + break; - case DB_SHUTDOWNING: - ereport(LOG, - (errmsg("database system shutdown was interrupted; last known up at %s", - str_time(ControlFile->time)))); - break; + case DB_SHUTDOWNING: + ereport(LOG, + (errmsg("database system shutdown was interrupted; last known up at %s", + str_time(ControlFile->time)))); + break; - case DB_IN_CRASH_RECOVERY: - ereport(LOG, - (errmsg("database system was interrupted while in recovery at %s", - str_time(ControlFile->time)), - errhint("This probably means that some data is corrupted and" - " you will have to use the last backup for recovery."))); - break; + case DB_IN_CRASH_RECOVERY: + ereport(LOG, + (errmsg("database system was interrupted while in recovery at %s", + str_time(ControlFile->time)), + errhint("This probably means that some data is corrupted and" + " you will have to use the last backup for recovery."))); + break; - case DB_IN_ARCHIVE_RECOVERY: - ereport(LOG, - (errmsg("database system was interrupted while in recovery at log time %s", - str_time(ControlFile->checkPointCopy.time)), - errhint("If this has occurred more than once some data might be corrupted" - " and you might need to choose an earlier recovery target."))); - break; + case DB_IN_ARCHIVE_RECOVERY: + ereport(LOG, + (errmsg("database system was interrupted while in recovery at log time %s", + str_time(ControlFile->checkPointCopy.time)), + errhint("If this has occurred more than once some data might be corrupted" + " and you might need to choose an earlier recovery target."))); + break; - case DB_IN_PRODUCTION: - ereport(LOG, - (errmsg("database system was interrupted; last known up at %s", - str_time(ControlFile->time)))); - break; + case DB_IN_PRODUCTION: + ereport(LOG, + (errmsg("database system was interrupted; last known up at %s", + str_time(ControlFile->time)))); + break; - default: - ereport(FATAL, - (errmsg("control file contains invalid database cluster state"))); + default: + ereport(FATAL, + (errmsg("control file contains invalid database cluster state"))); } /* This is just to allow attaching to startup process with a debugger */ @@ -6724,7 +8264,7 @@ StartupXLOG(void) if (ControlFile->state != DB_SHUTDOWNED && ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) { - RemoveTempXlogFiles(); + // RemoveTempXlogFiles(); SyncDataDirectory(); } @@ -6782,19 +8322,37 @@ StartupXLOG(void) /* Set up XLOG reader facility */ MemSet(&private, 0, sizeof(XLogPageReadPrivate)); - xlogreader = - XLogReaderAllocate(wal_segment_size, NULL, - XL_ROUTINE(.page_read = &XLogPageRead, - .segment_open = NULL, - .segment_close = wal_segment_close), - &private); + MemSet(&he3private, 0, sizeof(XLogPageReadPrivate)); + XLogReaderState *tmpxlogreader = NULL; + if (he3mirror) { + InitXLogInsert(); + he3xlogreader = + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.page_read = &XLogPageRead, + .segment_open = NULL, + .segment_close = wal_segment_close), + &he3private); + xlogreader = + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.batch_read = &AllXLogBatchRead, + ), + &private); + tmpxlogreader = he3xlogreader; + he3xlogreader->system_identifier = ControlFile->system_identifier; + } else { + xlogreader = + XLogReaderAllocate(wal_segment_size, NULL, + XL_ROUTINE(.batch_read = &AllXLogBatchRead, + ), + &private); + tmpxlogreader = xlogreader; + } if (!xlogreader) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"), errdetail("Failed while allocating a WAL reading processor."))); xlogreader->system_identifier = ControlFile->system_identifier; - /* * Allocate two page buffers dedicated to WAL consistency checks. We do * it this way, rather than just making static arrays, for two reasons: @@ -6802,13 +8360,13 @@ StartupXLOG(void) * (2) a static char array isn't guaranteed to have any particular * alignment, whereas palloc() will provide MAXALIGN'd storage. */ - replay_image_masked = (char *) palloc(BLCKSZ); - primary_image_masked = (char *) palloc(BLCKSZ); - + replay_image_masked = (char *)palloc(BLCKSZ); + primary_image_masked = (char *)palloc(BLCKSZ); + bool firstStartup = true; if (read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby)) { - List *tablespaces = NIL; + List *tablespaces = NIL; /* * Archive recovery was requested, and thanks to the backup label @@ -6823,15 +8381,15 @@ StartupXLOG(void) * When a backup_label file is present, we want to roll forward from * the checkpoint it identifies, rather than using pg_control. */ - record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true); + record = ReadCheckpointRecord(tmpxlogreader, checkPointLoc, 0, true); if (record != NULL) { - memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + memcpy(&checkPoint, XLogRecGetData(tmpxlogreader), sizeof(CheckPoint)); wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); ereport(DEBUG1, (errmsg_internal("checkpoint record is at %X/%X", LSN_FORMAT_ARGS(checkPointLoc)))); - InRecovery = true; /* force recovery even if SHUTDOWNED */ + InRecovery = true; /* force recovery even if SHUTDOWNED */ /* * Make sure that REDO location exists. This may not be the case @@ -6841,8 +8399,8 @@ StartupXLOG(void) */ if (checkPoint.redo < checkPointLoc) { - XLogBeginRead(xlogreader, checkPoint.redo); - if (!ReadRecord(xlogreader, LOG, false)) + XLogBeginRead(tmpxlogreader, checkPoint.redo); + if (!ReadRecord(tmpxlogreader, LOG, false)) ereport(FATAL, (errmsg("could not find redo location referenced by checkpoint record"), errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" @@ -6859,18 +8417,18 @@ StartupXLOG(void) "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", DataDir, DataDir, DataDir))); - wasShutdown = false; /* keep compiler quiet */ + wasShutdown = false; /* keep compiler quiet */ } /* read the tablespace_map file if present and create symlinks. */ if (read_tablespace_map(&tablespaces)) { - ListCell *lc; + ListCell *lc; - foreach(lc, tablespaces) + foreach (lc, tablespaces) { tablespaceinfo *ti = lfirst(lc); - char *linkloc; + char *linkloc; linkloc = psprintf("pg_tblspc/%s", ti->oid); @@ -6956,9 +8514,20 @@ StartupXLOG(void) } /* Get the last valid checkpoint record. */ - checkPointLoc = ControlFile->checkPoint; - RedoStartLSN = ControlFile->checkPointCopy.redo; - record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true); + if (he3mirror) { + if (ControlFile->checkPointFile == 0) { + ControlFile->checkPointFile = ControlFile->checkPoint; + RedoStartLSN = ControlFile->checkPointCopy.redo; + } else { + firstStartup = false; + RedoStartLSN = ControlFile->checkPointCopy.redo; + } + checkPointLoc = ControlFile->checkPointFile; + } else{ + checkPointLoc = ControlFile->checkPoint; + RedoStartLSN = ControlFile->checkPointCopy.redo; + } + record = ReadCheckpointRecord(tmpxlogreader, checkPointLoc, 1, true); if (record != NULL) { ereport(DEBUG1, @@ -6976,7 +8545,10 @@ StartupXLOG(void) ereport(PANIC, (errmsg("could not locate a valid checkpoint record"))); } - memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); + memcpy(&checkPoint, XLogRecGetData(tmpxlogreader), sizeof(CheckPoint)); + if (he3mirror) { + checkPoint.redo = ControlFile->checkPointCopy.redo; + } wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); } @@ -6999,12 +8571,12 @@ StartupXLOG(void) * timeline in the history of the requested timeline, we cannot proceed: * the backup is not part of the history of the requested timeline. */ - Assert(expectedTLEs); /* was initialized by reading checkpoint - * record */ + Assert(expectedTLEs); /* was initialized by reading checkpoint + * record */ if (tliOfPointInHistory(checkPointLoc, expectedTLEs) != checkPoint.ThisTimeLineID) { - XLogRecPtr switchpoint; + XLogRecPtr switchpoint; /* * tliSwitchPoint will throw an error if the checkpoint's timeline is @@ -7024,17 +8596,21 @@ StartupXLOG(void) * The min recovery point should be part of the requested timeline's * history, too. */ - if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) && + if (!he3mirror && !XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) && tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) != - ControlFile->minRecoveryPointTLI) + ControlFile->minRecoveryPointTLI) ereport(FATAL, (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u", recoveryTargetTLI, LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), ControlFile->minRecoveryPointTLI))); - LastRec = RecPtr = checkPointLoc; - + if (firstStartup) { + LastRec = RecPtr = checkPointLoc; + } else { + LastRec = RecPtr = checkPoint.redo; + } + ereport(DEBUG1, (errmsg_internal("redo record is at %X/%X; shutdown %s", LSN_FORMAT_ARGS(checkPoint.redo), @@ -7152,8 +8728,11 @@ StartupXLOG(void) restoreTwoPhaseData(); lastFullPageWrites = checkPoint.fullPageWrites; - - RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; + if (firstStartup) { + RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; + } else { + RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = ControlFile->minRecoveryPoint; + } doPageWrites = lastFullPageWrites; if (RecPtr < checkPoint.redo) @@ -7178,7 +8757,13 @@ StartupXLOG(void) { /* force recovery due to presence of recovery signal file */ InRecovery = true; - } else if (push_standby == false) { + } + else if (push_standby == false) + { + InRecovery = true; + } + else if (he3_point_in_time_recovery) + { InRecovery = true; } @@ -7187,11 +8772,11 @@ StartupXLOG(void) */ abortedRecPtr = InvalidXLogRecPtr; missingContrecPtr = InvalidXLogRecPtr; - + pthread_t ntid = 0; /* REDO */ if (InRecovery) { - int rmid; + int rmid; /* * Update pg_control to show that we are recovering and to show the @@ -7225,7 +8810,11 @@ StartupXLOG(void) XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH; SpinLockRelease(&XLogCtl->info_lck); } - ControlFile->checkPoint = checkPointLoc; + if (firstStartup) { + ControlFile->checkPoint = checkPointLoc; + } else { + ControlFile->checkPoint = checkPoint.redo; + } ControlFile->checkPointCopy = checkPoint; if (InArchiveRecovery) { @@ -7259,16 +8848,19 @@ StartupXLOG(void) if (backupFromStandby) { + /* if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY && dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY) ereport(FATAL, (errmsg("backup_label contains data inconsistent with control file"), errhint("This means that the backup is corrupted and you will " "have to use another backup for recovery."))); + */ + //备机备份因为日志拆分,minRecoveryPoint结束位置无法对应kv的wal位置 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint; } } - ControlFile->time = (pg_time_t) time(NULL); + ControlFile->time = (pg_time_t)time(NULL); /* No need to hold ControlFileLock yet, we aren't up far enough */ UpdateControlFile(); @@ -7351,7 +8943,7 @@ StartupXLOG(void) if (ArchiveRecoveryRequested && EnableHotStandby) { TransactionId *xids; - int nxids; + int nxids; ereport(DEBUG1, (errmsg_internal("initializing for hot standby"))); @@ -7423,8 +9015,13 @@ StartupXLOG(void) SpinLockAcquire(&XLogCtl->info_lck); if (checkPoint.redo < RecPtr) XLogCtl->replayEndRecPtr = checkPoint.redo; - else - XLogCtl->replayEndRecPtr = EndRecPtr; + else { + if (!firstStartup) { + XLogCtl->replayEndRecPtr = checkPoint.redo + record->xl_tot_len; + } else { + XLogCtl->replayEndRecPtr = EndRecPtr; + } + } XLogCtl->replayEndTLI = ThisTimeLineID; XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr; XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI; @@ -7452,11 +9049,14 @@ StartupXLOG(void) { PublishStartupProcessInformation(); #ifndef PG_NOREPLAY - if (push_standby == false) { + if (push_standby == false) + { #endif EnableSyncRequestForwarding(); #ifndef PG_NOREPLAY - } else { + } + else + { ProcessSyncRequests(); } #endif @@ -7470,27 +9070,102 @@ StartupXLOG(void) */ CheckRecoveryConsistency(); + if (he3mirror) { + he3xlogreader->insertTikv = true; + Insert = &XLogCtl->Insert; + xlogreader->insertTikv = true; + /* + * He3DB as replica, when restart he3db, because wal log lsn become bigger than pg, same as minRecoveryPoint. + * if we still use pg lsn as start location for insert kv, we may can not reached consistent recovery state. + * So set start location for insert kv with minRecoveryPoint when minRecoveryPoint not 0. + */ + if (!firstStartup){ + XLogBeginRead(xlogreader, checkPoint.redo); + record = ReadRecord(xlogreader, PANIC, false); + Insert->PrevBytePos = record->xl_prev; + Insert->CurrBytePos = checkPoint.redo; + XLogCtl->InitializedUpTo = ControlFile->minRecoveryPoint; + LogwrtResult.Write = ControlFile->minRecoveryPoint; + pg_atomic_write_u64(&LogwrtResult.Flush, ControlFile->minRecoveryPoint); + XLogCtl->LogwrtResult = LogwrtResult; + XLogCtl->LogwrtRqst.Write = ControlFile->minRecoveryPoint; + // XLogCtl->LogwrtRqst.Flush = checkPoint.redo; + pg_atomic_write_u64(&(XLogCtl->LogwrtRqst.Flush), ControlFile->minRecoveryPoint); + } else{ + Insert->PrevBytePos = checkPoint.redo; + Insert->CurrBytePos = checkPoint.redo; + XLogCtl->InitializedUpTo = checkPoint.redo; + LogwrtResult.Write = checkPoint.redo; + pg_atomic_write_u64(&LogwrtResult.Flush, checkPoint.redo); + XLogCtl->LogwrtResult = LogwrtResult; + XLogCtl->LogwrtRqst.Write = checkPoint.redo; + // XLogCtl->LogwrtRqst.Flush = checkPoint.redo; + pg_atomic_write_u64(&(XLogCtl->LogwrtRqst.Flush), checkPoint.redo); + } + XLogCtl->ThisTimeLineID = ThisTimeLineID; + + } + /* * Find the first record that logically follows the checkpoint --- it * might physically precede it, though. */ + if(IsBootstrapProcessingMode() != true && InitdbSingle != true) { + xlogreader->streamStart = true; + pfree(xlogreader->readBuf); + xlogreader->readBuf = NULL; + xlogreader->readLen = 0; + } + int err; if (checkPoint.redo < RecPtr) { /* back up to find the record */ XLogBeginRead(xlogreader, checkPoint.redo); + if(IsBootstrapProcessingMode() != true && InitdbSingle != true) { + *g_redoStartLsn = xlogreader->EndRecPtr; + if (he3mirror) { + XLogBeginRead(he3xlogreader, checkPoint.redo); + XLogPageReadPrivate *private = he3xlogreader->private_data; + private->fetching_ckpt = false; + err = pthread_create(&ntid,NULL,thr_fn,(void*)he3xlogreader); + } else { + err = pthread_create(&ntid,NULL,thr_fn,(void*)xlogreader); + } + if (err != 0) { + elog(FATAL,"pthread_create redo failed %s",strerror(err)); + } + } record = ReadRecord(xlogreader, PANIC, false); } else - { + { + + if (!firstStartup) { + XLogBeginRead(xlogreader, checkPoint.redo); + } /* just have to read next record after CheckPoint */ + if(IsBootstrapProcessingMode() != true && InitdbSingle != true) { + *g_redoStartLsn = xlogreader->EndRecPtr; + if (he3mirror) { + XLogBeginRead(he3xlogreader, checkPointLoc); + XLogPageReadPrivate *private = he3xlogreader->private_data; + private->fetching_ckpt = false; + err = pthread_create(&ntid,NULL,thr_fn,(void*)he3xlogreader); + } else { + err = pthread_create(&ntid,NULL,thr_fn,(void*)xlogreader); + } + if (err != 0) { + elog(FATAL,"pthread_create failed %s",strerror(err)); + } + } record = ReadRecord(xlogreader, LOG, false); } - + if (record != NULL) { ErrorContextCallback errcallback; TimestampTz xtime; - PGRUsage ru0; + PGRUsage ru0; pg_rusage_init(&ru0); @@ -7505,7 +9180,7 @@ StartupXLOG(void) */ do { - bool switchedTLI = false; + bool switchedTLI = false; #ifdef WAL_DEBUG if (XLOG_DEBUG || @@ -7516,8 +9191,8 @@ StartupXLOG(void) initStringInfo(&buf); appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ", - LSN_FORMAT_ARGS(ReadRecPtr), - LSN_FORMAT_ARGS(EndRecPtr)); + LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), + LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); xlog_outrec(&buf, xlogreader); appendStringInfoString(&buf, " - "); xlog_outdesc(&buf, xlogreader); @@ -7542,7 +9217,7 @@ StartupXLOG(void) * otherwise would is a minor issue, so it doesn't seem worth * adding another spinlock cycle to prevent that. */ - if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState != + if (((volatile XLogCtlData *)XLogCtl)->recoveryPauseState != RECOVERY_NOT_PAUSED) recoveryPausesHere(false); @@ -7568,14 +9243,14 @@ StartupXLOG(void) * here otherwise pausing during the delay-wait wouldn't * work. */ - if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState != + if (((volatile XLogCtlData *)XLogCtl)->recoveryPauseState != RECOVERY_NOT_PAUSED) recoveryPausesHere(false); } /* Setup error traceback support for ereport() */ errcallback.callback = rm_redo_error_callback; - errcallback.arg = (void *) xlogreader; + errcallback.arg = (void *)xlogreader; errcallback.previous = error_context_stack; error_context_stack = &errcallback; @@ -7595,13 +9270,13 @@ StartupXLOG(void) */ if (record->xl_rmid == RM_XLOG_ID) { - TimeLineID newTLI = ThisTimeLineID; - TimeLineID prevTLI = ThisTimeLineID; - uint8 info = record->xl_info & ~XLR_INFO_MASK; + TimeLineID newTLI = ThisTimeLineID; + TimeLineID prevTLI = ThisTimeLineID; + uint8 info = record->xl_info & ~XLR_INFO_MASK; if (info == XLOG_CHECKPOINT_SHUTDOWN) { - CheckPoint checkPoint; + CheckPoint checkPoint; memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); newTLI = checkPoint.ThisTimeLineID; @@ -7619,7 +9294,7 @@ StartupXLOG(void) if (newTLI != ThisTimeLineID) { /* Check that it's OK to switch to this TLI */ - checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI); + checkTimeLineSwitch(xlogreader->EndRecPtr, newTLI, prevTLI); /* Following WAL records should be run with new TLI */ ThisTimeLineID = newTLI; @@ -7632,8 +9307,9 @@ StartupXLOG(void) * so that XLogFlush will update minRecoveryPoint correctly. */ SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->replayEndRecPtr = EndRecPtr; + XLogCtl->replayEndRecPtr = xlogreader->EndRecPtr; XLogCtl->replayEndTLI = ThisTimeLineID; + XLogCtl->ThisTimeLineID = ThisTimeLineID; SpinLockRelease(&XLogCtl->info_lck); /* @@ -7643,11 +9319,54 @@ StartupXLOG(void) if (standbyState >= STANDBY_INITIALIZED && TransactionIdIsValid(record->xl_xid)) RecordKnownAssignedTransactionIds(record->xl_xid); - + RelFileNode rnode; + BlockNumber blkno; + ForkNumber forknum; + if (xlogreader->max_block_id == 0) { + XLogRecGetBlockTag(xlogreader, 0, &rnode, &forknum, &blkno); + // elog(LOG,"======to_replay========%d_%d_%d_%d",rnode.dbNode,rnode.relNode,forknum,blkno); + } /* Now apply the WAL record itself */ - xlogreader->isreplay = true; - RmgrTable[record->xl_rmid].rm_redo(xlogreader); - + bool hasReplay = false; + if (!he3db_xlog_donot_to_replay(record)) { + if (push_standby == true || EnableHotStandby == false || *isPromoteIsTriggered) { + RmgrId rmgrId = XLogRecGetRmid(xlogreader); + uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK; + if (rmgrId == RM_SMGR_ID && info == XLOG_SMGR_TRUNCATE) { + pushTikv(0,hashMapSize(),true); + } + } else { + RmgrId rmgrId = XLogRecGetRmid(xlogreader); + uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK; + if (rmgrId == RM_SMGR_ID && info == XLOG_SMGR_TRUNCATE) { + XLogRecPtr consistPtr, startlsn; + consistPtr = GetXLogPushToDisk(); + startlsn = record->xl_end - record->xl_tot_len; + while (consistPtr < startlsn) + { + pg_usleep(100000L); + elog(LOG, "standby consist lsn %ld, truncate lsn %ld", consistPtr, startlsn); + consistPtr = GetXLogPushToDisk(); + } + } + } + RmgrTable[record->xl_rmid].rm_redo(xlogreader); + hasReplay = true; + } else { +#ifndef PG_NOREPLAY + if(IsBootstrapProcessingMode() != true && InitdbSingle != true) { + if (true == data_buffer_for_replay(xlogreader,record->xl_end-record->xl_tot_len,record->xl_end)) { + RmgrTable[record->xl_rmid].rm_redo(xlogreader); + hasReplay = true; + } + } else { +#endif + RmgrTable[record->xl_rmid].rm_redo(xlogreader); + hasReplay = true; +#ifndef PG_NOREPLAY + } +#endif + } /* * After redo, check whether the backup pages associated with * the WAL record are consistent with the existing pages. This @@ -7660,8 +9379,17 @@ StartupXLOG(void) /* Pop the error context stack */ error_context_stack = errcallback.previous; - if (xlogreader->isreplay == true) { - updateLastReplayLsn(); + if (hasReplay == true) + { + updateLastReplayLsn(xlogreader); + //initdb no need to replay +#ifndef PG_NOREPLAY + if(IsBootstrapProcessingMode() != true && InitdbSingle!=true) { + if (push_standby == true || EnableHotStandby == false || *isPromoteIsTriggered) { + pushTikv(0, hashMapSize(),false); + } + } +#endif } /* @@ -7689,7 +9417,7 @@ StartupXLOG(void) * (possibly bogus) future WAL segments on the old * timeline. */ - RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID); + // RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID); /* * Wake up any walsenders to notice that we are on a new @@ -7706,10 +9434,20 @@ StartupXLOG(void) break; } + if (he3_point_in_time_recovery && he3recoveryStopsAfter(xlogreader)) + { + pushTikv(0,hashMapSize(),true); + reachedRecoveryTarget = true; + WalTaskImmediateFree(); + break; + } /* Else, try to fetch the next WAL record */ record = ReadRecord(xlogreader, LOG, false); - } while (record != NULL); - + } while (record != NULL); + pthread_join(ntid,NULL); + if (ProcHasReleaseFlag()) { + exit(1); + } /* * end of main redo apply loop */ @@ -7728,23 +9466,23 @@ StartupXLOG(void) */ switch (recoveryTargetAction) { - case RECOVERY_TARGET_ACTION_SHUTDOWN: + case RECOVERY_TARGET_ACTION_SHUTDOWN: - /* - * exit with special return code to request shutdown - * of postmaster. Log messages issued from - * postmaster. - */ - proc_exit(3); + /* + * exit with special return code to request shutdown + * of postmaster. Log messages issued from + * postmaster. + */ + proc_exit(3); - case RECOVERY_TARGET_ACTION_PAUSE: - SetRecoveryPause(true); - recoveryPausesHere(true); + case RECOVERY_TARGET_ACTION_PAUSE: + SetRecoveryPause(true); + recoveryPausesHere(true); - /* drop into promote */ + /* drop into promote */ - case RECOVERY_TARGET_ACTION_PROMOTE: - break; + case RECOVERY_TARGET_ACTION_PROMOTE: + break; } } @@ -7772,7 +9510,6 @@ StartupXLOG(void) /* there are no WAL records following the checkpoint */ ereport(LOG, (errmsg("redo is not required"))); - } /* @@ -7830,6 +9567,13 @@ StartupXLOG(void) * valid or last applied record, so we can identify the exact endpoint of * what we consider the valid portion of WAL. */ + + if (xlogreader->streamStart == true) { + xlogreader->readBuf = (char *) palloc_extended(4 * XLOG_BLCKSZ, + MCXT_ALLOC_NO_OOM); + xlogreader->readLen = 0; + xlogreader->streamStart = false; + } XLogBeginRead(xlogreader, LastRec); record = ReadRecord(xlogreader, PANIC, false); EndOfLog = EndRecPtr; @@ -7841,7 +9585,7 @@ StartupXLOG(void) * and we were reading the old WAL from a segment belonging to a higher * timeline. */ - EndOfLogTLI = xlogreader->seg.ws_tli; + EndOfLogTLI = ThisTimeLineID; /* * Complain if we did not roll forward far enough to render the backup @@ -7901,12 +9645,12 @@ StartupXLOG(void) * In a normal crash recovery, we can just extend the timeline we were in. */ PrevTimeLineID = ThisTimeLineID; - if (ArchiveRecoveryRequested) + if (ArchiveRecoveryRequested || he3_point_in_time_recovery) { - char reason[200]; - char recoveryPath[MAXPGPATH]; + char reason[200]; + char recoveryPath[MAXPGPATH]; - Assert(InArchiveRecovery); + Assert(InArchiveRecovery || he3_point_in_time_recovery); ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1; ereport(LOG, @@ -7946,7 +9690,8 @@ StartupXLOG(void) * (Note that we also have a copy of the last block of the old WAL in * readBuf; we will use that below.) */ - exitArchiveRecovery(EndOfLogTLI, EndOfLog); + if (!he3_point_in_time_recovery) + exitHe3ArchiveRecovery(EndOfLogTLI); /* * Write the timeline history file, and have it archived. After this @@ -7966,11 +9711,11 @@ StartupXLOG(void) * rid of it. */ snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG"); - unlink(recoveryPath); /* ignore any error */ + unlink(recoveryPath); /* ignore any error */ /* Get rid of any remaining recovered timeline-history file, too */ snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY"); - unlink(recoveryPath); /* ignore any error */ + unlink(recoveryPath); /* ignore any error */ } /* Save the selected TimeLineID in shared memory, too */ @@ -8003,43 +9748,44 @@ StartupXLOG(void) * record spans, not the one it starts in. The last block is indeed the * one we want to use. */ - if (EndOfLog % XLOG_BLCKSZ != 0) - { - char *page; - int len; - int firstIdx; - XLogRecPtr pageBeginPtr; + // if (EndOfLog % XLOG_BLCKSZ != 0) + // { + // char *page; + // int len; + // int firstIdx; + // XLogRecPtr pageBeginPtr; - pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ); - Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size)); + // pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ); + // Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size)); - firstIdx = XLogRecPtrToBufIdx(EndOfLog); + // firstIdx = XLogRecPtrToBufIdx(EndOfLog); - /* Copy the valid part of the last block, and zero the rest */ - page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ]; - len = EndOfLog % XLOG_BLCKSZ; - memcpy(page, xlogreader->readBuf, len); - memset(page + len, 0, XLOG_BLCKSZ - len); + // /* Copy the valid part of the last block, and zero the rest */ + // page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ]; + // len = EndOfLog % XLOG_BLCKSZ; + // memcpy(page, xlogreader->readBuf, len); + // memset(page + len, 0, XLOG_BLCKSZ - len); - XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ; - XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ; - } - else - { + // XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ; + // XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ; + // } + // else + // { /* * There is no partial block to copy. Just set InitializedUpTo, and * let the first attempt to insert a log record to initialize the next * buffer. */ - XLogCtl->InitializedUpTo = EndOfLog; - } + XLogCtl->InitializedUpTo = EndOfLog; + // } - LogwrtResult.Write = LogwrtResult.Flush = EndOfLog; + LogwrtResult.Write = EndOfLog; + pg_atomic_write_u64(&LogwrtResult.Flush, EndOfLog); XLogCtl->LogwrtResult = LogwrtResult; XLogCtl->LogwrtRqst.Write = EndOfLog; - XLogCtl->LogwrtRqst.Flush = EndOfLog; + pg_atomic_write_u64(&(XLogCtl->LogwrtRqst.Flush), EndOfLog); LocalSetXLogInsertAllowed(); @@ -8135,7 +9881,7 @@ StartupXLOG(void) * pre-allocated files containing garbage. In any case, they are not * part of the new timeline's history so we don't need them. */ - RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID); + // RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID); /* * If the switch happened in the middle of a segment, what to do with @@ -8166,41 +9912,42 @@ StartupXLOG(void) * restored from the archive to begin with, it's expected to have a * .done file). */ - if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 && - XLogArchivingActive()) - { - char origfname[MAXFNAMELEN]; - XLogSegNo endLogSegNo; - XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size); - XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size); + // if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 && + // XLogArchivingActive()) + // { + // char origfname[MAXFNAMELEN]; + // XLogSegNo endLogSegNo; - if (!XLogArchiveIsReadyOrDone(origfname)) - { - char origpath[MAXPGPATH]; - char partialfname[MAXFNAMELEN]; - char partialpath[MAXPGPATH]; + // XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size); + // XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size); - XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size); - snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname); - snprintf(partialpath, MAXPGPATH, "%s.partial", origpath); + // if (!XLogArchiveIsReadyOrDone(origfname)) + // { + // char origpath[MAXPGPATH]; + // char partialfname[MAXFNAMELEN]; + // char partialpath[MAXPGPATH]; - /* - * Make sure there's no .done or .ready file for the .partial - * file. - */ - XLogArchiveCleanup(partialfname); + // XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size); + // snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname); + // snprintf(partialpath, MAXPGPATH, "%s.partial", origpath); - durable_rename(origpath, partialpath, ERROR); - XLogArchiveNotify(partialfname); - } - } + // /* + // * Make sure there's no .done or .ready file for the .partial + // * file. + // */ + // XLogArchiveCleanup(partialfname); + + // durable_rename(origpath, partialpath, ERROR); + // XLogArchiveNotify(partialfname); + // } + // } } /* * Preallocate additional log files, if wanted. */ - PreallocXlogFiles(EndOfLog); + // PreallocXlogFiles(EndOfLog); /* * Okay, we're officially UP. @@ -8208,7 +9955,7 @@ StartupXLOG(void) InRecovery = false; /* start the archive_timeout timer and LSN running */ - XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); + XLogCtl->lastSegSwitchTime = (pg_time_t)time(NULL); XLogCtl->lastSegSwitchLSN = EndOfLog; /* also initialize latestCompletedXid, to nextXid - 1 */ @@ -8234,11 +9981,13 @@ StartupXLOG(void) RecoverPreparedTransactions(); /* Shut down xlogreader */ - if (readFile >= 0) - { - close(readFile); - readFile = -1; - } + if (he3mirror) { + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + } XLogReaderFree(xlogreader); /* @@ -8271,7 +10020,7 @@ StartupXLOG(void) */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->state = DB_IN_PRODUCTION; - ControlFile->time = (pg_time_t) time(NULL); + ControlFile->time = (pg_time_t)time(NULL); SpinLockAcquire(&XLogCtl->info_lck); XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE; @@ -8305,7 +10054,10 @@ StartupXLOG(void) * appropriate now that we're not in standby mode anymore. */ if (promoted) + { + *isPromoteIsTriggered = true; RequestCheckpoint(CHECKPOINT_FORCE); + } } /* @@ -8316,16 +10068,17 @@ StartupXLOG(void) static void CheckRecoveryConsistency(void) { - XLogRecPtr lastReplayedEndRecPtr; + XLogRecPtr lastReplayedEndRecPtr; /* * During crash recovery, we don't reach a consistent state until we've * replayed all the WAL. */ - if (XLogRecPtrIsInvalid(minRecoveryPoint)) + if (XLogRecPtrIsInvalid(minRecoveryPoint) && !he3_point_in_time_recovery) return; - Assert(InArchiveRecovery); + if (!he3_point_in_time_recovery) + Assert(InArchiveRecovery); /* * assume that we are called in the startup process, and hence don't need @@ -8350,8 +10103,8 @@ CheckRecoveryConsistency(void) LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr) - ControlFile->minRecoveryPoint = lastReplayedEndRecPtr; + /*if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr) + ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;*/ ControlFile->backupStartPoint = InvalidXLogRecPtr; ControlFile->backupEndPoint = InvalidXLogRecPtr; @@ -8401,6 +10154,8 @@ CheckRecoveryConsistency(void) LocalHotStandbyActive = true; SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY); + //sleep 200 ms for conflict between PMSIGNAL_START_WALRECEIVER with PMSIGNAL_BEGIN_HOT_STANDBY + usleep(200000); } } @@ -8413,8 +10168,7 @@ CheckRecoveryConsistency(void) * As a side-effect, we initialize the local TimeLineID and RedoRecPtr * variables the first time we see that recovery is finished. */ -bool -RecoveryInProgress(void) +bool RecoveryInProgress(void) { /* * We check shared state each time only until we leave recovery mode. We @@ -8487,8 +10241,7 @@ GetRecoveryState(void) * shared memory. (And note that standbyState alone doesn't tell the truth * anyway.) */ -bool -HotStandbyActive(void) +bool HotStandbyActive(void) { /* * We check shared state each time only until Hot Standby is active. We @@ -8512,8 +10265,7 @@ HotStandbyActive(void) * Like HotStandbyActive(), but to be used only in WAL replay code, * where we don't need to ask any other process what the state is. */ -bool -HotStandbyActiveInReplay(void) +bool HotStandbyActiveInReplay(void) { Assert(AmStartupProcess() || !IsPostmasterEnvironment); return LocalHotStandbyActive; @@ -8526,8 +10278,7 @@ HotStandbyActiveInReplay(void) * But we also have provisions for forcing the result "true" or "false" * within specific processes regardless of the global state. */ -bool -XLogInsertAllowed(void) +bool XLogInsertAllowed(void) { /* * If value is "unconditionally true" or "unconditionally false", just @@ -8535,7 +10286,7 @@ XLogInsertAllowed(void) * done. */ if (LocalXLogInsertAllowed >= 0) - return (bool) LocalXLogInsertAllowed; + return (bool)LocalXLogInsertAllowed; /* * Else, must check to see if we're still in recovery. @@ -8578,26 +10329,46 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int whichChkpt, bool report) { XLogRecord *record; - uint8 info; + uint8 info; + if (he3mirror) { + if (!XRecOffIsValid(RecPtr)) + { + if (!report) + return NULL; - if (!XRecOffIsValid(RecPtr)) - { - if (!report) - return NULL; + switch (whichChkpt) + { + case 1: + ereport(LOG, + (errmsg("invalid primary checkpoint link in control file"))); + break; + default: + ereport(LOG, + (errmsg("invalid checkpoint link in backup_label file"))); + break; + } + return NULL; + } + } - switch (whichChkpt) - { - case 1: - ereport(LOG, - (errmsg("invalid primary checkpoint link in control file"))); - break; - default: - ereport(LOG, - (errmsg("invalid checkpoint link in backup_label file"))); - break; - } - return NULL; - } + // if (!XRecOffIsValid(RecPtr)) + // { + // if (!report) + // return NULL; + + // switch (whichChkpt) + // { + // case 1: + // ereport(LOG, + // (errmsg("invalid primary checkpoint link in control file"))); + // break; + // default: + // ereport(LOG, + // (errmsg("invalid checkpoint link in backup_label file"))); + // break; + // } + // return NULL; + // } XLogBeginRead(xlogreader, RecPtr); record = ReadRecord(xlogreader, LOG, true); @@ -8609,14 +10380,14 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, switch (whichChkpt) { - case 1: - ereport(LOG, - (errmsg("invalid primary checkpoint record"))); - break; - default: - ereport(LOG, - (errmsg("invalid checkpoint record"))); - break; + case 1: + ereport(LOG, + (errmsg("invalid primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid checkpoint record"))); + break; } return NULL; } @@ -8624,14 +10395,14 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, { switch (whichChkpt) { - case 1: - ereport(LOG, - (errmsg("invalid resource manager ID in primary checkpoint record"))); - break; - default: - ereport(LOG, - (errmsg("invalid resource manager ID in checkpoint record"))); - break; + case 1: + ereport(LOG, + (errmsg("invalid resource manager ID in primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid resource manager ID in checkpoint record"))); + break; } return NULL; } @@ -8641,14 +10412,14 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, { switch (whichChkpt) { - case 1: - ereport(LOG, - (errmsg("invalid xl_info in primary checkpoint record"))); - break; - default: - ereport(LOG, - (errmsg("invalid xl_info in checkpoint record"))); - break; + case 1: + ereport(LOG, + (errmsg("invalid xl_info in primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid xl_info in checkpoint record"))); + break; } return NULL; } @@ -8656,14 +10427,14 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, { switch (whichChkpt) { - case 1: - ereport(LOG, - (errmsg("invalid length of primary checkpoint record"))); - break; - default: - ereport(LOG, - (errmsg("invalid length of checkpoint record"))); - break; + case 1: + ereport(LOG, + (errmsg("invalid length of primary checkpoint record"))); + break; + default: + ereport(LOG, + (errmsg("invalid length of checkpoint record"))); + break; } return NULL; } @@ -8679,8 +10450,7 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, * process's copies of ThisTimeLineID and RedoRecPtr valid too. This was * unnecessary however, since the postmaster itself never touches XLOG anyway. */ -void -InitXLOGAccess(void) +void InitXLOGAccess(void) { XLogCtlInsert *Insert = &XLogCtl->Insert; @@ -8692,7 +10462,7 @@ InitXLOGAccess(void) wal_segment_size = ControlFile->xlog_seg_size; /* Use GetRedoRecPtr to copy the RedoRecPtr safely */ - (void) GetRedoRecPtr(); + (void)GetRedoRecPtr(); /* Also update our copy of doPageWrites. */ doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites); @@ -8708,7 +10478,7 @@ InitXLOGAccess(void) XLogRecPtr GetRedoRecPtr(void) { - XLogRecPtr ptr; + XLogRecPtr ptr; /* * The possibly not up-to-date copy in XlogCtl is enough. Even if we @@ -8734,8 +10504,7 @@ GetRedoRecPtr(void) * possibly out-of-date. XLogInsertRecord will re-check them against * up-to-date values, while holding the WAL insert lock. */ -void -GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p) +void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p) { *RedoRecPtr_p = RedoRecPtr; *doPageWrites_p = doPageWrites; @@ -8752,7 +10521,7 @@ GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p) XLogRecPtr GetInsertRecPtr(void) { - XLogRecPtr recptr; + XLogRecPtr recptr; SpinLockAcquire(&XLogCtl->info_lck); recptr = XLogCtl->LogwrtRqst.Write; @@ -8771,8 +10540,7 @@ GetFlushRecPtr(void) SpinLockAcquire(&XLogCtl->info_lck); LogwrtResult = XLogCtl->LogwrtResult; SpinLockRelease(&XLogCtl->info_lck); - - return LogwrtResult.Flush; + return (XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush); } /* @@ -8786,12 +10554,12 @@ GetFlushRecPtr(void) XLogRecPtr GetLastImportantRecPtr(void) { - XLogRecPtr res = InvalidXLogRecPtr; - int i; + XLogRecPtr res = InvalidXLogRecPtr; + int i; for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) { - XLogRecPtr last_important; + XLogRecPtr last_important; /* * Need to take a lock to prevent torn reads of the LSN, which are @@ -8815,7 +10583,7 @@ GetLastImportantRecPtr(void) pg_time_t GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN) { - pg_time_t result; + pg_time_t result; /* Need WALWriteLock, but shared lock is sufficient */ LWLockAcquire(WALWriteLock, LW_SHARED); @@ -8829,8 +10597,7 @@ GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN) /* * This must be called ONCE during postmaster or standalone-backend shutdown */ -void -ShutdownXLOG(int code, Datum arg) +void ShutdownXLOG(int code, Datum arg) { /* * We should have an aux process resource owner to use, and we should not @@ -8881,7 +10648,7 @@ LogCheckpointStart(int flags, bool restartpoint) { if (restartpoint) ereport(LOG, - /* translator: the placeholders show checkpoint options */ + /* translator: the placeholders show checkpoint options */ (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s", (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", @@ -8893,7 +10660,7 @@ LogCheckpointStart(int flags, bool restartpoint) (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : ""))); else ereport(LOG, - /* translator: the placeholders show checkpoint options */ + /* translator: the placeholders show checkpoint options */ (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s", (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "", (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "", @@ -8911,12 +10678,12 @@ LogCheckpointStart(int flags, bool restartpoint) static void LogCheckpointEnd(bool restartpoint) { - long write_msecs, - sync_msecs, - total_msecs, - longest_msecs, - average_msecs; - uint64 average_sync_time; + long write_msecs, + sync_msecs, + total_msecs, + longest_msecs, + average_msecs; + uint64 average_sync_time; CheckpointStats.ckpt_end_t = GetCurrentTimestamp(); @@ -8944,13 +10711,13 @@ LogCheckpointEnd(bool restartpoint) * Timing values returned from CheckpointStats are in microseconds. * Convert to milliseconds for consistent printing. */ - longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000); + longest_msecs = (long)((CheckpointStats.ckpt_longest_sync + 999) / 1000); average_sync_time = 0; if (CheckpointStats.ckpt_sync_rels > 0) average_sync_time = CheckpointStats.ckpt_agg_sync_time / - CheckpointStats.ckpt_sync_rels; - average_msecs = (long) ((average_sync_time + 999) / 1000); + CheckpointStats.ckpt_sync_rels; + average_msecs = (long)((average_sync_time + 999) / 1000); if (restartpoint) ereport(LOG, @@ -8960,18 +10727,18 @@ LogCheckpointEnd(bool restartpoint) "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " "distance=%d kB, estimate=%d kB", CheckpointStats.ckpt_bufs_written, - (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, + (double)CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_segs_added, CheckpointStats.ckpt_segs_removed, CheckpointStats.ckpt_segs_recycled, - write_msecs / 1000, (int) (write_msecs % 1000), - sync_msecs / 1000, (int) (sync_msecs % 1000), - total_msecs / 1000, (int) (total_msecs % 1000), + write_msecs / 1000, (int)(write_msecs % 1000), + sync_msecs / 1000, (int)(sync_msecs % 1000), + total_msecs / 1000, (int)(total_msecs % 1000), CheckpointStats.ckpt_sync_rels, - longest_msecs / 1000, (int) (longest_msecs % 1000), - average_msecs / 1000, (int) (average_msecs % 1000), - (int) (PrevCheckPointDistance / 1024.0), - (int) (CheckPointDistanceEstimate / 1024.0)))); + longest_msecs / 1000, (int)(longest_msecs % 1000), + average_msecs / 1000, (int)(average_msecs % 1000), + (int)(PrevCheckPointDistance / 1024.0), + (int)(CheckPointDistanceEstimate / 1024.0)))); else ereport(LOG, (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); " @@ -8980,18 +10747,18 @@ LogCheckpointEnd(bool restartpoint) "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " "distance=%d kB, estimate=%d kB", CheckpointStats.ckpt_bufs_written, - (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, + (double)CheckpointStats.ckpt_bufs_written * 100 / NBuffers, CheckpointStats.ckpt_segs_added, CheckpointStats.ckpt_segs_removed, CheckpointStats.ckpt_segs_recycled, - write_msecs / 1000, (int) (write_msecs % 1000), - sync_msecs / 1000, (int) (sync_msecs % 1000), - total_msecs / 1000, (int) (total_msecs % 1000), + write_msecs / 1000, (int)(write_msecs % 1000), + sync_msecs / 1000, (int)(sync_msecs % 1000), + total_msecs / 1000, (int)(total_msecs % 1000), CheckpointStats.ckpt_sync_rels, - longest_msecs / 1000, (int) (longest_msecs % 1000), - average_msecs / 1000, (int) (average_msecs % 1000), - (int) (PrevCheckPointDistance / 1024.0), - (int) (CheckPointDistanceEstimate / 1024.0)))); + longest_msecs / 1000, (int)(longest_msecs % 1000), + average_msecs / 1000, (int)(average_msecs % 1000), + (int)(PrevCheckPointDistance / 1024.0), + (int)(CheckPointDistanceEstimate / 1024.0)))); } /* @@ -9030,7 +10797,7 @@ UpdateCheckPointDistanceEstimate(uint64 nbytes) CheckPointDistanceEstimate = nbytes; else CheckPointDistanceEstimate = - (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes); + (0.90 * CheckPointDistanceEstimate + 0.10 * (double)nbytes); } /* @@ -9055,7 +10822,7 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset) set_ps_display(""); else { - char activitymsg[128]; + char activitymsg[128]; snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s", (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "", @@ -9065,7 +10832,6 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset) } } - /* * Perform a checkpoint --- either during shutdown, or on-the-fly * @@ -9095,20 +10861,19 @@ update_checkpoint_display(int flags, bool restartpoint, bool reset) * As a result, timing of actions is critical here and be careful to note that * this function will likely take minutes to execute on a busy system. */ -void -CreateCheckPoint(int flags) +void CreateCheckPoint(int flags) { - bool shutdown; - CheckPoint checkPoint; - XLogRecPtr recptr; - XLogSegNo _logSegNo; + bool shutdown; + CheckPoint checkPoint; + XLogRecPtr recptr; + XLogSegNo _logSegNo; XLogCtlInsert *Insert = &XLogCtl->Insert; - uint32 freespace; - XLogRecPtr PriorRedoPtr; - XLogRecPtr curInsert; - XLogRecPtr last_important_lsn; + uint32 freespace; + XLogRecPtr PriorRedoPtr; + XLogRecPtr curInsert; + XLogRecPtr last_important_lsn; VirtualTransactionId *vxids; - int nvxids; + int nvxids; /* * An end-of-recovery checkpoint is really a shutdown checkpoint, just @@ -9152,7 +10917,7 @@ CreateCheckPoint(int flags) { LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->state = DB_SHUTDOWNING; - ControlFile->time = (pg_time_t) time(NULL); + ControlFile->time = (pg_time_t)time(NULL); UpdateControlFile(); LWLockRelease(ControlFileLock); } @@ -9166,7 +10931,7 @@ CreateCheckPoint(int flags) /* Begin filling in the checkpoint WAL record */ MemSet(&checkPoint, 0, sizeof(checkPoint)); - checkPoint.time = (pg_time_t) time(NULL); + checkPoint.time = (pg_time_t)time(NULL); /* * For Hot Standby, derive the oldestActiveXid before we fix the redo @@ -9234,14 +10999,14 @@ CreateCheckPoint(int flags) * the buffer flush work. Those XLOG records are logically after the * checkpoint, even though physically before it. Got that? */ - freespace = INSERT_FREESPACE(curInsert); - if (freespace == 0) - { - if (XLogSegmentOffset(curInsert, wal_segment_size) == 0) - curInsert += SizeOfXLogLongPHD; - else - curInsert += SizeOfXLogShortPHD; - } + // freespace = INSERT_FREESPACE(curInsert); + // if (freespace == 0) + // { + // if (XLogSegmentOffset(curInsert, wal_segment_size) == 0) + // curInsert += SizeOfXLogLongPHD; + // else + // curInsert += SizeOfXLogShortPHD; + // } checkPoint.redo = curInsert; /* @@ -9355,7 +11120,7 @@ CreateCheckPoint(int flags) { do { - pg_usleep(10000L); /* wait for 10 msec */ + pg_usleep(10000L); /* wait for 10 msec */ } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids)); } pfree(vxids); @@ -9379,10 +11144,9 @@ CreateCheckPoint(int flags) * Now insert the checkpoint record into XLOG. */ XLogBeginInsert(); - XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint)); + XLogRegisterData((char *)(&checkPoint), sizeof(checkPoint)); recptr = XLogInsert(RM_XLOG_ID, - shutdown ? XLOG_CHECKPOINT_SHUTDOWN : - XLOG_CHECKPOINT_ONLINE); + shutdown ? XLOG_CHECKPOINT_SHUTDOWN : XLOG_CHECKPOINT_ONLINE); XLogFlush(recptr); @@ -9396,7 +11160,7 @@ CreateCheckPoint(int flags) if (shutdown) { if (flags & CHECKPOINT_END_OF_RECOVERY) - LocalXLogInsertAllowed = -1; /* return to "check" state */ + LocalXLogInsertAllowed = -1; /* return to "check" state */ else LocalXLogInsertAllowed = 0; /* never again write WAL */ } @@ -9423,7 +11187,7 @@ CreateCheckPoint(int flags) ControlFile->state = DB_SHUTDOWNED; ControlFile->checkPoint = ProcLastRecPtr; ControlFile->checkPointCopy = checkPoint; - ControlFile->time = (pg_time_t) time(NULL); + ControlFile->time = (pg_time_t)time(NULL); /* crash recovery should always recover to the end of WAL */ ControlFile->minRecoveryPoint = InvalidXLogRecPtr; ControlFile->minRecoveryPointTLI = 0; @@ -9439,10 +11203,10 @@ CreateCheckPoint(int flags) UpdateControlFile(); LWLockRelease(ControlFileLock); - //update checkpoint lsn - //if (!RecoveryInProgress()) { + // update checkpoint lsn + // if (!RecoveryInProgress()) { // UpdateStatShareStorage(checkPoint.redo); - //} + // } /* Update shared-memory copy of checkpoint XID/epoch */ SpinLockAcquire(&XLogCtl->info_lck); @@ -9471,26 +11235,26 @@ CreateCheckPoint(int flags) * Delete old log files, those no longer needed for last checkpoint to * prevent the disk holding the xlog from growing full. */ - XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); - KeepLogSeg(recptr, &_logSegNo); - if (InvalidateObsoleteReplicationSlots(_logSegNo)) - { + // XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + // KeepLogSeg(recptr, &_logSegNo); + // if (InvalidateObsoleteReplicationSlots(_logSegNo)) + // { /* * Some slots have been invalidated; recalculate the old-segment * horizon, starting again from RedoRecPtr. */ - XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); - KeepLogSeg(recptr, &_logSegNo); - } - _logSegNo--; - RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr); + // XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + // KeepLogSeg(recptr, &_logSegNo); + // } + // _logSegNo--; + // RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr); /* * Make more log segments if needed. (Do this after recycling old log * segments, since that may supply some of the needed files.) */ - if (!shutdown) - PreallocXlogFiles(recptr); + // if (!shutdown) + // PreallocXlogFiles(recptr); /* * Truncate pg_subtrans if possible. We can throw away all data before @@ -9527,7 +11291,7 @@ static void CreateEndOfRecoveryRecord(void) { xl_end_of_recovery xlrec; - XLogRecPtr recptr; + XLogRecPtr recptr; /* sanity check */ if (!RecoveryInProgress()) @@ -9545,7 +11309,7 @@ CreateEndOfRecoveryRecord(void) START_CRIT_SECTION(); XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery)); + XLogRegisterData((char *)&xlrec, sizeof(xl_end_of_recovery)); recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY); XLogFlush(recptr); @@ -9555,7 +11319,7 @@ CreateEndOfRecoveryRecord(void) * changes to this point. */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - ControlFile->time = (pg_time_t) time(NULL); + ControlFile->time = (pg_time_t)time(NULL); ControlFile->minRecoveryPoint = recptr; ControlFile->minRecoveryPointTLI = ThisTimeLineID; UpdateControlFile(); @@ -9563,7 +11327,7 @@ CreateEndOfRecoveryRecord(void) END_CRIT_SECTION(); - LocalXLogInsertAllowed = -1; /* return to "check" state */ + LocalXLogInsertAllowed = -1; /* return to "check" state */ } /* @@ -9590,7 +11354,7 @@ static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn) { xl_overwrite_contrecord xlrec; - XLogRecPtr recptr; + XLogRecPtr recptr; /* sanity check */ if (!RecoveryInProgress()) @@ -9602,7 +11366,7 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn) START_CRIT_SECTION(); XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord)); + XLogRegisterData((char *)&xlrec, sizeof(xl_overwrite_contrecord)); recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD); @@ -9613,8 +11377,9 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn) return recptr; } -void PushCheckPointGuts(XLogRecPtr checkPointRedo, int flags) { - CheckPointGuts(checkPointRedo,flags); +void PushCheckPointGuts(XLogRecPtr checkPointRedo, int flags) +{ + CheckPointGuts(checkPointRedo, flags); } /* @@ -9628,7 +11393,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { if ((ControlFile->state == DB_IN_PRODUCTION || ControlFile->state == DB_IN_ARCHIVE_RECOVERY) && !push_standby) { - PrecacheHotData(); + PrecacheHotDataByRules(); } CheckPointRelationMap(); CheckPointReplicationSlots(); @@ -9644,10 +11409,13 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointSUBTRANS(); CheckPointMultiXact(); CheckPointPredicate(); - if (push_standby == true) { + //only CHECKPOINT_IS_SHUTDOWN flush page,master only bufferpool full need to evict page + if ((flags & CHECKPOINT_IS_SHUTDOWN) || + IsBootstrapProcessingMode() == true || InitdbSingle == true) + { CheckPointBuffers(flags); } - + /* Perform all queued up fsyncs */ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); @@ -9710,18 +11478,17 @@ RecoveryRestartPoint(const CheckPoint *checkPoint) * a restartpoint if we have replayed a safe checkpoint record since last * restartpoint. */ -bool -CreateRestartPoint(int flags) +bool CreateRestartPoint(int flags) { - XLogRecPtr lastCheckPointRecPtr; - XLogRecPtr lastCheckPointEndPtr; - CheckPoint lastCheckPoint; - XLogRecPtr PriorRedoPtr; - XLogRecPtr receivePtr; - XLogRecPtr replayPtr; - TimeLineID replayTLI; - XLogRecPtr endptr; - XLogSegNo _logSegNo; + XLogRecPtr lastCheckPointRecPtr; + XLogRecPtr lastCheckPointEndPtr; + CheckPoint lastCheckPoint; + XLogRecPtr PriorRedoPtr; + XLogRecPtr receivePtr; + XLogRecPtr replayPtr; + TimeLineID replayTLI; + XLogRecPtr endptr; + XLogSegNo _logSegNo; TimestampTz xtime; /* Get a local copy of the last safe checkpoint record. */ @@ -9768,7 +11535,7 @@ CreateRestartPoint(int flags) { LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; - ControlFile->time = (pg_time_t) time(NULL); + ControlFile->time = (pg_time_t)time(NULL); UpdateControlFile(); LWLockRelease(ControlFileLock); } @@ -9829,7 +11596,7 @@ CreateRestartPoint(int flags) { ControlFile->checkPoint = lastCheckPointRecPtr; ControlFile->checkPointCopy = lastCheckPoint; - ControlFile->time = (pg_time_t) time(NULL); + ControlFile->time = (pg_time_t)time(NULL); /* * Ensure minRecoveryPoint is past the checkpoint record. Normally, @@ -9854,7 +11621,6 @@ CreateRestartPoint(int flags) if (flags & CHECKPOINT_IS_SHUTDOWN) ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY; UpdateControlFile(); - } LWLockRelease(ControlFileLock); @@ -9869,26 +11635,26 @@ CreateRestartPoint(int flags) * Delete old log files, those no longer needed for last restartpoint to * prevent the disk holding the xlog from growing full. */ - XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + // XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); /* * Retreat _logSegNo using the current end of xlog replayed or received, * whichever is later. */ - receivePtr = GetWalRcvFlushRecPtr(NULL, NULL); - replayPtr = GetXLogReplayRecPtr(&replayTLI); - endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr; - KeepLogSeg(endptr, &_logSegNo); - if (InvalidateObsoleteReplicationSlots(_logSegNo)) - { + // receivePtr = GetWalRcvFlushRecPtr(NULL, NULL); + // replayPtr = GetXLogReplayRecPtr(&replayTLI); + // endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr; + // KeepLogSeg(endptr, &_logSegNo); + // if (InvalidateObsoleteReplicationSlots(_logSegNo)) + // { /* * Some slots have been invalidated; recalculate the old-segment * horizon, starting again from RedoRecPtr. */ - XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); - KeepLogSeg(endptr, &_logSegNo); - } - _logSegNo--; + // XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size); + // KeepLogSeg(endptr, &_logSegNo); + // } + // _logSegNo--; /* * Try to recycle segments on a useful timeline. If we've been promoted @@ -9903,16 +11669,16 @@ CreateRestartPoint(int flags) * and will go wasted until recycled on the next restartpoint. We'll live * with that. */ - if (RecoveryInProgress()) - ThisTimeLineID = replayTLI; + // if (RecoveryInProgress()) + // ThisTimeLineID = replayTLI; - RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr); + // RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr); /* * Make more log segments if needed. (Do this after recycling old log * segments, since that may supply some of the needed files.) */ - PreallocXlogFiles(endptr); + // PreallocXlogFiles(endptr); /* * ThisTimeLineID is normally not set when we're still in recovery. @@ -9944,7 +11710,8 @@ CreateRestartPoint(int flags) (errmsg("recovery restart point at %X/%X", LSN_FORMAT_ARGS(lastCheckPoint.redo)), xtime ? errdetail("Last completed transaction was at log time %s.", - timestamptz_to_str(xtime)) : 0)); + timestamptz_to_str(xtime)) + : 0)); /* * Finally, execute archive_cleanup_command, if any. @@ -9982,13 +11749,13 @@ CreateRestartPoint(int flags) WALAvailability GetWALAvailability(XLogRecPtr targetLSN) { - XLogRecPtr currpos; /* current write LSN */ - XLogSegNo currSeg; /* segid of currpos */ - XLogSegNo targetSeg; /* segid of targetLSN */ - XLogSegNo oldestSeg; /* actual oldest segid */ - XLogSegNo oldestSegMaxWalSize; /* oldest segid kept by max_wal_size */ - XLogSegNo oldestSlotSeg; /* oldest segid kept by slot */ - uint64 keepSegs; + XLogRecPtr currpos; /* current write LSN */ + XLogSegNo currSeg; /* segid of currpos */ + XLogSegNo targetSeg; /* segid of targetLSN */ + XLogSegNo oldestSeg; /* actual oldest segid */ + XLogSegNo oldestSegMaxWalSize; /* oldest segid kept by max_wal_size */ + XLogSegNo oldestSlotSeg; /* oldest segid kept by slot */ + uint64 keepSegs; /* * slot does not reserve WAL. Either deactivated, or has never been active @@ -9996,57 +11763,60 @@ GetWALAvailability(XLogRecPtr targetLSN) if (XLogRecPtrIsInvalid(targetLSN)) return WALAVAIL_INVALID_LSN; - /* - * Calculate the oldest segment currently reserved by all slots, - * considering wal_keep_size and max_slot_wal_keep_size. Initialize - * oldestSlotSeg to the current segment. - */ - currpos = GetXLogWriteRecPtr(); - XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size); - KeepLogSeg(currpos, &oldestSlotSeg); - - /* - * Find the oldest extant segment file. We get 1 until checkpoint removes - * the first WAL segment file since startup, which causes the status being - * wrong under certain abnormal conditions but that doesn't actually harm. - */ - oldestSeg = XLogGetLastRemovedSegno() + 1; - - /* calculate oldest segment by max_wal_size */ - XLByteToSeg(currpos, currSeg, wal_segment_size); - keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1; - - if (currSeg > keepSegs) - oldestSegMaxWalSize = currSeg - keepSegs; - else - oldestSegMaxWalSize = 1; - - /* the segment we care about */ - XLByteToSeg(targetLSN, targetSeg, wal_segment_size); - - /* - * No point in returning reserved or extended status values if the - * targetSeg is known to be lost. - */ - if (targetSeg >= oldestSlotSeg) + if (0) { - /* show "reserved" when targetSeg is within max_wal_size */ - if (targetSeg >= oldestSegMaxWalSize) - return WALAVAIL_RESERVED; + /* + * Calculate the oldest segment currently reserved by all slots, + * considering wal_keep_size and max_slot_wal_keep_size. Initialize + * oldestSlotSeg to the current segment. + */ + currpos = GetXLogWriteRecPtr(); + XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size); + KeepLogSeg(currpos, &oldestSlotSeg); - /* being retained by slots exceeding max_wal_size */ - return WALAVAIL_EXTENDED; + /* + * Find the oldest extant segment file. We get 1 until checkpoint removes + * the first WAL segment file since startup, which causes the status being + * wrong under certain abnormal conditions but that doesn't actually harm. + */ + oldestSeg = XLogGetLastRemovedSegno() + 1; + + /* calculate oldest segment by max_wal_size */ + XLByteToSeg(currpos, currSeg, wal_segment_size); + keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1; + + if (currSeg > keepSegs) + oldestSegMaxWalSize = currSeg - keepSegs; + else + oldestSegMaxWalSize = 1; + + /* the segment we care about */ + XLByteToSeg(targetLSN, targetSeg, wal_segment_size); + + /* + * No point in returning reserved or extended status values if the + * targetSeg is known to be lost. + */ + if (targetSeg >= oldestSlotSeg) + { + /* show "reserved" when targetSeg is within max_wal_size */ + if (targetSeg >= oldestSegMaxWalSize) + return WALAVAIL_RESERVED; + + /* being retained by slots exceeding max_wal_size */ + return WALAVAIL_EXTENDED; + } + + /* WAL segments are no longer retained but haven't been removed yet */ + if (targetSeg >= oldestSeg) + return WALAVAIL_UNRESERVED; + + /* Definitely lost */ + return WALAVAIL_REMOVED; } - - /* WAL segments are no longer retained but haven't been removed yet */ - if (targetSeg >= oldestSeg) - return WALAVAIL_UNRESERVED; - - /* Definitely lost */ - return WALAVAIL_REMOVED; + return WALAVAIL_RESERVED; } - /* * Retreat *logSegNo to the last segment that we need to retain because of * either wal_keep_size or replication slots. @@ -10066,9 +11836,9 @@ GetWALAvailability(XLogRecPtr targetLSN) static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) { - XLogSegNo currSegNo; - XLogSegNo segno; - XLogRecPtr keep; + XLogSegNo currSegNo; + XLogSegNo segno; + XLogRecPtr keep; XLByteToSeg(recptr, currSegNo, wal_segment_size); segno = currSegNo; @@ -10085,7 +11855,7 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) /* Cap by max_slot_wal_keep_size ... */ if (max_slot_wal_keep_size_mb >= 0) { - uint64 slot_keep_segs; + uint64 slot_keep_segs; slot_keep_segs = ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size); @@ -10098,7 +11868,7 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) /* but, keep at least wal_keep_size if that's set */ if (wal_keep_size_mb > 0) { - uint64 keep_segs; + uint64 keep_segs; keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size); if (currSegNo - segno < keep_segs) @@ -10119,12 +11889,11 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo) /* * Write a NEXTOID log record */ -void -XLogPutNextOid(Oid nextOid) +void XLogPutNextOid(Oid nextOid) { XLogBeginInsert(); - XLogRegisterData((char *) (&nextOid), sizeof(Oid)); - (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID); + XLogRegisterData((char *)(&nextOid), sizeof(Oid)); + (void)XLogInsert(RM_XLOG_ID, XLOG_NEXTOID); /* * We need not flush the NEXTOID record immediately, because any of the @@ -10159,7 +11928,7 @@ XLogPutNextOid(Oid nextOid) XLogRecPtr RequestXLogSwitch(bool mark_unimportant) { - XLogRecPtr RecPtr; + XLogRecPtr RecPtr; /* XLOG SWITCH has no data */ XLogBeginInsert(); @@ -10177,14 +11946,14 @@ RequestXLogSwitch(bool mark_unimportant) XLogRecPtr XLogRestorePoint(const char *rpName) { - XLogRecPtr RecPtr; + XLogRecPtr RecPtr; xl_restore_point xlrec; xlrec.rp_time = GetCurrentTimestamp(); strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN); XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point)); + XLogRegisterData((char *)&xlrec, sizeof(xl_restore_point)); RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT); @@ -10221,7 +11990,7 @@ XLogReportParameters(void) if (wal_level != ControlFile->wal_level || XLogIsNeeded()) { xl_parameter_change xlrec; - XLogRecPtr recptr; + XLogRecPtr recptr; xlrec.MaxConnections = MaxConnections; xlrec.max_worker_processes = max_worker_processes; @@ -10233,7 +12002,7 @@ XLogReportParameters(void) xlrec.track_commit_timestamp = track_commit_timestamp; XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogRegisterData((char *)&xlrec, sizeof(xlrec)); recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE); XLogFlush(recptr); @@ -10262,11 +12031,10 @@ XLogReportParameters(void) * Note: this function assumes there is no other process running * concurrently that could update it. */ -void -UpdateFullPageWrites(void) +void UpdateFullPageWrites(void) { XLogCtlInsert *Insert = &XLogCtl->Insert; - bool recoveryInProgress; + bool recoveryInProgress; /* * Do nothing if full_page_writes has not been changed. @@ -10308,7 +12076,7 @@ UpdateFullPageWrites(void) if (XLogStandbyInfoActive() && !recoveryInProgress) { XLogBeginInsert(); - XLogRegisterData((char *) (&fullPageWrites), sizeof(bool)); + XLogRegisterData((char *)(&fullPageWrites), sizeof(bool)); XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE); } @@ -10373,19 +12141,18 @@ checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI) * Definitions of info values are in include/catalog/pg_control.h, though * not all record types are related to control file updates. */ -void -xlog_redo(XLogReaderState *record) +void xlog_redo(XLogReaderState *record) { - uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; - XLogRecPtr lsn = record->EndRecPtr; - + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + XLogRecPtr lsn = record->EndRecPtr; + /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */ Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT || !XLogRecHasAnyBlockRefs(record)); if (info == XLOG_NEXTOID) { - Oid nextOid; + Oid nextOid; /* * We used to try to take the maximum of ShmemVariableCache->nextOid @@ -10403,13 +12170,18 @@ xlog_redo(XLogReaderState *record) else if (info == XLOG_CHECKPOINT_SHUTDOWN) { #ifndef PG_NOREPLAY - if (startupPid == getpid() && PushPtr != record->currRecPtr) { - PushPtr = record->currRecPtr; - memcpy(&GlobalCheckPoint,XLogRecGetData(record), sizeof(CheckPoint)); + if (startupPid == getpid()) + { + if (record->insertTikv == false) { + CheckPointPtr = record->currRecPtr; + } else { + CheckPointPtr = record->ReadRecPtr; + } + memcpy(&GlobalCheckPoint, XLogRecGetData(record), sizeof(CheckPoint)); GlobalState = DB_SHUTDOWNED; } #endif - CheckPoint checkPoint; + CheckPoint checkPoint; memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); /* In a SHUTDOWN checkpoint, believe the counters exactly */ @@ -10452,7 +12224,7 @@ xlog_redo(XLogReaderState *record) if (standbyState >= STANDBY_INITIALIZED) { TransactionId *xids; - int nxids; + int nxids; TransactionId oldestActiveXID; TransactionId latestCompletedXid; RunningTransactionsData running; @@ -10484,6 +12256,9 @@ xlog_redo(XLogReaderState *record) /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; + if (he3mirror == true && record->streamStart == true) { + ControlFile->checkPointFile = FileCheckPointPtr; + } LWLockRelease(ControlFileLock); /* Update shared-memory copy of checkpoint XID/epoch */ @@ -10505,13 +12280,20 @@ xlog_redo(XLogReaderState *record) else if (info == XLOG_CHECKPOINT_ONLINE) { #ifndef PG_NOREPLAY - if (startupPid == getpid() && PushPtr != record->currRecPtr) { - PushPtr = record->currRecPtr; - memcpy(&GlobalCheckPoint,XLogRecGetData(record), sizeof(CheckPoint)); + if (startupPid == getpid()) + { + if (record->insertTikv == false) { + CheckPointPtr = record->currRecPtr; + } else { + CheckPointPtr = record->ReadRecPtr; + } + memcpy(&GlobalCheckPoint, XLogRecGetData(record), sizeof(CheckPoint)); + // for recover checkpoint must be flush disk,so set GlobalCheckPoint.redo = CheckPointPtr is ok + GlobalCheckPoint.redo = CheckPointPtr; GlobalState = DB_IN_ARCHIVE_RECOVERY; } #endif - CheckPoint checkPoint; + CheckPoint checkPoint; memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint)); /* In an ONLINE checkpoint, treat the XID counter as a minimum */ @@ -10549,7 +12331,10 @@ xlog_redo(XLogReaderState *record) checkPoint.oldestXidDB); /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; + ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; + if (he3mirror == true && record->streamStart == true) { + ControlFile->checkPointFile = FileCheckPointPtr; + } LWLockRelease(ControlFileLock); /* Update shared-memory copy of checkpoint XID/epoch */ @@ -10623,19 +12408,15 @@ xlog_redo(XLogReaderState *record) * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info * code just to distinguish them for statistics purposes. */ - if (data_buffer_for_replay(record) == false) { - return; - } - Buffer buffer; - + Buffer buffer; + if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED) elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); UnlockReleaseBuffer(buffer); - } else if (info == XLOG_BACKUP_END) { - XLogRecPtr startpoint; + XLogRecPtr startpoint; memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint)); @@ -10711,7 +12492,7 @@ xlog_redo(XLogReaderState *record) } else if (info == XLOG_FPW_CHANGE) { - bool fpw; + bool fpw; memcpy(&fpw, XLogRecGetData(record), sizeof(bool)); @@ -10767,7 +12548,7 @@ xlog_outrec(StringInfo buf, XLogReaderState *record) xlog_block_info(buf, record); } -#endif /* WAL_DEBUG */ +#endif /* WAL_DEBUG */ /* * Returns a string giving information about all the blocks in an @@ -10776,13 +12557,13 @@ xlog_outrec(StringInfo buf, XLogReaderState *record) static void xlog_block_info(StringInfo buf, XLogReaderState *record) { - int block_id; + int block_id; /* decode block references */ for (block_id = 0; block_id <= record->max_block_id; block_id++) { RelFileNode rnode; - ForkNumber forknum; + ForkNumber forknum; BlockNumber blk; if (!XLogRecHasBlockRef(record, block_id)) @@ -10812,8 +12593,8 @@ xlog_block_info(StringInfo buf, XLogReaderState *record) static void xlog_outdesc(StringInfo buf, XLogReaderState *record) { - RmgrId rmid = XLogRecGetRmid(record); - uint8 info = XLogRecGetInfo(record); + RmgrId rmid = XLogRecGetRmid(record); + uint8 info = XLogRecGetInfo(record); const char *id; appendStringInfoString(buf, RmgrTable[rmid].rm_name); @@ -10828,7 +12609,6 @@ xlog_outdesc(StringInfo buf, XLogReaderState *record) RmgrTable[rmid].rm_desc(buf, record); } - /* * Return the (possible) sync flag used for opening a file, depending on the * value of the GUC wal_sync_method. @@ -10836,7 +12616,7 @@ xlog_outdesc(StringInfo buf, XLogReaderState *record) static int get_sync_bit(int method) { - int o_direct_flag = 0; + int o_direct_flag = 0; /* If fsync is disabled, never open in sync mode */ if (!enableFsync) @@ -10861,36 +12641,35 @@ get_sync_bit(int method) switch (method) { - /* - * enum values for all sync options are defined even if they are - * not supported on the current platform. But if not, they are - * not included in the enum option array, and therefore will never - * be seen here. - */ - case SYNC_METHOD_FSYNC: - case SYNC_METHOD_FSYNC_WRITETHROUGH: - case SYNC_METHOD_FDATASYNC: - return 0; + /* + * enum values for all sync options are defined even if they are + * not supported on the current platform. But if not, they are + * not included in the enum option array, and therefore will never + * be seen here. + */ + case SYNC_METHOD_FSYNC: + case SYNC_METHOD_FSYNC_WRITETHROUGH: + case SYNC_METHOD_FDATASYNC: + return 0; #ifdef OPEN_SYNC_FLAG - case SYNC_METHOD_OPEN: - return OPEN_SYNC_FLAG | o_direct_flag; + case SYNC_METHOD_OPEN: + return OPEN_SYNC_FLAG | o_direct_flag; #endif #ifdef OPEN_DATASYNC_FLAG - case SYNC_METHOD_OPEN_DSYNC: - return OPEN_DATASYNC_FLAG | o_direct_flag; + case SYNC_METHOD_OPEN_DSYNC: + return OPEN_DATASYNC_FLAG | o_direct_flag; #endif - default: - /* can't happen (unless we are out of sync with option array) */ - elog(ERROR, "unrecognized wal_sync_method: %d", method); - return 0; /* silence warning */ + default: + /* can't happen (unless we are out of sync with option array) */ + elog(ERROR, "unrecognized wal_sync_method: %d", method); + return 0; /* silence warning */ } } /* * GUC support */ -void -assign_xlog_sync_method(int new_sync_method, void *extra) +void assign_xlog_sync_method(int new_sync_method, void *extra) { if (sync_method != new_sync_method) { @@ -10900,42 +12679,41 @@ assign_xlog_sync_method(int new_sync_method, void *extra) * changing, close the log file so it will be reopened (with new flag * bit) at next use. */ - if (openLogFile >= 0) - { - pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN); - if (pg_fsync(openLogFile) != 0) - { - char xlogfname[MAXFNAMELEN]; - int save_errno; + //todo: sync wal kv + // if (openLogFile >= 0) + // { + // pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN); + // if (pg_fsync(openLogFile) != 0) + // { + // char xlogfname[MAXFNAMELEN]; + // int save_errno; - save_errno = errno; - XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, - wal_segment_size); - errno = save_errno; - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", xlogfname))); - } + // save_errno = errno; + // XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, + // wal_segment_size); + // errno = save_errno; + // ereport(PANIC, + // (errcode_for_file_access(), + // errmsg("could not fsync file \"%s\": %m", xlogfname))); + // } - pgstat_report_wait_end(); - if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method)) - XLogFileClose(); - } + // pgstat_report_wait_end(); + // if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method)) + // XLogFileClose(); + // } } } - /* * Issue appropriate kind of fsync (if any) for an XLOG output file. * * 'fd' is a file descriptor for the XLOG file to be fsync'd. * 'segno' is for error reporting purposes. */ -void -issue_xlog_fsync(int fd, XLogSegNo segno) +void issue_xlog_fsync(int fd, XLogSegNo segno) { - char *msg = NULL; - instr_time start; + char *msg = NULL; + instr_time start; /* * Quick exit if fsync is disabled or write() has already synced the WAL @@ -10953,37 +12731,37 @@ issue_xlog_fsync(int fd, XLogSegNo segno) pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC); switch (sync_method) { - case SYNC_METHOD_FSYNC: - if (pg_fsync_no_writethrough(fd) != 0) - msg = _("could not fsync file \"%s\": %m"); - break; + case SYNC_METHOD_FSYNC: + if (pg_fsync_no_writethrough(fd) != 0) + msg = _("could not fsync file \"%s\": %m"); + break; #ifdef HAVE_FSYNC_WRITETHROUGH - case SYNC_METHOD_FSYNC_WRITETHROUGH: - if (pg_fsync_writethrough(fd) != 0) - msg = _("could not fsync write-through file \"%s\": %m"); - break; + case SYNC_METHOD_FSYNC_WRITETHROUGH: + if (pg_fsync_writethrough(fd) != 0) + msg = _("could not fsync write-through file \"%s\": %m"); + break; #endif #ifdef HAVE_FDATASYNC - case SYNC_METHOD_FDATASYNC: - if (pg_fdatasync(fd) != 0) - msg = _("could not fdatasync file \"%s\": %m"); - break; + case SYNC_METHOD_FDATASYNC: + if (pg_fdatasync(fd) != 0) + msg = _("could not fdatasync file \"%s\": %m"); + break; #endif - case SYNC_METHOD_OPEN: - case SYNC_METHOD_OPEN_DSYNC: - /* not reachable */ - Assert(false); - break; - default: - elog(PANIC, "unrecognized wal_sync_method: %d", sync_method); - break; + case SYNC_METHOD_OPEN: + case SYNC_METHOD_OPEN_DSYNC: + /* not reachable */ + Assert(false); + break; + default: + elog(PANIC, "unrecognized wal_sync_method: %d", sync_method); + break; } /* PANIC if failed to fsync */ if (msg) { - char xlogfname[MAXFNAMELEN]; - int save_errno = errno; + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; XLogFileName(xlogfname, ThisTimeLineID, segno, wal_segment_size); @@ -11000,7 +12778,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno) */ if (track_wal_io_timing) { - instr_time duration; + instr_time duration; INSTR_TIME_SET_CURRENT(duration); INSTR_TIME_SUBTRACT(duration, start); @@ -11056,17 +12834,17 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, StringInfo labelfile, List **tablespaces, StringInfo tblspcmapfile) { - bool exclusive = (labelfile == NULL); - bool backup_started_in_recovery = false; - XLogRecPtr checkpointloc; - XLogRecPtr startpoint; - TimeLineID starttli; - pg_time_t stamp_time; - char strfbuf[128]; - char xlogfilename[MAXFNAMELEN]; - XLogSegNo _logSegNo; + bool exclusive = (labelfile == NULL); + bool backup_started_in_recovery = false; + XLogRecPtr checkpointloc; + XLogRecPtr startpoint; + TimeLineID starttli; + pg_time_t stamp_time; + char strfbuf[128]; + char xlogfilename[MAXFNAMELEN]; + XLogSegNo _logSegNo; struct stat stat_buf; - FILE *fp; + FILE *fp; backup_started_in_recovery = RecoveryInProgress(); @@ -11140,13 +12918,13 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, WALInsertLockRelease(); /* Ensure we release forcePageWrites if fail below */ - PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive)); + PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum)BoolGetDatum(exclusive)); { - bool gotUniqueStartpoint = false; - DIR *tblspcdir; + bool gotUniqueStartpoint = false; + DIR *tblspcdir; struct dirent *de; tablespaceinfo *ti; - int datadirpathlen; + int datadirpathlen; /* * Force an XLOG file switch before the checkpoint, to ensure that the @@ -11174,7 +12952,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, do { - bool checkpointfpw; + bool checkpointfpw; /* * Force a CHECKPOINT. Aside from being necessary to prevent torn @@ -11212,7 +12990,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, if (backup_started_in_recovery) { - XLogRecPtr recptr; + XLogRecPtr recptr; /* * Check to see if all WAL replayed during online backup @@ -11223,6 +13001,8 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, recptr = XLogCtl->lastFpwDisableRecPtr; SpinLockRelease(&XLogCtl->info_lck); + /* + *for he3db pg_basebackup if (!checkpointfpw || startpoint <= recptr) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), @@ -11232,6 +13012,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, "is corrupt and should not be used. " "Enable full_page_writes and run CHECKPOINT on the primary, " "and then try an online backup again."))); + */ /* * During recovery, since we don't use the end-of-backup WAL @@ -11279,12 +13060,12 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, tblspcdir = AllocateDir("pg_tblspc"); while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL) { - char fullpath[MAXPGPATH + 10]; - char linkpath[MAXPGPATH]; - char *relpath = NULL; - int rllen; + char fullpath[MAXPGPATH + 10]; + char linkpath[MAXPGPATH]; + char *relpath = NULL; + int rllen; StringInfoData escapedpath; - char *s; + char *s; /* Skip anything that doesn't look like a tablespace */ if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) @@ -11367,7 +13148,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, labelfile = makeStringInfo(); /* Use the log timezone here, not the session timezone */ - stamp_time = (pg_time_t) time(NULL); + stamp_time = (pg_time_t)time(NULL); pg_strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&stamp_time, log_timezone)); @@ -11470,7 +13251,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, pfree(tblspcmapfile); } } - PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive)); + PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum)BoolGetDatum(exclusive)); /* * Mark that start phase has correctly finished for an exclusive backup. @@ -11504,7 +13285,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p, static void pg_start_backup_callback(int code, Datum arg) { - bool exclusive = DatumGetBool(arg); + bool exclusive = DatumGetBool(arg); /* Update backup counters and forcePageWrites on failure */ WALInsertLockAcquireExclusive(); @@ -11533,7 +13314,7 @@ pg_start_backup_callback(int code, Datum arg) static void pg_stop_backup_callback(int code, Datum arg) { - bool exclusive = DatumGetBool(arg); + bool exclusive = DatumGetBool(arg); /* Update backup status on failure */ WALInsertLockAcquireExclusive(); @@ -11572,30 +13353,30 @@ get_backup_status(void) XLogRecPtr do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) { - bool exclusive = (labelfile == NULL); - bool backup_started_in_recovery = false; - XLogRecPtr startpoint; - XLogRecPtr stoppoint; - TimeLineID stoptli; - pg_time_t stamp_time; - char strfbuf[128]; - char histfilepath[MAXPGPATH]; - char startxlogfilename[MAXFNAMELEN]; - char stopxlogfilename[MAXFNAMELEN]; - char lastxlogfilename[MAXFNAMELEN]; - char histfilename[MAXFNAMELEN]; - char backupfrom[20]; - XLogSegNo _logSegNo; - FILE *lfp; - FILE *fp; - char ch; - int seconds_before_warning; - int waits = 0; - bool reported_waiting = false; - char *remaining; - char *ptr; - uint32 hi, - lo; + bool exclusive = (labelfile == NULL); + bool backup_started_in_recovery = false; + XLogRecPtr startpoint; + XLogRecPtr stoppoint; + TimeLineID stoptli; + pg_time_t stamp_time; + char strfbuf[128]; + char histfilepath[MAXPGPATH]; + char startxlogfilename[MAXFNAMELEN]; + char stopxlogfilename[MAXFNAMELEN]; + char lastxlogfilename[MAXFNAMELEN]; + char histfilename[MAXFNAMELEN]; + char backupfrom[20]; + XLogSegNo _logSegNo; + FILE *lfp; + FILE *fp; + char ch; + int seconds_before_warning; + int waits = 0; + bool reported_waiting = false; + char *remaining; + char *ptr; + uint32 hi, + lo; backup_started_in_recovery = RecoveryInProgress(); @@ -11640,13 +13421,13 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) * Remove backup_label. In case of failure, the state for an exclusive * backup is switched back to in-progress. */ - PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive)); + PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum)BoolGetDatum(exclusive)); { /* * Read the existing label file into memory. */ struct stat statbuf; - int r; + int r; if (stat(BACKUP_LABEL_FILE, &statbuf)) { @@ -11689,7 +13470,7 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) */ durable_unlink(TABLESPACE_MAP, DEBUG1); } - PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive)); + PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum)BoolGetDatum(exclusive)); } /* @@ -11741,12 +13522,13 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) */ if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c", &hi, &lo, startxlogfilename, - &ch) != 4 || ch != '\n') + &ch) != 4 || + ch != '\n') ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); - startpoint = ((uint64) hi) << 32 | lo; - remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */ + startpoint = ((uint64)hi) << 32 | lo; + remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */ /* * Parse the BACKUP FROM line. If we are taking an online backup from the @@ -11796,7 +13578,7 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) */ if (backup_started_in_recovery) { - XLogRecPtr recptr; + XLogRecPtr recptr; /* * Check to see if all WAL replayed during online backup contain @@ -11806,6 +13588,8 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) recptr = XLogCtl->lastFpwDisableRecPtr; SpinLockRelease(&XLogCtl->info_lck); + /* + *for he3db pg_basebackup if (startpoint <= recptr) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), @@ -11815,7 +13599,7 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) "is corrupt and should not be used. " "Enable full_page_writes and run CHECKPOINT on the primary, " "and then try an online backup again."))); - + */ LWLockAcquire(ControlFileLock, LW_SHARED); stoppoint = ControlFile->minRecoveryPoint; @@ -11828,7 +13612,7 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) * Write the backup-end xlog record */ XLogBeginInsert(); - XLogRegisterData((char *) (&startpoint), sizeof(startpoint)); + XLogRegisterData((char *)(&startpoint), sizeof(startpoint)); stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END); stoptli = ThisTimeLineID; @@ -11842,7 +13626,7 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size); /* Use the log timezone here, not the session timezone */ - stamp_time = (pg_time_t) time(NULL); + stamp_time = (pg_time_t)time(NULL); pg_strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&stamp_time, log_timezone)); @@ -11939,7 +13723,7 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) if (++waits >= seconds_before_warning) { - seconds_before_warning *= 2; /* This wraps in >10 years... */ + seconds_before_warning *= 2; /* This wraps in >10 years... */ ereport(WARNING, (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)", waits), @@ -11964,7 +13748,6 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) return stoppoint; } - /* * do_pg_abort_backup: abort a running backup * @@ -11982,10 +13765,9 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p) * NB: This gets used as a before_shmem_exit handler, hence the odd-looking * signature. */ -void -do_pg_abort_backup(int code, Datum arg) +void do_pg_abort_backup(int code, Datum arg) { - bool emit_warning = DatumGetBool(arg); + bool emit_warning = DatumGetBool(arg); /* * Quick exit if session is not keeping around a non-exclusive backup @@ -12014,8 +13796,7 @@ do_pg_abort_backup(int code, Datum arg) * Register a handler that will warn about unterminated backups at end of * session, unless this has already been done. */ -void -register_persistent_abort_backup_handler(void) +void register_persistent_abort_backup_handler(void) { static bool already_done = false; @@ -12033,8 +13814,8 @@ register_persistent_abort_backup_handler(void) XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI) { - XLogRecPtr recptr; - TimeLineID tli; + XLogRecPtr recptr; + TimeLineID tli; SpinLockAcquire(&XLogCtl->info_lck); recptr = XLogCtl->lastReplayedEndRecPtr; @@ -12046,6 +13827,34 @@ GetXLogReplayRecPtr(TimeLineID *replayTLI) return recptr; } +XLogRecPtr GetXLogPushToDisk(void) { + XLogRecPtr pushToDiskLsn; + SpinLockAcquire(&XLogCtl->info_lck); + pushToDiskLsn = XLogCtl->pushToDisk; + SpinLockRelease(&XLogCtl->info_lck); + return pushToDiskLsn; +} + +void SetXLogPushToDisk(XLogRecPtr pushToDiskLsn) { + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->pushToDisk = pushToDiskLsn; + SpinLockRelease(&XLogCtl->info_lck); +} + +XLogRecPtr GetFileReplayLsn(void) { + XLogRecPtr fileLsn; + SpinLockAcquire(&XLogCtl->info_lck); + fileLsn = XLogCtl->fileLsn; + SpinLockRelease(&XLogCtl->info_lck); + return fileLsn; +} + +void SetFileReplayLsn(XLogRecPtr filelsn) { + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->fileLsn = filelsn; + SpinLockRelease(&XLogCtl->info_lck); +} + /* * Get latest WAL insert pointer */ @@ -12053,7 +13862,7 @@ XLogRecPtr GetXLogInsertRecPtr(void) { XLogCtlInsert *Insert = &XLogCtl->Insert; - uint64 current_bytepos; + uint64 current_bytepos; SpinLockAcquire(&Insert->insertpos_lck); current_bytepos = Insert->CurrBytePos; @@ -12079,8 +13888,7 @@ GetXLogWriteRecPtr(void) * Returns the redo pointer of the last checkpoint or restartpoint. This is * the oldest point in WAL that we still need, if we have to restart recovery. */ -void -GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli) +void GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli) { LWLockAcquire(ControlFileLock, LW_SHARED); *oldrecptr = ControlFile->checkPointCopy.redo; @@ -12108,17 +13916,17 @@ static bool read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, bool *backupFromStandby) { - char startxlogfilename[MAXFNAMELEN]; - TimeLineID tli_from_walseg, - tli_from_file; - FILE *lfp; - char ch; - char backuptype[20]; - char backupfrom[20]; - char backuplabel[MAXPGPATH]; - char backuptime[128]; - uint32 hi, - lo; + char startxlogfilename[MAXFNAMELEN]; + TimeLineID tli_from_walseg, + tli_from_file; + FILE *lfp; + char ch; + char backuptype[20]; + char backupfrom[20]; + char backuplabel[MAXPGPATH]; + char backuptime[128]; + uint32 hi, + lo; *backupEndRequired = false; *backupFromStandby = false; @@ -12134,7 +13942,7 @@ read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", BACKUP_LABEL_FILE))); - return false; /* it's not there, all is fine */ + return false; /* it's not there, all is fine */ } /* @@ -12143,17 +13951,19 @@ read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, * format). */ if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c", - &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n') + &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || + ch != '\n') ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); - RedoStartLSN = ((uint64) hi) << 32 | lo; + RedoStartLSN = ((uint64)hi) << 32 | lo; if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c", - &hi, &lo, &ch) != 3 || ch != '\n') + &hi, &lo, &ch) != 3 || + ch != '\n') ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); - *checkPointLoc = ((uint64) hi) << 32 | lo; + *checkPointLoc = ((uint64)hi) << 32 | lo; /* * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore @@ -12233,12 +14043,12 @@ static bool read_tablespace_map(List **tablespaces) { tablespaceinfo *ti; - FILE *lfp; - char str[MAXPGPATH]; - int ch, - i, - n; - bool was_backslash; + FILE *lfp; + char str[MAXPGPATH]; + int ch, + i, + n; + bool was_backslash; /* * See if tablespace_map file is present @@ -12251,7 +14061,7 @@ read_tablespace_map(List **tablespaces) (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", TABLESPACE_MAP))); - return false; /* it's not there, all is fine */ + return false; /* it's not there, all is fine */ } /* @@ -12266,7 +14076,7 @@ read_tablespace_map(List **tablespaces) if (!was_backslash && (ch == '\n' || ch == '\r')) { if (i == 0) - continue; /* \r immediately followed by \n */ + continue; /* \r immediately followed by \n */ /* * The de-escaped line should contain an OID followed by exactly @@ -12301,7 +14111,7 @@ read_tablespace_map(List **tablespaces) } } - if (i != 0 || was_backslash) /* last line not terminated? */ + if (i != 0 || was_backslash) /* last line not terminated? */ ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); @@ -12321,7 +14131,7 @@ read_tablespace_map(List **tablespaces) static void rm_redo_error_callback(void *arg) { - XLogReaderState *record = (XLogReaderState *) arg; + XLogReaderState *record = (XLogReaderState *)arg; StringInfoData buf; initStringInfo(&buf); @@ -12329,9 +14139,9 @@ rm_redo_error_callback(void *arg) xlog_block_info(&buf, record); /* translator: %s is a WAL record description */ - errcontext("WAL redo at %X/%X for %s", - LSN_FORMAT_ARGS(record->ReadRecPtr), - buf.data); + // errcontext("WAL redo at %X/%X for %s", + // LSN_FORMAT_ARGS(record->ReadRecPtr), + // buf.data); pfree(buf.data); } @@ -12341,8 +14151,7 @@ rm_redo_error_callback(void *arg) * * This is done by checking for existence of the "backup_label" file. */ -bool -BackupInProgress(void) +bool BackupInProgress(void) { struct stat stat_buf; @@ -12361,8 +14170,7 @@ BackupInProgress(void) * useless. To correctly finish an online backup, pg_stop_backup must be * called. */ -void -CancelBackup(void) +void CancelBackup(void) { struct stat stat_buf; @@ -12417,6 +14225,8 @@ CancelBackup(void) } } + + /* * Read the XLOG page containing RecPtr into readBuf (if not read already). * Returns number of bytes read, if the page is read successfully, or -1 @@ -12440,128 +14250,239 @@ CancelBackup(void) * XLogPageRead() to try fetching the record from another source, or to * sleep and retry. */ + static int + XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *readBuf) + { + XLogPageReadPrivate *private = + (XLogPageReadPrivate *) xlogreader->private_data; + int emode = private->emode; + uint32 targetPageOff; + XLogSegNo targetSegNo; + int r; + + XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size); + targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size); + + /* + * See if we need to switch to a new segment because the requested record + * is not in the currently open one. + */ + if (readFile >= 0 && + !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size)) + { + /* + * Request a restartpoint if we've replayed too much xlog since the + * last one. + */ + if (bgwriterLaunched) + { + if (XLogCheckpointNeeded(readSegNo)) + { + (void) GetRedoRecPtr(); + if (XLogCheckpointNeeded(readSegNo)) + RequestCheckpoint(CHECKPOINT_CAUSE_XLOG); + } + } + + close(readFile); + readFile = -1; + readSource = XLOG_FROM_ANY; + } + + XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size); + + retry: + /* See if we need to retrieve more data */ + if (readFile < 0 || + (readSource == XLOG_FROM_STREAM && + flushedUpto < targetPagePtr + reqLen)) + { + if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen, + private->randAccess, + private->fetching_ckpt, + targetRecPtr)) + { + if (readFile >= 0) + close(readFile); + readFile = -1; + readLen = 0; + readSource = XLOG_FROM_ANY; + return -1; + } + } + + /* + * At this point, we have the right segment open and if we're streaming we + * know the requested record is in it. + */ + Assert(readFile != -1); + + /* + * If the current segment is being streamed from the primary, calculate + * how much of the current page we have received already. We know the + * requested record has been received, but this is for the benefit of + * future calls, to allow quick exit at the top of this function. + */ + if (readSource == XLOG_FROM_STREAM) + { + if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ)) + readLen = XLOG_BLCKSZ; + else + readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) - + targetPageOff; + } + else + readLen = XLOG_BLCKSZ; + + /* Read the requested page */ + readOff = targetPageOff; + + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); + //r = he3fs_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff, WalRestoreRead); + r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); + if (r != XLOG_BLCKSZ) + { + char fname[MAXFNAMELEN]; + int save_errno = errno; + + pgstat_report_wait_end(); + XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size); + if (r < 0) + { + errno = save_errno; + ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), + (errcode_for_file_access(), + errmsg("could not read from log segment %s, offset %u: %m", + fname, readOff))); + } + else + ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read from log segment %s, offset %u: read %d of %zu", + fname, readOff, r, (Size) XLOG_BLCKSZ))); + goto next_record_is_invalid; + } + pgstat_report_wait_end(); + + Assert(targetSegNo == readSegNo); + Assert(targetPageOff == readOff); + Assert(reqLen <= readLen); + + xlogreader->seg.ws_tli = curFileTLI; + + /* + * Check the page header immediately, so that we can retry immediately if + * it's not valid. This may seem unnecessary, because XLogReadRecord() + * validates the page header anyway, and would propagate the failure up to + * ReadRecord(), which would retry. However, there's a corner case with + * continuation records, if a record is split across two pages such that + * we would need to read the two pages from different sources. For + * example, imagine a scenario where a streaming replica is started up, + * and replay reaches a record that's split across two WAL segments. The + * first page is only available locally, in pg_wal, because it's already + * been recycled on the primary. The second page, however, is not present + * in pg_wal, and we should stream it from the primary. There is a + * recycled WAL segment present in pg_wal, with garbage contents, however. + * We would read the first page from the local WAL segment, but when + * reading the second page, we would read the bogus, recycled, WAL + * segment. If we didn't catch that case here, we would never recover, + * because ReadRecord() would retry reading the whole record from the + * beginning. + * + * Of course, this only catches errors in the page header, which is what + * happens in the case of a recycled WAL segment. Other kinds of errors or + * corruption still has the same problem. But this at least fixes the + * common case, which can happen as part of normal operation. + * + * Validating the page header is cheap enough that doing it twice + * shouldn't be a big deal from a performance point of view. + */ + if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf)) + { + /* reset any error XLogReaderValidatePageHeader() might have set */ + xlogreader->errormsg_buf[0] = '\0'; + goto next_record_is_invalid; + } + + return readLen; + + next_record_is_invalid: + lastSourceFailed = true; + + if (readFile >= 0) + close(readFile); + readFile = -1; + readLen = 0; + readSource = XLOG_FROM_ANY; + + /* In standby-mode, keep trying */ + if (StandbyMode) + goto retry; + else + return -1; + } + +static int AllXLogBatchRead(XLogReaderState *xlogreader, XLogRecPtr startPtr, int reqLen, + char *readBuf) { + int ret = 0; + if (xlogreader->streamStart == false) { + return XLogBatchRead(xlogreader,startPtr,reqLen,readBuf); + } else { + ret = consumerXLogBatchRead(xlogreader,startPtr,reqLen,readBuf); + return ret; + } +} + +/* +* reqLen is SizeOfXLogRecord, at least find one wal record. +*/ static int -XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, - XLogRecPtr targetRecPtr, char *readBuf) +XLogBatchRead(XLogReaderState *xlogreader, XLogRecPtr startPtr, int reqLen, + char *readBuf) { XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; int emode = private->emode; - uint32 targetPageOff; + XLogRecPtr targetOff; XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; int r; - XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size); - targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size); + targetOff = xlogreader->EndRecPtr; + /* * See if we need to switch to a new segment because the requested record * is not in the currently open one. */ - if (readFile >= 0 && - !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size)) - { - /* - * Request a restartpoint if we've replayed too much xlog since the - * last one. - */ - if (bgwriterLaunched) - { - if (XLogCheckpointNeeded(readSegNo)) - { - (void) GetRedoRecPtr(); - if (XLogCheckpointNeeded(readSegNo)) - RequestCheckpoint(CHECKPOINT_CAUSE_XLOG); - } - } - - close(readFile); - readFile = -1; - readSource = XLOG_FROM_ANY; - } - - XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size); - retry: /* See if we need to retrieve more data */ - if (readFile < 0 || - (readSource == XLOG_FROM_STREAM && - flushedUpto < targetPagePtr + reqLen)) - { - if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen, + // if ((readSource == XLOG_FROM_STREAM && + // flushedUpto < targetOff + reqLen)) + // { + if (!WaitForWALToBecomeAvailable(startPtr + reqLen, private->randAccess, private->fetching_ckpt, - targetRecPtr)) + startPtr)) { - if (readFile >= 0) - close(readFile); - readFile = -1; readLen = 0; readSource = XLOG_FROM_ANY; return -1; } - } + // } - /* - * At this point, we have the right segment open and if we're streaming we - * know the requested record is in it. - */ - Assert(readFile != -1); - - /* - * If the current segment is being streamed from the primary, calculate - * how much of the current page we have received already. We know the - * requested record has been received, but this is for the benefit of - * future calls, to allow quick exit at the top of this function. - */ - if (readSource == XLOG_FROM_STREAM) - { - if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ)) - readLen = XLOG_BLCKSZ; - else - readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) - - targetPageOff; - } - else - readLen = XLOG_BLCKSZ; /* Read the requested page */ - readOff = targetPageOff; pgstat_report_wait_start(WAIT_EVENT_WAL_READ); - //r = he3fs_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff, WalRestoreRead); - r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); - if (r != XLOG_BLCKSZ) - { - char fname[MAXFNAMELEN]; - int save_errno = errno; - - pgstat_report_wait_end(); - XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size); - if (r < 0) - { - errno = save_errno; - ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), - (errcode_for_file_access(), - errmsg("could not read from log segment %s, offset %u: %m", - fname, readOff))); - } - else - ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("could not read from log segment %s, offset %u: read %d of %zu", - fname, readOff, r, (Size) XLOG_BLCKSZ))); - goto next_record_is_invalid; - } + bool walStoreToLocal = false; + if (EnableHotStandby && *isPromoteIsTriggered == false && !push_standby) + walStoreToLocal = true; + + r = batchRead((uint8_t *) readBuf, ControlFile->checkPointCopy.ThisTimeLineID, targetOff,targetOff+16384,walStoreToLocal); pgstat_report_wait_end(); - Assert(targetSegNo == readSegNo); - Assert(targetPageOff == readOff); - Assert(reqLen <= readLen); - - xlogreader->seg.ws_tli = curFileTLI; - /* * Check the page header immediately, so that we can retry immediately if * it's not valid. This may seem unnecessary, because XLogReadRecord() @@ -12589,21 +14510,17 @@ retry: * Validating the page header is cheap enough that doing it twice * shouldn't be a big deal from a performance point of view. */ - if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf)) + if (r < reqLen) { /* reset any error XLogReaderValidatePageHeader() might have set */ xlogreader->errormsg_buf[0] = '\0'; goto next_record_is_invalid; } - return readLen; + return r; next_record_is_invalid: lastSourceFailed = true; - - if (readFile >= 0) - close(readFile); - readFile = -1; readLen = 0; readSource = XLOG_FROM_ANY; @@ -12613,6 +14530,271 @@ next_record_is_invalid: else return -1; } + +bool startup_shutdown_requested = false; +static bool GetShutDownStatus(void) { + if (startup_shutdown_requested) { + return true; + } + return false; +} + +static int consumerFailedNum = 0; +static XLogRecPtr reStartPtr = 0; +consumerXLogBatchRead(XLogReaderState *xlogreader, XLogRecPtr startPtr, int reqLen, + char *readBuf) +{ + wal_batch_t *FirstData = NULL; + int timeCount = 0; + bool pushFlag = false; + reStartPtr = startPtr; + do { + if (gRingBufferManger->maxIdx > spaceNum/16 || timeCount > 1000 || pushFlag == true) { + if (push_standby == true || EnableHotStandby == false) { + pushTikv(0, hashMapSize(), true); + } + if (gRingBufferManger->maxIdx != 0 ) { + ring_buffer_dequeue_arr(gRingBufferManger,gRingBufferManger->maxIdx); + gRingBufferManger->maxIdx = 0; + if (pushFlag == true) { + consumerFailedNum++; + return -1; + } + } + } + if (1 == ring_buffer_peek(gRingBufferManger,&FirstData,gRingBufferManger->maxIdx)) { + if (pg_atomic_read_u32(&FirstData->status) == (uint32_t) COMPLETEDSTATUS) { + if (FirstData->startLsn == startPtr) { + xlogreader->readBuf = FirstData->data; + xlogreader->readLen = FirstData->dataLen; + gRingBufferManger->maxIdx++; + FileCheckPointPtr = FirstData->checkPointLsn; + consumerFailedNum = 0; + return FirstData->dataLen; + } else if (startPtr > FirstData->startLsn) { + if (FirstData->startLsn == 0) { + if (he3mirror) { + if (FirstData->checkPointLsn != 0) { + SetFileReplayLsn(FirstData->checkPointLsn); + } + } + gRingBufferManger->maxIdx++; + pushFlag = true; + } + // for master startup donot know wal endLsn,default failed to get 8 times of lsn is wal file end + if (!EnableHotStandby || LocalPromoteIsTriggered) { + WalTaskImmediateFree(); + } + continue; + } else { + elog(LOG, "stream FirstData.startLsn %X gt startPtr %X, need clean wals which ge startPtr.", + FirstData->startLsn, startPtr); + DelRangeWals(ThisTimeLineID, FirstData->startLsn, PG_UINT64_MAX); + WalTaskImmediateFree(); + return -1; + } + } else { + pg_usleep(1000); + continue; + } + } else { + timeCount++; + pg_usleep(500); + } + } while(GetShutDownStatus() == false && (FirstData == NULL || startPtr >= FirstData->startLsn)); + + return -1; +} + +XLogRecPtr producerHe3dbXLog(XLogRecData *rdata, + XLogRecPtr fpw_lsn, + uint8 flags, + int num_fpi,XLogRecPtr fileLsn) { + XLogCtlInsert *Insert = &XLogCtl->Insert; + pg_crc32c rdata_crc; + XLogRecPtr StartPos; //本次预留空间的起始位置 + XLogRecPtr EndPos; //本次预留空间的结束位置 + bool prevDoPageWrites = doPageWrites; + + /* we assume that all of the record header is in the first chunk */ + if (rdata != NULL) { + Assert(rdata->len >= SizeOfXLogRecord); + } + static char *he3Xlog = NULL; + static int datalen = 0; + if (he3Xlog == NULL) { + he3Xlog = (char *) palloc_extended(4 * XLOG_BLCKSZ, + MCXT_ALLOC_NO_OOM); + } + bool checkPointFlag = false; + if (group_total_len != 0) { + XLogRecord *rechdr = (XLogRecord *) rdata->data; + XLogRecPtr startbytepos; + He3DBReserveXLogInsertLocation(group_total_len, rechdr->xl_tot_len, &StartPos, &EndPos, + &rechdr->xl_prev,&startbytepos); + + char *currpos; + int written; + XLogRecPtr CurrPos; + XLogRecData *rdata; + CurrPos = StartPos; + /*currpos是xlog在xlog buffer的物理地址*/ + currpos = he3Xlog + datalen; + + /* Copy record data */ + written = 0; + for (int i = 0; i < grouo_rec_count; i++) + { + rdata = (XLogRecData *)&groupRecData[i]; + while (rdata != NULL) + { + char *rdata_data = rdata->data; + int rdata_len = rdata->len; + memcpy(currpos, rdata_data, rdata_len); + currpos += rdata_len; + CurrPos += rdata_len; + written += rdata_len; + rdata = rdata->next; + } + } + + Assert(written == group_total_len); + datalen += written; + if (CurrPos != EndPos) + elog(PANIC, "space reserved for WAL record does not match what was written"); + if (grouo_rec_count == 1 && rechdr->xl_rmid == RM_XLOG_ID) { + if (((rechdr->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN) || + ((rechdr->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_ONLINE)) { + checkPointFlag = true; + } + } + } else { + EndPos = XLogCtl->Insert.CurrBytePos; + } + + if (datalen > 2 * XLOG_BLCKSZ || checkPointFlag == true || group_total_len == 0 && (datalen != 0 || fileLsn != 0)) { + wal_batch_t walElem; + walElem.startLsn = EndPos - datalen; + walElem.endLsn = EndPos; + walElem.checkPointLsn = InvalidXLogRecPtr; + bool retry = false; + if (checkPointFlag == true) { + walElem.checkPointLsn = fileLsn; + } else { + if (group_total_len == 0 && fileLsn != 0) { + if (datalen == 0) { + walElem.startLsn = 0; + walElem.checkPointLsn = fileLsn; + } else { + // send data and fileLsn with twice + retry = true; + } + } + } + wal_batch_t *curElem = NULL; +again: + while((curElem = ring_buffer_queue(gRingBufferManger,walElem)) == NULL) { + usleep(10); + } + if (datalen != 0) { + memcpy(curElem->data,he3Xlog,datalen); + } + WalTaskPool(curElem); + datalen = 0; + if (retry == true) { + walElem.startLsn = 0; + walElem.endLsn = 0; + walElem.checkPointLsn = fileLsn; + retry = false; + goto again; + } + } + return EndPos; +} + +static int +producerXLogParallelBatchRead(XLogReaderState *xlogreader, XLogRecPtr startPtr, int reqLen) { + XLogPageReadPrivate *private = + (XLogPageReadPrivate *) xlogreader->private_data; + int emode = private->emode; + if (readedUpto == 0) { + readedUpto = startPtr; + } + //failed to connect master more than 8 + int failedCount = 0; + while(GetShutDownStatus() == false) { + if (!WaitForWALToBecomeAvailable(readedUpto + reqLen, + private->randAccess, + private->fetching_ckpt, + readedUpto)) + { + readLen = 0; + readSource = XLOG_FROM_ANY; + } else { + failedCount = 0; + } + + XLogRecPtr maxFlushedUpto = 0; + //storage batch is 16k batch,we has 8 of pthreads + if (!EnableHotStandby || LocalPromoteIsTriggered || (failedCount > 8 && consumerFailedNum < 8)) { + maxFlushedUpto = readedUpto + 8 * 16 * 1024; + } else { + //reConnect to master + if (consumerFailedNum >= 8) { + if (ring_buffer_is_empty(gRingBufferManger)) { + if (!XLogRecPtrIsInvalid(reStartPtr)) { + readedUpto = reStartPtr; + maxFlushedUpto = reStartPtr; + consumerFailedNum = 0; + failedCount = 0; + ReConnectPrimaryDB(); + } + } + } else { + if (readedUpto < flushedUpto) { + maxFlushedUpto = flushedUpto; + } else if (flushedUpto == 0 || !WalRcvStreaming()) { + failedCount++; + lastSourceFailed = true; + } + } + } + + /* Read the requested page */ + //readOff = targetOff; one elem is 16kb,one wal max 16kb,16kb + 16kb < 32kb + //pgstat_report_wait_start(WAIT_EVENT_WAL_READ); + XLogRecPtr start = readedUpto; + XLogRecPtr end = maxFlushedUpto; + int pos = 0; + int tlen = (16 * 1024); + for(;start < end;start += tlen) { + wal_batch_t walElem; + walElem.startLsn = start; + walElem.checkPointLsn = 0; + // walElem.endLsn = start + tlen; + if (start + tlen <= end) + walElem.endLsn = start + tlen; + else + walElem.endLsn = end; + wal_batch_t *curElem = NULL; + while((curElem = ring_buffer_queue(gRingBufferManger,walElem)) == NULL) { + pg_usleep(100000); + if (IsFreePthreadPool()){ + return 0; + } + } + WalTaskPool(curElem); + } + if (readedUpto < end) { + readedUpto = end; + } else { + pg_usleep(100000); + } + } + return 0; + //pgstat_report_wait_end(); +} + /* * Open the WAL segment containing WAL location 'RecPtr'. @@ -12646,7 +14828,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, { static TimestampTz last_fail_time = 0; TimestampTz now; - bool streaming_reply_sent = false; + bool streaming_reply_sent = false; /*------- * Standby mode is implemented by a state machine: @@ -12685,8 +14867,8 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, for (;;) { - XLogSource oldSource = currentSource; - bool startWalReceiver = false; + XLogSource oldSource = currentSource; + bool startWalReceiver = false; /* * First check if we failed to read from the current source, and @@ -12698,114 +14880,118 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, { switch (currentSource) { - case XLOG_FROM_ARCHIVE: - case XLOG_FROM_PG_WAL: + case XLOG_FROM_ARCHIVE: + case XLOG_FROM_PG_WAL: - /* - * Check to see if the trigger file exists. Note that we - * do this only after failure, so when you create the - * trigger file, we still finish replaying as much as we - * can from archive and pg_wal before failover. - */ - if (StandbyMode && CheckForStandbyTrigger()) + /* + * Check to see if the trigger file exists. Note that we + * do this only after failure, so when you create the + * trigger file, we still finish replaying as much as we + * can from archive and pg_wal before failover. + */ + if (StandbyMode && CheckForStandbyTrigger()) + { + ShutdownWalRcv(); + return false; + } + + /* + * Not in standby mode, and we've now tried the archive + * and pg_wal. + */ + if (!StandbyMode) + return false; + + /* + * Move to XLOG_FROM_STREAM state, and set to start a + * walreceiver if necessary. + */ + currentSource = XLOG_FROM_STREAM; + startWalReceiver = true; + break; + + case XLOG_FROM_STREAM: + + /* + * Failure while streaming. Most likely, we got here + * because streaming replication was terminated, or + * promotion was triggered. But we also get here if we + * find an invalid record in the WAL streamed from the + * primary, in which case something is seriously wrong. + * There's little chance that the problem will just go + * away, but PANIC is not good for availability either, + * especially in hot standby mode. So, we treat that the + * same as disconnection, and retry from archive/pg_wal + * again. The WAL in the archive should be identical to + * what was streamed, so it's unlikely that it helps, but + * one can hope... + */ + + /* + * We should be able to move to XLOG_FROM_STREAM only in + * standby mode. + */ + Assert(StandbyMode); + /* + * Before we leave XLOG_FROM_STREAM state, make sure that + * walreceiver is not active, so that it won't overwrite + * WAL that we restore from archive. + */ + if (WalRcvStreaming()) { + elog(LOG,"=======WalRcvStreaming========="); + ShutdownWalRcv(); + } + + /* + * Before we sleep, re-scan for possible new timelines if + * we were requested to recover to the latest timeline. + */ + if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) + { + if (rescanLatestTimeLine()) { - ShutdownWalRcv(); - return false; + currentSource = XLOG_FROM_ARCHIVE; + break; } + } - /* - * Not in standby mode, and we've now tried the archive - * and pg_wal. - */ - if (!StandbyMode) - return false; + /* + * XLOG_FROM_STREAM is the last state in our state + * machine, so we've exhausted all the options for + * obtaining the requested WAL. We're going to loop back + * and retry from the archive, but if it hasn't been long + * since last attempt, sleep wal_retrieve_retry_interval + * milliseconds to avoid busy-waiting. + */ + now = GetCurrentTimestamp(); + if (!TimestampDifferenceExceeds(last_fail_time, now, + wal_retrieve_retry_interval)) + { + long wait_time; - /* - * Move to XLOG_FROM_STREAM state, and set to start a - * walreceiver if necessary. - */ - currentSource = XLOG_FROM_STREAM; - startWalReceiver = true; - break; - - case XLOG_FROM_STREAM: - - /* - * Failure while streaming. Most likely, we got here - * because streaming replication was terminated, or - * promotion was triggered. But we also get here if we - * find an invalid record in the WAL streamed from the - * primary, in which case something is seriously wrong. - * There's little chance that the problem will just go - * away, but PANIC is not good for availability either, - * especially in hot standby mode. So, we treat that the - * same as disconnection, and retry from archive/pg_wal - * again. The WAL in the archive should be identical to - * what was streamed, so it's unlikely that it helps, but - * one can hope... - */ - - /* - * We should be able to move to XLOG_FROM_STREAM only in - * standby mode. - */ - Assert(StandbyMode); - - /* - * Before we leave XLOG_FROM_STREAM state, make sure that - * walreceiver is not active, so that it won't overwrite - * WAL that we restore from archive. - */ - if (WalRcvStreaming()) - ShutdownWalRcv(); - - /* - * Before we sleep, re-scan for possible new timelines if - * we were requested to recover to the latest timeline. - */ - if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) - { - if (rescanLatestTimeLine()) - { - currentSource = XLOG_FROM_ARCHIVE; - break; - } + wait_time = wal_retrieve_retry_interval - + TimestampDifferenceMilliseconds(last_fail_time, now); + //this for mutipthread use WaitLatch=>WaitEventSetWait=>global variables LatchWaitSet + while(!ring_buffer_is_empty(gRingBufferManger) && !IsFreePthreadPool() && !startup_shutdown_requested) { + pg_usleep(1000L); } - - /* - * XLOG_FROM_STREAM is the last state in our state - * machine, so we've exhausted all the options for - * obtaining the requested WAL. We're going to loop back - * and retry from the archive, but if it hasn't been long - * since last attempt, sleep wal_retrieve_retry_interval - * milliseconds to avoid busy-waiting. - */ + (void)WaitLatch(&XLogCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | + WL_EXIT_ON_PM_DEATH, + wait_time, + WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL); + ResetLatch(&XLogCtl->recoveryWakeupLatch); now = GetCurrentTimestamp(); - if (!TimestampDifferenceExceeds(last_fail_time, now, - wal_retrieve_retry_interval)) - { - long wait_time; - wait_time = wal_retrieve_retry_interval - - TimestampDifferenceMilliseconds(last_fail_time, now); + /* Handle interrupt signals of startup process */ + HandleStartupProcInterrupts(); + } + last_fail_time = now; + currentSource = XLOG_FROM_ARCHIVE; + break; - (void) WaitLatch(&XLogCtl->recoveryWakeupLatch, - WL_LATCH_SET | WL_TIMEOUT | - WL_EXIT_ON_PM_DEATH, - wait_time, - WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL); - ResetLatch(&XLogCtl->recoveryWakeupLatch); - now = GetCurrentTimestamp(); - - /* Handle interrupt signals of startup process */ - HandleStartupProcInterrupts(); - } - last_fail_time = now; - currentSource = XLOG_FROM_ARCHIVE; - break; - - default: - elog(ERROR, "unexpected WAL source %d", currentSource); + default: + elog(ERROR, "unexpected WAL source %d", currentSource); } } else if (currentSource == XLOG_FROM_PG_WAL) @@ -12832,250 +15018,280 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, switch (currentSource) { - case XLOG_FROM_ARCHIVE: - case XLOG_FROM_PG_WAL: + case XLOG_FROM_ARCHIVE: + case XLOG_FROM_PG_WAL: + /* + * WAL receiver must not be running when reading WAL from + * archive or pg_wal. + */ + Assert(!WalRcvStreaming()); + if (he3mirror) { + /* Close any old file we might have open. */ + if (readFile >= 0) + { + close(readFile); + readFile = -1; + } + /* Reset curFileTLI if random fetch. */ + if (randAccess) + curFileTLI = 0; + + /* + * Try to restore the file from archive, or read an existing + * file from pg_wal. + */ + readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, + currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY : + currentSource); + if (readFile >= 0) + return true; /* success! */ + + /* + * Nope, not found in archive or pg_wal. + */ + lastSourceFailed = true; + break; + } else{ + /* Reset curFileTLI if random fetch. */ + if (randAccess) + curFileTLI = 0; + + XLogPageReadAnyTLI(); + return true; /* success! */ + } + + case XLOG_FROM_STREAM: + { + bool havedata; + + /* + * We should be able to move to XLOG_FROM_STREAM only in + * standby mode. + */ + Assert(StandbyMode); + + /* + * First, shutdown walreceiver if its restart has been + * requested -- but no point if we're already slated for + * starting it. + */ + if (pendingWalRcvRestart && !startWalReceiver) + { + ShutdownWalRcv(); /* - * WAL receiver must not be running when reading WAL from - * archive or pg_wal. + * Re-scan for possible new timelines if we were + * requested to recover to the latest timeline. */ - Assert(!WalRcvStreaming()); + if (recoveryTargetTimeLineGoal == + RECOVERY_TARGET_TIMELINE_LATEST) + rescanLatestTimeLine(); - /* Close any old file we might have open. */ - if (readFile >= 0) + startWalReceiver = true; + } + pendingWalRcvRestart = false; + + /* + * Launch walreceiver if needed. + * + * If fetching_ckpt is true, RecPtr points to the initial + * checkpoint location. In that case, we use RedoStartLSN + * as the streaming start position instead of RecPtr, so + * that when we later jump backwards to start redo at + * RedoStartLSN, we will have the logs streamed already. + */ + if (startWalReceiver && + PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0) + { + XLogRecPtr ptr; + TimeLineID tli; + + if (fetching_ckpt) { - close(readFile); - readFile = -1; + ptr = RedoStartLSN; + tli = ControlFile->checkPointCopy.ThisTimeLineID; } - /* Reset curFileTLI if random fetch. */ - if (randAccess) - curFileTLI = 0; + else + { + ptr = RecPtr; + /* + * Use the record begin position to determine the + * TLI, rather than the position we're reading. + */ + + // if (startWalReceiver && + // PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0) + // { + // XLogRecPtr ptr; + // TimeLineID tli; + + tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); + + + if (curFileTLI > 0 && tli < curFileTLI) + elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u", + LSN_FORMAT_ARGS(tliRecPtr), + tli, curFileTLI); + } + curFileTLI = tli; + ThisTimeLineID2 = tli; + RequestXLogStreaming(tli, ptr, PrimaryConnInfo, + PrimarySlotName, + wal_receiver_create_temp_slot); + flushedUpto = 0; + } + + + /*if (push_standby == true) { + PushPtr = tliRecPtr; + }*/ + + /* + * Check if WAL receiver is active or wait to start up. + */ + if (!WalRcvStreaming()) + { + lastSourceFailed = true; + break; + } + + /* + * Walreceiver is active, so see if new data has arrived. + * + * We only advance XLogReceiptTime when we obtain fresh + * WAL from walreceiver and observe that we had already + * processed everything before the most recent "chunk" + * that it flushed to disk. In steady state where we are + * keeping up with the incoming data, XLogReceiptTime will + * be updated on each cycle. When we are behind, + * XLogReceiptTime will not advance, so the grace time + * allotted to conflicting queries will decrease. + */ + if (RecPtr < flushedUpto) + havedata = true; + else + { + XLogRecPtr latestChunkStart; + + flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI); + if (RecPtr < flushedUpto && receiveTLI == curFileTLI) + { + havedata = true; + if (latestChunkStart <= RecPtr) + { + XLogReceiptTime = GetCurrentTimestamp(); + SetCurrentChunkStartTime(XLogReceiptTime); + } + } + else + havedata = false; + } + + if (havedata) + { /* - * Try to restore the file from archive, or read an existing - * file from pg_wal. + * Great, streamed far enough. Open the file if it's + * not open already. Also read the timeline history + * file if we haven't initialized timeline history + * yet; it should be streamed over and present in + * pg_wal by now. Use XLOG_FROM_STREAM so that source + * info is set correctly and XLogReceiptTime isn't + * changed. + * + * NB: We must set readTimeLineHistory based on + * recoveryTargetTLI, not receiveTLI. Normally they'll + * be the same, but if recovery_target_timeline is + * 'latest' and archiving is configured, then it's + * possible that we managed to retrieve one or more + * new timeline history files from the archive, + * updating recoveryTargetTLI. */ - readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, - currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY : - currentSource); - if (readFile >= 0) - return true; /* success! */ + if(he3mirror){ + if (readFile < 0) { + if (!expectedTLEs) + expectedTLEs = readTimeLineHistory(recoveryTargetTLI); + readFile = XLogFileRead(readSegNo, PANIC, + receiveTLI, + XLOG_FROM_STREAM, false); + Assert(readFile >= 0); + } else { + /* just make sure source info is correct... */ + readSource = XLOG_FROM_STREAM; + XLogReceiptSource = XLOG_FROM_STREAM; + return true; + } + } else{ + /* just make sure source info is correct... */ + readSource = XLOG_FROM_STREAM; + XLogReceiptSource = XLOG_FROM_STREAM; + return true; + } + break; + } + /* + * Data not here yet. Check for trigger, then wait for + * walreceiver to wake us up when new WAL arrives. + */ + if (CheckForStandbyTrigger()) + { /* - * Nope, not found in archive or pg_wal. + * Note that we don't "return false" immediately here. + * After being triggered, we still want to replay all + * the WAL that was already streamed. It's in pg_wal + * now, so we just treat this as a failure, and the + * state machine will move on to replay the streamed + * WAL from pg_wal, and then recheck the trigger and + * exit replay. */ lastSourceFailed = true; break; + } - case XLOG_FROM_STREAM: - { - bool havedata; + /* + * Since we have replayed everything we have received so + * far and are about to start waiting for more WAL, let's + * tell the upstream server our replay location now so + * that pg_stat_replication doesn't show stale + * information. + */ + if (!streaming_reply_sent) + { + WalRcvForceReply(); + streaming_reply_sent = true; + } - /* - * We should be able to move to XLOG_FROM_STREAM only in - * standby mode. - */ - Assert(StandbyMode); + /* + * Wait for more WAL to arrive. Time out after 5 seconds + * to react to a trigger file promptly and to check if the + * WAL receiver is still active. + */ + //this for mutipthread use WaitLatch=>WaitEventSetWait=>global variables LatchWaitSet + while(!ring_buffer_is_empty(gRingBufferManger) && !IsFreePthreadPool() && !startup_shutdown_requested) { + pg_usleep(1000L); + } + (void)WaitLatch(&XLogCtl->recoveryWakeupLatch, + WL_LATCH_SET | WL_TIMEOUT | + WL_EXIT_ON_PM_DEATH, + 1000L, WAIT_EVENT_RECOVERY_WAL_STREAM); + if (he3mirror) { + group_total_len = 0; + producerHe3dbXLog(NULL,0,0,0,0); + } + ResetLatch(&XLogCtl->recoveryWakeupLatch); + break; + } - /* - * First, shutdown walreceiver if its restart has been - * requested -- but no point if we're already slated for - * starting it. - */ - if (pendingWalRcvRestart && !startWalReceiver) - { - ShutdownWalRcv(); - - /* - * Re-scan for possible new timelines if we were - * requested to recover to the latest timeline. - */ - if (recoveryTargetTimeLineGoal == - RECOVERY_TARGET_TIMELINE_LATEST) - rescanLatestTimeLine(); - - startWalReceiver = true; - } - pendingWalRcvRestart = false; - - /* - * Launch walreceiver if needed. - * - * If fetching_ckpt is true, RecPtr points to the initial - * checkpoint location. In that case, we use RedoStartLSN - * as the streaming start position instead of RecPtr, so - * that when we later jump backwards to start redo at - * RedoStartLSN, we will have the logs streamed already. - */ - if (push_standby == false && startWalReceiver && - PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0) - { - XLogRecPtr ptr; - TimeLineID tli; - - if (fetching_ckpt) - { - ptr = RedoStartLSN; - tli = ControlFile->checkPointCopy.ThisTimeLineID; - } - else - { - ptr = RecPtr; - - /* - * Use the record begin position to determine the - * TLI, rather than the position we're reading. - */ - tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); - - if (curFileTLI > 0 && tli < curFileTLI) - elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u", - LSN_FORMAT_ARGS(tliRecPtr), - tli, curFileTLI); - } - curFileTLI = tli; - RequestXLogStreaming(tli, ptr, PrimaryConnInfo, - PrimarySlotName, - wal_receiver_create_temp_slot); - flushedUpto = 0; - } - - /*if (push_standby == true) { - PushPtr = tliRecPtr; - }*/ - - /* - * Check if WAL receiver is active or wait to start up. - */ - if (!WalRcvStreaming()) - { - lastSourceFailed = true; - break; - } - - /* - * Walreceiver is active, so see if new data has arrived. - * - * We only advance XLogReceiptTime when we obtain fresh - * WAL from walreceiver and observe that we had already - * processed everything before the most recent "chunk" - * that it flushed to disk. In steady state where we are - * keeping up with the incoming data, XLogReceiptTime will - * be updated on each cycle. When we are behind, - * XLogReceiptTime will not advance, so the grace time - * allotted to conflicting queries will decrease. - */ - if (RecPtr < flushedUpto) - havedata = true; - else - { - XLogRecPtr latestChunkStart; - - flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI); - if (RecPtr < flushedUpto && receiveTLI == curFileTLI) - { - havedata = true; - if (latestChunkStart <= RecPtr) - { - XLogReceiptTime = GetCurrentTimestamp(); - SetCurrentChunkStartTime(XLogReceiptTime); - } - } - else - havedata = false; - } - if (havedata) - { - /* - * Great, streamed far enough. Open the file if it's - * not open already. Also read the timeline history - * file if we haven't initialized timeline history - * yet; it should be streamed over and present in - * pg_wal by now. Use XLOG_FROM_STREAM so that source - * info is set correctly and XLogReceiptTime isn't - * changed. - * - * NB: We must set readTimeLineHistory based on - * recoveryTargetTLI, not receiveTLI. Normally they'll - * be the same, but if recovery_target_timeline is - * 'latest' and archiving is configured, then it's - * possible that we managed to retrieve one or more - * new timeline history files from the archive, - * updating recoveryTargetTLI. - */ - if (readFile < 0) - { - if (!expectedTLEs) - expectedTLEs = readTimeLineHistory(recoveryTargetTLI); - readFile = XLogFileRead(readSegNo, PANIC, - receiveTLI, - XLOG_FROM_STREAM, false); - Assert(readFile >= 0); - } - else - { - /* just make sure source info is correct... */ - readSource = XLOG_FROM_STREAM; - XLogReceiptSource = XLOG_FROM_STREAM; - return true; - } - break; - } - - /* - * Data not here yet. Check for trigger, then wait for - * walreceiver to wake us up when new WAL arrives. - */ - if (CheckForStandbyTrigger()) - { - /* - * Note that we don't "return false" immediately here. - * After being triggered, we still want to replay all - * the WAL that was already streamed. It's in pg_wal - * now, so we just treat this as a failure, and the - * state machine will move on to replay the streamed - * WAL from pg_wal, and then recheck the trigger and - * exit replay. - */ - lastSourceFailed = true; - break; - } - - /* - * Since we have replayed everything we have received so - * far and are about to start waiting for more WAL, let's - * tell the upstream server our replay location now so - * that pg_stat_replication doesn't show stale - * information. - */ - if (!streaming_reply_sent) - { - WalRcvForceReply(); - streaming_reply_sent = true; - } - - /* - * Wait for more WAL to arrive. Time out after 5 seconds - * to react to a trigger file promptly and to check if the - * WAL receiver is still active. - */ - (void) WaitLatch(&XLogCtl->recoveryWakeupLatch, - WL_LATCH_SET | WL_TIMEOUT | - WL_EXIT_ON_PM_DEATH, - 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM); - ResetLatch(&XLogCtl->recoveryWakeupLatch); - break; - } - - default: - elog(ERROR, "unexpected WAL source %d", currentSource); + default: + elog(ERROR, "unexpected WAL source %d", currentSource); } /* * Check for recovery pause here so that we can confirm more quickly * that a requested pause has actually taken effect. */ - if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState != + if (((volatile XLogCtlData *)XLogCtl)->recoveryPauseState != RECOVERY_NOT_PAUSED) recoveryPausesHere(false); @@ -13086,15 +15302,14 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, HandleStartupProcInterrupts(); } - return false; /* not reached */ + return false; /* not reached */ } /* * Set flag to signal the walreceiver to restart. (The startup process calls * this on noticing a relevant configuration change.) */ -void -StartupRequestWalReceiverRestart(void) +void StartupRequestWalReceiverRestart(void) { if (currentSource == XLOG_FROM_STREAM && WalRcvRunning()) { @@ -13144,8 +15359,7 @@ emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) * Unlike CheckForStandbyTrigger(), this works in any process * that's connected to shared memory. */ -bool -PromoteIsTriggered(void) +bool PromoteIsTriggered(void) { /* * We check shared state each time only until a standby promotion is @@ -13224,8 +15438,7 @@ CheckForStandbyTrigger(void) /* * Remove the files signaling a standby promotion request. */ -void -RemovePromoteSignalFiles(void) +void RemovePromoteSignalFiles(void) { unlink(PROMOTE_SIGNAL_FILE); } @@ -13233,8 +15446,7 @@ RemovePromoteSignalFiles(void) /* * Check to see if a promote request has arrived. */ -bool -CheckPromoteSignal(void) +bool CheckPromoteSignal(void) { struct stat stat_buf; @@ -13248,8 +15460,7 @@ CheckPromoteSignal(void) * Wake up startup process to replay newly arrived WAL, or to notice that * failover has been requested. */ -void -WakeupRecovery(void) +void WakeupRecovery(void) { SetLatch(&XLogCtl->recoveryWakeupLatch); } @@ -13257,8 +15468,7 @@ WakeupRecovery(void) /* * Update the WalWriterSleeping flag. */ -void -SetWalWriterSleeping(bool sleeping) +void SetWalWriterSleeping(bool sleeping) { SpinLockAcquire(&XLogCtl->info_lck); XLogCtl->WalWriterSleeping = sleeping; @@ -13268,8 +15478,320 @@ SetWalWriterSleeping(bool sleeping) /* * Schedule a walreceiver wakeup in the main recovery loop. */ -void -XLogRequestWalReceiverReply(void) +void XLogRequestWalReceiverReply(void) { doRequestWalReceiverReply = true; } + +static void PrecacheHotDataByRules() +{ + char instanceName[NAMEDATALEN]; //default:master + char metaHost[16]; //default:127.0.0.1 + char metaUser[NAMEDATALEN]; //default:postgres + char metaPw[NAMEDATALEN]; //default:123456 + char metaPort[8]; //default:5432 + char localPort[8]; //default:PostPortNumber + StringInfoData cmd, metaConnStr, localConnStr; + + initStringInfo(&cmd); + initStringInfo(&metaConnStr); + initStringInfo(&localConnStr); + + memset(instanceName, 0, NAMEDATALEN); + memset(metaHost, 0, 16); + memset(metaUser, 0, NAMEDATALEN); + memset(metaPw, 0, NAMEDATALEN); + memset(metaPort, 0, 8); + memset(localPort, 0, 8); + + //parse + if (strlen(he3_meta_conninfo) > 0) + { + char *temStr; + char *temChr; + int temStrLen; + + //instanceName + temStr = strstr(he3_meta_conninfo, "application_name="); + temStrLen = strlen("application_name="); + + if (temStr != NULL) + { + temChr = strchr(temStr, ' '); + if (temChr != NULL) + { + memcpy(instanceName, temStr + temStrLen, temChr - temStr - temStrLen); + } + else + { + strcpy(instanceName, temStr + temStrLen); + } + } + else + { + strcpy(instanceName, "master"); + } + + //metaHost + temStr = strstr(he3_meta_conninfo, "host="); + temStrLen = strlen("host="); + + if (temStr != NULL) + { + temChr = strchr(temStr, ' '); + if (temChr != NULL) + { + memcpy(metaHost, temStr + temStrLen, temChr - temStr - temStrLen); + } + else + { + strcpy(metaHost, temStr + temStrLen); + } + } + else + { + strcpy(metaHost, "127.0.0.1"); + } + + //metaUser + temStr = strstr(he3_meta_conninfo, "user="); + temStrLen = strlen("user="); + + if (temStr != NULL) + { + temChr = strchr(temStr, ' '); + if (temChr != NULL) + { + memcpy(metaUser, temStr + temStrLen, temChr - temStr - temStrLen); + } + else + { + strcpy(metaUser, temStr + temStrLen); + } + } + else + { + strcpy(metaUser, "postgres"); + } + + //metaPw + temStr = strstr(he3_meta_conninfo, "password="); + temStrLen = strlen("password="); + + if (temStr != NULL) + { + temChr = strchr(temStr, ' '); + if (temChr != NULL) + { + memcpy(metaPw, temStr + temStrLen, temChr - temStr - temStrLen); + } + else + { + strcpy(metaPw, temStr + temStrLen); + } + } + else + { + strcpy(metaPw, "123456"); + } + + //metaPort + temStr = strstr(he3_meta_conninfo, "port="); + temStrLen = strlen("port="); + + if (temStr != NULL) + { + temChr = strchr(temStr, ' '); + if (temChr != NULL) + { + memcpy(metaPort, temStr + temStrLen, temChr - temStr - temStrLen); + } + else + { + strcpy(metaPort, temStr + temStrLen); + } + } + else + { + strcpy(metaPort, "5432"); + } + } + else + { + return; + } + + //assemble metaConnStr + appendStringInfoString(&metaConnStr, "host="); + appendStringInfoString(&metaConnStr, metaHost); + appendStringInfoString(&metaConnStr, " user="); + appendStringInfoString(&metaConnStr, metaUser); + appendStringInfoString(&metaConnStr, " password="); + appendStringInfoString(&metaConnStr, metaPw); + appendStringInfoString(&metaConnStr, " port="); + appendStringInfoString(&metaConnStr, metaPort); + appendStringInfoString(&metaConnStr, " dbname=postgres"); + + //local + sprintf(localPort, "%d", PostPortNumber); + + PGconn *metaConn = PQconnectdb(metaConnStr.data); + if (PQstatus(metaConn) != CONNECTION_OK) + { + PQfinish(metaConn); + return; + } + + appendStringInfoString(&cmd, "SELECT datname, relname, crules, ctype, indexname, keyname, keyvalue, keytype, action FROM pg_hot_data WHERE crulessettime>cachetime AND clientname='"); + appendStringInfoString(&cmd, instanceName); + appendStringInfoString(&cmd, "'"); + + //Query the corresponding precache policy + PGresult *ruleRes = PQexec(metaConn, cmd.data); + if (PQresultStatus(ruleRes) != PGRES_TUPLES_OK) + { + PQclear(ruleRes); + PQfinish(metaConn); + return; + } + int rows = PQntuples(ruleRes); + for(int i=0; iinfo_lck); + LogwrtResult = XLogCtl->LogwrtResult; + XLogParralFlush flushinfo = XLogCtl->LogFlush; + SpinLockRelease(&XLogCtl->info_lck); + *writtenlsn = LogwrtResult.Write; + *flushlsn = (XLogRecPtr) pg_atomic_read_u64(&LogwrtResult.Flush); + *totaltimes = flushinfo.last; + *parallels = flushinfo.last - pg_atomic_read_u64(&flushinfo.begin); +} \ No newline at end of file diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 64ec780..fac5e47 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -31,7 +31,15 @@ #include "replication/origin.h" #include "storage/bufmgr.h" #include "storage/proc.h" +#include "storage/spin.h" #include "utils/memutils.h" +#include "access/heapam_xlog.h" +#include "access/nbtxlog.h" +#include "access/nbtree.h" +#include "access/gistxlog.h" +#include "access/gist_private.h" +#include "access/spgxlog.h" +#include "access/brin_xlog.h" /* Buffer size required to store a compressed version of backup block image */ #define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ) @@ -68,10 +76,12 @@ static int max_registered_block_id = 0; /* highest block_id + 1 currently int group_total_len; int grouo_rec_count; +int grouo_rec_cur_count; XLogRecord *grouphead[XLR_MAX_BLOCK_ID + 1]; int grouplens[XLR_MAX_BLOCK_ID + 1]; XLogRecData groupRecData[XLR_MAX_BLOCK_ID + 1]; +XLogRecPtr groupEndLsn[XLR_MAX_BLOCK_ID + 1]; /* * A chain of XLogRecDatas to hold the "main data" of a WAL record, registered @@ -525,7 +535,8 @@ XLogInsert(RmgrId rmid, uint8 info) EndPos = SizeOfXLogLongPHD; /* start of 1st chkpt record */ return EndPos; } - + + do { XLogRecPtr RedoRecPtr; @@ -543,15 +554,442 @@ XLogInsert(RmgrId rmid, uint8 info) rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites, &fpw_lsn, &num_fpi); - - EndPos = XLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi); + EndPos = He3DBXLogInsertRecord(rdt, fpw_lsn, curinsert_flags, num_fpi); } while (EndPos == InvalidXLogRecPtr); - XLogResetInsertion(); return EndPos; } +static XLogRecData g_bkp_rdatas[XLR_MAX_BLOCK_ID + 1][2]; +static XLogRecData g_main_data; + +static void extendMainData(XLogReaderState *state) { + int extendSize = 64; + if (state->main_data_len + extendSize > state->main_data_bufsz) + { + + if (state->main_data) + pfree(state->main_data); + + /* + * main_data_bufsz must be MAXALIGN'ed. In many xlog record + * types, we omit trailing struct padding on-disk to save a few + * bytes; but compilers may generate accesses to the xlog struct + * that assume that padding bytes are present. If the palloc + * request is not large enough to include such padding bytes then + * we'll get valgrind complaints due to otherwise-harmless fetches + * of the padding bytes. + * + * In addition, force the initial request to be reasonably large + * so that we don't waste time with lots of trips through this + * stanza. BLCKSZ / 2 seems like a good compromise choice. + */ + state->main_data_bufsz = MAXALIGN(Max(state->main_data_len + extendSize, + BLCKSZ / 2)); + state->main_data = palloc(state->main_data_bufsz); + } +} + +static void convertMainData(XLogReaderState *state, OldXLogRecord *record) { + RmgrId rmid = record->xl_rmid; + uint8 info = (record->xl_info & ~XLR_INFO_MASK); + switch(rmid) { + case RM_HEAP2_ID: + { + if ((info & XLOG_HEAP_OPMASK) == XLOG_HEAP2_VISIBLE) { + xl_old_heap_visible *xlrec = (xl_old_heap_visible *) XLogRecGetData(state); + xl_heap_visible xlrecNew; + xlrecNew.rnode = state->blocks[1].rnode; + xlrecNew.blkno = state->blocks[1].blkno; + xlrecNew.cutoff_xid = xlrec->cutoff_xid; + xlrecNew.flags = xlrec->flags; + extendMainData(state); + state->main_data_len = sizeof(xl_heap_visible); + memcpy(state->main_data,&xlrecNew,state->main_data_len); + } + break; + } + case RM_HEAP_ID: + { + if (((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_UPDATE) || + ((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_HOT_UPDATE)) { + xl_old_heap_update *xlrec = (xl_old_heap_update *) XLogRecGetData(state); + xl_heap_update xlrecNew; + xlrecNew.old_xmax = xlrec->old_xmax; + xlrecNew.old_offnum = xlrec->old_offnum; + xlrecNew.old_infobits_set = xlrec->old_infobits_set; + xlrecNew.flags = xlrec->flags; + xlrecNew.new_xmax = xlrec->new_xmax; + xlrecNew.new_offnum = xlrec->new_offnum; + xlrecNew.newblk = state->blocks[0].blkno; + if(state->max_block_id == 0){ + xlrecNew.oldblk = state->blocks[0].blkno; + } else{ + xlrecNew.oldblk = state->blocks[1].blkno; + } + xlrecNew.rnode = state->blocks[0].rnode; + extendMainData(state); + state->main_data_len = sizeof(xl_heap_update); + memcpy(state->main_data,&xlrecNew,state->main_data_len); + } + break; + } + case RM_BTREE_ID: + { + if (info == XLOG_BTREE_SPLIT_L || info == XLOG_BTREE_SPLIT_R) { + xl_old_btree_split *xlrec = (xl_old_btree_split *) XLogRecGetData(state); + xl_btree_split xlrecNew; + xlrecNew.level = xlrec->level; + xlrecNew.firstrightoff = xlrec->firstrightoff; + xlrecNew.newitemoff = xlrec->newitemoff; + xlrecNew.postingoff = xlrec->postingoff; + xlrecNew.origpagenumber = state->blocks[0].blkno; + xlrecNew.rightpagenumber = state->blocks[1].blkno; + if (!XLogRecGetBlockTag(state, 2, NULL, NULL, &xlrecNew.spagenumber)) + xlrecNew.spagenumber = P_NONE; + extendMainData(state); + state->main_data_len = sizeof(xl_btree_split); + memcpy(state->main_data,&xlrecNew,state->main_data_len); + } + break; + } + case RM_GIST_ID: + { + if (info == XLOG_GIST_PAGE_SPLIT) { + gistoldxlogPageSplit *xlrec = (gistoldxlogPageSplit *) XLogRecGetData(state); + gistxlogPageSplit xlrecNew; + xlrecNew.markfollowright = xlrec->markfollowright; + xlrecNew.npage = xlrec->npage; + xlrecNew.origleaf = xlrec->origleaf; + xlrecNew.orignsn = xlrec->orignsn; + xlrecNew.origrlink = xlrec->origrlink; + xlrecNew.isroot = false; + if (xlrec->npage > 0) { + if (state->blocks[1].blkno == GIST_ROOT_BLKNO) { + xlrecNew.isroot = true; + } + } + extendMainData(state); + state->main_data_len = sizeof(gistxlogPageSplit); + memcpy(state->main_data,&xlrecNew,state->main_data_len); + } + break; + } + case RM_SPGIST_ID: + { + if (info == XLOG_SPGIST_ADD_LEAF) { + spgoldxlogAddLeaf *xlrec = (spgoldxlogAddLeaf *) XLogRecGetData(state); + spgxlogAddLeaf xlrecNew; + xlrecNew.newPage = xlrec->newPage; /* init dest page? */ + xlrecNew.storesNulls = xlrec->storesNulls; /* page is in the nulls tree? */ + xlrecNew.offnumLeaf = xlrec->offnumLeaf; /* offset where leaf tuple gets placed */ + xlrecNew.offnumHeadLeaf = xlrec->offnumHeadLeaf; /* offset of head tuple in chain, if any */ + + xlrecNew.offnumParent = xlrec->offnumParent; /* where the parent downlink is, if any */ + xlrecNew.nodeI = xlrec->nodeI; + xlrecNew.blknoLeaf = state->blocks[0].blkno; + extendMainData(state); + state->main_data_len = sizeof(spgxlogAddLeaf); + memcpy(state->main_data,&xlrecNew,state->main_data_len); + + } else if (info == XLOG_SPGIST_MOVE_LEAFS) { + spgoldxlogMoveLeafs *xlrec = (spgoldxlogMoveLeafs *) XLogRecGetData(state); + spgxlogMoveLeafs xlrecNew; + xlrecNew.nMoves = xlrec->nMoves; /* number of tuples moved from source page */ + xlrecNew.newPage = xlrec->newPage; /* init dest page? */ + xlrecNew.replaceDead = xlrec->replaceDead; /* are we replacing a DEAD source tuple? */ + xlrecNew.storesNulls = xlrec->storesNulls; /* pages are in the nulls tree? */ + + /* where the parent downlink is */ + xlrecNew.offnumParent = xlrec->offnumParent; + xlrecNew.nodeI = xlrec->nodeI; + + xlrecNew.stateSrc = xlrec->stateSrc; + + /* for he3pg */ + xlrecNew.blknoDst = state->blocks[1].blkno; + + /*---------- + * data follows: + * array of deleted tuple numbers, length nMoves + * array of inserted tuple numbers, length nMoves + 1 or 1 + * list of leaf tuples, length nMoves + 1 or 1 (unaligned!) + * + * Note: if replaceDead is true then there is only one inserted tuple + * number and only one leaf tuple in the data, because we are not copying + * the dead tuple from the source + *---------- + */ + char* tmp = palloc(state->main_data_len-SizeOfOldSpgxlogMoveLeafs); + memcpy(tmp,state->main_data+SizeOfOldSpgxlogMoveLeafs,state->main_data_len-SizeOfOldSpgxlogMoveLeafs); + extendMainData(state); + memcpy(state->main_data,&xlrecNew,SizeOfSpgxlogMoveLeafs); + memcpy(state->main_data + SizeOfSpgxlogMoveLeafs, tmp, state->main_data_len-SizeOfOldSpgxlogMoveLeafs); + state->main_data_len += SizeOfSpgxlogMoveLeafs-SizeOfOldSpgxlogMoveLeafs; + pfree(tmp); + } else if (info == XLOG_SPGIST_ADD_NODE) { + spgoldxlogAddNode *xlrec = (spgoldxlogAddNode *) XLogRecGetData(state); + spgxlogAddNode xlrecNew; + xlrecNew.offnum = xlrec->offnum; + /* + * Offset of the new tuple, on the new page (on backup block 1). Invalid, + * if we overwrote the old tuple in the original page). + */ + xlrecNew.offnumNew = xlrec->offnumNew; + xlrecNew.newPage = xlrec->newPage; /* init new page? */ + + /*---- + * Where is the parent downlink? parentBlk indicates which page it's on, + * and offnumParent is the offset within the page. The possible values for + * parentBlk are: + * + * 0: parent == original page + * 1: parent == new page + * 2: parent == different page (blk ref 2) + * -1: parent not updated + *---- + */ + xlrecNew.parentBlk = xlrec->parentBlk; + xlrecNew.offnumParent = xlrec->offnumParent; /* offset within the parent page */ + + xlrecNew.nodeI = xlrec->nodeI; + xlrecNew.blkno1 = state->blocks[0].blkno; + xlrecNew.stateSrc = xlrec->stateSrc; + extendMainData(state); + state->main_data_len = sizeof(spgxlogAddNode); + memcpy(state->main_data,&xlrecNew,state->main_data_len); + + } else if (info == XLOG_SPGIST_PICKSPLIT) { + spgoldxlogPickSplit *xlrec = (spgoldxlogPickSplit *) XLogRecGetData(state); + spgxlogPickSplit xlrecNew; + xlrecNew.isRootSplit = xlrec->isRootSplit; + + xlrecNew.nDelete = xlrec->nDelete; /* n to delete from Src */ + xlrecNew.nInsert = xlrec->nInsert; /* n to insert on Src and/or Dest */ + xlrecNew.initSrc = xlrec->initSrc; /* re-init the Src page? */ + xlrecNew.initDest = xlrec->initDest; /* re-init the Dest page? */ + + /* for he3pg */ + xlrecNew.blknoInner = state->blocks[2].blkno; + /* where to put new inner tuple */ + xlrecNew.offnumInner = xlrec->offnumInner; + xlrecNew.initInner = xlrec->initInner; /* re-init the Inner page? */ + + xlrecNew.storesNulls = xlrec->storesNulls; /* pages are in the nulls tree? */ + + /* where the parent downlink is, if any */ + xlrecNew.innerIsParent = xlrec->innerIsParent; /* is parent the same as inner page? */ + xlrecNew.offnumParent = xlrec->offnumParent; + xlrecNew.nodeI = xlrec->nodeI; + + xlrecNew.stateSrc = xlrec->stateSrc; + + /*---------- + * data follows: + * array of deleted tuple numbers, length nDelete + * array of inserted tuple numbers, length nInsert + * array of page selector bytes for inserted tuples, length nInsert + * new inner tuple (unaligned!) + * list of leaf tuples, length nInsert (unaligned!) + *---------- + */ + char* tmp = palloc(state->main_data_len-SizeOfOldSpgxlogPickSplit); + memcpy(tmp,state->main_data+SizeOfOldSpgxlogPickSplit,state->main_data_len-SizeOfOldSpgxlogPickSplit); + extendMainData(state); + memcpy(state->main_data,&xlrecNew,SizeOfSpgxlogPickSplit); + memcpy(state->main_data + SizeOfSpgxlogPickSplit, tmp, state->main_data_len-SizeOfOldSpgxlogPickSplit); + state->main_data_len += SizeOfSpgxlogPickSplit-SizeOfOldSpgxlogPickSplit; + pfree(tmp); + } + break; + } + case RM_BRIN_ID: + { + if (info == XLOG_BRIN_INSERT) { + xl_old_brin_insert *xlrec = (xl_old_brin_insert *) XLogRecGetData(state); + xl_brin_insert xlrecNew; + xlrecNew.heapBlk = xlrec->heapBlk; + /* extra information needed to update the revmap */ + xlrecNew.pagesPerRange = xlrec->pagesPerRange; + xlrecNew.block0 = state->blocks[0].blkno; + /* offset number in the main page to insert the tuple to. */ + xlrecNew.offnum = xlrec->offnum; + extendMainData(state); + state->main_data_len = sizeof(xl_brin_insert); + memcpy(state->main_data,&xlrecNew,state->main_data_len); + } else if ( info == XLOG_BRIN_UPDATE) { + xl_old_brin_update *xlrec = (xl_old_brin_update *) XLogRecGetData(state); + xl_brin_update xlrecUpdate; + xl_old_brin_insert *xlrecInsert = &xlrec->insert; + xl_brin_insert xlrecNew; + xlrecNew.heapBlk = xlrecInsert->heapBlk; + /* extra information needed to update the revmap */ + xlrecNew.pagesPerRange = xlrecInsert->pagesPerRange; + xlrecNew.block0 = state->blocks[0].blkno; + /* offset number in the main page to insert the tuple to. */ + xlrecNew.offnum = xlrecInsert->offnum; + xlrecUpdate.oldOffnum = xlrec->oldOffnum; + xlrecUpdate.insert = xlrecNew; + extendMainData(state); + state->main_data_len = sizeof(xl_brin_update); + memcpy(state->main_data,&xlrecUpdate,state->main_data_len); + } + break; + } + default: + break; + } +} + +XLogRecData *DecodeXLogRecordAssemble(XLogReaderState *state, OldXLogRecord *record, + XLogRecPtr RedoRecPtr, bool doPageWrites, + XLogRecPtr *fpw_lsn, int *num_fpi) +{ + + /* + * Make an rdata chain containing all the data portions of all block + * references. This includes the data for full-page images. Also append + * the headers for the block references in the scratch buffer. + */ + RmgrId rmid = record->xl_rmid; + uint8 info = record->xl_info; + *fpw_lsn = InvalidXLogRecPtr; + int block_id; + XLogRecord *rechdr = NULL; + group_total_len = 0; + grouo_rec_count = 0; + grouo_rec_cur_count = 0; + int maxidx = (state->max_block_id < 0 ? 1:state->max_block_id+1); + bool isDone = false; + for (block_id = 0; block_id < maxidx; block_id++) + { + XLogRecData* rdt; + uint32 total_len; + total_len = 0; + pg_crc32c rdata_crc = 0; + XLogRecData *rdt_datas_last; + char *scratch; + // char linkkey[36]; + groupRecData[grouo_rec_count].next = NULL; + rdt_datas_last = &groupRecData[grouo_rec_count]; + + scratch = hdr_scratch + grouo_rec_count * SINGLE_SCRATCH_SIZE; + groupRecData[grouo_rec_count].data = scratch; + /*group_total_len+=HEADER_SCRATCH_SIZE;*/ + grouphead[grouo_rec_count]=(XLogRecord *)scratch; + /* The record begins with the fixed-size header */ + rechdr = (XLogRecord *)scratch; + scratch += SizeOfXLogRecord; + if (state->max_block_id >= 0) { + DecodedBkpBlock *blkbuf = &state->blocks[block_id]; + if (!blkbuf->in_use) + continue; + XLogRecData* bkp_rdatas = g_bkp_rdatas[block_id]; + + XLogRecordBlockHeader bkpb; + XLogRecordBlockImageHeader bimg; + XLogRecordBlockCompressHeader cbimg = {0}; + bkpb.id = 0; + bkpb.fork_flags = blkbuf->flags; + bkpb.data_length = blkbuf->data_len; + //total_len += bkpb.data_length; + /* Ok, copy the header to the scratch buffer */ + memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader); + scratch += SizeOfXLogRecordBlockHeader; + if (blkbuf->has_image) { + bimg.bimg_info = blkbuf->bimg_info; + bimg.hole_offset = blkbuf->hole_offset; + bimg.length = blkbuf->bimg_len; + memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader); + scratch += SizeOfXLogRecordBlockImageHeader; + rdt_datas_last->next = &bkp_rdatas[0]; + rdt_datas_last = rdt_datas_last->next; + bkp_rdatas[0].data = blkbuf->bkp_image; + bkp_rdatas[0].len = blkbuf->bimg_len; + if (bimg.bimg_info & BKPIMAGE_IS_COMPRESSED) { + cbimg.hole_length = blkbuf->hole_length; + if (bimg.bimg_info & BKPIMAGE_HAS_HOLE) { + memcpy(scratch, &cbimg, + SizeOfXLogRecordBlockCompressHeader); + scratch += SizeOfXLogRecordBlockCompressHeader; + } + } + total_len += bimg.length; + *num_fpi += 1; + } + if (blkbuf->has_data) { + rdt_datas_last->next = &bkp_rdatas[1]; + rdt_datas_last = rdt_datas_last->next; + bkp_rdatas[1].data = blkbuf->data; + bkp_rdatas[1].len = blkbuf->data_len; + total_len += blkbuf->data_len; + } + memcpy(scratch, &blkbuf->rnode, sizeof(RelFileNode)); + scratch += sizeof(RelFileNode); + + memcpy(scratch, &blkbuf->blkno, sizeof(BlockNumber)); + scratch += sizeof(BlockNumber); + } + + if (state->record_origin != InvalidRepOriginId) { + *(scratch++) = (char)XLR_BLOCK_ID_ORIGIN; + memcpy(scratch, &state->record_origin, sizeof(RepOriginId)); + scratch += sizeof(RepOriginId); + } + + if (state->toplevel_xid != InvalidTransactionId) { + *(scratch++) = (char)XLR_BLOCK_ID_TOPLEVEL_XID; + memcpy(scratch, &state->toplevel_xid, sizeof(TransactionId)); + scratch += sizeof(TransactionId); + } + + if (state->main_data_len > 0) { + rdt_datas_last->next = &g_main_data; + rdt_datas_last = &g_main_data; + if (isDone == false) { + convertMainData(state,record); + g_main_data.data = state->main_data; + g_main_data.len = state->main_data_len; + isDone = true; + } + if (state->main_data_len > 255) { + *(scratch++) = (char)XLR_BLOCK_ID_DATA_LONG; + memcpy(scratch, &state->main_data_len, sizeof(uint32)); + scratch += sizeof(uint32); + } else { + *(scratch++) = (char)XLR_BLOCK_ID_DATA_SHORT; + *(scratch++) = (uint8)state->main_data_len; + } + total_len += state->main_data_len; + } + + rdt_datas_last->next = NULL; + groupRecData[grouo_rec_count].len = scratch - groupRecData[grouo_rec_count].data; + total_len += groupRecData[grouo_rec_count].len; + grouplens[grouo_rec_count] = total_len; + + /* + * Fill in the fields in the record header. Prev-link is filled in later, + * once we know where in the WAL the record will be inserted. The CRC does + * not include the record header yet. + */ + rechdr->xl_xid = record->xl_xid; + rechdr->xl_tot_len = total_len; + rechdr->xl_info = info; + rechdr->xl_rmid = rmid; + rechdr->xl_prev = InvalidXLogRecPtr; + rechdr->xl_crc = rdata_crc; + rechdr->blocknum = block_id; + rechdr->mtr = false; + + group_total_len += total_len; + grouo_rec_count++; + } + rechdr->mtr = true; + + return &groupRecData[0]; +} /* * Assemble a WAL record from the registered data and buffers into an * XLogRecData chain, ready for insertion with XLogInsertRecord(). @@ -598,8 +1036,10 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, pg_crc32c rdata_crc; XLogRecData *rdt_datas_last; char *scratch; + // char linkkey[36]; groupRecData[grouo_rec_count].next = NULL; rdt_datas_last = &groupRecData[grouo_rec_count]; + scratch = hdr_scratch + grouo_rec_count * SINGLE_SCRATCH_SIZE; groupRecData[grouo_rec_count].data = scratch; /*group_total_len+=HEADER_SCRATCH_SIZE;*/ @@ -609,6 +1049,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, scratch += SizeOfXLogRecord; if (max_registered_block_id != 0) { registered_buffer *regbuf = ®istered_buffers[block_id]; + bool needs_backup; bool needs_data; XLogRecordBlockHeader bkpb; @@ -621,7 +1062,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, if (!regbuf->in_use) continue; - /* + /* * Note: this function can be called multiple times for the same record. * All the modifications we do to the rdata chains below must handle that. */ @@ -823,7 +1264,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, /* followed by the record's origin, if any */ if ((curinsert_flags & XLOG_INCLUDE_ORIGIN) && - replorigin_session_origin != InvalidRepOriginId) + replorigin_session_origin != InvalidRepOriginId) //1 { *(scratch++) = (char)XLR_BLOCK_ID_ORIGIN; memcpy(scratch, &replorigin_session_origin, sizeof(replorigin_session_origin)); @@ -831,7 +1272,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, } /* followed by toplevel XID, if not already included in previous record */ - if (IsSubTransactionAssignmentPending()) + if (IsSubTransactionAssignmentPending()) //2 { TransactionId xid = GetTopTransactionIdIfAny(); @@ -846,13 +1287,13 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, /* followed by main data, if any */ if (mainrdata_len > 0) { - if (mainrdata_len > 255) + if (mainrdata_len > 255) //3 { - *(scratch++) = (char)XLR_BLOCK_ID_DATA_LONG; + *(scratch++) = (char)XLR_BLOCK_ID_DATA_LONG; memcpy(scratch, &mainrdata_len, sizeof(uint32)); scratch += sizeof(uint32); } - else + else //4 { *(scratch++) = (char)XLR_BLOCK_ID_DATA_SHORT; *(scratch++) = (uint8)mainrdata_len; @@ -865,7 +1306,8 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, rdt_datas_last->next = NULL; groupRecData[grouo_rec_count].len = scratch - groupRecData[grouo_rec_count].data; total_len += groupRecData[grouo_rec_count].len; - grouplens[grouo_rec_count]=MAXALIGN(total_len); + //grouplens[grouo_rec_count]=MAXALIGN(total_len); + grouplens[grouo_rec_count]=total_len; /* @@ -907,7 +1349,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, rechdr->blocknum=block_id; rechdr->mtr = false; - group_total_len += MAXALIGN(total_len); + group_total_len += total_len; grouo_rec_count++; } rechdr->mtr = true; diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 0bd54c1..01817f2 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -20,18 +20,20 @@ #include #include "access/transam.h" +#include "access/xlog.h" #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "access/xlogrecord.h" #include "catalog/pg_control.h" #include "common/pg_lzcompress.h" #include "replication/origin.h" +#include "utils/hfs.h" #ifndef PG_NOREPLAY #include "access/pushpage.h" #include "utils/guc.h" +#include "access/pagehashqueue.h" #endif - #ifndef FRONTEND #include "miscadmin.h" #include "pgstat.h" @@ -43,15 +45,27 @@ static void report_invalid_record(XLogReaderState *state, const char *fmt,...) static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength); static int ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen); +static int He3DBReadWalInternal(XLogReaderState *state, XLogRecPtr startptr, + int reqLen); static void XLogReaderInvalReadState(XLogReaderState *state); + +static XLogRecPtr SplitXLogRecord(XLogReaderState *state, OldXLogRecord *record); + +static bool ValidOldXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, + XLogRecPtr PrevRecPtr, OldXLogRecord *record, bool randAccess); static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess); +static bool ValidOldXLogRecord(XLogReaderState *state, OldXLogRecord *record, + XLogRecPtr recptr); static bool ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr); static void ResetDecoder(XLogReaderState *state); static void WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, int segsize, const char *waldir); +static bool +DecodeOldXLogRecord(XLogReaderState *state, OldXLogRecord *record, char **errormsg); + /* size of the buffer allocated for error message. */ #define MAX_ERRORMSG_LEN 1000 @@ -100,7 +114,7 @@ XLogReaderAllocate(int wal_segment_size, const char *waldir, * isn't guaranteed to have any particular alignment, whereas * palloc_extended() will provide MAXALIGN'd storage. */ - state->readBuf = (char *) palloc_extended(XLOG_BLCKSZ, + state->readBuf = (char *) palloc_extended(4 * XLOG_BLCKSZ, MCXT_ALLOC_NO_OOM); if (!state->readBuf) { @@ -109,8 +123,8 @@ XLogReaderAllocate(int wal_segment_size, const char *waldir, } /* Initialize segment info. */ - WALOpenSegmentInit(&state->seg, &state->segcxt, wal_segment_size, - waldir); + WALOpenSegmentInit(&state->seg, &state->segcxt, wal_segment_size, + waldir); /* system_identifier initialized to zeroes above */ state->private_data = private_data; @@ -138,6 +152,7 @@ XLogReaderAllocate(int wal_segment_size, const char *waldir, } state->tag = NULL; state->buffer = InvalidBuffer; + state->insertTikv = false; return state; } @@ -147,8 +162,8 @@ XLogReaderFree(XLogReaderState *state) { int block_id; - if (state->seg.ws_file != -1) - state->routine.segment_close(state); + // if (state->seg.ws_file != -1) + // state->routine.segment_close(state); for (block_id = 0; block_id <= XLR_MAX_BLOCK_ID; block_id++) { @@ -161,7 +176,9 @@ XLogReaderFree(XLogReaderState *state) pfree(state->errormsg_buf); if (state->readRecordBuf) pfree(state->readRecordBuf); - pfree(state->readBuf); + if (state->streamStart != true) { + pfree(state->readBuf); + } pfree(state); } @@ -219,18 +236,42 @@ allocate_recordbuf(XLogReaderState *state, uint32 reclength) /* * Initialize the passed segment structs. */ -static void -WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, - int segsize, const char *waldir) -{ - seg->ws_file = -1; - seg->ws_segno = 0; - seg->ws_tli = 0; + static void + WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, + int segsize, const char *waldir) + { + seg->ws_file = -1; + seg->ws_segno = 0; + seg->ws_tli = 0; - segcxt->ws_segsize = segsize; - if (waldir) - snprintf(segcxt->ws_dir, MAXPGPATH, "%s", waldir); -} + segcxt->ws_segsize = segsize; + if (waldir) + snprintf(segcxt->ws_dir, MAXPGPATH, "%s", waldir); + } + +// static int +// xlogread(int64_t fd, void *buf, off_t offset, size_t size) +// { +// Bufrd result; +// size_t count; +// result = readfs(fd, offset, size); +// if (result.count <= 0) +// return (ssize_t)result.count; +// else if (result.count <= XLOG_BLCKSZ) +// { +// memcpy(buf, result.buf, result.count); +// count = result.count; +// } +// else +// { +// memcpy(buf, result.buf, XLOG_BLCKSZ); +// count = BLCKSZ; +// } + +// free_dataRead(result.buf, 1, 1); +// return (ssize_t)count; + +// } /* * Begin reading WAL at 'RecPtr'. @@ -246,7 +287,7 @@ WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr) { - Assert(!XLogRecPtrIsInvalid(RecPtr)); + // Assert(!XLogRecPtrIsInvalid(RecPtr)); ResetDecoder(state); @@ -255,7 +296,386 @@ XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr) state->ReadRecPtr = InvalidXLogRecPtr; } -extern XLogRecPtr GetFlushXlogPtr(); +extern XLogRecPtr GetFlushXlogPtr(void); + +#ifndef PG_NOREPLAY +static XLogRecPtr SplitXLogRecord(XLogReaderState *state, OldXLogRecord *record) { + XLogRecPtr EndPos = InvalidXLogRecPtr; + do + { + XLogRecPtr RedoRecPtr; + bool doPageWrites; + XLogRecPtr fpw_lsn; + XLogRecData *rdt; + int num_fpi = 0; + RelFileNode rnodes[XLR_MAX_BLOCK_ID + 1]; + BlockNumber blknos[XLR_MAX_BLOCK_ID + 1]; + + /* + * Get values needed to decide whether to do full-page writes. Since + * we don't yet have an insertion lock, these could change under us, + * but XLogInsertRecord will recheck them once it has a lock. + */ + GetFullPageWriteInfo(&RedoRecPtr, &doPageWrites); + + rdt = DecodeXLogRecordAssemble(state,record, RedoRecPtr, doPageWrites, + &fpw_lsn, &num_fpi); + if (state->insertTikv == true) { + report_invalid_record(state,"producerHe3dbXLog file startLSN: %X/%X endLSN: %X/%X",LSN_FORMAT_ARGS(state->ReadRecPtr),LSN_FORMAT_ARGS(state->EndRecPtr)); + EndPos = producerHe3dbXLog(rdt, fpw_lsn, 0, num_fpi,state->currRecPtr); + } else { + break; + } + } while (EndPos == InvalidXLogRecPtr); + + return EndPos; + +} + +XLogRecord * +StartupXLogReadRecord(XLogReaderState *state, char **errormsg) +{ + XLogRecPtr RecPtr; + OldXLogRecord *oldrecord; + XLogRecPtr targetPagePtr; + bool randAccess; + uint32 len, + total_len; + uint32 targetRecOff; + uint32 pageHeaderSize; + bool assembled; + bool gotheader; + int readOff; + + /* + * randAccess indicates whether to verify the previous-record pointer of + * the record we're reading. We only do this if we're reading + * sequentially, which is what we initially assume. + */ + randAccess = false; + + /* reset error state */ + *errormsg = NULL; + state->errormsg_buf[0] = '\0'; + + ResetDecoder(state); + state->abortedRecPtr = InvalidXLogRecPtr; + state->missingContrecPtr = InvalidXLogRecPtr; + + RecPtr = state->EndRecPtr; + + if (state->ReadRecPtr != InvalidXLogRecPtr) + { + /* read the record after the one we just read */ + + /* + * EndRecPtr is pointing to end+1 of the previous WAL record. If + * we're at a page boundary, no more records can fit on the current + * page. We must skip over the page header, but we can't do that until + * we've read in the page, since the header size is variable. + */ + } + else + { + /* + * Caller supplied a position to start at. + * + * In this case, EndRecPtr should already be pointing to a valid + * record starting position. + */ + Assert(XRecOffIsValid(RecPtr)); + randAccess = true; + } + + restart: + + state->currRecPtr = RecPtr; + assembled = false; + + targetPagePtr = RecPtr - (RecPtr % XLOG_BLCKSZ); + targetRecOff = RecPtr % XLOG_BLCKSZ; + + /* + * Read the page containing the record into state->readBuf. Request enough + * byte to cover the whole record header, or at least the part of it that + * fits on the same page. + */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(targetRecOff + SizeOfOldXLogRecord, XLOG_BLCKSZ)); + + if (readOff < 0) + goto err; + + /* + * ReadPageInternal always returns at least the page header, so we can + * examine it now. + */ + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); + if (targetRecOff == 0) + { + /* + * At page start, so skip over page header. + */ + RecPtr += pageHeaderSize; + targetRecOff = pageHeaderSize; + } + else if (targetRecOff < pageHeaderSize) + { + report_invalid_record(state, "invalid record offset at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && + targetRecOff == pageHeaderSize) + { + report_invalid_record(state, "contrecord is requested by %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* ReadPageInternal has verified the page header */ + Assert(pageHeaderSize <= readOff); + + /* + * Read the record length. + * + * NB: Even though we use an XLogRecord pointer here, the whole record + * header might not fit on this page. xl_tot_len is the first field of the + * struct, so it must be on this page (the records are MAXALIGNed), but we + * cannot access any other fields until we've verified that we got the + * whole header. + */ + oldrecord = (OldXLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ); + total_len = oldrecord->xl_tot_len; + + + /* + * If the whole record header is on this page, validate it immediately. + * Otherwise do just a basic sanity check on xl_tot_len, and validate the + * rest of the header after reading it from the next page. The xl_tot_len + * check is necessary here to ensure that we enter the "Need to reassemble + * record" code path below; otherwise we might fail to apply + * ValidXLogRecordHeader at all. + */ + if (targetRecOff <= XLOG_BLCKSZ - SizeOfOldXLogRecord) + { + if (!ValidOldXLogRecordHeader(state, RecPtr, state->ReadRecPtr, oldrecord, + randAccess)) + goto err; + gotheader = true; + } + else + { + /* XXX: more validation should be done here */ + if (total_len < SizeOfOldXLogRecord) + { + report_invalid_record(state, + "StartupXLogReadRecord: invalid record length at %X/%X: wanted %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfOldXLogRecord, total_len); + goto err; + } + gotheader = false; + } + + len = XLOG_BLCKSZ - RecPtr % XLOG_BLCKSZ; + if (total_len > len) + { + /* Need to reassemble record */ + char *contdata; + XLogPageHeader pageHeader; + char *buffer; + uint32 gotlen; + + assembled = true; + + /* + * Enlarge readRecordBuf as needed. + */ + if (total_len > state->readRecordBufSize && + !allocate_recordbuf(state, total_len)) + { + /* We treat this as a "bogus data" condition */ + report_invalid_record(state, "record length %u at %X/%X too long", + total_len, LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* Copy the first fragment of the record from the first page. */ + memcpy(state->readRecordBuf, + state->readBuf + RecPtr % XLOG_BLCKSZ, len); + buffer = state->readRecordBuf + len; + gotlen = len; + + do + { + /* Calculate pointer to beginning of next page */ + targetPagePtr += XLOG_BLCKSZ; + + /* Wait for the next page to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(total_len - gotlen + SizeOfXLogShortPHD, + XLOG_BLCKSZ)); + if (readOff < 0) + goto err; + + Assert(SizeOfXLogShortPHD <= readOff); + + pageHeader = (XLogPageHeader) state->readBuf; + + /* + * If we were expecting a continuation record and got an + * "overwrite contrecord" flag, that means the continuation record + * was overwritten with a different record. Restart the read by + * assuming the address to read is the location where we found + * this flag; but keep track of the LSN of the record we were + * reading, for later verification. + */ + if (pageHeader->xlp_info & XLP_FIRST_IS_OVERWRITE_CONTRECORD) + { + state->overwrittenRecPtr = RecPtr; + ResetDecoder(state); + RecPtr = targetPagePtr; + goto restart; + } + + /* Check that the continuation on next page looks valid */ + if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) + { + report_invalid_record(state, + "there is no contrecord flag at %X/%X", + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* + * Cross-check that xlp_rem_len agrees with how much of the record + * we expect there to be left. + */ + if (pageHeader->xlp_rem_len == 0 || + total_len != (pageHeader->xlp_rem_len + gotlen)) + { + report_invalid_record(state, + "invalid contrecord length %u (expected %lld) at %X/%X", + pageHeader->xlp_rem_len, + ((long long) total_len) - gotlen, + LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + /* Append the continuation from this page to the buffer */ + pageHeaderSize = XLogPageHeaderSize(pageHeader); + + if (readOff < pageHeaderSize) { + readOff = ReadPageInternal(state, targetPagePtr, + pageHeaderSize); + } + + Assert(pageHeaderSize <= readOff); + + contdata = (char *) state->readBuf + pageHeaderSize; + len = XLOG_BLCKSZ - pageHeaderSize; + if (pageHeader->xlp_rem_len < len) + len = pageHeader->xlp_rem_len; + + if (readOff < pageHeaderSize + len) { + readOff = ReadPageInternal(state, targetPagePtr, + pageHeaderSize + len); + } + memcpy(buffer, (char *) contdata, len); + buffer += len; + gotlen += len; + + /* If we just reassembled the record header, validate it. */ + if (!gotheader) + { + oldrecord = (OldXLogRecord *) state->readRecordBuf; + if (!ValidOldXLogRecordHeader(state, RecPtr, state->ReadRecPtr, + oldrecord, randAccess)) + goto err; + gotheader = true; + } + } while (gotlen < total_len); + + Assert(gotheader); + + oldrecord = (OldXLogRecord *) state->readRecordBuf; + if (!ValidOldXLogRecord(state, oldrecord, RecPtr)) + goto err; + + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); + state->ReadRecPtr = RecPtr; + state->EndRecPtr = targetPagePtr + pageHeaderSize + + MAXALIGN(pageHeader->xlp_rem_len); + } + else + { + /* Wait for the record data to become available */ + readOff = ReadPageInternal(state, targetPagePtr, + Min(targetRecOff + total_len, XLOG_BLCKSZ)); + if (readOff < 0) + goto err; + + /* Record does not cross a page boundary */ + if (!ValidOldXLogRecord(state, oldrecord, RecPtr)) + goto err; + + state->EndRecPtr = RecPtr + MAXALIGN(total_len); + + state->ReadRecPtr = RecPtr; + } + + /* + * Special processing if it's an XLOG SWITCH record + */ + if (oldrecord->xl_rmid == RM_XLOG_ID && + (oldrecord->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH) + { + /* Pretend it extends to end of segment */ + state->EndRecPtr += state->segcxt.ws_segsize - 1; + state->EndRecPtr -= XLogSegmentOffset(state->EndRecPtr, state->segcxt.ws_segsize); + } + + if (DecodeOldXLogRecord(state, oldrecord, errormsg)) { + SplitXLogRecord(state, oldrecord); + //return this for judge checkpoint of xlog exist,xlogrecord only head don't include maindata + return grouphead[grouo_rec_cur_count++]; + } else + return NULL; + + err: + if (assembled) + { + /* + * We get here when a record that spans multiple pages needs to be + * assembled, but something went wrong -- perhaps a contrecord piece + * was lost. If caller is WAL replay, it will know where the aborted + * record was and where to direct followup WAL to be written, marking + * the next piece with XLP_FIRST_IS_OVERWRITE_CONTRECORD, which will + * in turn signal downstream WAL consumers that the broken WAL record + * is to be ignored. + */ + state->abortedRecPtr = RecPtr; + state->missingContrecPtr = targetPagePtr; + } + + /* + * Invalidate the read state. We might read from a different source after + * failure. + */ + XLogReaderInvalReadState(state); + + if (state->errormsg_buf[0] != '\0') + *errormsg = state->errormsg_buf; + if (state->localWalComplete == false) { + group_total_len = 0; + producerHe3dbXLog(NULL, 0, 0, 0, state->EndRecPtr); + state->localWalComplete = true; + } + return NULL; +} +#endif /* * Attempt to read an XLOG record. @@ -331,62 +751,37 @@ XLogReadRecord(XLogReaderState *state, char **errormsg) restart: #ifndef PG_NOREPLAY if (IsBootstrapProcessingMode() != true && InitdbSingle!=true) { - if (push_standby == true && getpid() == startupPid) { - /* Update shared-memory status */ - if (PushPtr != PrePushPtr) { - PushCheckPointGuts(PushPtr,GlobalState); - int count = 0; - char buffer[20480]; - int len1 = snprintf(buffer,sizeof(buffer),"sadd boost_%ld_%ld ",PushPtr,PrePushPtr); - char oneV[128]; - int pos = len1; - while(!QueueEmpty() && QueueHeadEndLsn() <= PushPtr) { - QDataType data = QueuePop(); - if (data.startlsn != PrePushPtr||data.endlsn > PushPtr) { - printf("Error lsn =====================%ld,%ld,%ld,%ld\n",data.startlsn,PrePushPtr,data.endlsn,PushPtr); + if (getpid() == startupPid) { + if (push_standby == true) { + /* Update shared-memory status */ + XLogRecPtr prevPushPoint = PrePushPtr; + if (!XLogRecPtrIsInvalid(CheckPointPtr)) { + pushTikv(0,hashMapSize(),true); + PushCheckPointGuts(CheckPointPtr,GlobalState); + FlushNewRecoveryPoint(CheckPointPtr); + printf("curlsn==%x==prevPushPoint==%x==PushPoint==%x==last check point lsn====%x\n",RecPtr,prevPushPoint,PushPtr,CheckPointPtr); + CheckPointPtr = InvalidXLogRecPtr; + } + //need to tell push standby has new standby add + while(ApplyLsn != InvalidXLogRecPtr && RecPtr >= ApplyLsn) { + XLogRecPtr tmpLsn = InvalidXLogRecPtr; + if (!he3mirror) { + tmpLsn = QueryMinLsn(InvalidXLogRecPtr); } - count++; - int len2 = snprintf(oneV,sizeof(oneV),"%d_%d_%d_%d ",data.dbNode,data.relNode,data.forkNum,data.blockNum); - memcpy(buffer + pos,oneV,len2); - pos += len2; - if (count % 100 == 0) { - buffer[pos] = '\0'; - pushRedisList(buffer); - pos = len1; - } - } - if (count == 0) { - int len2 = snprintf(oneV,sizeof(oneV),"%s ","null"); - memcpy(buffer + pos,oneV,len2); - pos += len2; - } - if (count % 100 != 0 || count == 0) { - buffer[pos] = '\0'; - pushRedisList(buffer); - } - FlushNewRecoveryPoint(PushPtr); - printf("curlsn==%x==pre check point==%x==%ld==last check point lsn==%x==%ld\n",RecPtr,PrePushPtr,PrePushPtr,PushPtr,PushPtr); - PrePushPtr = PushPtr; - if (ApplyLsn < PushPtr) { - ApplyLsn = PushPtr; - } - } - //need to tell push standby has new standby add - while(ApplyLsn != InvalidXLogRecPtr && RecPtr >= ApplyLsn) { - //XLogRecPtr lsn = GetFlushXlogPtr(); - XLogRecPtr tmpLsn = QueryMinLsn(InvalidXLogRecPtr); - if (tmpLsn !=InvalidXLogRecPtr) { - if (tmpLsn <= RecPtr) { - sleep(3); - continue; + if (tmpLsn !=InvalidXLogRecPtr) { + pushTikv(0,hashMapSize(),true); + if (tmpLsn <= RecPtr) { + pg_usleep(10000); + continue; + } else { + ApplyLsn = tmpLsn; + } } else { - ApplyLsn = tmpLsn; + ApplyLsn = InvalidXLogRecPtr; } - } else { - ApplyLsn = InvalidXLogRecPtr; + break; } - break; - } + } } } #endif @@ -402,7 +797,7 @@ restart: * byte to cover the whole record header, or at least the part of it that * fits on the same page. */ - readOff = ReadPageInternal(state, targetPagePtr, + readOff = He3DBReadWalInternal(state, targetPagePtr, Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ)); if (readOff < 0) goto err; @@ -471,7 +866,7 @@ restart: if (total_len < SizeOfXLogRecord) { report_invalid_record(state, - "invalid record length at %X/%X: wanted %u, got %u", + "XLogReadRecord: invalid record length at %X/%X: wanted %u, got %u", LSN_FORMAT_ARGS(RecPtr), (uint32) SizeOfXLogRecord, total_len); goto err; @@ -514,7 +909,7 @@ restart: targetPagePtr += XLOG_BLCKSZ; /* Wait for the next page to become available */ - readOff = ReadPageInternal(state, targetPagePtr, + readOff = He3DBReadWalInternal(state, targetPagePtr, Min(total_len - gotlen + SizeOfXLogShortPHD, XLOG_BLCKSZ)); @@ -569,7 +964,7 @@ restart: pageHeaderSize = XLogPageHeaderSize(pageHeader); if (readOff < pageHeaderSize) - readOff = ReadPageInternal(state, targetPagePtr, + readOff = He3DBReadWalInternal(state, targetPagePtr, pageHeaderSize); Assert(pageHeaderSize <= readOff); @@ -580,7 +975,7 @@ restart: len = pageHeader->xlp_rem_len; if (readOff < pageHeaderSize + len) - readOff = ReadPageInternal(state, targetPagePtr, + readOff = He3DBReadWalInternal(state, targetPagePtr, pageHeaderSize + len); memcpy(buffer, (char *) contdata, len); @@ -612,7 +1007,7 @@ restart: else { /* Wait for the record data to become available */ - readOff = ReadPageInternal(state, targetPagePtr, + readOff = He3DBReadWalInternal(state, targetPagePtr, Min(targetRecOff + total_len, XLOG_BLCKSZ)); if (readOff < 0) goto err; @@ -629,13 +1024,13 @@ restart: /* * Special processing if it's an XLOG SWITCH record */ - if (record->xl_rmid == RM_XLOG_ID && - (record->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH) - { - /* Pretend it extends to end of segment */ - state->EndRecPtr += state->segcxt.ws_segsize - 1; - state->EndRecPtr -= XLogSegmentOffset(state->EndRecPtr, state->segcxt.ws_segsize); - } + // if (record->xl_rmid == RM_XLOG_ID && + // (record->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH) + // { + // /* Pretend it extends to end of segment */ + // state->EndRecPtr += state->segcxt.ws_segsize - 1; + // state->EndRecPtr -= XLogSegmentOffset(state->EndRecPtr, state->segcxt.ws_segsize); + // } if (DecodeXLogRecord(state, record, errormsg)) return record; @@ -670,6 +1065,173 @@ err: return NULL; } +XLogRecord * +He3DBXLogReadRecord(XLogReaderState *state, char **errormsg) +{ + XLogRecPtr RecPtr; + XLogRecord *record; + uint32 total_len; + int readOff; + + /* + * randAccess indicates whether to verify the previous-record pointer of + * the record we're reading. We only do this if we're reading + * sequentially, which is what we initially assume. + */ + + /* reset error state */ + *errormsg = NULL; + state->errormsg_buf[0] = '\0'; + + ResetDecoder(state); + state->abortedRecPtr = InvalidXLogRecPtr; + state->missingContrecPtr = InvalidXLogRecPtr; + + RecPtr = state->EndRecPtr; + + // if (state->ReadRecPtr != InvalidXLogRecPtr) + // { + // /* read the record after the one we just read */ + + // /* + // * EndRecPtr is pointing to end+1 of the previous WAL record. If + // * we're at a page boundary, no more records can fit on the current + // * page. We must skip over the page header, but we can't do that until + // * we've read in the page, since the header size is variable. + // */ + // } + // else + // { + // /* + // * Caller supplied a position to start at. + // * + // * In this case, EndRecPtr should already be pointing to a valid + // * record starting position. + // */ + // Assert(XRecOffIsValid(RecPtr)); + // } + +restart: +#ifndef PG_NOREPLAY + if (IsBootstrapProcessingMode() != true && InitdbSingle!=true) { + if (getpid() == startupPid) { + if (push_standby == true) { + // Update shared-memory status + XLogRecPtr prevPushPoint = PrePushPtr; + if (!XLogRecPtrIsInvalid(CheckPointPtr)) { + pushTikv(0,hashMapSize(),true); + PushCheckPointGuts(CheckPointPtr,GlobalState); + FlushNewRecoveryPoint(CheckPointPtr); + printf("curlsn==%x==prevPushPoint==%x==PushPoint==%x==last check point lsn====%x\n",RecPtr,prevPushPoint,PushPtr,CheckPointPtr); + CheckPointPtr = InvalidXLogRecPtr; + } + //need to tell push standby has new standby add + while(ApplyLsn != InvalidXLogRecPtr && RecPtr >= ApplyLsn) { + XLogRecPtr tmpLsn = InvalidXLogRecPtr; + if (!he3mirror) { + tmpLsn = QueryMinLsn(InvalidXLogRecPtr); + } + if (tmpLsn !=InvalidXLogRecPtr) { + pushTikv(0,hashMapSize(),true); + if (tmpLsn <= RecPtr) { + pg_usleep(10000); + continue; + } else { + ApplyLsn = tmpLsn; + } + } else { + ApplyLsn = InvalidXLogRecPtr; + } + break; + } + } + } + } +#endif + + state->currRecPtr = RecPtr; + + // targetPagePtr = RecPtr - (RecPtr % XLOG_BLCKSZ); + // targetRecOff = RecPtr % XLOG_BLCKSZ; + + /* + * Read the page containing the record into state->readBuf. Request enough + * byte to cover the whole record header, or at least the part of it that + * fits on the same page. + */ + + if (state->bufoff >= state->readLen) + { + readOff = He3DBReadWalInternal(state, RecPtr, SizeOfXLogRecord); + if (readOff < 0) + goto err; + state->bufoff = 0; + } + + /* + * Read the record length. + * + * NB: Even though we use an XLogRecord pointer here, the whole record + * header might not fit on this page. xl_tot_len is the first field of the + * struct, so it must be on this page (the records are MAXALIGNed), but we + * cannot access any other fields until we've verified that we got the + * whole header. + */ + record = (XLogRecord *) (state->readBuf + state->bufoff); + total_len = record->xl_tot_len; + state->bufoff += total_len; + + /* + * If the whole record header is on this page, validate it immediately. + * Otherwise do just a basic sanity check on xl_tot_len, and validate the + * rest of the header after reading it from the next page. The xl_tot_len + * check is necessary here to ensure that we enter the "Need to reassemble + * record" code path below; otherwise we might fail to apply + * ValidXLogRecordHeader at all. + */ + if (total_len < SizeOfXLogRecord) + { + report_invalid_record(state, + "He3DBXLogReadRecord: invalid record length at %X/%X: wanted %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfXLogRecord, total_len); + goto err; + } + + + state->EndRecPtr = record->xl_end; + state->ReadRecPtr = record->xl_end - total_len; + + /* + * Special processing if it's an XLOG SWITCH record + */ + // if (record->xl_rmid == RM_XLOG_ID && + // (record->xl_info & ~XLR_INFO_MASK) == XLOG_SWITCH) + // { + // /* Pretend it extends to end of segment */ + // state->EndRecPtr += state->segcxt.ws_segsize - 1; + // state->EndRecPtr -= XLogSegmentOffset(state->EndRecPtr, state->segcxt.ws_segsize); + // } + + if (DecodeXLogRecord(state, record, errormsg)) + return record; + else + return NULL; + +err: + /* + * Invalidate the read state. We might read from a different source after + * failure. + */ + XLogReaderInvalReadState(state); + + if (state->errormsg_buf[0] != '\0') + *errormsg = state->errormsg_buf; + + return NULL; +} + + /* * He3DBXLogListReadRecord * @@ -707,93 +1269,122 @@ He3DBXLogListReadRecord(XLogReaderState *state, char **errormsg, char *pageXlogB * We fetch the page from a reader-local cache if we know we have the required * data and if there hasn't been any error since caching the data. */ + static int + ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) + { + int readLen; + uint32 targetPageOff; + XLogSegNo targetSegNo; + XLogPageHeader hdr; + + Assert((pageptr % XLOG_BLCKSZ) == 0); + + XLByteToSeg(pageptr, targetSegNo, state->segcxt.ws_segsize); + targetPageOff = XLogSegmentOffset(pageptr, state->segcxt.ws_segsize); + + /* check whether we have all the requested data already */ + if (targetSegNo == state->seg.ws_segno && + targetPageOff == state->segoff && reqLen <= state->readLen) + return state->readLen; + + /* + * Data is not in our buffer. + * + * Every time we actually read the segment, even if we looked at parts of + * it before, we need to do verification as the page_read callback might + * now be rereading data from a different source. + * + * Whenever switching to a new WAL segment, we read the first page of the + * file and validate its header, even if that's not where the target + * record is. This is so that we can check the additional identification + * info that is present in the first page's "long" header. + */ + if (targetSegNo != state->seg.ws_segno && targetPageOff != 0) + { + XLogRecPtr targetSegmentPtr = pageptr - targetPageOff; + + readLen = state->routine.page_read(state, targetSegmentPtr, XLOG_BLCKSZ, + state->currRecPtr, + state->readBuf); + if (readLen < 0) + goto err; + + /* we can be sure to have enough WAL available, we scrolled back */ + Assert(readLen == XLOG_BLCKSZ); + + if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, + state->readBuf)) + goto err; + } + + /* + * First, read the requested data length, but at least a short page header + * so that we can validate it. + */ + readLen = state->routine.page_read(state, pageptr, Max(reqLen, SizeOfXLogShortPHD), + state->currRecPtr, + state->readBuf); + if (readLen < 0) + goto err; + + Assert(readLen <= XLOG_BLCKSZ); + + /* Do we have enough data to check the header length? */ + if (readLen <= SizeOfXLogShortPHD) + goto err; + + Assert(readLen >= reqLen); + + hdr = (XLogPageHeader) state->readBuf; + + /* still not enough */ + if (readLen < XLogPageHeaderSize(hdr)) + { + readLen = state->routine.page_read(state, pageptr, XLogPageHeaderSize(hdr), + state->currRecPtr, + state->readBuf); + if (readLen < 0) + goto err; + } + + /* + * Now that we know we have the full header, validate it. + */ + if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr)) + goto err; + + /* update read state information */ + state->seg.ws_segno = targetSegNo; + state->segoff = targetPageOff; + state->readLen = readLen; + + return readLen; + + err: + XLogReaderInvalReadState(state); + return -1; + } + static int -ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) +He3DBReadWalInternal(XLogReaderState *state, XLogRecPtr startptr, int reqLen) { int readLen; - uint32 targetPageOff; - XLogSegNo targetSegNo; - XLogPageHeader hdr; - Assert((pageptr % XLOG_BLCKSZ) == 0); - XLByteToSeg(pageptr, targetSegNo, state->segcxt.ws_segsize); - targetPageOff = XLogSegmentOffset(pageptr, state->segcxt.ws_segsize); - - /* check whether we have all the requested data already */ - if (targetSegNo == state->seg.ws_segno && - targetPageOff == state->segoff && reqLen <= state->readLen) - return state->readLen; - - /* - * Data is not in our buffer. - * - * Every time we actually read the segment, even if we looked at parts of - * it before, we need to do verification as the page_read callback might - * now be rereading data from a different source. - * - * Whenever switching to a new WAL segment, we read the first page of the - * file and validate its header, even if that's not where the target - * record is. This is so that we can check the additional identification - * info that is present in the first page's "long" header. - */ - if (targetSegNo != state->seg.ws_segno && targetPageOff != 0) - { - XLogRecPtr targetSegmentPtr = pageptr - targetPageOff; - - readLen = state->routine.page_read(state, targetSegmentPtr, XLOG_BLCKSZ, - state->currRecPtr, - state->readBuf); - if (readLen < 0) - goto err; - - /* we can be sure to have enough WAL available, we scrolled back */ - Assert(readLen == XLOG_BLCKSZ); - - if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, - state->readBuf)) - goto err; - } /* * First, read the requested data length, but at least a short page header * so that we can validate it. */ - readLen = state->routine.page_read(state, pageptr, Max(reqLen, SizeOfXLogShortPHD), - state->currRecPtr, + readLen = state->routine.batch_read(state, startptr, reqLen, state->readBuf); if (readLen < 0) goto err; - Assert(readLen <= XLOG_BLCKSZ); + // Assert(readLen <= XLOG_BLCKSZ); - /* Do we have enough data to check the header length? */ - if (readLen <= SizeOfXLogShortPHD) - goto err; - - Assert(readLen >= reqLen); - - hdr = (XLogPageHeader) state->readBuf; - - /* still not enough */ - if (readLen < XLogPageHeaderSize(hdr)) - { - readLen = state->routine.page_read(state, pageptr, XLogPageHeaderSize(hdr), - state->currRecPtr, - state->readBuf); - if (readLen < 0) - goto err; - } - - /* - * Now that we know we have the full header, validate it. - */ - if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr)) - goto err; /* update read state information */ - state->seg.ws_segno = targetSegNo; - state->segoff = targetPageOff; state->readLen = readLen; return readLen; @@ -803,17 +1394,80 @@ err: return -1; } + /* * Invalidate the xlogreader's read state to force a re-read. */ static void XLogReaderInvalReadState(XLogReaderState *state) { - state->seg.ws_segno = 0; - state->segoff = 0; +#ifndef PG_NOREPLAY + if (he3mirror){ + state->seg.ws_segno = 0; + state->segoff = 0; + } +#endif + // state->seg.ws_segno = 0; + // state->segoff = 0; state->readLen = 0; } + +static bool +ValidOldXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, + XLogRecPtr PrevRecPtr, OldXLogRecord *record, + bool randAccess) +{ + if (record->xl_tot_len < SizeOfOldXLogRecord) + { + report_invalid_record(state, + "ValidOldXLogRecordHeader: invalid record length at %X/%X: wanted %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfOldXLogRecord, record->xl_tot_len); + return false; + } + if (record->xl_rmid > RM_MAX_ID) + { + report_invalid_record(state, + "invalid resource manager ID %u at %X/%X", + record->xl_rmid, LSN_FORMAT_ARGS(RecPtr)); + return false; + } + if (randAccess) + { + /* + * We can't exactly verify the prev-link, but surely it should be less + * than the record's own address. + */ + if (!(record->xl_prev < RecPtr)) + { + report_invalid_record(state, + "record with incorrect prev-link %X/%X at %X/%X", + LSN_FORMAT_ARGS(record->xl_prev), + LSN_FORMAT_ARGS(RecPtr)); + return false; + } + } + else + { + /* + * Record's prev-link should exactly match our previous location. This + * check guards against torn WAL pages where a stale but valid-looking + * WAL record starts on a sector boundary. + */ + if (record->xl_prev != PrevRecPtr) + { + report_invalid_record(state, + "record with incorrect prev-link %X/%X at %X/%X", + LSN_FORMAT_ARGS(record->xl_prev), + LSN_FORMAT_ARGS(RecPtr)); + return false; + } + } + + return true; +} + /* * Validate an XLOG record header. * @@ -828,7 +1482,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (record->xl_tot_len < SizeOfXLogRecord) { report_invalid_record(state, - "invalid record length at %X/%X: wanted %u, got %u", + "ValidXLogRecordHeader: invalid record length at %X/%X: wanted %u, got %u", LSN_FORMAT_ARGS(RecPtr), (uint32) SizeOfXLogRecord, record->xl_tot_len); return false; @@ -836,7 +1490,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, if (record->xl_rmid > RM_MAX_ID) { report_invalid_record(state, - "invalid resource manager ID %u at %X/%X", + "ValidXLogRecordHeader: invalid resource manager ID %u at %X/%X", record->xl_rmid, LSN_FORMAT_ARGS(RecPtr)); return false; } @@ -875,6 +1529,29 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, return true; } +static bool +ValidOldXLogRecord(XLogReaderState *state, OldXLogRecord *record, XLogRecPtr recptr) +{ + pg_crc32c crc; + + /* Calculate the CRC */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, ((char *) record) + SizeOfOldXLogRecord, record->xl_tot_len - SizeOfOldXLogRecord); + /* include the record header last */ + COMP_CRC32C(crc, (char *) record, offsetof(OldXLogRecord, xl_crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(record->xl_crc, crc)) + { + report_invalid_record(state, + "incorrect resource manager data checksum in record at %X/%X===%d=====%d", + LSN_FORMAT_ARGS(recptr),record->xl_crc,crc); + return false; + } + + return true; +} + /* * CRC-check an XLOG record. We do not believe the contents of an XLOG @@ -909,147 +1586,167 @@ ValidXLogRecord(XLogReaderState *state, XLogRecord *record, XLogRecPtr recptr) return true; } +bool +He3DBValidXLogRecord( XLogRecord *record) +{ + pg_crc32c crc; + + /* Calculate the CRC */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord); + /* include the record header last */ + COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(record->xl_crc, crc)) + { + return false; + } + + return true; +} + /* * Validate a page header. * * Check if 'phdr' is valid as the header of the XLog page at position * 'recptr'. */ -bool -XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, - char *phdr) -{ - XLogRecPtr recaddr; - XLogSegNo segno; - int32 offset; - XLogPageHeader hdr = (XLogPageHeader) phdr; + bool + XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, + char *phdr) + { + XLogRecPtr recaddr; + XLogSegNo segno; + int32 offset; + XLogPageHeader hdr = (XLogPageHeader) phdr; - Assert((recptr % XLOG_BLCKSZ) == 0); + Assert((recptr % XLOG_BLCKSZ) == 0); - XLByteToSeg(recptr, segno, state->segcxt.ws_segsize); - offset = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); + XLByteToSeg(recptr, segno, state->segcxt.ws_segsize); + offset = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); - XLogSegNoOffsetToRecPtr(segno, offset, state->segcxt.ws_segsize, recaddr); + XLogSegNoOffsetToRecPtr(segno, offset, state->segcxt.ws_segsize, recaddr); - if (hdr->xlp_magic != XLOG_PAGE_MAGIC) - { - char fname[MAXFNAMELEN]; + if (hdr->xlp_magic != XLOG_PAGE_MAGIC) + { + char fname[MAXFNAMELEN]; - XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); - report_invalid_record(state, - "invalid magic number %04X in log segment %s, offset %u", - hdr->xlp_magic, - fname, - offset); - return false; - } + report_invalid_record(state, + "invalid magic number %04X in log segment %s, offset %u", + hdr->xlp_magic, + fname, + offset); + return false; + } - if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0) - { - char fname[MAXFNAMELEN]; + if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0) + { + char fname[MAXFNAMELEN]; - XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); - report_invalid_record(state, - "invalid info bits %04X in log segment %s, offset %u", - hdr->xlp_info, - fname, - offset); - return false; - } + report_invalid_record(state, + "invalid info bits %04X in log segment %s, offset %u", + hdr->xlp_info, + fname, + offset); + return false; + } - if (hdr->xlp_info & XLP_LONG_HEADER) - { - XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr; + if (hdr->xlp_info & XLP_LONG_HEADER) + { + XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr; - if (state->system_identifier && - longhdr->xlp_sysid != state->system_identifier) - { - report_invalid_record(state, - "WAL file is from different database system: WAL file database system identifier is %llu, pg_control database system identifier is %llu", - (unsigned long long) longhdr->xlp_sysid, - (unsigned long long) state->system_identifier); - return false; - } - else if (longhdr->xlp_seg_size != state->segcxt.ws_segsize) - { - report_invalid_record(state, - "WAL file is from different database system: incorrect segment size in page header"); - return false; - } - else if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ) - { - report_invalid_record(state, - "WAL file is from different database system: incorrect XLOG_BLCKSZ in page header"); - return false; - } - } - else if (offset == 0) - { - char fname[MAXFNAMELEN]; + if (state->system_identifier && + longhdr->xlp_sysid != state->system_identifier) + { + report_invalid_record(state, + "WAL file is from different database system: WAL file database system identifier is %llu, pg_control database system identifier is %llu", + (unsigned long long) longhdr->xlp_sysid, + (unsigned long long) state->system_identifier); + return false; + } + else if (longhdr->xlp_seg_size != state->segcxt.ws_segsize) + { + report_invalid_record(state, + "WAL file is from different database system: incorrect segment size in page header"); + return false; + } + else if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ) + { + report_invalid_record(state, + "WAL file is from different database system: incorrect XLOG_BLCKSZ in page header"); + return false; + } + } + else if (offset == 0) + { + char fname[MAXFNAMELEN]; - XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); - /* hmm, first page of file doesn't have a long header? */ - report_invalid_record(state, - "invalid info bits %04X in log segment %s, offset %u", - hdr->xlp_info, - fname, - offset); - return false; - } + /* hmm, first page of file doesn't have a long header? */ + report_invalid_record(state, + "invalid info bits %04X in log segment %s, offset %u", + hdr->xlp_info, + fname, + offset); + return false; + } - /* - * Check that the address on the page agrees with what we expected. This - * check typically fails when an old WAL segment is recycled, and hasn't - * yet been overwritten with new data yet. - */ - if (hdr->xlp_pageaddr != recaddr) - { - char fname[MAXFNAMELEN]; + /* + * Check that the address on the page agrees with what we expected. This + * check typically fails when an old WAL segment is recycled, and hasn't + * yet been overwritten with new data yet. + */ + if (hdr->xlp_pageaddr != recaddr) + { + char fname[MAXFNAMELEN]; - XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); - report_invalid_record(state, - "unexpected pageaddr %X/%X in log segment %s, offset %u", - LSN_FORMAT_ARGS(hdr->xlp_pageaddr), - fname, - offset); - return false; - } + report_invalid_record(state, + "unexpected pageaddr %X/%X in log segment %s, offset %u", + LSN_FORMAT_ARGS(hdr->xlp_pageaddr), + fname, + offset); + return false; + } - /* - * Since child timelines are always assigned a TLI greater than their - * immediate parent's TLI, we should never see TLI go backwards across - * successive pages of a consistent WAL sequence. - * - * Sometimes we re-read a segment that's already been (partially) read. So - * we only verify TLIs for pages that are later than the last remembered - * LSN. - */ - if (recptr > state->latestPagePtr) - { - if (hdr->xlp_tli < state->latestPageTLI) - { - char fname[MAXFNAMELEN]; + /* + * Since child timelines are always assigned a TLI greater than their + * immediate parent's TLI, we should never see TLI go backwards across + * successive pages of a consistent WAL sequence. + * + * Sometimes we re-read a segment that's already been (partially) read. So + * we only verify TLIs for pages that are later than the last remembered + * LSN. + */ + if (recptr > state->latestPagePtr) + { + if (hdr->xlp_tli < state->latestPageTLI) + { + char fname[MAXFNAMELEN]; - XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); + XLogFileName(fname, state->seg.ws_tli, segno, state->segcxt.ws_segsize); - report_invalid_record(state, - "out-of-sequence timeline ID %u (after %u) in log segment %s, offset %u", - hdr->xlp_tli, - state->latestPageTLI, - fname, - offset); - return false; - } - } - state->latestPagePtr = recptr; - state->latestPageTLI = hdr->xlp_tli; + report_invalid_record(state, + "out-of-sequence timeline ID %u (after %u) in log segment %s, offset %u", + hdr->xlp_tli, + state->latestPageTLI, + fname, + offset); + return false; + } + } + state->latestPagePtr = recptr; + state->latestPageTLI = hdr->xlp_tli; - return true; -} + return true; + } #ifdef FRONTEND /* @@ -1072,92 +1769,22 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr) { - XLogRecPtr tmpRecPtr; XLogRecPtr found = InvalidXLogRecPtr; - XLogPageHeader header; char *errormsg; + int readLen; - Assert(!XLogRecPtrIsInvalid(RecPtr)); - /* - * skip over potential continuation data, keeping in mind that it may span - * multiple pages - */ - tmpRecPtr = RecPtr; - while (true) - { - XLogRecPtr targetPagePtr; - int targetRecOff; - uint32 pageHeaderSize; - int readLen; - - /* - * Compute targetRecOff. It should typically be equal or greater than - * short page-header since a valid record can't start anywhere before - * that, except when caller has explicitly specified the offset that - * falls somewhere there or when we are skipping multi-page - * continuation record. It doesn't matter though because - * ReadPageInternal() is prepared to handle that and will read at - * least short page-header worth of data - */ - targetRecOff = tmpRecPtr % XLOG_BLCKSZ; - - /* scroll back to page boundary */ - targetPagePtr = tmpRecPtr - targetRecOff; - - /* Read the page containing the record */ - readLen = ReadPageInternal(state, targetPagePtr, targetRecOff); - if (readLen < 0) - goto err; - - header = (XLogPageHeader) state->readBuf; - - pageHeaderSize = XLogPageHeaderSize(header); - - /* make sure we have enough data for the page header */ - readLen = ReadPageInternal(state, targetPagePtr, pageHeaderSize); - if (readLen < 0) - goto err; - - /* skip over potential continuation data */ - if (header->xlp_info & XLP_FIRST_IS_CONTRECORD) - { - /* - * If the length of the remaining continuation data is more than - * what can fit in this page, the continuation record crosses over - * this page. Read the next page and try again. xlp_rem_len in the - * next page header will contain the remaining length of the - * continuation data - * - * Note that record headers are MAXALIGN'ed - */ - if (MAXALIGN(header->xlp_rem_len) >= (XLOG_BLCKSZ - pageHeaderSize)) - tmpRecPtr = targetPagePtr + XLOG_BLCKSZ; - else - { - /* - * The previous continuation record ends in this page. Set - * tmpRecPtr to point to the first valid record - */ - tmpRecPtr = targetPagePtr + pageHeaderSize - + MAXALIGN(header->xlp_rem_len); - break; - } - } - else - { - tmpRecPtr = targetPagePtr + pageHeaderSize; - break; - } - } + readLen = He3DBReadWalInternal(state, RecPtr, SizeOfXLogRecord); + if (readLen < 0) + goto err; /* * we know now that tmpRecPtr is an address pointing to a valid XLogRecord * because either we're at the first record after the beginning of a page * or we just jumped over the remaining data of a continuation. */ - XLogBeginRead(state, tmpRecPtr); - while (XLogReadRecord(state, &errormsg) != NULL) + XLogBeginRead(state, RecPtr); + while (He3DBXLogReadRecord(state, &errormsg) != NULL) { /* past the record we've found, break out */ if (RecPtr <= state->ReadRecPtr) @@ -1191,87 +1818,108 @@ err: * XXX probably this should be improved to suck data directly from the * WAL buffers when possible. */ -bool -WALRead(XLogReaderState *state, - char *buf, XLogRecPtr startptr, Size count, TimeLineID tli, - WALReadError *errinfo) +// bool +// WALRead(XLogReaderState *state, +// char *buf, XLogRecPtr startptr, Size count, TimeLineID tli, +// WALReadError *errinfo) +// { +// char *p; +// XLogRecPtr recptr; +// Size nbytes; + +// p = buf; +// recptr = startptr; +// nbytes = count; + +// while (nbytes > 0) +// { +// uint32 startoff; +// int segbytes; +// int readbytes; + +// startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); + +// /* +// * If the data we want is not in a segment we have open, close what we +// * have (if anything) and open the next one, using the caller's +// * provided openSegment callback. +// */ +// if (state->seg.ws_file < 0 || +// !XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) || +// tli != state->seg.ws_tli) +// { +// XLogSegNo nextSegNo; + +// if (state->seg.ws_file >= 0) +// state->routine.segment_close(state); + +// XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize); +// state->routine.segment_open(state, nextSegNo, &tli); + +// /* This shouldn't happen -- indicates a bug in segment_open */ +// Assert(state->seg.ws_file >= 0); + +// /* Update the current segment info. */ +// state->seg.ws_tli = tli; +// state->seg.ws_segno = nextSegNo; +// } + +// /* How many bytes are within this segment? */ +// if (nbytes > (state->segcxt.ws_segsize - startoff)) +// segbytes = state->segcxt.ws_segsize - startoff; +// else +// segbytes = nbytes; + +// #ifndef FRONTEND +// pgstat_report_wait_start(WAIT_EVENT_WAL_READ); +// #endif + +// /* Reset errno first; eases reporting non-errno-affecting errors */ +// errno = 0; +// readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff); + +// #ifndef FRONTEND +// pgstat_report_wait_end(); +// #endif + +// if (readbytes <= 0) +// { +// errinfo->wre_errno = errno; +// errinfo->wre_req = segbytes; +// errinfo->wre_read = readbytes; +// errinfo->wre_off = startoff; +// errinfo->wre_seg = state->seg; +// return false; +// } + +// /* Update state for read */ +// recptr += readbytes; +// nbytes -= readbytes; +// p += readbytes; +// } + +// return true; +// } + +int +He3DBWALRead(XLogReaderState *state, + XLogRecPtr startptr, int count, char *buf) { - char *p; - XLogRecPtr recptr; - Size nbytes; - - p = buf; - recptr = startptr; - nbytes = count; - - while (nbytes > 0) - { - uint32 startoff; - int segbytes; - int readbytes; - - startoff = XLogSegmentOffset(recptr, state->segcxt.ws_segsize); - - /* - * If the data we want is not in a segment we have open, close what we - * have (if anything) and open the next one, using the caller's - * provided openSegment callback. - */ - if (state->seg.ws_file < 0 || - !XLByteInSeg(recptr, state->seg.ws_segno, state->segcxt.ws_segsize) || - tli != state->seg.ws_tli) - { - XLogSegNo nextSegNo; - - if (state->seg.ws_file >= 0) - state->routine.segment_close(state); - - XLByteToSeg(recptr, nextSegNo, state->segcxt.ws_segsize); - state->routine.segment_open(state, nextSegNo, &tli); - - /* This shouldn't happen -- indicates a bug in segment_open */ - Assert(state->seg.ws_file >= 0); - - /* Update the current segment info. */ - state->seg.ws_tli = tli; - state->seg.ws_segno = nextSegNo; - } - - /* How many bytes are within this segment? */ - if (nbytes > (state->segcxt.ws_segsize - startoff)) - segbytes = state->segcxt.ws_segsize - startoff; - else - segbytes = nbytes; + int nbytes; #ifndef FRONTEND pgstat_report_wait_start(WAIT_EVENT_WAL_READ); #endif - /* Reset errno first; eases reporting non-errno-affecting errors */ - errno = 0; - readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) startoff); + nbytes = batchReadForTools((uint8_t *) buf, state->currTLI, startptr, startptr+16384, false); #ifndef FRONTEND pgstat_report_wait_end(); #endif - if (readbytes <= 0) - { - errinfo->wre_errno = errno; - errinfo->wre_req = segbytes; - errinfo->wre_read = readbytes; - errinfo->wre_off = startoff; - errinfo->wre_seg = state->seg; - return false; - } - - /* Update state for read */ - recptr += readbytes; - nbytes -= readbytes; - p += readbytes; - } - - return true; + if (nbytes < count) + return -1; + return nbytes; } /* ---------------------------------------- @@ -1299,6 +1947,322 @@ ResetDecoder(XLogReaderState *state) state->max_block_id = -1; } +static bool +DecodeOldXLogRecord(XLogReaderState *state, OldXLogRecord *record, char **errormsg) +{ + /* + * read next _size bytes from record buffer, but check for overrun first. + */ +#define COPY_HEADER_FIELD(_dst, _size) \ + do { \ + if (remaining < _size) \ + goto shortdata_err; \ + memcpy(_dst, ptr, _size); \ + ptr += _size; \ + remaining -= _size; \ + } while(0) + + char *ptr; + uint32 remaining; + uint32 datatotal; + RelFileNode *rnode = NULL; + uint8 block_id; + static int lastMaxIdx = XLR_MAX_BLOCK_ID; + state->max_block_id = lastMaxIdx; + ResetDecoder(state); + + state->decoded_record = NULL; + state->record_origin = InvalidRepOriginId; + state->toplevel_xid = InvalidTransactionId; + + ptr = (char *) record; + ptr += SizeOfOldXLogRecord; + + if (record->xl_tot_len < SizeOfOldXLogRecord) { + return false; + } + remaining = record->xl_tot_len - SizeOfOldXLogRecord; + + /* Decode the headers */ + datatotal = 0; + while (remaining > datatotal) + { + COPY_HEADER_FIELD(&block_id, sizeof(uint8)); + + if (block_id == XLR_BLOCK_ID_DATA_SHORT) + { + /* XLogRecordDataHeaderShort */ + uint8 main_data_len; + + COPY_HEADER_FIELD(&main_data_len, sizeof(uint8)); + + state->main_data_len = main_data_len; + datatotal += main_data_len; + break; /* by convention, the main data fragment is + * always last */ + } + else if (block_id == XLR_BLOCK_ID_DATA_LONG) + { + /* XLogRecordDataHeaderLong */ + uint32 main_data_len; + + COPY_HEADER_FIELD(&main_data_len, sizeof(uint32)); + state->main_data_len = main_data_len; + datatotal += main_data_len; + break; /* by convention, the main data fragment is + * always last */ + } + else if (block_id == XLR_BLOCK_ID_ORIGIN) + { + COPY_HEADER_FIELD(&state->record_origin, sizeof(RepOriginId)); + } + else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID) + { + COPY_HEADER_FIELD(&state->toplevel_xid, sizeof(TransactionId)); + } + else if (block_id <= XLR_MAX_BLOCK_ID) + { + /* XLogRecordBlockHeader */ + DecodedBkpBlock *blk; + uint8 fork_flags; + + if (block_id <= state->max_block_id) + { + report_invalid_record(state, + "out-of-order block_id %u at %X/%X", + block_id, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + state->max_block_id = block_id; + + blk = &state->blocks[block_id]; + blk->in_use = true; + blk->apply_image = false; + + COPY_HEADER_FIELD(&fork_flags, sizeof(uint8)); + blk->forknum = fork_flags & BKPBLOCK_FORK_MASK; + blk->flags = fork_flags; + blk->has_image = ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0); + blk->has_data = ((fork_flags & BKPBLOCK_HAS_DATA) != 0); + + COPY_HEADER_FIELD(&blk->data_len, sizeof(uint16)); + /* cross-check that the HAS_DATA flag is set iff data_length > 0 */ + if (blk->has_data && blk->data_len == 0) + { + report_invalid_record(state, + "BKPBLOCK_HAS_DATA set, but no data included at %X/%X", + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + if (!blk->has_data && blk->data_len != 0) + { + report_invalid_record(state, + "BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X", + (unsigned int) blk->data_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + datatotal += blk->data_len; + + if (blk->has_image) + { + COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16)); + COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); + COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8)); + + blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0); + + if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED) + { + if (blk->bimg_info & BKPIMAGE_HAS_HOLE) + COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); + else + blk->hole_length = 0; + } + else + blk->hole_length = BLCKSZ - blk->bimg_len; + datatotal += blk->bimg_len; + + /* + * cross-check that hole_offset > 0, hole_length > 0 and + * bimg_len < BLCKSZ if the HAS_HOLE flag is set. + */ + if ((blk->bimg_info & BKPIMAGE_HAS_HOLE) && + (blk->hole_offset == 0 || + blk->hole_length == 0 || + blk->bimg_len == BLCKSZ)) + { + report_invalid_record(state, + "BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + (unsigned int) blk->bimg_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + /* + * cross-check that hole_offset == 0 and hole_length == 0 if + * the HAS_HOLE flag is not set. + */ + if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && + (blk->hole_offset != 0 || blk->hole_length != 0)) + { + report_invalid_record(state, + "BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + /* + * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED + * flag is set. + */ + if ((blk->bimg_info & BKPIMAGE_IS_COMPRESSED) && + blk->bimg_len == BLCKSZ) + { + report_invalid_record(state, + "BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X", + (unsigned int) blk->bimg_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + /* + * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor + * IS_COMPRESSED flag is set. + */ + if (!(blk->bimg_info & BKPIMAGE_HAS_HOLE) && + !(blk->bimg_info & BKPIMAGE_IS_COMPRESSED) && + blk->bimg_len != BLCKSZ) + { + report_invalid_record(state, + "neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X", + (unsigned int) blk->data_len, + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + } + if (!(fork_flags & BKPBLOCK_SAME_REL)) + { + COPY_HEADER_FIELD(&blk->rnode, sizeof(RelFileNode)); + rnode = &blk->rnode; + } + else + { + if (rnode == NULL) + { + report_invalid_record(state, + "BKPBLOCK_SAME_REL set but no previous rel at %X/%X", + LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + + blk->rnode = *rnode; + } + COPY_HEADER_FIELD(&blk->blkno, sizeof(BlockNumber)); + } + else + { + report_invalid_record(state, + "invalid block_id %u at %X/%X", + block_id, LSN_FORMAT_ARGS(state->ReadRecPtr)); + goto err; + } + } + + if (remaining != datatotal) + goto shortdata_err; + + /* + * Ok, we've parsed the fragment headers, and verified that the total + * length of the payload in the fragments is equal to the amount of data + * left. Copy the data of each fragment to a separate buffer. + * + * We could just set up pointers into readRecordBuf, but we want to align + * the data for the convenience of the callers. Backup images are not + * copied, however; they don't need alignment. + */ + lastMaxIdx = state->max_block_id; + /* block data first */ + for (block_id = 0; block_id <= state->max_block_id; block_id++) + { + DecodedBkpBlock *blk = &state->blocks[block_id]; + + if (!blk->in_use) + continue; + + Assert(blk->has_image || !blk->apply_image); + + if (blk->has_image) + { + blk->bkp_image = ptr; + ptr += blk->bimg_len; + } + if (blk->has_data) + { + if (!blk->data || blk->data_len > blk->data_bufsz) + { + if (blk->data) + pfree(blk->data); + + /* + * Force the initial request to be BLCKSZ so that we don't + * waste time with lots of trips through this stanza as a + * result of WAL compression. + */ + blk->data_bufsz = MAXALIGN(Max(blk->data_len, BLCKSZ)); + blk->data = palloc(blk->data_bufsz); + } + memcpy(blk->data, ptr, blk->data_len); + ptr += blk->data_len; + } + } + + /* and finally, the main data */ + if (state->main_data_len > 0) + { + if (!state->main_data || state->main_data_len > state->main_data_bufsz) + { + if (state->main_data) + pfree(state->main_data); + + /* + * main_data_bufsz must be MAXALIGN'ed. In many xlog record + * types, we omit trailing struct padding on-disk to save a few + * bytes; but compilers may generate accesses to the xlog struct + * that assume that padding bytes are present. If the palloc + * request is not large enough to include such padding bytes then + * we'll get valgrind complaints due to otherwise-harmless fetches + * of the padding bytes. + * + * In addition, force the initial request to be reasonably large + * so that we don't waste time with lots of trips through this + * stanza. BLCKSZ / 2 seems like a good compromise choice. + */ + state->main_data_bufsz = MAXALIGN(Max(state->main_data_len, + BLCKSZ / 2)); + state->main_data = palloc(state->main_data_bufsz); + } + memcpy(state->main_data, ptr, state->main_data_len); + ptr += state->main_data_len; + } + + return true; + + shortdata_err: + report_invalid_record(state, + "record with invalid length at %X/%X", + LSN_FORMAT_ARGS(state->ReadRecPtr)); + err: + *errormsg = state->errormsg_buf; + + return false; +} + /* * Decode the previously read record. * @@ -1503,23 +2467,10 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) goto err; } } - if (!(fork_flags & BKPBLOCK_SAME_REL)) - { - COPY_HEADER_FIELD(&blk->rnode, sizeof(RelFileNode)); - rnode = &blk->rnode; - } - else - { - if (rnode == NULL) - { - report_invalid_record(state, - "BKPBLOCK_SAME_REL set but no previous rel at %X/%X", - LSN_FORMAT_ARGS(state->ReadRecPtr)); - goto err; - } - blk->rnode = *rnode; - } + // He3DB save rnode in every records + COPY_HEADER_FIELD(&blk->rnode, sizeof(RelFileNode)); + rnode = &blk->rnode; COPY_HEADER_FIELD(&blk->blkno, sizeof(BlockNumber)); } else diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index a30031d..a85af1b 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -369,6 +369,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, /* Caller specified a bogus block_id */ elog(PANIC, "failed to locate backup block with ID %d", block_id); } +/* #ifndef PG_NOREPLAY if (IsBootstrapProcessingMode() != true && InitdbSingle != true) { //push standby collect dirty page @@ -384,6 +385,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, } } #endif +*/ /* * Make sure that if the block is marked with WILL_INIT, the caller is @@ -427,8 +429,8 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, * force the on-disk state of init forks to always be in sync with the * state in shared buffers. */ - if (forknum == INIT_FORKNUM) - FlushOneBuffer(*buf); + // if (forknum == INIT_FORKNUM) + // FlushOneBuffer(*buf); return BLK_RESTORED; } @@ -630,13 +632,15 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, else { /* hm, page doesn't exist in file */ - if (mode == RBM_NORMAL) - { - log_invalid_page(rnode, forknum, blkno, false); - return InvalidBuffer; - } - if (mode == RBM_NORMAL_NO_LOG) - return InvalidBuffer; + if(!he3mirror && he3share){ + if (mode == RBM_NORMAL && EnableHotStandby != false && *isPromoteIsTriggered == false) + { + log_invalid_page(rnode, forknum, blkno, false); + return InvalidBuffer; + } + if (mode == RBM_NORMAL_NO_LOG) + return InvalidBuffer; + } /* OK to extend the file */ /* we do this in recovery only - no rel-extension lock needed */ Assert(InRecovery); @@ -664,7 +668,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, } } - if (mode == RBM_NORMAL) + if (he3share && !he3mirror && mode == RBM_NORMAL) { /* check that page has been initialized */ Page page = (Page) BufferGetPage(buffer); @@ -890,135 +894,135 @@ XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, * caller must also update ThisTimeLineID with the result of * GetXLogReplayRecPtr and must check RecoveryInProgress(). */ -void -XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, uint32 wantLength) -{ - const XLogRecPtr lastReadPage = (state->seg.ws_segno * - state->segcxt.ws_segsize + state->segoff); +// void +// XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, uint32 wantLength) +// { +// const XLogRecPtr lastReadPage = (state->seg.ws_segno * +// state->segcxt.ws_segsize + state->segoff); - Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0); - Assert(wantLength <= XLOG_BLCKSZ); - Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ); +// Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0); +// Assert(wantLength <= XLOG_BLCKSZ); +// Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ); - /* - * If the desired page is currently read in and valid, we have nothing to - * do. - * - * The caller should've ensured that it didn't previously advance readOff - * past the valid limit of this timeline, so it doesn't matter if the - * current TLI has since become historical. - */ - if (lastReadPage == wantPage && - state->readLen != 0 && - lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1)) - return; +// /* +// * If the desired page is currently read in and valid, we have nothing to +// * do. +// * +// * The caller should've ensured that it didn't previously advance readOff +// * past the valid limit of this timeline, so it doesn't matter if the +// * current TLI has since become historical. +// */ +// if (lastReadPage == wantPage && +// state->readLen != 0 && +// lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1)) +// return; - /* - * If we're reading from the current timeline, it hasn't become historical - * and the page we're reading is after the last page read, we can again - * just carry on. (Seeking backwards requires a check to make sure the - * older page isn't on a prior timeline). - * - * ThisTimeLineID might've become historical since we last looked, but the - * caller is required not to read past the flush limit it saw at the time - * it looked up the timeline. There's nothing we can do about it if - * StartupXLOG() renames it to .partial concurrently. - */ - if (state->currTLI == ThisTimeLineID && wantPage >= lastReadPage) - { - Assert(state->currTLIValidUntil == InvalidXLogRecPtr); - return; - } +// /* +// * If we're reading from the current timeline, it hasn't become historical +// * and the page we're reading is after the last page read, we can again +// * just carry on. (Seeking backwards requires a check to make sure the +// * older page isn't on a prior timeline). +// * +// * ThisTimeLineID might've become historical since we last looked, but the +// * caller is required not to read past the flush limit it saw at the time +// * it looked up the timeline. There's nothing we can do about it if +// * StartupXLOG() renames it to .partial concurrently. +// */ +// if (state->currTLI == ThisTimeLineID && wantPage >= lastReadPage) +// { +// Assert(state->currTLIValidUntil == InvalidXLogRecPtr); +// return; +// } - /* - * If we're just reading pages from a previously validated historical - * timeline and the timeline we're reading from is valid until the end of - * the current segment we can just keep reading. - */ - if (state->currTLIValidUntil != InvalidXLogRecPtr && - state->currTLI != ThisTimeLineID && - state->currTLI != 0 && - ((wantPage + wantLength) / state->segcxt.ws_segsize) < - (state->currTLIValidUntil / state->segcxt.ws_segsize)) - return; +// /* +// * If we're just reading pages from a previously validated historical +// * timeline and the timeline we're reading from is valid until the end of +// * the current segment we can just keep reading. +// */ +// if (state->currTLIValidUntil != InvalidXLogRecPtr && +// state->currTLI != ThisTimeLineID && +// state->currTLI != 0 && +// ((wantPage + wantLength) / state->segcxt.ws_segsize) < +// (state->currTLIValidUntil / state->segcxt.ws_segsize)) +// return; - /* - * If we reach this point we're either looking up a page for random - * access, the current timeline just became historical, or we're reading - * from a new segment containing a timeline switch. In all cases we need - * to determine the newest timeline on the segment. - * - * If it's the current timeline we can just keep reading from here unless - * we detect a timeline switch that makes the current timeline historical. - * If it's a historical timeline we can read all the segment on the newest - * timeline because it contains all the old timelines' data too. So only - * one switch check is required. - */ - { - /* - * We need to re-read the timeline history in case it's been changed - * by a promotion or replay from a cascaded replica. - */ - List *timelineHistory = readTimeLineHistory(ThisTimeLineID); - XLogRecPtr endOfSegment; +// /* +// * If we reach this point we're either looking up a page for random +// * access, the current timeline just became historical, or we're reading +// * from a new segment containing a timeline switch. In all cases we need +// * to determine the newest timeline on the segment. +// * +// * If it's the current timeline we can just keep reading from here unless +// * we detect a timeline switch that makes the current timeline historical. +// * If it's a historical timeline we can read all the segment on the newest +// * timeline because it contains all the old timelines' data too. So only +// * one switch check is required. +// */ +// { +// /* +// * We need to re-read the timeline history in case it's been changed +// * by a promotion or replay from a cascaded replica. +// */ +// List *timelineHistory = readTimeLineHistory(ThisTimeLineID); +// XLogRecPtr endOfSegment; - endOfSegment = ((wantPage / state->segcxt.ws_segsize) + 1) * - state->segcxt.ws_segsize - 1; - Assert(wantPage / state->segcxt.ws_segsize == - endOfSegment / state->segcxt.ws_segsize); +// endOfSegment = ((wantPage / state->segcxt.ws_segsize) + 1) * +// state->segcxt.ws_segsize - 1; +// Assert(wantPage / state->segcxt.ws_segsize == +// endOfSegment / state->segcxt.ws_segsize); - /* - * Find the timeline of the last LSN on the segment containing - * wantPage. - */ - state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory); - state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory, - &state->nextTLI); +// /* +// * Find the timeline of the last LSN on the segment containing +// * wantPage. +// */ +// state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory); +// state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory, +// &state->nextTLI); - Assert(state->currTLIValidUntil == InvalidXLogRecPtr || - wantPage + wantLength < state->currTLIValidUntil); +// Assert(state->currTLIValidUntil == InvalidXLogRecPtr || +// wantPage + wantLength < state->currTLIValidUntil); - list_free_deep(timelineHistory); +// list_free_deep(timelineHistory); - elog(DEBUG3, "switched to timeline %u valid until %X/%X", - state->currTLI, - LSN_FORMAT_ARGS(state->currTLIValidUntil)); - } -} +// elog(DEBUG3, "switched to timeline %u valid until %X/%X", +// state->currTLI, +// LSN_FORMAT_ARGS(state->currTLIValidUntil)); +// } +// } /* XLogReaderRoutine->segment_open callback for local pg_wal files */ -void -wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo, - TimeLineID *tli_p) -{ - TimeLineID tli = *tli_p; - char path[MAXPGPATH]; + void + wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo, + TimeLineID *tli_p) + { + TimeLineID tli = *tli_p; + char path[MAXPGPATH]; - XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize); - state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); - if (state->seg.ws_file >= 0) - return; + XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize); + state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); + if (state->seg.ws_file >= 0) + return; - if (errno == ENOENT) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("requested WAL segment %s has already been removed", - path))); - else - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - path))); -} + if (errno == ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("requested WAL segment %s has already been removed", + path))); + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + path))); + } /* stock XLogReaderRoutine->segment_close callback */ -void -wal_segment_close(XLogReaderState *state) -{ - close(state->seg.ws_file); - /* need to check errno? */ - state->seg.ws_file = -1; -} + void + wal_segment_close(XLogReaderState *state) + { + close(state->seg.ws_file); + /* need to check errno? */ + state->seg.ws_file = -1; + } /* * XLogReaderRoutine->page_read callback for reading local xlog files @@ -1031,9 +1035,132 @@ wal_segment_close(XLogReaderState *state) * exists for normal backends, so we have to do a check/sleep/repeat style of * loop for now. */ +// int +// read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, +// int reqLen, XLogRecPtr targetRecPtr, char *cur_page) +// { +// XLogRecPtr read_upto, +// loc; +// TimeLineID tli; +// int count; +// WALReadError errinfo; + +// loc = targetPagePtr + reqLen; + +// /* Loop waiting for xlog to be available if necessary */ +// while (1) +// { +// /* +// * Determine the limit of xlog we can currently read to, and what the +// * most recent timeline is. +// * +// * RecoveryInProgress() will update ThisTimeLineID when it first +// * notices recovery finishes, so we only have to maintain it for the +// * local process until recovery ends. +// */ +// if (!RecoveryInProgress()) +// read_upto = GetFlushRecPtr(); +// else +// read_upto = GetXLogReplayRecPtr(&ThisTimeLineID); +// tli = ThisTimeLineID; + +// /* +// * Check which timeline to get the record from. +// * +// * We have to do it each time through the loop because if we're in +// * recovery as a cascading standby, the current timeline might've +// * become historical. We can't rely on RecoveryInProgress() because in +// * a standby configuration like +// * +// * A => B => C +// * +// * if we're a logical decoding session on C, and B gets promoted, our +// * timeline will change while we remain in recovery. +// * +// * We can't just keep reading from the old timeline as the last WAL +// * archive in the timeline will get renamed to .partial by +// * StartupXLOG(). +// * +// * If that happens after our caller updated ThisTimeLineID but before +// * we actually read the xlog page, we might still try to read from the +// * old (now renamed) segment and fail. There's not much we can do +// * about this, but it can only happen when we're a leaf of a cascading +// * standby whose primary gets promoted while we're decoding, so a +// * one-off ERROR isn't too bad. +// */ +// XLogReadDetermineTimeline(state, targetPagePtr, reqLen); + +// if (state->currTLI == ThisTimeLineID) +// { + +// if (loc <= read_upto) +// break; + +// CHECK_FOR_INTERRUPTS(); +// pg_usleep(1000L); +// } +// else +// { +// /* +// * We're on a historical timeline, so limit reading to the switch +// * point where we moved to the next timeline. +// * +// * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know +// * about the new timeline, so we must've received past the end of +// * it. +// */ +// read_upto = state->currTLIValidUntil; + +// /* +// * Setting tli to our wanted record's TLI is slightly wrong; the +// * page might begin on an older timeline if it contains a timeline +// * switch, since its xlog segment will have been copied from the +// * prior timeline. This is pretty harmless though, as nothing +// * cares so long as the timeline doesn't go backwards. We should +// * read the page header instead; FIXME someday. +// */ +// tli = state->currTLI; + +// /* No need to wait on a historical timeline */ +// break; +// } +// } + +// if (targetPagePtr + XLOG_BLCKSZ <= read_upto) +// { +// /* +// * more than one block available; read only that block, have caller +// * come back if they need more. +// */ +// count = XLOG_BLCKSZ; +// } +// else if (targetPagePtr + reqLen > read_upto) +// { +// /* not enough data there */ +// return -1; +// } +// else +// { +// /* enough bytes available to satisfy the request */ +// count = read_upto - targetPagePtr; +// } + +// /* +// * Even though we just determined how much of the page can be validly read +// * as 'count', read the whole page anyway. It's guaranteed to be +// * zero-padded up to the page boundary if it's incomplete. +// */ +// if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, tli, +// &errinfo)) +// WALReadRaiseError(&errinfo); + +// /* number of valid bytes in the buffer */ +// return count; +// } + int -read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, - int reqLen, XLogRecPtr targetRecPtr, char *cur_page) +read_local_xlog_batch(XLogReaderState *state, + XLogRecPtr targetRecPtr, int reqLen, char *cur_page) { XLogRecPtr read_upto, loc; @@ -1041,7 +1168,7 @@ read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int count; WALReadError errinfo; - loc = targetPagePtr + reqLen; + loc = targetRecPtr + reqLen; /* Loop waiting for xlog to be available if necessary */ while (1) @@ -1084,7 +1211,7 @@ read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, * standby whose primary gets promoted while we're decoding, so a * one-off ERROR isn't too bad. */ - XLogReadDetermineTimeline(state, targetPagePtr, reqLen); + // XLogReadDetermineTimeline(state, targetRecPtr, reqLen); if (state->currTLI == ThisTimeLineID) { @@ -1122,7 +1249,7 @@ read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, } } - if (targetPagePtr + XLOG_BLCKSZ <= read_upto) + if (targetRecPtr + XLOG_BLCKSZ <= read_upto) { /* * more than one block available; read only that block, have caller @@ -1130,7 +1257,7 @@ read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, */ count = XLOG_BLCKSZ; } - else if (targetPagePtr + reqLen > read_upto) + else if (targetRecPtr + reqLen > read_upto) { /* not enough data there */ return -1; @@ -1138,7 +1265,7 @@ read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, else { /* enough bytes available to satisfy the request */ - count = read_upto - targetPagePtr; + count = read_upto - targetRecPtr; } /* @@ -1146,14 +1273,15 @@ read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, * as 'count', read the whole page anyway. It's guaranteed to be * zero-padded up to the page boundary if it's incomplete. */ - if (!WALRead(state, cur_page, targetPagePtr, XLOG_BLCKSZ, tli, - &errinfo)) - WALReadRaiseError(&errinfo); + // if (!He3DBWALRead(state, cur_page, targetRecPtr, XLOG_BLCKSZ, tli, + // &errinfo)) + // WALReadRaiseError(&errinfo); /* number of valid bytes in the buffer */ return count; } + /* * Backend-specific convenience code to handle read errors encountered by * WALRead(). diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 6c91b8c..cb98678 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -37,6 +37,7 @@ #include "postmaster/bgwriter.h" #include "postmaster/startup.h" #include "postmaster/walwriter.h" +#include "postmaster/secondbuffer.h" #include "replication/walreceiver.h" #include "storage/bufmgr.h" #include "storage/bufpage.h" @@ -50,6 +51,7 @@ #include "utils/ps_status.h" #include "utils/rel.h" #include "utils/relmapper.h" +#include "access/pagehashqueue.h" uint32 bootstrap_data_checksum_version = 0; /* No checksum */ @@ -333,6 +335,12 @@ AuxiliaryProcessMain(int argc, char *argv[]) case WalReceiverProcess: MyBackendType = B_WAL_RECEIVER; break; + case CleanLogIndexProcess: + MyBackendType = B_CLEAN_LOGINDEX; + break; + case SecondBufferProcess: + MyBackendType = B_SECONDBUFFER; + break; default: MyBackendType = B_INVALID; } @@ -467,7 +475,14 @@ AuxiliaryProcessMain(int argc, char *argv[]) case WalReceiverProcess: WalReceiverMain(); proc_exit(1); - + + case CleanLogIndexProcess: + CleanLogIndexMain(0,NULL); + proc_exit(1); + case SecondBufferProcess: + SecondBufferMain(); + proc_exit(1); + default: elog(PANIC, "unrecognized process type: %d", (int) MyAuxProcType); proc_exit(1); @@ -521,7 +536,6 @@ BootstrapModeMain(void) attrtypes[i] = NULL; Nulls[i] = false; } - ufs_init_client(); /* * Process bootstrap input. */ diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index 56a8815..b804564 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -44,8 +44,7 @@ OBJS = \ pg_subscription.o \ pg_type.o \ storage.o \ - toasting.o \ - pg_hot_data.o + toasting.o include $(top_srcdir)/src/backend/common.mk @@ -70,7 +69,7 @@ CATALOG_HEADERS := \ pg_default_acl.h pg_init_privs.h pg_seclabel.h pg_shseclabel.h \ pg_collation.h pg_partitioned_table.h pg_range.h pg_transform.h \ pg_sequence.h pg_publication.h pg_publication_rel.h pg_subscription.h \ - pg_subscription_rel.h pg_stat_share_storage.h pg_hot_data.h + pg_subscription_rel.h pg_stat_share_storage.h GENERATED_HEADERS := $(CATALOG_HEADERS:%.h=%_d.h) schemapg.h system_fk_info.h diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 2711458..7d8c1d5 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -40,7 +40,6 @@ #include "catalog/pg_stat_share_storage.h" #include "catalog/pg_tablespace.h" #include "catalog/pg_type.h" -#include "catalog/pg_hot_data.h" #include "miscadmin.h" #include "storage/fd.h" #include "utils/fmgroids.h" @@ -248,7 +247,6 @@ IsSharedRelation(Oid relationId) if (relationId == AuthIdRelationId || relationId == AuthMemRelationId || relationId == DatabaseRelationId || - relationId == HotDataRelationId || relationId == SharedDescriptionRelationId || relationId == SharedDependRelationId || relationId == SharedSecLabelRelationId || @@ -265,7 +263,6 @@ IsSharedRelation(Oid relationId) relationId == AuthMemMemRoleIndexId || relationId == DatabaseNameIndexId || relationId == DatabaseOidIndexId || - relationId == HotDataDatnameRelnameIndexId || relationId == SharedDescriptionObjIndexId || relationId == SharedDependDependerIndexId || relationId == SharedDependReferenceIndexId || diff --git a/src/backend/catalog/pg_hot_data.c b/src/backend/catalog/pg_hot_data.c deleted file mode 100644 index bd4ecf0..0000000 --- a/src/backend/catalog/pg_hot_data.c +++ /dev/null @@ -1,276 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pg_hot_data.c - * for hot data precache - * - *------------------------------------------------------------------------- - */ - -#include "postgres.h" -#include "catalog/pg_hot_data.h" -#include "libpq-fe.h" -#include "lib/stringinfo.h" -#include "utils/timestamp.h" -#include "access/xlog.h" -#include "postmaster/postmaster.h" -#include - -void PrecacheHotData() -{ - char instanceName[NAMEDATALEN]; //default:master - char primaryHost[16]; //default:127.0.0.1 - char primaryUser[NAMEDATALEN]; //default:postgres - char primaryPw[NAMEDATALEN]; //default:123456 - char primaryPort[8]; //default:PostPortNumber - char localPort[8]; //default:master - StringInfoData cmd, primaryConnStr, localConnStr; - - initStringInfo(&cmd); - initStringInfo(&primaryConnStr); - initStringInfo(&localConnStr); - - memset(instanceName, 0, NAMEDATALEN); - memset(primaryHost, 0, 16); - memset(primaryUser, 0, NAMEDATALEN); - memset(primaryPw, 0, NAMEDATALEN); - memset(primaryPort, 0, 8); - memset(localPort, 0, 8); - - //parse - if (strlen(PrimaryConnInfo) > 0) - { - char *temStr; - char *temChr; - int temStrLen; - - //instanceName - temStr = strstr(PrimaryConnInfo, "application_name="); - temStrLen = strlen("application_name="); - - if (temStr != NULL) - { - temChr = strchr(temStr, ' '); - if (temChr != NULL) - { - memcpy(instanceName, temStr + temStrLen, temChr - temStr - temStrLen); - } - else - { - strcpy(instanceName, temStr + temStrLen); - } - } - else - { - strcpy(instanceName, "master"); - } - - //primaryHost - temStr = strstr(PrimaryConnInfo, "host="); - temStrLen = strlen("host="); - - if (temStr != NULL) - { - temChr = strchr(temStr, ' '); - if (temChr != NULL) - { - memcpy(primaryHost, temStr + temStrLen, temChr - temStr - temStrLen); - } - else - { - strcpy(primaryHost, temStr + temStrLen); - } - } - else - { - strcpy(primaryHost, "127.0.0.1"); - } - - //primaryUser - temStr = strstr(PrimaryConnInfo, "user="); - temStrLen = strlen("user="); - - if (temStr != NULL) - { - temChr = strchr(temStr, ' '); - if (temChr != NULL) - { - memcpy(primaryUser, temStr + temStrLen, temChr - temStr - temStrLen); - } - else - { - strcpy(primaryUser, temStr + temStrLen); - } - } - else - { - strcpy(primaryUser, "postgres"); - } - - //primaryPw - temStr = strstr(PrimaryConnInfo, "password="); - temStrLen = strlen("password="); - - if (temStr != NULL) - { - temChr = strchr(temStr, ' '); - if (temChr != NULL) - { - memcpy(primaryPw, temStr + temStrLen, temChr - temStr - temStrLen); - } - else - { - strcpy(primaryPw, temStr + temStrLen); - } - } - else - { - strcpy(primaryPw, "123456"); - } - - //primaryPort - temStr = strstr(PrimaryConnInfo, "port="); - temStrLen = strlen("port="); - - if (temStr != NULL) - { - temChr = strchr(temStr, ' '); - if (temChr != NULL) - { - memcpy(primaryPort, temStr + temStrLen, temChr - temStr - temStrLen); - } - else - { - strcpy(primaryPort, temStr + temStrLen); - } - } - else - { - sprintf(primaryPort, "%d", PostPortNumber); - } - } - else - { - strcpy(instanceName, "master"); - strcpy(primaryHost, "127.0.0.1"); - strcpy(primaryUser, "postgres"); - strcpy(primaryPw, "123456"); - sprintf(primaryPort, "%d", PostPortNumber); - } - - //assemble primaryConnStr - appendStringInfoString(&primaryConnStr, "host="); - appendStringInfoString(&primaryConnStr, primaryHost); - appendStringInfoString(&primaryConnStr, " user="); - appendStringInfoString(&primaryConnStr, primaryUser); - appendStringInfoString(&primaryConnStr, " password="); - appendStringInfoString(&primaryConnStr, primaryPw); - appendStringInfoString(&primaryConnStr, " port="); - appendStringInfoString(&primaryConnStr, primaryPort); - appendStringInfoString(&primaryConnStr, " dbname=postgres"); - - //conn local - sprintf(localPort, "%d", PostPortNumber); - appendStringInfoString(&localConnStr, "host=127.0.0.1 port="); - appendStringInfoString(&localConnStr, localPort); - appendStringInfoString(&localConnStr, " user=postgres dbname=postgres"); - PGconn *localConn = PQconnectdb(localConnStr.data); - if (PQstatus(localConn) != CONNECTION_OK) - { - PQfinish(localConn); - //log - return; - } - - appendStringInfoString(&cmd, "SELECT datname, relname, crules FROM pg_hot_data WHERE crulessettime>cachetime AND clientname='"); - appendStringInfoString(&cmd, instanceName); - appendStringInfoString(&cmd, "'"); - - //Query the corresponding precache policy - PGresult *ruleRes = PQexec(localConn, cmd.data); - if (PQresultStatus(ruleRes) != PGRES_TUPLES_OK) - { - PQclear(ruleRes); - PQfinish(localConn); - //log - return; - } - int rows = PQntuples(ruleRes); - for(int i=0; ird_node; xlrec.flags = SMGR_TRUNCATE_ALL; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); - lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); @@ -362,11 +377,49 @@ RelationTruncate(Relation rel, BlockNumber nblocks) */ if (fsm || vm) XLogFlush(lsn); + + if (IsBootstrapProcessingMode() != true && InitdbSingle != true) { + RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT + | CHECKPOINT_FLUSH_ALL); + } } /* Do the real work to truncate relation forks */ if (IsBootstrapProcessingMode()!=true && InitdbSingle!=true) { + /* + * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will + * just drop them without bothering to write the contents. + */ + DropRelFileNodeBuffers(rel->rd_smgr, &forks, nforks, &blocks); + + /* + * Send a shared-inval message to force other backends to close any smgr + * references they may have for this rel. This is useful because they + * might have open file pointers to segments that got removed, and/or + * smgr_targblock variables pointing past the new rel end. (The inval + * message will come back to our backend, too, causing a + * probably-unnecessary local smgr flush. But we don't expect that this + * is a performance-critical path.) As in the unlink code, we want to be + * sure the message is sent before we start changing things on-disk. + */ + CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode); smgrtruncatelsn(rel->rd_smgr, forks, nforks, blocks,lsn); + + for ( i = 0; i < nforks; i ++) + { + ldKey.sk.dbid = rel->rd_smgr->smgr_rnode.node.dbNode; + ldKey.sk.relid = rel->rd_smgr->smgr_rnode.node.relNode; + ldKey.sk.forkno = forks[i]; + ldKey.sk.blkno = blocks[i]; + SendInvalPage(&ldKey); + } + + ldKey.sk.dbid = 0; + ldKey.sk.relid = 0; + ldKey.sk.forkno = 32; + ldKey.sk.blkno = 0; + SendInvalPage(&ldKey); + } else { smgrtruncate(rel->rd_smgr, forks, nforks, blocks); } @@ -379,6 +432,13 @@ RelationTruncate(Relation rel, BlockNumber nblocks) */ if (need_fsm_vacuum) FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber); + + /* He3DB: Resume to enable cancel query */ + if (disable_cancel_query) + { + RESUME_INTERRUPTS(); + CHECK_FOR_INTERRUPTS(); + } } /* @@ -448,12 +508,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, CHECK_FOR_INTERRUPTS(); // smgrread(src, forkNum, blkno, buf.data, GetXLogWriteRecPtr()); - smgrread(src, forkNum, blkno, &dataPage, InvalidXLogRecPtr); - for(int i = 0; i < BLCKSZ; i ++) { - buf.data[i] = dataPage[i]; - } - - free(dataPage); + smgrread(src, forkNum, blkno, buf.data); if (!PageIsVerifiedExtended(page, blkno, PIV_LOG_WARNING | PIV_REPORT_STAT)) @@ -929,13 +984,14 @@ smgr_redo(XLogReaderState *record) reln = smgropen(xlrec->rnode, InvalidBackendId); /* He3DB: propeller instance and He3DB slave instance not create rel file*/ - if (!EnableHotStandby) + if (*isPromoteIsTriggered || !EnableHotStandby || he3mirror || !he3share) { smgrcreate(reln, xlrec->forkNum, true); } } else if (info == XLOG_SMGR_TRUNCATE) { + int i = 0; xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); SMgrRelation reln; Relation rel; @@ -944,6 +1000,9 @@ smgr_redo(XLogReaderState *record) int nforks = 0; bool need_fsm_vacuum = false; + WalLdPageKey walkey; + LdPageKey ldKey; + reln = smgropen(xlrec->rnode, InvalidBackendId); /* @@ -953,7 +1012,7 @@ smgr_redo(XLogReaderState *record) * log as best we can until the drop is seen. */ /* He3DB: propeller instance and He3DB slave instance not create rel file*/ - if (!EnableHotStandby) + if (*isPromoteIsTriggered || !EnableHotStandby || he3mirror || !he3share) { smgrcreate(reln, MAIN_FORKNUM, true); } @@ -1011,9 +1070,60 @@ smgr_redo(XLogReaderState *record) } } + /* + * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will + * just drop them without bothering to write the contents. + */ + DropRelFileNodeBuffers(reln, &forks, nforks, &blocks); + + /* + * Send a shared-inval message to force other backends to close any smgr + * references they may have for this rel. This is useful because they + * might have open file pointers to segments that got removed, and/or + * smgr_targblock variables pointing past the new rel end. (The inval + * message will come back to our backend, too, causing a + * probably-unnecessary local smgr flush. But we don't expect that this + * is a performance-critical path.) As in the unlink code, we want to be + * sure the message is sent before we start changing things on-disk. + */ + CacheInvalidateSmgr(reln->smgr_rnode); + /* Do the real work to truncate relation forks */ - if (nforks > 0) - smgrtruncate(reln, forks, nforks, blocks); + if (nforks > 0 && (!EnableHotStandby || *isPromoteIsTriggered || !he3share)) + smgrtruncatelsn(reln, forks, nforks, blocks, record->ReadRecPtr); + if (EnableHotStandby && !push_standby) + { + for (i = 0; i < nforks; i ++) + { + ldKey.sk.dbid = reln->smgr_rnode.node.dbNode; + ldKey.sk.relid = reln->smgr_rnode.node.relNode; + ldKey.sk.forkno = forks[i]; + ldKey.sk.blkno = blocks[i]; + SendInvalPage(&ldKey); + + walkey.sk.dbid = reln->smgr_rnode.node.dbNode; + walkey.sk.relid = reln->smgr_rnode.node.relNode; + walkey.sk.forkno = forks[i]; + walkey.sk.blkno = blocks[i]; + walkey.pageLsn = SwapLsnFromLittleToBig(record->ReadRecPtr); + walkey.partition = 1; + SendInvalWal(&walkey); + } + + ldKey.sk.dbid = 0; + ldKey.sk.relid = 0; + ldKey.sk.forkno = 32; + ldKey.sk.blkno = 0; + SendInvalPage(&ldKey); + + walkey.sk.dbid = 0; + walkey.sk.relid = 0; + walkey.sk.forkno = 32; + walkey.sk.blkno = 0; + walkey.pageLsn = 0; + walkey.partition = 0; + SendInvalWal(&walkey); + } /* * Update upper-level FSM pages to account for the truncation. This is diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 999d984..e93150c 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -866,6 +866,24 @@ CREATE VIEW pg_stat_replication AS JOIN pg_stat_get_wal_senders() AS W ON (S.pid = W.pid) LEFT JOIN pg_authid AS U ON (S.usesysid = U.oid); +CREATE VIEW pg_stat_he3walwrite AS + SELECT + s.write_lsn, + s.flush_lsn, + s.writekv_totaltimes, + s.writekv_parallels + FROM pg_stat_get_he3walwrite() AS s + ; + +CREATE VIEW pg_stat_he3_logindex AS +SELECT + s.memtable_total, + s.memtable_used, + s.memtable_start_index, + s.memtable_active_index, + s.page_total +FROM pg_stat_get_he3_logindex() AS s; + CREATE VIEW pg_stat_slru AS SELECT s.name, diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 7d5b15e..d58e96b 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -2236,7 +2236,7 @@ dbase_redo(XLogReaderState *record) if (stat(dst_path, &st) == 0 && S_ISDIR(st.st_mode)) { /* He3DB: propeller instance and He3DB slave instance not create db file*/ - if (!EnableHotStandby) + if (!EnableHotStandby || *isPromoteIsTriggered) { if (!rmtree(dst_path, true)) /* If this failed, copydir() below is going to error. */ @@ -2258,8 +2258,28 @@ dbase_redo(XLogReaderState *record) * We don't need to copy subdirectories */ /* He3DB: propeller instance and He3DB slave instance not create db file*/ - if (!EnableHotStandby) + if (!EnableHotStandby || *isPromoteIsTriggered || he3mirror || !he3share) { + // int count = 0; + // for (;;) + // { + // XLogRecPtr pushlsn; + // XLogRecPtr lastlsn = record->currRecPtr; + // pushlsn = QueryPushChkpointLsn(); + // if (pushlsn == InvalidXLogRecPtr) + // ereport(ERROR, + // (errcode(ERRCODE_INTERNAL_ERROR), + // errmsg("push standby's latest apply lsn shouldn't be 0"))); + // if (lastlsn <= pushlsn) + // break; + // if (count > 100) + // ereport(ERROR, + // (errcode(ERRCODE_INTERNAL_ERROR), + // errmsg("push standby's latest apply lsn(%X/%X) is still behind primary(%X/%X) after try 100 times.", + // LSN_FORMAT_ARGS(pushlsn), LSN_FORMAT_ARGS(lastlsn)))); + // pg_usleep(1000000L); + // count++; + // } copydir(src_path, dst_path, false); } } diff --git a/src/backend/commands/push_control.c b/src/backend/commands/push_control.c index c762ad5..deda0f8 100644 --- a/src/backend/commands/push_control.c +++ b/src/backend/commands/push_control.c @@ -9,7 +9,7 @@ #include "utils/timestamp.h" #include "fmgr.h" #include "utils/fmgrprotos.h" - +#include "catalog/indexing.h" void UpdateStatShareStorage(int64 vcl) { Relation pg_stat_share_storage_rel = NULL; diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 0f90a29..969886e 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -1881,9 +1881,6 @@ pg_sequence_last_value(PG_FUNCTION_ARGS) void seq_redo(XLogReaderState *record) { - if (data_buffer_for_replay(record) == false) { - return; - } XLogRecPtr lsn = record->EndRecPtr; uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; Buffer buffer; diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index ab85119..8e74118 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1557,7 +1557,7 @@ ExecutePlan(EState *estate, if (TupIsNull(slot)) break; - if (!isPreCache) + if (!isPreCacheTable && !isPreCacheIndex) { /* * If we have a junk filter, then project a new tuple with the junk diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 2db1914..34a2d0f 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -51,6 +51,7 @@ #include "utils/rel.h" #include "utils/snapmgr.h" #include "utils/spccache.h" +#include "storage/bufmgr.h" static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node); static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate); @@ -81,6 +82,28 @@ BitmapHeapNext(BitmapHeapScanState *node) ParallelBitmapHeapState *pstate = node->pstate; dsa_area *dsa = node->ss.ps.state->es_query_dsa; + /* set preCacheNodeOid */ + if (isPreCacheIndex && preCacheNodeOid == 0) + { + preCacheNodeOid = ((BitmapIndexScanState *)((PlanState *)(node))->lefttree)->biss_ScanDesc->indexRelation->rd_node.relNode; + if (isPreCacheAction) + { + preCacheNodesPtr[(*preCacheNodesCountPtr)++] = preCacheNodeOid; + } + else + { + for(int i = 0; i < *preCacheNodesCountPtr; i++) + { + if (preCacheNodesPtr[i] == preCacheNodeOid) + { + preCacheNodesPtr[i] = preCacheNodesPtr[*preCacheNodesCountPtr - 1]; + (*preCacheNodesCountPtr)--; + break; + } + } + } + } + /* * extract necessary information from index scan node */ diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 8fee958..f50abdc 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -66,6 +66,28 @@ IndexOnlyNext(IndexOnlyScanState *node) TupleTableSlot *slot; ItemPointer tid; + /* set preCacheNodeOid */ + if (isPreCacheIndex && preCacheNodeOid == 0) + { + preCacheNodeOid = node->ioss_RelationDesc->rd_node.relNode; + if (isPreCacheAction) + { + preCacheNodesPtr[(*preCacheNodesCountPtr)++] = preCacheNodeOid; + } + else + { + for(int i = 0; i < *preCacheNodesCountPtr; i++) + { + if (preCacheNodesPtr[i] == preCacheNodeOid) + { + preCacheNodesPtr[i] = preCacheNodesPtr[*preCacheNodesCountPtr - 1]; + (*preCacheNodesCountPtr)--; + break; + } + } + } + } + /* * extract necessary information from index scan node */ diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 2fffb1b..7951d91 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -43,6 +43,7 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/rel.h" +#include "storage/bufmgr.h" /* * When an ordering operator is used, tuples fetched from the index that @@ -86,6 +87,28 @@ IndexNext(IndexScanState *node) IndexScanDesc scandesc; TupleTableSlot *slot; + /* set preCacheNodeOid */ + if (isPreCacheIndex && preCacheNodeOid == 0) + { + preCacheNodeOid = node->iss_RelationDesc->rd_node.relNode; + if (isPreCacheAction) + { + preCacheNodesPtr[(*preCacheNodesCountPtr)++] = preCacheNodeOid; + } + else + { + for(int i = 0; i < *preCacheNodesCountPtr; i++) + { + if (preCacheNodesPtr[i] == preCacheNodeOid) + { + preCacheNodesPtr[i] = preCacheNodesPtr[*preCacheNodesCountPtr - 1]; + (*preCacheNodesCountPtr)--; + break; + } + } + } + } + /* * extract necessary information from index scan node */ diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index 066f9ae..50da2f4 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -32,6 +32,7 @@ #include "executor/execdebug.h" #include "executor/nodeSeqscan.h" #include "utils/rel.h" +#include "storage/bufmgr.h" static TupleTableSlot *SeqNext(SeqScanState *node); @@ -54,6 +55,28 @@ SeqNext(SeqScanState *node) ScanDirection direction; TupleTableSlot *slot; + /* set preCacheTableNode */ + if (isPreCacheTable && preCacheNodeOid == 0) + { + preCacheNodeOid = node->ss.ss_currentRelation->rd_node.relNode; + if (isPreCacheAction) + { + preCacheNodesPtr[(*preCacheNodesCountPtr)++] = preCacheNodeOid; + } + else + { + for(int i = 0; i < *preCacheNodesCountPtr; i++) + { + if (preCacheNodesPtr[i] == preCacheNodeOid) + { + preCacheNodesPtr[i] = preCacheNodesPtr[*preCacheNodesCountPtr - 1]; + (*preCacheNodesCountPtr)--; + break; + } + } + } + } + /* * get information from the estate and scan state */ diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile index bfdf6a8..c880660 100644 --- a/src/backend/postmaster/Makefile +++ b/src/backend/postmaster/Makefile @@ -21,6 +21,7 @@ OBJS = \ interrupt.o \ pgarch.o \ pgstat.o \ + secondbuffer.o \ postmaster.o \ startup.o \ syslogger.o \ diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 715643c..a50ef74 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -241,11 +241,11 @@ BackgroundWriterMain(void) /* * Do one cycle of dirty-buffer writing. */ - if (push_standby == true) { - can_hibernate = BgBufferSync(&wb_context); - } else { + //if (push_standby == true) { + // can_hibernate = BgBufferSync(&wb_context); + //} else { can_hibernate = true; - } + //} /* * Send off activity statistics to the stats collector diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 4809cf8..016e021 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -493,7 +493,7 @@ CheckpointerMain(void) } /* Check for archive_timeout and switch xlog files if necessary. */ - CheckArchiveTimeout(); + //CheckArchiveTimeout(); /* * Send off activity statistics to the stats collector. (The reason @@ -719,7 +719,7 @@ CheckpointWriteDelay(int flags, double progress) AbsorbSyncRequests(); absorb_counter = WRITES_PER_ABSORB; - CheckArchiveTimeout(); + //CheckArchiveTimeout(); /* * Report interim activity statistics to the stats collector. @@ -1346,3 +1346,9 @@ FirstCallSinceLastCheckpoint(void) return FirstCall; } + +pid_t +He3DBQueryCkpPid(void) +{ + return CheckpointerShmem->checkpointer_pid; +} diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 178de36..6fd4233 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -65,7 +65,6 @@ #include "postgres.h" -#include "utils/ufs.h" #include #include #include @@ -115,6 +114,7 @@ #include "postmaster/interrupt.h" #include "postmaster/pgarch.h" #include "postmaster/postmaster.h" +#include "postmaster/secondbuffer.h" #include "postmaster/syslogger.h" #include "replication/logicallauncher.h" #include "replication/walsender.h" @@ -133,7 +133,7 @@ #include "utils/timeout.h" #include "utils/timestamp.h" #include "utils/varlena.h" - +#include "access/pagehashqueue.h" #ifdef EXEC_BACKEND #include "storage/spin.h" #endif @@ -148,7 +148,8 @@ #define BACKEND_TYPE_AUTOVAC 0x0002 /* autovacuum worker process */ #define BACKEND_TYPE_WALSND 0x0004 /* walsender process */ #define BACKEND_TYPE_BGWORKER 0x0008 /* bgworker process */ -#define BACKEND_TYPE_ALL 0x000F /* OR of all the above */ +#define BACKEND_TYPE_FLUSHPAGE 0x0010 /* parallel flush pid*/ +#define BACKEND_TYPE_ALL 0x001F /* OR of all the above */ /* * List of active backends (or child processes anyway; we don't actually @@ -200,6 +201,8 @@ BackgroundWorker *MyBgworkerEntry = NULL; /* The socket number we are listening for connections on */ int PostPortNumber; +int flag = 1; + /* The directory names for Unix socket(s) */ char *Unix_socket_directories; @@ -255,6 +258,8 @@ static pid_t StartupPID = 0, AutoVacPID = 0, PgArchPID = 0, PgStatPID = 0, + SecondBufferPID = 0, + CleanLogIndexPID = 0, SysLoggerPID = 0; /* Startup process's status */ @@ -376,6 +381,10 @@ static volatile bool avlauncher_needs_signal = false; /* received START_WALRECEIVER signal */ static volatile sig_atomic_t WalReceiverRequested = false; +/* received START_PARALLEL PUSH signal */ +static volatile sig_atomic_t PageParallelPush = false; + + /* set when there's a worker that needs to be started up */ static volatile bool StartWorkerNeeded = true; static volatile bool HaveCrashedWorker = false; @@ -438,6 +447,7 @@ static pid_t StartChildProcess(AuxProcType type); static void StartAutovacuumWorker(void); static void MaybeStartWalReceiver(void); static void InitPostmasterDeathWatchHandle(void); +static void StartALLPageFlushWorker(void); /* * Archiver is allowed to start up at the current postmaster state? @@ -558,6 +568,8 @@ static void ShmemBackendArrayRemove(Backend *bn); #define StartCheckpointer() StartChildProcess(CheckpointerProcess) #define StartWalWriter() StartChildProcess(WalWriterProcess) #define StartWalReceiver() StartChildProcess(WalReceiverProcess) + #define StartSecondBuffer() StartChildProcess(SecondBufferProcess) +#define StartCleanLogIndex() StartChildProcess(CleanLogIndexProcess) /* Macros to check exit status of a child process */ #define EXIT_STATUS_0(st) ((st) == 0) @@ -589,7 +601,6 @@ PostmasterMain(int argc, char *argv[]) char *output_config_variable = NULL; InitProcessGlobals(); - PostmasterPid = MyProcPid; IsPostmasterEnvironment = true; @@ -1772,6 +1783,10 @@ ServerLoop(void) CheckpointerPID = StartCheckpointer(); if (BgWriterPID == 0) BgWriterPID = StartBackgroundWriter(); + if (CleanLogIndexPID == 0) + CleanLogIndexPID = StartCleanLogIndex(); + if (SecondBufferPID == 0) + SecondBufferPID = StartSecondBuffer(); } /* @@ -1782,6 +1797,9 @@ ServerLoop(void) if (WalWriterPID == 0 && pmState == PM_RUN) WalWriterPID = StartWalWriter(); + // if(SecondBufferPID == 0 && pmState == PM_RUN) + // SecondBufferPID = StartSecondBuffer(); + /* * If we have lost the autovacuum launcher, try to start a new one. We * don't want autovacuum to run in binary upgrade mode because @@ -2733,6 +2751,11 @@ SIGHUP_handler(SIGNAL_ARGS) signal_child(SysLoggerPID, SIGHUP); if (PgStatPID != 0) signal_child(PgStatPID, SIGHUP); + if (SecondBufferPID != 0) + signal_child(SecondBufferPID, SIGHUP); + if (CleanLogIndexPID != 0 ) + signal_child(CleanLogIndexPID, SIGHUP); + /* Reload authentication config files too */ if (!load_hba()) @@ -3052,6 +3075,10 @@ reaper(SIGNAL_ARGS) BgWriterPID = StartBackgroundWriter(); if (WalWriterPID == 0) WalWriterPID = StartWalWriter(); + if (SecondBufferPID == 0) + SecondBufferPID = StartSecondBuffer(); //作用? + if (CleanLogIndexPID == 0) + CleanLogIndexPID = StartCleanLogIndex(); /* * Likewise, start other special children as needed. In a restart @@ -3094,6 +3121,7 @@ reaper(SIGNAL_ARGS) continue; } + /* * Was it the checkpointer? */ @@ -3128,7 +3156,7 @@ reaper(SIGNAL_ARGS) */ SignalChildren(SIGUSR2); - pmState = PM_SHUTDOWN_2; + // pmState = PM_SHUTDOWN_2; /* * We can also shut down the stats collector now; there's @@ -3164,6 +3192,25 @@ reaper(SIGNAL_ARGS) continue; } + if (pid == SecondBufferPID) + { + SecondBufferPID = 0; + if (EXIT_STATUS_0(exitstatus) && pmState == PM_SHUTDOWN) + { + Assert(Shutdown > NoShutdown); + pmState = PM_SHUTDOWN_2; + } + else + { + /* + * Any unexpected exit of the checkpointer (including FATAL + * exit) is treated as a crash. + */ + HandleChildCrash(pid, exitstatus, + _("second buffer process")); + } + } + /* * Was it the wal receiver? If exit status is zero (normal) or one * (FATAL exit), we assume everything is all right just like normal @@ -3691,7 +3738,29 @@ HandleChildCrash(int pid, int exitstatus, const char *procname) signal_child(PgStatPID, SIGQUIT); allow_immediate_pgstat_restart(); } + /* Take care of the clean logindex too */ + if (pid == CleanLogIndexPID) + CleanLogIndexPID = 0; + else if (CleanLogIndexPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) CleanLogIndexPID))); + signal_child(CleanLogIndexPID, (SendStop ? SIGSTOP : SIGQUIT)); + } + /* Take care of the walwriter too*/ + if (pid == SecondBufferPID) + SecondBufferPID = 0; + else if (SecondBufferPID != 0 && take_action) + { + ereport(DEBUG2, + (errmsg_internal("sending %s to process %d", + (SendStop ? "SIGSTOP" : "SIGQUIT"), + (int) SecondBufferPID))); + signal_child(SecondBufferPID, (SendStop ? SIGSTOP : SIGQUIT)); + } /* We do NOT restart the syslogger */ if (Shutdown != ImmediateShutdown) @@ -3839,8 +3908,14 @@ PostmasterStateMachine(void) signal_child(StartupPID, SIGTERM); if (WalReceiverPID != 0) signal_child(WalReceiverPID, SIGTERM); + if (CleanLogIndexPID != 0) + signal_child(CleanLogIndexPID, SIGTERM); + /*and the secondbuffer too*/ + if (SecondBufferPID != 0) + signal_child(SecondBufferPID,SIGTERM); /* checkpointer, archiver, stats, and syslogger may continue for now */ + /* Now transition to PM_WAIT_BACKENDS state to wait for them to die */ pmState = PM_WAIT_BACKENDS; } @@ -4176,6 +4251,10 @@ TerminateChildren(int signal) signal_child(PgArchPID, signal); if (PgStatPID != 0) signal_child(PgStatPID, signal); + if (CleanLogIndexPID !=0) + signal_child(CleanLogIndexPID, signal); + if (SecondBufferPID != 0) + signal_child(SecondBufferPID, signal); } /* @@ -4273,7 +4352,6 @@ BackendStartup(Port *port) report_fork_failure_to_client(port, save_errno); return STATUS_ERROR; } - /* in parent, successful fork */ ereport(DEBUG2, (errmsg_internal("forked new backend, pid=%d socket=%d", @@ -4530,7 +4608,7 @@ BackendRun(Port *port) if (port->privateConn == true) { privateConn = true; } - + client_application_name = port->application_name; /* * Make sure we aren't in PostmasterContext anymore. (We can't delete it * just yet, though, because InitPostgres will need the HBA data.) @@ -5139,6 +5217,11 @@ SubPostmasterMain(int argc, char *argv[]) static void ExitPostmaster(int status) { + + + ClosePageDBEnv(); + CloseWalDBEnv(); + #ifdef HAVE_PTHREAD_IS_THREADED_NP /* @@ -5203,7 +5286,6 @@ sigusr1_handler(SIGNAL_ARGS) CheckpointerPID = StartCheckpointer(); Assert(BgWriterPID == 0); BgWriterPID = StartBackgroundWriter(); - /* * Start the archiver if we're responsible for (re-)archiving received * files. @@ -5299,7 +5381,24 @@ sigusr1_handler(SIGNAL_ARGS) /* The autovacuum launcher wants us to start a worker process. */ StartAutovacuumWorker(); } - + + /* start Flush Page */ + if (!PageParallelPush && CheckPostmasterSignal(PMSIGNAL_PARALLEL_FLUSH_WORKER)) { + PageParallelPush = true; + StartALLPageFlushWorker(); + } + + if (CheckPostmasterSignal(PMSIGNAL_CLEAN_LOGINDEX_WORKER)) { + if ( CleanLogIndexPID == 0) { + CleanLogIndexPID = StartCleanLogIndex(); + } + } + + // if (CheckPostmasterSignal(PMSIGNAL_SECONDBUFFER_WORKER)) { + // if (SecondBufferPID == 0) { + // SecondBufferPID = StartSecondBuffer(); + // } +// } if (CheckPostmasterSignal(PMSIGNAL_START_WALRECEIVER)) { /* Startup Process wants us to start the walreceiver process. */ @@ -5464,7 +5563,15 @@ StartChildProcess(AuxProcType type) av[ac++] = "--forkboot"; av[ac++] = NULL; /* filled in by postmaster_forkexec */ #endif - + + if (pageEnv == NULL) + { + InitPageDBEnv(); + } + if (walEnv == NULL) + { + InitWalDBEnv(); + } snprintf(typebuf, sizeof(typebuf), "-x%d", type); av[ac++] = typebuf; @@ -5487,7 +5594,6 @@ StartChildProcess(AuxProcType type) MemoryContextSwitchTo(TopMemoryContext); MemoryContextDelete(PostmasterContext); PostmasterContext = NULL; - AuxiliaryProcessMain(ac, av); /* does not return */ } #endif /* EXEC_BACKEND */ @@ -5545,6 +5651,67 @@ StartChildProcess(AuxProcType type) return pid; } +static void StartALLPageFlushWorker(void) { + for(int i = 0;icancel_key = MyCancelKey; + + /* parallel workers are not dead_end and need a child slot */ + bn->dead_end = false; + bn->child_slot = MyPMChildSlot = AssignPostmasterChildSlot(); + bn->bgworker_notify = false; + + bn->pid = StartPageFlushWorker(); + if (bn->pid > 0) + { + bn->bkend_type = BACKEND_TYPE_FLUSHPAGE; + dlist_push_head(&BackendList, &bn->elem); +#ifdef EXEC_BACKEND + ShmemBackendArrayAdd(bn); +#endif + /* all OK */ + continue; + } + + /* + * fork failed, fall through to report -- actual error message was + * logged by StartAutoVacWorker + */ + (void) ReleasePostmasterChildSlot(bn->child_slot); + free(bn); + } + else + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + } +} + + /* * StartAutovacuumWorker * Start an autovac worker process. @@ -5712,7 +5879,7 @@ int MaxLivePostmasterChildren(void) { return 2 * (MaxConnections + autovacuum_max_workers + 1 + - max_wal_senders + max_worker_processes); + max_wal_senders + max_parallel_flush_process + max_worker_processes); } /* diff --git a/src/backend/postmaster/secondbuffer.c b/src/backend/postmaster/secondbuffer.c new file mode 100644 index 0000000..26392b6 --- /dev/null +++ b/src/backend/postmaster/secondbuffer.c @@ -0,0 +1,1702 @@ +#include "postgres.h" +#include "access/xlogrecord.h" +#include +#include +#include +#include +#include "postmaster/secondbuffer.h" +#include +#include "utils/guc.h" + +#include +#include "postmaster/interrupt.h" +#include "libpq/pqsignal.h" +#include "storage/s_lock.h" +#include "storage/spin.h" +#include "storage/shmem.h" +#include "storage/bufpage.h" +#include "storage/pmsignal.h" +#include "storage/lwlock.h" +#include "utils/memutils.h" +#include "storage/procsignal.h" +#include "utils/dynahash.h" +#include "miscadmin.h" +#include "utils/ps_status.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "utils/wait_event.h" +#include +#include +#include +#include "storage/bufmgr.h" +#include +#include + +const char *socketfile = "/tmp/he3cleanwal"; +const char *p_socketfile = "/tmp/he3cleanpage"; +#define SizeOfCleanWal (offsetof(WalLdPageKey, partition) + sizeof(uint8)) +#define SizeOfCleanPage 16 + +typedef struct SocketFd +{ + int walSocketFd; + int pageSocketFd; +} SocketFd; + +typedef struct SingleKeyArray +{ + SdPageKey SdPageKeyList[SDLEN]; + uint16 head; + uint16 tail; + uint16 unused; + slock_t oplock; +} SingleKeyArray; + +typedef struct Statisticnum +{ + double totalunused; + slock_t change; +} Statisticnum; + +typedef struct DPageKeyArray +{ + DPageKey dpk[1024]; + uint16 unused; + uint16 head; + uint16 tail; + uint16 pageIndex; + uint16 walIndex; + slock_t append; +} DPageKeyArray; +/* +secondbufferhash code +*/ +static int IsDirExist(const char *path); +static void CleanWalsByPage(WalLdPageKey *walkey); +static void CleanWalsByTable(WalLdPageKey *walkey); +static void CleanPagesByTable(LdPageKey *lpk); + +static HTAB *SecondBufferHash = NULL; +extern bool EnableHotStandby; +SocketFd SocFd = {-1, -1}; +DPageKeyArray *DPArray = NULL; + +SingleKeyArray *MultiKeyArrays; + +MDB_env *pageEnv = NULL; +MDB_env *walEnv = NULL; + +MDB_dbi pageDbi; +MDB_dbi walDbi; + +MDB_txn *pageTxn = NULL; +MDB_txn *walTxn = NULL; +MDB_cursor *cursor = NULL; + +Statisticnum *statisticnum = NULL; + +LWLockPadded *SecondBufferMainLWLockArray = NULL; + +char *lmdb_page_directory; +char *lmdb_wal_directory; +Size SNBuffers = 1024; + +Size SecondBufferShmemSize(void) +{ + Size size; + size = mul_size(SNBuffers, BLKSZ); + return size; +} + +Size SecondBufferLWLockShmemSize(void) +{ + Size size; + int i; + int numLocks = NUM_LOCK_PARTITIONS; + + /* Space for the LWLock array. */ + size = mul_size(numLocks, sizeof(LWLockPadded)); + size = add_size(size, LWLOCK_PADDED_SIZE); + + return size; +} + +static void +InitializeSecondBufferLWLocks(void) +{ + int id; + int i; + LWLockPadded *lock; + + for (id = 0, lock = SecondBufferMainLWLockArray; id < NUM_LOCK_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, id); +} + +void CreateSecondBufferLWLocks(void) +{ + if (!IsUnderPostmaster) + { + Size spaceLocks = SecondBufferLWLockShmemSize(); + char *ptr; + + /* Allocate space */ + ptr = (char *)ShmemAlloc(spaceLocks); + + /* Ensure desired alignment of LWLock array */ + ptr += LWLOCK_PADDED_SIZE - ((uintptr_t)ptr) % LWLOCK_PADDED_SIZE; + + SecondBufferMainLWLockArray = (LWLockPadded *)ptr; + + /* Initialize all LWLocks */ + InitializeSecondBufferLWLocks(); + } +} + +void InitSecondBufferMeta(void) +{ + bool found, found1; + int i; + MultiKeyArrays = (SingleKeyArray *) + ShmemInitStruct("multi page keys arrays", + sizeof(SingleKeyArray) * SDNUM, + &found); + + statisticnum = (Statisticnum *) + ShmemInitStruct("statistic num", + sizeof(Statisticnum), + &found1); + + if (MultiKeyArrays == NULL) + { + ereport(PANIC, (errmsg("init secondbuffer meta fail"))); + } + statisticnum->totalunused = SDLEN * SDNUM; + SpinLockInit(&statisticnum->change); + + for (i = 0; i < SDNUM; i++) + { + SpinLockInit(&MultiKeyArrays[i].oplock); + MultiKeyArrays[i].head = MultiKeyArrays[i].tail = 0; + MultiKeyArrays[i].unused = SDLEN; + } +} + +void InitDPageKeyArray(void) +{ + // ereport(LOG, (errmsg("initdp"))); + bool found; + int i; + DPArray = (DPageKeyArray *) + ShmemInitStruct("deleted page keys arrays", + sizeof(DPageKeyArray), + &found); + // ereport(LOG, (errmsg("initdp doing"))); + if (DPArray == NULL) + { + ereport(PANIC, (errmsg("init DPArray fail"))); + } + + SpinLockInit(&DPArray->append); + DPArray->head = DPArray->tail = DPArray->pageIndex = DPArray->walIndex = 0; + DPArray->unused = 1024; + // ereport(LOG, (errmsg("initdp done"))); +} +/* + init SecondBufferHash + */ +void InitSecondBufferHash(void) +{ + HASHCTL info; + long init_table_size, + max_table_size; + bool found; + + /* + * Compute init/max size to request for lock hashtables. Note these + * calculations must agree with SecondBufferhashShmemSize! + */ + max_table_size = 200; + init_table_size = max_table_size / 2; + + info.keysize = sizeof(SdPageKey); + info.entrysize = sizeof(SdPageValue); + + info.num_partitions = NUM_LOCK_PARTITIONS; + + SecondBufferHash = ShmemInitHash("SecondBuffer hash", + init_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS | HASH_PARTITION); +} + +void InitPageDBEnv() +{ + if (!IsDirExist(lmdb_page_directory)) + { + pg_mkdir_p(lmdb_page_directory, 0777); + } + mdb_env_create(&pageEnv); + mdb_env_set_maxreaders(pageEnv, MAXREADERS); + mdb_env_set_mapsize(pageEnv, MAPSIE); + mdb_env_open(pageEnv, lmdb_page_directory, MDB_FIXEDMAP | MDB_NOSYNC, 0664); + mdb_txn_begin(pageEnv, NULL, 0, &pageTxn); + mdb_dbi_open(pageTxn, NULL, MDB_CREATE, &pageDbi); + mdb_txn_commit(pageTxn); +} + +void InitWalDBEnv() +{ + if (!IsDirExist(lmdb_wal_directory)) + { + pg_mkdir_p(lmdb_wal_directory, 0777); + } + mdb_env_create(&walEnv); + mdb_env_set_maxreaders(walEnv, MAXREADERS); + mdb_env_set_mapsize(walEnv, MAPSIE); + mdb_env_open(walEnv, lmdb_wal_directory, MDB_FIXEDMAP | MDB_NOSYNC, 0664); + mdb_txn_begin(walEnv, NULL, 0, &walTxn); + mdb_dbi_open(walTxn, NULL, MDB_CREATE | MDB_DUPSORT, &walDbi); + mdb_txn_commit(walTxn); +} +void ClosePageDBEnv() +{ + mdb_dbi_close(pageEnv, pageDbi); + mdb_env_close(pageEnv); + ereport(LOG, errmsg("close page success")); +} + +void CloseWalDBEnv() +{ + mdb_dbi_close(walEnv, walDbi); + mdb_env_close(walEnv); + ereport(LOG, errmsg("close wal success")); +} + +static void +convertKey(SdPageKey *sdkey, PageKey *pk) +{ + sdkey->dbid = pk->relfileNode.dbNode; + sdkey->relid = pk->relfileNode.relNode; + sdkey->forkno = pk->forkNo; + sdkey->blkno = pk->blkNo; +} + +static void +convertKeyLd(LdPageKey *ldkey, PageKey *pk) +{ + SdPageKey sdkey; + sdkey.dbid = pk->relfileNode.dbNode; + sdkey.relid = pk->relfileNode.relNode; + sdkey.forkno = pk->forkNo; + sdkey.blkno = pk->blkNo; + + ldkey->sk = sdkey; +} + +/* + * notification_match: match function to use with notification_hash + */ +static int +secondbuffer_match(const void *key1, const void *key2, Size keysize) +{ + const SdPageKey *k1 = (const SdPageKey *)key1; + const SdPageKey *k2 = (const SdPageKey *)key2; + + Assert(keysize == sizeof(SdPageKey)); + if (k1->dbid == k2->dbid && + k1->blkno == k2->blkno && k1->forkno == k2->forkno && k1->relid == k2->relid) + return 0; /* equal */ + return 1; /* not equal */ +} + +static uint32 +SecondBufferHashCode(const SdPageKey *pk) +{ + return get_hash_value(SecondBufferHash, (const void *)pk); +} + +static SdPageValue * +SetupSecondBufferInTable(const SdPageKey *pk) +{ + + SdPageValue *pv; + bool found; + + pv = (SdPageValue *) + hash_search(SecondBufferHash, pk, HASH_ENTER_NULL, &found); + + return pv; +} + +static bool +CleanUpSecondBuffer(const SdPageKey *pk) +{ + + // LWLock *partitionLock; + // uint32 newHash; + // newHash = SecondBufferHashCode(pk); + // partitionLock = SecondBufferMappingPartitionLock(newHash); + // LWLockAcquire(partitionLock, LW_EXCLUSIVE); + bool found; + hash_search(SecondBufferHash, + (void *)pk, + HASH_REMOVE, + &found); + // LWLockRelease(partitionLock); + return found; +} + +static SdPageValue * +FindSecondBufferInTable(const SdPageKey *pk) +{ + SdPageValue *pv; + bool found; + if (SecondBufferHash == NULL) + { + return NULL; + } + pv = (SdPageValue *) + hash_search(SecondBufferHash, + pk, + HASH_FIND, + &found); + if (!found) + { + return NULL; + } + return pv; +} + +// sb -> ssb +void ReceivePageFromDataBuffer(PageKey *pk, uint8_t *buffer) +{ + SdPageKey *sk; + sk = (SdPageKey *)malloc(sizeof(SdPageKey)); + convertKey(sk, pk); + + LWLock *partitionLock; + uint32 newHash; + newHash = SecondBufferHashCode(sk); + partitionLock = SecondBufferMappingPartitionLock(newHash); + SdPageValue *sdPageValue = NULL; + while (sdPageValue == NULL) + { + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + sdPageValue = SetupSecondBufferInTable(sk); + if (sdPageValue == NULL) + { + LWLockRelease(partitionLock); + continue; + } + sdPageValue->canDelete = false; + memcpy(sdPageValue->pagecontent, buffer, 8192); + LWLockRelease(partitionLock); + } + + srand((int)clock()); + int index = rand() % SDNUM; + SingleKeyArray *sa; + + for (;;) + { + sa = &MultiKeyArrays[index]; + SpinLockAcquire(&sa->oplock); + if (sa->unused > 0) + { + sa->SdPageKeyList[sa->tail] = *sk; + sa->tail = (sa->tail + 1) % SDLEN; + sa->unused--; + SpinLockAcquire(&statisticnum->change); + statisticnum->totalunused--; + SpinLockRelease(&statisticnum->change); + SpinLockRelease(&sa->oplock); + break; + } + else + { + SpinLockRelease(&sa->oplock); + index = (index + 1) % SDNUM; + } + } + free(sk); +} + +// ssb -> sb +static uint8_t * +GetPageFromSecondBuffer(PageKey *pk, uint8_t *buffer) +{ + SdPageKey *sk = NULL; + sk = (SdPageKey *)malloc(sizeof(SdPageKey)); + convertKey(sk, pk); + + LWLock *partitionLock; + uint32 newHash; + newHash = SecondBufferHashCode(sk); + partitionLock = SecondBufferMappingPartitionLock(newHash); + LWLockAcquire(partitionLock, LW_SHARED); + + SdPageValue *sv = FindSecondBufferInTable(sk); + if (sv == NULL) + { + if (sk != NULL) + { + free(sk); + } + LWLockRelease(partitionLock); + return NULL; + } + else + { + free(sk); + memcpy(buffer, sv->pagecontent, BLCKSZ); + LWLockRelease(partitionLock); + return buffer; + } +} + +// lc -> sb +static uint8_t * +GetPageFromLocalBuffer(PageKey *pk, uint8_t *buffer) +{ + int i = 0; + if (pageEnv == NULL) + { + return NULL; + } + + MDB_txn *tmptxn; + LdPageKey *lk; + lk = (LdPageKey *)malloc(sizeof(LdPageKey)); + convertKeyLd(lk, pk); + MDB_val key, data; + data.mv_data = NULL; + data.mv_size = 0; + key.mv_size = sizeof(LdPageKey); + key.mv_data = lk; + mdb_txn_begin(pageEnv, NULL, 0, &tmptxn); + mdb_get(tmptxn, pageDbi, &key, &data); + if (data.mv_data != NULL) + { + mdb_txn_abort(tmptxn); + free(lk); + memcpy(buffer, data.mv_data, data.mv_size); + return data.mv_data; + } + else + { + free(lk); + mdb_txn_abort(tmptxn); + return NULL; + } +} + +uint64_t +SwapLsnFromLittleToBig(uint64_t lsn) +{ +#ifndef WORDS_BIGENDIAN + /* trans lsn from little endian to big endian in memory + * eg: 0x12345678 ===> 0x78563412 + */ + + uint32 low, high; + low = (uint32)(lsn); + high = (uint32)((lsn) >> 32); + + low = (low << 16) | (low >> 16); + low = ((low & 0x00FF00FF) << 8) | ((low >> 8) & 0x00FF00FF); + + high = (high << 16) | (high >> 16); + high = ((high & 0x00FF00FF) << 8) | ((high >> 8) & 0x00FF00FF); + return ((uint64)(low)) << 32 | (uint64)(high); +#endif + return lsn; +} + +uint64_t +SwapLsnFromBigToLittle(uint64_t lsn) +{ +#ifndef WORDS_BIGENDIAN + /* trans lsn from big endian to little endian in memory + * eg: 0x78563412 ===> 0x12345678 + */ + + uint32 low, high; + low = (uint32)(lsn); + high = (uint32)((lsn) >> 32); + + low = (low << 16) | (low >> 16); + low = ((low & 0x00FF00FF) << 8) | ((low >> 8) & 0x00FF00FF); + + high = (high << 16) | (high >> 16); + high = ((high & 0x00FF00FF) << 8) | ((high >> 8) & 0x00FF00FF); + return ((uint64)(low)) << 32 | (uint64)(high); +#endif + return lsn; +} + +Bufrd GetWalFromLocalBuffer(WalLdPageKey *wpk, uint64_t replyLsn) +{ + MDB_txn *tmptxn; + MDB_cursor *tmpcursor; + + Bufrd bufrd; + MDB_val key, data; + int tb = -1, co = -1, cg = -1; + + data.mv_size = 0; + data.mv_data = NULL; + + key.mv_size = SizeOfCleanWal; + key.mv_data = wpk; + int waldatalen = 0, roomlen = 2048; + uint32 dbid, relid, forkno, blkno; + dbid = wpk->sk.dbid; + relid = wpk->sk.relid; + forkno = wpk->sk.forkno; + blkno = wpk->sk.blkno; + + bufrd.buf = NULL; + bufrd.cap = 0; + bufrd.count = 0; + + uint8_t *waldata = (uint8_t *)malloc(roomlen); + tb = mdb_txn_begin(walEnv, NULL, 0, &tmptxn); + if (tb != 0) + { + // TODO + ereport(PANIC, errmsg("mdb_txn_begin failed,error is:%d", tb)); + } + co = mdb_cursor_open(tmptxn, walDbi, &tmpcursor); + if (co != 0) + { + // TODO + ereport(PANIC, errmsg("mdb_txn_open failed,error is:%d", co)); + } + // ereport(LOG,errmsg("535 get key dbid %d, relid %d, fork %d, blk %d, pagelsn %ld, part %d", + // wpk->sk.dbid, wpk->sk.relid, wpk->sk.forkno, wpk->sk.blkno, SwapLsnFromBigToLittle(wpk->pageLsn), wpk->partition)); + if ((cg = mdb_cursor_get(tmpcursor, &key, &data, MDB_SET_RANGE)) != 0) + { + ereport(LOG, errmsg("mdb_txn_get failed,error is:%d, dbid %d, relid %d, fork %d, blk %d, pagelsn %ld, part %d", + cg, dbid, relid, forkno, blkno, SwapLsnFromBigToLittle(wpk->pageLsn), wpk->partition)); + bufrd.buf = waldata; + mdb_cursor_close(tmpcursor); + mdb_txn_abort(tmptxn); + return bufrd; + } + wpk = (WalLdPageKey *)key.mv_data; + + // ereport(LOG,errmsg("549 get key dbid %d, relid %d, fork %d, blk %d, pagelsn %ld, part %d", + // wpk->sk.dbid, wpk->sk.relid, wpk->sk.forkno, wpk->sk.blkno, SwapLsnFromBigToLittle(wpk->pageLsn), wpk->partition)); + while (wpk->sk.dbid == dbid && wpk->sk.relid == relid && wpk->sk.forkno == forkno && wpk->sk.blkno == blkno && SwapLsnFromBigToLittle(wpk->pageLsn) < replyLsn) + { + memcpy(waldata + waldatalen, data.mv_data, data.mv_size); + waldatalen += data.mv_size; + if (0 != mdb_cursor_get(tmpcursor, &key, &data, MDB_NEXT)) + { + bufrd.buf = waldata; + bufrd.cap = roomlen; + bufrd.count = waldatalen; + mdb_cursor_close(tmpcursor); + mdb_txn_abort(tmptxn); + return bufrd; + } + else + { + if (waldatalen + data.mv_size > roomlen) + { + roomlen += 1024; + waldata = (uint8_t *)realloc(waldata, roomlen); + } + + wpk = (WalLdPageKey *)key.mv_data; + // ereport(LOG,errmsg("get key dbid %d, relid %d, fork %d, blk %d, pagelsn %ld, part %d", + // wpk->sk.dbid, wpk->sk.relid, wpk->sk.forkno, wpk->sk.blkno, SwapLsnFromBigToLittle(wpk->pageLsn), wpk->partition)); + } + } + bufrd.buf = waldata; + bufrd.count = waldatalen; + bufrd.cap = roomlen; + + mdb_cursor_close(tmpcursor); + mdb_txn_abort(tmptxn); + tmpcursor = NULL; + tmptxn = NULL; + return bufrd; +} + +void AddOneItemToDPArray(OriginDPageKey odpk) +{ + // ereport(LOG, (errmsg("AddOneItemToDPArray in processing"))); + // DPageKey dpk; + // dpk.pk = odpk.pk; + // dpk.operation = odpk.opration; + // dpk.pagedeleted = false; + + // while(1) + // { + // SpinLockAcquire(&DPArray->append); + // if (DPArray->unused > 0) + // { + // DPArray->dpk[DPArray->tail] = dpk; + // DPArray->tail ++; + // DPArray->unused --; + // SpinLockRelease(&DPArray->append); + // break; + // } + // SpinLockRelease(&DPArray->append); + // pg_usleep(1); + + // } + // ereport(LOG, (errmsg("AddOneItemToDPArray done"))); +} + +void storeWalInLocalBuffer(kvStruct *ks, int32 length) +{ + // pthread_mutex_lock(&q_lock); + int tb = -1, co = -1, cp = -1, cc = -1, tc = -1; + MDB_txn *tmptxn = NULL; + MDB_val key, data; + MDB_cursor *tmpcursor = NULL; + + uint8_t *xlogContent = NULL; + uint8_t part = 0; + uint32_t totallen = 0; + WalLdPageKey wlpk; + + tb = mdb_txn_begin(walEnv, NULL, 0, &tmptxn); + if (tb != 0) + { + // TODO + ereport(LOG, errmsg("put mdb_txn_begin failed,error is:%d", tb)); + } + + co = mdb_cursor_open(tmptxn, walDbi, &tmpcursor); + if (co != 0) + { + // TODO + ereport(LOG, errmsg("put mdb_txn_open failed,error is:%d", co)); + } + + for (int i = 0; i < length; i++) + { + part = 0; + uint8_t *buf = ks[i].buf; + totallen = (uint32_t)buf[0] | (uint32_t)(buf[1] << 8) | (uint32_t)(buf[2] << 16) | (uint32_t)(buf[3] << 24); + + key.mv_size = SizeOfCleanWal; + wlpk.sk = ks[i].lpk.sk; + + if (totallen > 511) + { + while (totallen > 0) + { + if (totallen > 511) + { + data.mv_size = 511; + data.mv_data = NULL; + xlogContent = (uint8_t *)malloc(511); + wlpk.pageLsn = SwapLsnFromLittleToBig(ks[i].lsn); + wlpk.partition = part; + + memcpy(xlogContent, buf + (part * 511), 511); // 502 = 511 - 9 + part++; + totallen -= 511; + key.mv_data = &wlpk; + data.mv_data = xlogContent; + cp = mdb_cursor_put(tmpcursor, &key, &data, MDB_NODUPDATA); + if (cp != 0) + { + ereport(LOG, errmsg("mdb_txn_put big wal failed,error is:%d, rel %d, forkno %d, blk %d, pagelsn %ld, part %d", + cp, wlpk.sk.relid, wlpk.sk.forkno, wlpk.sk.blkno, SwapLsnFromBigToLittle(wlpk.pageLsn), wlpk.partition)); + if (cp == MDB_KEYEXIST) + { + break; + } + } + free(xlogContent); + xlogContent = NULL; + } + else + { + data.mv_size = totallen; + data.mv_data = NULL; + xlogContent = (uint8_t *)malloc(totallen); + wlpk.pageLsn = SwapLsnFromLittleToBig(ks[i].lsn); + wlpk.partition = part; + key.mv_data = &wlpk; + memcpy(xlogContent, buf + (part * 511), totallen); + data.mv_data = xlogContent; + cp = mdb_cursor_put(tmpcursor, &key, &data, MDB_NODUPDATA); + if (cp != 0) + { + ereport(LOG, errmsg("mdb_txn_put last of big wal failed,error is:%d, rel %d, forkno %d, blk %d", + cp, wlpk.sk.relid, wlpk.sk.forkno, wlpk.sk.blkno)); + if (cp == MDB_KEYEXIST) + { + break; + } + } + free(xlogContent); + xlogContent = NULL; + break; + } + } + } + else + { + data.mv_size = totallen; + data.mv_data = NULL; + xlogContent = (uint8_t *)malloc(totallen); + wlpk.pageLsn = SwapLsnFromLittleToBig(ks[i].lsn); + wlpk.partition = 0; + key.mv_data = &wlpk; + memcpy(xlogContent, buf, totallen); + data.mv_data = xlogContent; + cp = mdb_cursor_put(tmpcursor, &key, &data, MDB_NODUPDATA); + if (cp != 0) + { + ereport(LOG, errmsg("mdb_txn_put failed,error is:%d, dbid %d, rel %d, forkno %d, blk %d, pagelsn %ld, part %d", + cp, wlpk.sk.dbid, wlpk.sk.relid, wlpk.sk.forkno, wlpk.sk.blkno, + SwapLsnFromBigToLittle(wlpk.pageLsn), wlpk.partition)); + if (cp == MDB_KEYEXIST) + { + cp = mdb_cursor_get(tmpcursor, &key, &data, MDB_SET); + if (cp != 0) + ereport(LOG, errmsg(" mdb_txn_get failed when put exist,error is:%d, rel %d, forkno %d, blk %d", + cp, wlpk.sk.relid, wlpk.sk.forkno, wlpk.sk.blkno)); + continue; + } + } + free(xlogContent); + xlogContent = NULL; + } + } + mdb_cursor_close(tmpcursor); + + co = mdb_txn_commit(tmptxn); + if (co != 0) + ereport(LOG, errmsg("put mdb_txn_commit failed,error is:%d", co)); + tmpcursor = NULL; + tmptxn = NULL; + // pthread_mutex_unlock(&q_lock); +} + +void GetPageFromCurrentNode(PageKey pk, Bufrd *bufrd) +{ + uint8_t *page; + page = NULL; + + if (bufrd->buf == NULL) + { + bufrd->buf = (uint8_t *)malloc(BLKSZ); + } + page = GetPageFromSecondBuffer(&pk, bufrd->buf); + if (page == NULL) + { + page = GetPageFromLocalBuffer(&pk, bufrd->buf); + } + + if (page == NULL) + { + bufrd->buf = NULL; + bufrd->cap = 0; + bufrd->count = 0; + } + else + { + if (*isPromoteIsTriggered || EnableHotStandby == false || push_standby) + { + bufrd->count = 8192; + bufrd->cap = 8192; + } + else + { + WalLdPageKey wlpk; + wlpk.sk.dbid = pk.relfileNode.dbNode; + wlpk.sk.relid = pk.relfileNode.relNode; + wlpk.sk.forkno = pk.forkNo; + wlpk.sk.blkno = pk.blkNo; + wlpk.pageLsn = SwapLsnFromLittleToBig(pk.pageLsn); + wlpk.partition = 0; + Bufrd waldata = GetWalFromLocalBuffer(&wlpk, pk.replyLsn); + if (waldata.count > 0) + { + bufrd->buf = (uint8_t *)realloc(bufrd->buf, 8192 + waldata.count); + memcpy(bufrd->buf + 8192, waldata.buf, waldata.count); + wlpk.pageLsn = SwapLsnFromLittleToBig(pk.replyLsn); + SendInvalWal(&wlpk); + + wlpk.sk.dbid = 0; + wlpk.sk.relid = 0; + wlpk.sk.forkno = 32; + wlpk.sk.blkno = 0; + wlpk.pageLsn = 0; + wlpk.partition = 0; + SendInvalWal(&wlpk); + } + bufrd->cap = bufrd->count = 8192 + waldata.count; + free(waldata.buf); + } + } +} + +// void SendInvalWal(WalLdPageKey *walkey) { +// int sock_fd; +// struct sockaddr_un un; +// un.sun_family = AF_UNIX; +// strcpy(un.sun_path, socketfile); +// sock_fd = socket(AF_UNIX, SOCK_STREAM, 0); +// if (sock_fd < 0) +// { +// elog(WARNING, "request socket failed"); +// return; +// } + +// if (connect(sock_fd, (struct sockaddr *)&un, sizeof(un)) < 0) +// { +// elog(WARNING, "connect socket failed"); +// return; +// } +// send(sock_fd, walkey, SizeOfCleanWal, 0); +// close(sock_fd); +// return; +// } + +void SendInvalWal(WalLdPageKey *walkey) +{ + struct sockaddr_un un; + if (SocFd.walSocketFd < 0) + { + un.sun_family = AF_UNIX; + strcpy(un.sun_path, socketfile); + SocFd.walSocketFd = socket(AF_UNIX, SOCK_STREAM, 0); + if (SocFd.walSocketFd < 0) + { + elog(WARNING, "request socket failed"); + return; + } + + if (connect(SocFd.walSocketFd, (struct sockaddr *)&un, sizeof(un)) < 0) + { + elog(WARNING, "connect socket failed"); + return; + } + } + + send(SocFd.walSocketFd, walkey, SizeOfCleanWal, 0); + // close(sock_fd); + return; +} + +void SendInvalPage(LdPageKey *ldKey) +{ + struct sockaddr_un un; + if (SocFd.pageSocketFd < 0) + { + un.sun_family = AF_UNIX; + strcpy(un.sun_path, p_socketfile); + SocFd.pageSocketFd = socket(AF_UNIX, SOCK_STREAM, 0); + if (SocFd.pageSocketFd < 0) + { + elog(WARNING, "request socket failed"); + return; + } + + if (connect(SocFd.pageSocketFd, (struct sockaddr *)&un, sizeof(un)) < 0) + { + elog(WARNING, "connect socket failed"); + return; + } + } + + send(SocFd.pageSocketFd, ldKey, SizeOfCleanPage, 0); + return; +} +void *doCleanWalInLmdb(void *fd) +{ + int syncFlag; + int new_fd = *(int *)fd; + static char data_buf[SizeOfCleanWal]; + while (1) + { + + memset(data_buf, 0, SizeOfCleanWal); + recv(new_fd, data_buf, SizeOfCleanWal, 0); + WalLdPageKey *wpk = (WalLdPageKey *)data_buf; + + if (0 == wpk->partition && 0 == wpk->pageLsn&& + 0 == wpk->sk.blkno && 0 == wpk->sk.dbid && + 32 == wpk->sk.forkno && 0 == wpk->sk.relid) + { + syncFlag = mdb_env_sync(walEnv, 1); + if (syncFlag != 0) + { + printf("wal mdb_env_sync is failed, errcode is :%d\n",syncFlag); + } + } + else if (0 == wpk->partition && 0 == wpk->pageLsn && + 0 == wpk->sk.blkno && 0 == wpk->sk.dbid && + 0 == wpk->sk.forkno && 0 == wpk->sk.relid) + { + close(new_fd); + break; + } + else + { + if (wpk->partition == 0) + { + CleanWalsByPage(wpk); + } + else + { + CleanWalsByTable(wpk); + } + } + } + mdb_env_sync(walEnv, 1); +} + +void *CleanWalsInLmdb(void *arg) +{ + int fd = -1, new_fd; + struct sockaddr_un un; + static char data_buf[SizeOfCleanWal]; + if (fd < 0) + { + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + { + elog(PANIC, "request cleanwal socket failed"); + } + } + un.sun_family = AF_UNIX; + unlink(socketfile); + strcpy(un.sun_path, socketfile); + + if (bind(fd, (struct sockaddr *)&un, sizeof(un)) < 0) + { + elog(PANIC, "bind cleanwal socket failed"); + } + if (listen(fd, MaxBackends + 8) < 0) + { + elog(PANIC, "listen cleanwal socket failed"); + } + + while (1) + { + pthread_t p; + new_fd = accept(fd, NULL, NULL); + if (new_fd < 0) + { + close(fd); + unlink(socketfile); + elog(PANIC, "cannot accept client connect request"); + } + + pthread_create(&p, NULL, doCleanWalInLmdb, &new_fd); + } +} + +void *doCleanPageInLmdb(void *fd) +{ + int new_fd = *(int *)fd; + int syncFlag; + static char data_buf[SizeOfCleanPage]; + while (1) + { + memset(data_buf, 0, SizeOfCleanPage); + recv(new_fd, data_buf, SizeOfCleanPage, 0); + LdPageKey *lpk = (LdPageKey *)data_buf; + if (0 == lpk->sk.blkno && 0 == lpk->sk.dbid && 32 == lpk->sk.forkno && 0 == lpk->sk.relid) + { + // close(new_fd); + // break; + syncFlag = mdb_env_sync(pageEnv, 1); + if (syncFlag != 0) + { + printf("page mdb_env_sync is failed,errcode is: %d\n",syncFlag); + } + + } + else if (0 == lpk->sk.blkno && 0 == lpk->sk.dbid && 0 == lpk->sk.forkno && 0 == lpk->sk.relid) + { + close(new_fd); + break; + } + else + { + CleanPagesByTable(lpk); + } + } +} +void *CleanPagesInLmdb(void *arg) +{ + int fd = -1, new_fd; + struct sockaddr_un un; + static char data_buf[SizeOfCleanPage]; + if (fd < 0) + { + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + { + elog(PANIC, "request cleanwal socket failed"); + } + } + un.sun_family = AF_UNIX; + unlink(p_socketfile); + strcpy(un.sun_path, p_socketfile); + + if (bind(fd, (struct sockaddr *)&un, sizeof(un)) < 0) + { + elog(PANIC, "bind cleanwal socket failed"); + } + if (listen(fd, MaxBackends + 8) < 0) + { + elog(PANIC, "listen cleanwal socket failed"); + } + + while (1) + { + pthread_t p; + new_fd = accept(fd, NULL, NULL); + if (new_fd < 0) + { + close(fd); + unlink(p_socketfile); + elog(PANIC, "cannot accept client connect request"); + } + + pthread_create(&p, NULL, doCleanPageInLmdb, &new_fd); + } +} + +static int +IsDirExist(const char *path) +{ + return !access(path, F_OK); +} + +static void +CleanWalsByPage(WalLdPageKey *walkey) +{ + MDB_txn *tmptxn; + MDB_cursor *tmpcursor; + MDB_val key, data; + int success = -1; + uint64 replayLsn = SwapLsnFromBigToLittle(walkey->pageLsn); + + walkey->pageLsn = 0; + key.mv_size = SizeOfCleanWal; + key.mv_data = walkey; + + data.mv_size = 0; + data.mv_data = NULL; + + uint32 dbid, relid, forkno, blkno; + dbid = walkey->sk.dbid; + relid = walkey->sk.relid; + forkno = walkey->sk.forkno; + blkno = walkey->sk.blkno; + + success = mdb_txn_begin(walEnv, NULL, 0, &tmptxn); + if (success != 0) + { + elog(LOG, "mdb_txn_begin failed when clean wals, err %d", success); + return; + } + + success = mdb_cursor_open(tmptxn, walDbi, &tmpcursor); + if (success != 0) + { + elog(LOG, "mdb_cursor_open failed when clean wals, err %d", success); + return; + } + + success = mdb_cursor_get(tmpcursor, &key, &data, MDB_SET_RANGE); + + if (success != 0) + { + ereport(LOG, errmsg("mdb_cursor_get failed when clean wals, err %d, rel %d, fork %d, blk %d, lsn %ld", + success, relid, forkno, blkno, SwapLsnFromBigToLittle(walkey->pageLsn))); + mdb_cursor_close(tmpcursor); + mdb_txn_abort(tmptxn); + return; + } + + walkey = (WalLdPageKey *)key.mv_data; + // elog(LOG, "get wal rel %d, fork %d, blk %d, lsn %ld, part %d", + // walkey->sk.relid, walkey->sk.forkno, walkey->sk.blkno, SwapLsnFromBigToLittle(walkey->pageLsn), walkey->partition); + + while (walkey->sk.dbid == dbid && walkey->sk.relid == relid && walkey->sk.forkno == forkno && walkey->sk.blkno == blkno && SwapLsnFromBigToLittle(walkey->pageLsn) < replayLsn) + { + // elog(LOG, "del wal rel %d, fork %d, blk %d, lsn %ld, part %d", + // walkey->sk.relid, walkey->sk.forkno, walkey->sk.blkno, SwapLsnFromBigToLittle(walkey->pageLsn), walkey->partition); + success = mdb_cursor_del(tmpcursor, 0); + if (success != 0) + elog(WARNING, "del wal failed: err %d, rel %d, fork %d, blk %d, lsn %ld", + success, walkey->sk.relid, walkey->sk.forkno, walkey->sk.blkno, SwapLsnFromBigToLittle(walkey->pageLsn)); + if (0 != mdb_cursor_get(tmpcursor, &key, &data, MDB_NEXT)) + { + break; + } + + walkey = (WalLdPageKey *)key.mv_data; + // ereport(LOG,errmsg("get key dbid %d, relid %d, fork %d, blk %d, pagelsn %ld, part %d", + // wpk->sk.dbid, wpk->sk.relid, wpk->sk.forkno, wpk->sk.blkno, SwapLsnFromBigToLittle(wpk->pageLsn), wpk->partition)); + } + + mdb_cursor_close(tmpcursor); + mdb_txn_commit(tmptxn); + tmpcursor = NULL; + tmptxn = NULL; + return; +} + +static void +CleanWalsByTable(WalLdPageKey *walkey) +{ + MDB_txn *tmptxn; + MDB_cursor *tmpcursor; + MDB_val key, data; + int success = -1; + uint64 replayLsn = SwapLsnFromBigToLittle(walkey->pageLsn); + + walkey->pageLsn = 0; + walkey->partition = 0; + key.mv_size = SizeOfCleanWal; + key.mv_data = walkey; + + data.mv_size = 0; + data.mv_data = NULL; + + uint32 dbid, relid, forkno, blkno; + dbid = walkey->sk.dbid; + relid = walkey->sk.relid; + forkno = walkey->sk.forkno; + blkno = walkey->sk.blkno; + + success = mdb_txn_begin(walEnv, NULL, 0, &tmptxn); + if (success != 0) + { + elog(LOG, "mdb_txn_begin failed when clean wals, err %d", success); + return; + } + + success = mdb_cursor_open(tmptxn, walDbi, &tmpcursor); + if (success != 0) + { + elog(LOG, "mdb_cursor_open failed when clean wals, err %d", success); + return; + } + + success = mdb_cursor_get(tmpcursor, &key, &data, MDB_SET_RANGE); + + if (success != 0) + { + ereport(LOG, errmsg("mdb_cursor_get failed when clean wals, err %d, rel %d, fork %d, blk %d, lsn %ld", + success, relid, forkno, blkno, SwapLsnFromBigToLittle(walkey->pageLsn))); + mdb_cursor_close(tmpcursor); + mdb_txn_abort(tmptxn); + return; + } + + walkey = (WalLdPageKey *)key.mv_data; + // elog(LOG, "get wal rel %d, fork %d, blk %d, lsn %ld, part %d", + // walkey->sk.relid, walkey->sk.forkno, walkey->sk.blkno, SwapLsnFromBigToLittle(walkey->pageLsn), walkey->partition); + + while (walkey->sk.dbid == dbid && walkey->sk.relid == relid && walkey->sk.forkno == forkno + // && walkey->sk.blkno == blkno + && SwapLsnFromBigToLittle(walkey->pageLsn) < replayLsn) + { + // elog(LOG, "del wal rel %d, fork %d, blk %d, lsn %ld, part %d", + // walkey->sk.relid, walkey->sk.forkno, walkey->sk.blkno, SwapLsnFromBigToLittle(walkey->pageLsn), walkey->partition); + success = mdb_cursor_del(tmpcursor, 0); + if (success != 0) + elog(WARNING, "del wal failed: err %d, rel %d, fork %d, blk %d, lsn %ld", + success, walkey->sk.relid, walkey->sk.forkno, walkey->sk.blkno, SwapLsnFromBigToLittle(walkey->pageLsn)); + if (0 != mdb_cursor_get(tmpcursor, &key, &data, MDB_NEXT)) + { + break; + } + + walkey = (WalLdPageKey *)key.mv_data; + // ereport(LOG,errmsg("get key dbid %d, relid %d, fork %d, blk %d, pagelsn %ld, part %d", + // wpk->sk.dbid, wpk->sk.relid, wpk->sk.forkno, wpk->sk.blkno, SwapLsnFromBigToLittle(wpk->pageLsn), wpk->partition)); + } + + mdb_cursor_close(tmpcursor); + mdb_txn_commit(tmptxn); + tmpcursor = NULL; + tmptxn = NULL; + return; +} + +static void +CleanPagesByTable(LdPageKey *ldKey) +{ + MDB_txn *tmptxn; + MDB_cursor *tmpcursor; + MDB_val key, data; + int success = -1; + + key.mv_size = SizeOfCleanPage; + key.mv_data = ldKey; + + data.mv_size = 0; + data.mv_data = NULL; + + uint32 dbid, relid, forkno, blkno; + dbid = ldKey->sk.dbid; + relid = ldKey->sk.relid; + forkno = ldKey->sk.forkno; + blkno = ldKey->sk.blkno; + success = mdb_txn_begin(pageEnv, NULL, 0, &tmptxn); + // mdb_txn_begin(pageEnv, NULL, 0, &tmptxn); + if (success != 0) + { + elog(LOG, "mdb_txn_begin failed when clean pages, err %d", success); + return; + } + success = mdb_cursor_open(tmptxn, pageDbi, &tmpcursor); + if (success != 0) + { + elog(LOG, "mdb_cursor_open failed when clean pages, err %d", success); + return; + } + + success = mdb_cursor_get(tmpcursor, &key, &data, MDB_SET_RANGE); + if (success != 0) + { + elog(LOG, "mdb_cursor_get failed when clean pages, err %d, rel %d, fork %d, blk %d", + success, relid, forkno, blkno); + mdb_cursor_close(tmpcursor); + mdb_txn_abort(tmptxn); + return; + } + + ldKey = (LdPageKey *)key.mv_data; + // elog(LOG, "get page db %d, rel %d, fork %d, blk %d", + // ldKey->sk.dbid, ldKey->sk.relid, ldKey->sk.forkno, ldKey->sk.blkno); + + while (ldKey->sk.dbid == dbid && ldKey->sk.relid == relid && ldKey->sk.forkno == forkno) + { + elog(LOG, "del page dbid %d, rel %d, fork %d, blk %d", + ldKey->sk.dbid, ldKey->sk.relid, ldKey->sk.forkno, ldKey->sk.blkno); + success = mdb_cursor_del(tmpcursor, 0); + if (success != 0) + elog(WARNING, "del wal failed: err %d, rel %d, fork %d, blk %d", + success, ldKey->sk.relid, ldKey->sk.forkno, ldKey->sk.blkno); + if (0 != mdb_cursor_get(tmpcursor, &key, &data, MDB_NEXT)) + { + break; + } + + ldKey = (LdPageKey *)key.mv_data; + // ereport(LOG,errmsg("get key dbid %d, relid %d, fork %d, blk %d, pagelsn %ld, part %d", + // wpk->sk.dbid, wpk->sk.relid, wpk->sk.forkno, wpk->sk.blkno, SwapLsnFromBigToLittle(wpk->pageLsn), wpk->partition)); + } + + mdb_cursor_close(tmpcursor); + mdb_txn_commit(tmptxn); + tmpcursor = NULL; + tmptxn = NULL; + return; +} + +static void * +RemovePageOrWalFromCurrentNode() +{ + // ereport(INFO, (errmsg("RemovePageOrWalFromCurrentNode in processing"))); + MDB_txn *tmptxn; + MDB_cursor *tmpcursor; + MDB_val key, data; + LdPageKey *lpk = NULL; + PageKey *pk = NULL; + SdPageValue *spv = NULL; + int success = 1; + int PageOrWal = 1; + + lpk = (LdPageKey *)malloc(sizeof(LdPageKey)); + + LWLock *partitionLock = NULL; + uint32 newHash; + + for (;;) + { + if (PageOrWal == (int)PAGE) + { + if (DPArray->pageIndex >= DPArray->tail) + { + continue; + } + pk = &DPArray->dpk[DPArray->pageIndex].pk; + } + else + { + if (DPArray->walIndex >= DPArray->pageIndex) + { + continue; + } + if (DPArray->dpk[DPArray->walIndex].pagedeleted = false) + { + continue; + } + pk = &DPArray->dpk[DPArray->walIndex].pk; + } + + convertKeyLd(lpk, pk); + key.mv_size = sizeof(LdPageKey); + key.mv_data = lpk; + + if (PageOrWal == 1) + { + mdb_txn_begin(pageEnv, NULL, 0, &tmptxn); + mdb_cursor_open(tmptxn, pageDbi, &tmpcursor); + } + else + { + mdb_txn_begin(walEnv, NULL, 0, &tmptxn); + mdb_cursor_open(tmptxn, walDbi, &tmpcursor); + } + + success = mdb_cursor_get(tmpcursor, &key, &data, MDB_SET); + + if (success == 0) + { + mdb_cursor_del(tmpcursor, 0); + } + else + { + // DROP + mdb_cursor_get(tmpcursor, &key, &data, MDB_PREV); + mdb_cursor_del(tmpcursor, 0); + if (PageOrWal == (int)PAGE) + { + // CleanUpSecondBuffer(&((SdPageValue *)data.mv_data)->pk); + newHash = SecondBufferHashCode(&((SdPageValue *)data.mv_data)->pk); + partitionLock = SecondBufferMappingPartitionLock(newHash); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + CleanUpSecondBuffer(&((SdPageValue *)data.mv_data)->pk); + LWLockRelease(partitionLock); + } + } + + if (PageOrWal == (int)PAGE && success == 0) + { + // TRUNCATE + if (DPArray->dpk[DPArray->walIndex].operation == (int)TRUNCATE) + { + // CleanUpSecondBuffer(&((SdPageValue *)data.mv_data)->pk); + newHash = SecondBufferHashCode(&((SdPageValue *)data.mv_data)->pk); + partitionLock = SecondBufferMappingPartitionLock(newHash); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + CleanUpSecondBuffer(&((SdPageValue *)data.mv_data)->pk); + LWLockRelease(partitionLock); + } + else if (NULL == FindSecondBufferInTable(&lpk->sk)) // EVICT + { + mdb_put(tmptxn, pageDbi, &key, &data, MDB_NODUPDATA); + } + } + + mdb_cursor_close(tmpcursor); + mdb_txn_commit(tmptxn); + if (PageOrWal == (int)PAGE) + { + DPArray->dpk[DPArray->pageIndex].pagedeleted = true; + DPArray->pageIndex = (DPArray->pageIndex + 1) % 1024; + + if (EnableHotStandby == false || *isPromoteIsTriggered) + { + DPArray->head = (DPArray->head + 1) % 1024; + SpinLockAcquire(&DPArray->append); + DPArray->unused++; + SpinLockRelease(&DPArray->append); + } + } + else + { + DPArray->head++; + DPArray->walIndex = (DPArray->walIndex + 1) % 1024; + DPArray->head = (DPArray->head + 1) % 1024; + SpinLockAcquire(&DPArray->append); + DPArray->unused++; + SpinLockRelease(&DPArray->append); + } + } +} + +static void +MovePageFromSecondBufferToLocalBuffer() +{ + printf("MovePageFromSecondBufferToLocalBuffer\n"); + MDB_txn *tmptxn; + MDB_txn *txn = NULL; + MDB_dbi dbi; + MDB_val key, data; + + SingleKeyArray *ska = NULL; + int localHead = 0; + int localTail = 0; + int localUnused = 0; + int processNum = 0; + int i = 0; + int j = 0; + SdPageKey spk; + LdPageKey lpk; + + SdPageKeyEntity *spke = NULL; + SdPageKeyList spkl; + spkl.head = NULL; + spkl.tail = NULL; + + SdPageValue *spv = NULL; + int tb = -1, mp = -1, mtc = -1, mdo = -1; + long sleeptime = 1000L; + bool success; + + LWLock *partitionLock = NULL; + uint32 newHash; + bool canShutDown = false; + pid_t ckp_pid; + + for (;;) + { + // exit when postmaster stop + ResetLatch(MyLatch); + if (ShutdownRequestPending) + { + if (canShutDown) + proc_exit(0); + + ckp_pid = He3DBQueryCkpPid(); + if (ckp_pid == 0) + canShutDown = true; + + if (kill(ckp_pid, 0) == -1) + { + if (errno == ESRCH) + { + elog(LOG, "checkpoint process is shutdown, we can shutdown secondbuffer process after flush all buffer into lmdb"); + canShutDown = true; + } + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not check the existence of the backend with PID %d: %m", + ckp_pid))); + } + } + + for (i = 0; i < SDNUM; i++) + { + ska = &MultiKeyArrays[i]; + SpinLockAcquire(&ska->oplock); + localHead = ska->head; + localTail = ska->tail; + localUnused = ska->unused; + SpinLockRelease(&ska->oplock); + + if (localUnused == 0) // + { + processNum = SDLEN; + } + else if (localUnused == SDLEN) + { + continue; + } + else + { + processNum = (localTail + SDLEN - localHead) % SDLEN; + } + success = true; + tb = mdb_txn_begin(pageEnv, NULL, 0, &tmptxn); + if (tb != 0) + { + ereport(LOG, errmsg("mdb_txn_begin failed,error code is %d", tb)); + continue; + } + + for (j = 0; j < processNum; j++) + { + spk = ska->SdPageKeyList[(localHead + j) % SDLEN]; + spke = (SdPageKeyEntity *)malloc(sizeof(SdPageKeyEntity)); + spke->spk = spk; + spke->next = NULL; + + if (spkl.head == NULL) + { + spkl.head = spke; + spkl.tail = spkl.head; + } + else + { + spkl.tail->next = spke; + spkl.tail = spkl.tail->next; + } + + newHash = SecondBufferHashCode(&spk); + partitionLock = SecondBufferMappingPartitionLock(newHash); + LWLockAcquire(partitionLock, LW_SHARED); + spv = FindSecondBufferInTable(&spk); + if (spv == NULL || spv->pagecontent == NULL) + { + LWLockRelease(partitionLock); + continue; + } + lpk.sk = spk; + + key.mv_size = sizeof(LdPageKey); + key.mv_data = &lpk; + + data.mv_size = 8192; + data.mv_data = spv->pagecontent; + + mp = mdb_put(tmptxn, pageDbi, &key, &data, 0); + spv->canDelete = true; + LWLockRelease(partitionLock); + if (mp != 0) + { + success = false; + ereport(LOG, errmsg("mdb_put failed, mp is %d", mp)); + break; + } + } + + if (!success) + { + mdb_txn_abort(tmptxn); + } + else + { + mtc = mdb_txn_commit(tmptxn); + if (mtc != 0) + { + success = false; + ereport(LOG, errmsg("mdb_txn_commit failed,error is:%d", mtc)); + mdb_txn_abort(tmptxn); + } + } + + SdPageKeyEntity *s = NULL; + SdPageValue *spv = NULL; + while (spkl.head != NULL) + { + s = spkl.head; + if (success) + { + spv = FindSecondBufferInTable(&s->spk); + if (spv != NULL) + { + newHash = SecondBufferHashCode(&s->spk); + partitionLock = SecondBufferMappingPartitionLock(newHash); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + if (spv->canDelete) + CleanUpSecondBuffer(&s->spk); + LWLockRelease(partitionLock); + } + } + if (spkl.head->next != NULL) + { + spkl.head = spkl.head->next; + free(s); + } + else + { + free(spkl.head); + break; + } + } + + spkl.head = spkl.tail = NULL; + + if (!success) + { + continue; + } + + ska->head = localTail; + SpinLockAcquire(&ska->oplock); + if (ska->unused == 0) + { + ska->unused = SDLEN; + SpinLockAcquire(&statisticnum->change); + statisticnum->totalunused += SDLEN; + SpinLockRelease(&statisticnum->change); + } + else + { + ska->unused = ska->unused + (localTail + SDLEN - localHead) % SDLEN; + SpinLockAcquire(&statisticnum->change); + statisticnum->totalunused += ((localTail + SDLEN - localHead) % SDLEN); + SpinLockRelease(&statisticnum->change); + } + + SpinLockRelease(&ska->oplock); + } + + double rate = statisticnum->totalunused / (SDLEN * SDNUM); + if (rate < 0.1) + { + sleeptime = 0; + } + else if (rate > 0.6) + { + sleeptime += 1000L; + } + else if (rate < 0.5) + { + sleeptime = sleeptime / 2; + } + pg_usleep(sleeptime); + + (void)WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + 50L /* convert to ms */, + WAIT_EVENT_SECONDBUFFER_MAIN); + } +} + +void SignalStartSecondBuffer(void) +{ + SendPostmasterSignal(PMSIGNAL_SECONDBUFFER_WORKER); +} + +void SecondBufferMain(void) +{ + + MyBackendType = B_SECONDBUFFER; + MemoryContext SecondBuffer_context; + SecondBuffer_context = AllocSetContextCreate(TopMemoryContext, + "SecondBuffer", + ALLOCSET_DEFAULT_SIZES); + MemoryContextSwitchTo(SecondBuffer_context); + + init_ps_display(NULL); + + SetProcessingMode(InitProcessing); + + pqsignal(SIGHUP, SIG_IGN); + pqsignal(SIGINT, SIG_IGN); + pqsignal(SIGTERM, SignalHandlerForShutdownRequest); + + pqsignal(SIGALRM, SIG_IGN); + pqsignal(SIGPIPE, SIG_IGN); + pqsignal(SIGUSR1, procsignal_sigusr1_handler); + pqsignal(SIGUSR2, SIG_IGN); + + pqsignal(SIGCHLD, SIG_DFL); + + /* + * Unblock signals (they were blocked when the postmaster forked us) + */ + PG_SETMASK(&UnBlockSig); + + pthread_t ntid; + int err; + err = pthread_create(&ntid, NULL, CleanPagesInLmdb, NULL); + if (err != 0) + elog(PANIC, "pthread_create CleanPagesInLmdb failed %s", strerror(err)); + MovePageFromSecondBufferToLocalBuffer(); +} diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c index 69077bd..14e6e65 100644 --- a/src/backend/postmaster/startup.c +++ b/src/backend/postmaster/startup.c @@ -32,7 +32,8 @@ #include "storage/standby.h" #include "utils/guc.h" #include "utils/timeout.h" - +#include "access/pagehashqueue.h" +#include "utils/resowner_private.h" #ifndef USE_POSTMASTER_DEATH_SIGNAL /* @@ -51,6 +52,7 @@ static volatile sig_atomic_t got_SIGHUP = false; static volatile sig_atomic_t shutdown_requested = false; static volatile sig_atomic_t promote_signaled = false; +static volatile sig_atomic_t proc_exit_success = false; /* * Flag set when executing a restore command, to tell SIGTERM signal handler @@ -103,8 +105,10 @@ StartupProcShutdownHandler(SIGNAL_ARGS) if (in_restore_command) proc_exit(1); - else + else { shutdown_requested = true; + startup_shutdown_requested = true; + } WakeupRecovery(); errno = save_errno; @@ -164,8 +168,10 @@ HandleStartupProcInterrupts(void) /* * Check if we were requested to exit without finishing recovery. */ - if (shutdown_requested) + if (shutdown_requested) { + proc_exit_success = true; proc_exit(1); + } /* * Emergency bailout if postmaster has died. This is to avoid the @@ -185,6 +191,9 @@ HandleStartupProcInterrupts(void) ProcessProcSignalBarrier(); } +bool ProcHasReleaseFlag(void) { + return proc_exit_success; +} /* -------------------------------- * signal handler routines @@ -237,7 +246,18 @@ StartupProcessMain(void) * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); - + + //start flushWork +#ifndef PG_NOREPLAY + if (IsBootstrapProcessingMode() != true && InitdbSingle!=true) { + //if (push_standby == true) { + SignalStartFlushWork(); + //} + pg_usleep(1000); + SignalStartCleanLogIndexWork(); + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + } +#endif /* * Do what we came for. */ diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c index 626fae8..34ce019 100644 --- a/src/backend/postmaster/walwriter.c +++ b/src/backend/postmaster/walwriter.c @@ -272,7 +272,7 @@ WalWriterMain(void) (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, - cur_timeout, + 1000, WAIT_EVENT_WAL_WRITER_MAIN); } } diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c index 1f38c5b..80f68f4 100644 --- a/src/backend/replication/logical/logicalfuncs.c +++ b/src/backend/replication/logical/logicalfuncs.c @@ -233,9 +233,7 @@ pg_logical_slot_get_changes_guts(FunctionCallInfo fcinfo, bool confirm, bool bin ctx = CreateDecodingContext(InvalidXLogRecPtr, options, false, - XL_ROUTINE(.page_read = read_local_xlog_page, - .segment_open = wal_segment_open, - .segment_close = wal_segment_close), + XL_ROUTINE(.page_read = read_local_xlog_batch), LogicalOutputPrepareWrite, LogicalOutputWrite, NULL); diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c index 31e74d3..ea69b66 100644 --- a/src/backend/replication/slotfuncs.c +++ b/src/backend/replication/slotfuncs.c @@ -153,9 +153,7 @@ create_logical_replication_slot(char *name, char *plugin, ctx = CreateInitDecodingContext(plugin, NIL, false, /* just catalogs is OK */ restart_lsn, - XL_ROUTINE(.page_read = read_local_xlog_page, - .segment_open = wal_segment_open, - .segment_close = wal_segment_close), + XL_ROUTINE(.page_read = read_local_xlog_batch), NULL, NULL, NULL); /* @@ -512,9 +510,7 @@ pg_logical_replication_slot_advance(XLogRecPtr moveto) ctx = CreateDecodingContext(InvalidXLogRecPtr, NIL, true, /* fast_forward */ - XL_ROUTINE(.page_read = read_local_xlog_page, - .segment_open = wal_segment_open, - .segment_close = wal_segment_close), + XL_ROUTINE(.page_read = read_local_xlog_batch), NULL, NULL, NULL); /* diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index a9f67ef..b1583a2 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -75,6 +75,7 @@ #include "utils/acl.h" #include "utils/builtins.h" #include "utils/guc.h" +#include "utils/hfs.h" #include "utils/pg_lsn.h" #include "utils/ps_status.h" #include "utils/resowner.h" @@ -94,14 +95,15 @@ bool hot_standby_feedback; static WalReceiverConn *wrconn = NULL; WalReceiverFunctionsType *WalReceiverFunctions = NULL; -#define NAPTIME_PER_CYCLE 100 /* max sleep time between cycles (100ms) */ +//#define NAPTIME_PER_CYCLE 100 /* max sleep time between cycles (100ms) */ +#define NAPTIME_PER_CYCLE 10 /* max sleep time between cycles (10ms) */ /* * These variables are used similarly to openLogFile/SegNo, * but for walreceiver to write the XLOG. recvFileTLI is the TimeLineID * corresponding the filename of recvFile. */ -static int recvFile = -1; +static int64_t recvFile = -1; static TimeLineID recvFileTLI = 0; static XLogSegNo recvSegNo = 0; @@ -408,7 +410,14 @@ WalReceiverMain(void) first_stream = false; /* Initialize LogstreamResult and buffers for processing messages */ - LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr(NULL); + if (he3mirror){ + /* when he3db restart, ReplayRecPtr may too bigger, so LogstreamResult.Write < LogstreamResult.Flush, + * can not flush wal normally and can not do replay. RedoRecPtr is suitable value. + */ + LogstreamResult.Write = LogstreamResult.Flush = GetFileReplayLsn(); + } else{ + LogstreamResult.Write = LogstreamResult.Flush = GetXLogReplayRecPtr(NULL); + } initStringInfo(&reply_message); initStringInfo(&incoming_message); @@ -823,7 +832,11 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len) case 'w': /* WAL records */ { /* copy message to StringInfo */ - hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64); + if (he3mirror) { + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64); + } else { + hdrlen = sizeof(int64) + sizeof(int64) + sizeof(int64) + sizeof(int64); + } if (len < hdrlen) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), @@ -833,12 +846,21 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len) /* read the fields */ dataStart = pq_getmsgint64(&incoming_message); walEnd = pq_getmsgint64(&incoming_message); + if (he3mirror){ + len -= hdrlen; + } else{ + len = pq_getmsgint64(&incoming_message); + } sendTime = pq_getmsgint64(&incoming_message); ProcessWalSndrMessage(walEnd, sendTime); - buf += hdrlen; - len -= hdrlen; - XLogWalRcvWrite(buf, len, dataStart); + if (he3mirror) { + XLogWalRcvWrite(buf, len, dataStart); + } else { + LogstreamResult.Write = dataStart+len; + /* Update shared-memory status */ + pg_atomic_write_u64(&WalRcv->writtenUpto, LogstreamResult.Write); + } break; } case 'k': /* Keepalive */ @@ -871,7 +893,7 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len) } } -XLogRecPtr GetFlushXlogPtr() { +XLogRecPtr GetFlushXlogPtr(void) { SpinLockAcquire(&WalRcv->mutex); XLogRecPtr rcvlsn = WalRcv->flushedUpto; SpinLockRelease(&WalRcv->mutex); @@ -917,6 +939,8 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) errno = 0; byteswritten = pg_pwrite(recvFile, buf, segbytes, (off_t) startoff); + // byteswritten = writefs(recvFile, buf, segbytes, (off_t) startoff); + if (byteswritten <= 0) { char xlogfname[MAXFNAMELEN]; @@ -967,12 +991,12 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) static void XLogWalRcvFlush(bool dying) { + WalRcvData *walrcv = WalRcv; if (LogstreamResult.Flush < LogstreamResult.Write) - { - WalRcvData *walrcv = WalRcv; - + { +#ifdef PG_NOREPLAY issue_xlog_fsync(recvFile, recvSegNo); - +#endif LogstreamResult.Flush = LogstreamResult.Write; /* Update shared-memory status */ @@ -1108,6 +1132,11 @@ XLogWalRcvSendReply(bool force, bool requestReply) writePtr = LogstreamResult.Write; flushPtr = LogstreamResult.Flush; applyPtr = GetXLogReplayRecPtr(NULL); +#ifndef PG_NOREPLAY + if (!he3mirror && push_standby == true) { + applyPtr = GetXLogPushToDisk(); + } +#endif resetStringInfo(&reply_message); pq_sendbyte(&reply_message, 'r'); diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 3b245c6..a9297e3 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -48,6 +48,8 @@ #include #include +#include +#include #include "access/printtup.h" #include "access/timeline.h" @@ -93,6 +95,14 @@ #include "utils/ps_status.h" #include "utils/timeout.h" #include "utils/timestamp.h" +#include "access/heapam_xlog.h" +#include "catalog/pg_control.h" +#include "access/nbtxlog.h" +#include "access/gistxlog.h" +#include "access/spgxlog.h" +#include "access/brin_xlog.h" +#include "access/xlog.h" +#include "access/pg_mirror.h" /* * Maximum data payload in a WAL data message. Must be >= XLOG_BLCKSZ. @@ -105,6 +115,10 @@ */ #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) +#define ONCE_READ_TIKV_WAL (XLOG_BLCKSZ * 2) +//bachread tikv 16k,but last record len mybe gt 8k,so DEFAULT_SEND_WAL_CAPCITY = 2 * ONCE_READ_TIKV_WAL +#define DEFAULT_SEND_WAL_CAPCITY (ONCE_READ_TIKV_WAL*2) + /* Array of WalSnds in shared memory */ WalSndCtlData *WalSndCtl = NULL; @@ -231,6 +245,7 @@ static void XLogSendPhysical(void); static void XLogSendLogical(void); static void WalSndDone(WalSndSendDataCallback send_data); static XLogRecPtr GetStandbyFlushRecPtr(void); +static XLogRecPtr GetStandbyReplayRecPtr(void); static void IdentifySystem(void); static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd); static void DropReplicationSlot(DropReplicationSlotCmd *cmd); @@ -255,7 +270,7 @@ static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch); static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID *tli_p); - +static void XLogSendTiKVPhysical(void); /* Initialize walsender process before entering the main command loop */ void @@ -299,8 +314,8 @@ WalSndErrorCleanup(void) ConditionVariableCancelSleep(); pgstat_report_wait_end(); - if (xlogreader != NULL && xlogreader->seg.ws_file >= 0) - wal_segment_close(xlogreader); + // if (xlogreader != NULL && xlogreader->seg.ws_file >= 0) + // wal_segment_close(xlogreader); if (MyReplicationSlot != NULL) ReplicationSlotRelease(); @@ -398,7 +413,7 @@ IdentifySystem(void) if (am_cascading_walsender) { /* this also updates ThisTimeLineID */ - logptr = GetStandbyFlushRecPtr(); + logptr = GetStandbyReplayRecPtr(); } else logptr = GetFlushRecPtr(); @@ -572,6 +587,10 @@ StartReplication(StartReplicationCmd *cmd) { StringInfoData buf; XLogRecPtr FlushPtr; + bool pgmirrorFlag = false; + if (client_application_name!=NULL && strncmp(client_application_name,"pgmirror",strlen("pgmirror")) == 0) { + pgmirrorFlag = true; + } if (ThisTimeLineID == 0) ereport(ERROR, @@ -581,8 +600,7 @@ StartReplication(StartReplicationCmd *cmd) /* create xlogreader for physical replication */ xlogreader = XLogReaderAllocate(wal_segment_size, NULL, - XL_ROUTINE(.segment_open = WalSndSegmentOpen, - .segment_close = wal_segment_close), + XL_ROUTINE(), NULL); if (!xlogreader) @@ -622,7 +640,7 @@ StartReplication(StartReplicationCmd *cmd) if (am_cascading_walsender) { /* this also updates ThisTimeLineID */ - FlushPtr = GetStandbyFlushRecPtr(); + FlushPtr = GetStandbyReplayRecPtr(); } else FlushPtr = GetFlushRecPtr(); @@ -718,7 +736,7 @@ StartReplication(StartReplicationCmd *cmd) * Don't allow a request to stream from a future point in WAL that * hasn't been flushed to disk in this server yet. */ - if (FlushPtr < cmd->startpoint) + if (pgmirrorFlag == false && FlushPtr < cmd->startpoint) { ereport(ERROR, (errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X", @@ -738,8 +756,21 @@ StartReplication(StartReplicationCmd *cmd) /* Main loop of walsender */ replication_active = true; - - WalSndLoop(XLogSendPhysical); + if (pgmirrorFlag == false) { + WalSndLoop(XLogSendPhysical); + } else { + readControlFile(DataDir); + SpinLockAcquire(&MyWalSnd->mutex); + if (walsenderLsn != 0) { + MyWalSnd->sentPtr = walsenderLsn; + sentPtr = walsenderLsn; + elog(LOG,"wal sender LSN %X/%X",LSN_FORMAT_ARGS(walsenderLsn)); + } else { + elog(ERROR,"WAL sender LSN 0/0"); + } + SpinLockRelease(&MyWalSnd->mutex); + WalSndLoop(XLogSendTiKVPhysical); + } replication_active = false; if (got_STOPPING) @@ -815,34 +846,34 @@ logical_read_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int req WALReadError errinfo; XLogSegNo segno; - XLogReadDetermineTimeline(state, targetPagePtr, reqLen); + // XLogReadDetermineTimeline(state, targetRecPtr, reqLen); sendTimeLineIsHistoric = (state->currTLI != ThisTimeLineID); sendTimeLine = state->currTLI; sendTimeLineValidUpto = state->currTLIValidUntil; sendTimeLineNextTLI = state->nextTLI; /* make sure we have enough WAL available */ - flushptr = WalSndWaitForWal(targetPagePtr + reqLen); + flushptr = WalSndWaitForWal(targetRecPtr + reqLen); /* fail if not (implies we are going to shut down) */ - if (flushptr < targetPagePtr + reqLen) + if (flushptr < targetRecPtr + reqLen) return -1; - if (targetPagePtr + XLOG_BLCKSZ <= flushptr) + if (targetRecPtr + XLOG_BLCKSZ <= flushptr) count = XLOG_BLCKSZ; /* more than one block available */ else - count = flushptr - targetPagePtr; /* part of the page available */ + count = flushptr - targetRecPtr; /* part of the page available */ /* now actually read the data, we know it's there */ - if (!WALRead(state, - cur_page, - targetPagePtr, - XLOG_BLCKSZ, - state->seg.ws_tli, /* Pass the current TLI because only - * WalSndSegmentOpen controls whether new - * TLI is needed. */ - &errinfo)) - WALReadRaiseError(&errinfo); + // if (!He3DBWALRead(state, + // cur_page, + // targetRecPtr, + // XLOG_BLCKSZ, + // state->currTLI, /* Pass the current TLI because only + // * WalSndSegmentOpen controls whether new + // * TLI is needed. */ + // &errinfo)) + // WALReadRaiseError(&errinfo); /* * After reading into the buffer, check that what we read was valid. We do @@ -851,8 +882,8 @@ logical_read_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int req * read() succeeds in that case, but the data we tried to read might * already have been overwritten with new WAL records. */ - XLByteToSeg(targetPagePtr, segno, state->segcxt.ws_segsize); - CheckXLogRemoved(segno, state->seg.ws_tli); + // XLByteToSeg(targetPagePtr, segno, state->segcxt.ws_segsize); + // CheckXLogRemoved(segno, state->seg.ws_tli); return count; } @@ -1007,9 +1038,7 @@ CreateReplicationSlot(CreateReplicationSlotCmd *cmd) ctx = CreateInitDecodingContext(cmd->plugin, NIL, need_full_snapshot, InvalidXLogRecPtr, - XL_ROUTINE(.page_read = logical_read_xlog_page, - .segment_open = WalSndSegmentOpen, - .segment_close = wal_segment_close), + XL_ROUTINE(.page_read = logical_read_xlog_page), WalSndPrepareWrite, WalSndWriteData, WalSndUpdateProgress); @@ -1167,9 +1196,7 @@ StartLogicalReplication(StartReplicationCmd *cmd) */ logical_decoding_ctx = CreateDecodingContext(cmd->startpoint, cmd->options, false, - XL_ROUTINE(.page_read = logical_read_xlog_page, - .segment_open = WalSndSegmentOpen, - .segment_close = wal_segment_close), + XL_ROUTINE(.page_read = logical_read_xlog_page), WalSndPrepareWrite, WalSndWriteData, WalSndUpdateProgress); xlogreader = logical_decoding_ctx->reader; @@ -1305,7 +1332,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, break; sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp()); - + // sleeptime = 10; //10ms /* Sleep until something happens or we time out */ WalSndWait(WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE, sleeptime, WAIT_EVENT_WAL_SENDER_WRITE_DATA); @@ -2379,7 +2406,7 @@ WalSndLoop(WalSndSendDataCallback send_data) * of reaching wal_sender_timeout before sending a keepalive. */ sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp()); - + // sleeptime = 10; //10ms if (pq_is_send_pending()) wakeEvents |= WL_SOCKET_WRITEABLE; @@ -2467,73 +2494,360 @@ WalSndKill(int code, Datum arg) SpinLockRelease(&walsnd->mutex); } -/* XLogReaderRoutine->segment_open callback */ -static void -WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, - TimeLineID *tli_p) -{ - char path[MAXPGPATH]; - - /*------- - * When reading from a historic timeline, and there is a timeline switch - * within this segment, read from the WAL segment belonging to the new - * timeline. - * - * For example, imagine that this server is currently on timeline 5, and - * we're streaming timeline 4. The switch from timeline 4 to 5 happened at - * 0/13002088. In pg_wal, we have these files: - * - * ... - * 000000040000000000000012 - * 000000040000000000000013 - * 000000050000000000000013 - * 000000050000000000000014 - * ... - * - * In this situation, when requested to send the WAL from segment 0x13, on - * timeline 4, we read the WAL from file 000000050000000000000013. Archive - * recovery prefers files from newer timelines, so if the segment was - * restored from the archive on this server, the file belonging to the old - * timeline, 000000040000000000000013, might not exist. Their contents are - * equal up to the switchpoint, because at a timeline switch, the used - * portion of the old segment is copied to the new file. ------- - */ - *tli_p = sendTimeLine; - if (sendTimeLineIsHistoric) - { - XLogSegNo endSegNo; - - XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize); - if (nextSegNo == endSegNo) - *tli_p = sendTimeLineNextTLI; +static void reConvertMainData(XLogRecord* sRecord, char*sMainData, uint32_t*sLen, char* dMainData, uint32_t* dLen) { + RmgrId rmid = sRecord->xl_rmid; + uint8 info = (sRecord->xl_info & ~XLR_INFO_MASK); + switch(rmid) { + case RM_HEAP2_ID: + { + if ((info & XLOG_HEAP_OPMASK) == XLOG_HEAP2_VISIBLE) { + xl_heap_visible *xlrec = (xl_heap_visible *)sMainData; + xl_old_heap_visible xlrecOld; + xlrecOld.cutoff_xid = xlrec->cutoff_xid; + xlrecOld.flags = xlrec->flags; + *dLen = sizeof(xl_old_heap_visible); + memcpy(dMainData,&xlrecOld,*dLen); + } + break; + } + case RM_HEAP_ID: + { + if (((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_UPDATE) || + ((info & XLOG_HEAP_OPMASK) == XLOG_HEAP_HOT_UPDATE)) { + xl_heap_update *xlrec = (xl_heap_update *)sMainData; + xl_old_heap_update xlrecOld; + xlrecOld.old_xmax = xlrec->old_xmax; + xlrecOld.old_offnum = xlrec->old_offnum; + xlrecOld.old_infobits_set = xlrec->old_infobits_set; + xlrecOld.flags = xlrec->flags; + xlrecOld.new_xmax = xlrec->new_xmax; + xlrecOld.new_offnum = xlrec->new_offnum; + *dLen = sizeof(xl_old_heap_update); + memcpy(dMainData,&xlrecOld,*dLen); + } + break; + } + case RM_BTREE_ID: + { + if (info == XLOG_BTREE_SPLIT_L || info == XLOG_BTREE_SPLIT_R) { + xl_btree_split *xlrec = (xl_btree_split *)sMainData; + xl_old_btree_split xlrecOld; + xlrecOld.level = xlrec->level; + xlrecOld.firstrightoff = xlrec->firstrightoff; + xlrecOld.newitemoff = xlrec->newitemoff; + xlrecOld.postingoff = xlrec->postingoff; + *dLen = sizeof(xl_old_btree_split); + memcpy(dMainData,&xlrecOld,*dLen); + } + break; + } + case RM_GIST_ID: + { + if (info == XLOG_GIST_PAGE_SPLIT) { + gistxlogPageSplit *xlrec = (gistxlogPageSplit *)sMainData; + gistoldxlogPageSplit xlrecOld; + xlrecOld.origrlink = xlrec->origrlink; + xlrecOld.orignsn = xlrec->orignsn; + xlrecOld.origleaf = xlrec->origleaf; + xlrecOld.npage = xlrec->npage; + xlrecOld.markfollowright = xlrec->markfollowright; + *dLen = sizeof(gistoldxlogPageSplit); + memcpy(dMainData,&xlrecOld,*dLen); + } + break; + } + case RM_SPGIST_ID: + { + if (info == XLOG_SPGIST_ADD_LEAF) { + spgxlogAddLeaf *xlrec = (spgxlogAddLeaf *)sMainData; + spgoldxlogAddLeaf xlrecOld; + xlrecOld.newPage = xlrec->newPage; + xlrecOld.storesNulls = xlrec->storesNulls; + xlrecOld.offnumLeaf = xlrec->offnumLeaf; + xlrecOld.offnumHeadLeaf = xlrec->offnumHeadLeaf; + xlrecOld.offnumParent = xlrec->offnumParent; + xlrecOld.nodeI = xlrec->nodeI; + *dLen = sizeof(spgoldxlogAddLeaf); + memcpy(dMainData,&xlrecOld,*dLen); + } else if (info == XLOG_SPGIST_MOVE_LEAFS) { + spgxlogMoveLeafs *xlrec = (spgxlogMoveLeafs *)sMainData; + spgoldxlogMoveLeafs xlrecOld; + xlrecOld.nMoves = xlrec->nMoves; + xlrecOld.newPage = xlrec->newPage; + xlrecOld.replaceDead = xlrec->replaceDead; + xlrecOld.storesNulls = xlrec->storesNulls; + xlrecOld.offnumParent = xlrec->offnumParent; + xlrecOld.nodeI = xlrec->nodeI; + xlrecOld.stateSrc = xlrec->stateSrc; + *dLen = SizeOfOldSpgxlogMoveLeafs; + memcpy(dMainData,&xlrecOld,*dLen); + memcpy(dMainData+*dLen,xlrec->offsets,*sLen-SizeOfSpgxlogMoveLeafs); + *dLen += *sLen-SizeOfSpgxlogMoveLeafs; + } else if (info == XLOG_SPGIST_ADD_NODE) { + spgxlogAddNode *xlrec = (spgxlogAddNode *)sMainData; + spgoldxlogAddNode xlrecOld; + xlrecOld.offnum = xlrec->offnum; + xlrecOld.offnumNew = xlrec->offnumNew; + xlrecOld.newPage = xlrec->newPage; + xlrecOld.parentBlk = xlrec->parentBlk; + xlrecOld.offnumParent = xlrec->offnumParent; + xlrecOld.nodeI = xlrec->nodeI; + xlrecOld.stateSrc = xlrec->stateSrc; + *dLen = sizeof(spgoldxlogAddNode); + memcpy(dMainData,&xlrecOld,*dLen); + } else if (info == XLOG_SPGIST_PICKSPLIT) { + spgxlogPickSplit *xlrec = (spgxlogPickSplit *)sMainData; + spgoldxlogPickSplit xlrecOld; + xlrecOld.isRootSplit = xlrec->isRootSplit; + xlrecOld.nDelete = xlrec->nDelete; + xlrecOld.nInsert = xlrec->nInsert; + xlrecOld.initSrc = xlrec->initSrc; + xlrecOld.initDest = xlrec->initDest; + xlrecOld.offnumInner = xlrec->offnumInner; + xlrecOld.initInner = xlrec->initInner; + xlrecOld.storesNulls = xlrec->storesNulls; + xlrecOld.innerIsParent = xlrec->innerIsParent; + xlrecOld.offnumParent = xlrec->offnumParent; + xlrecOld.nodeI = xlrec->nodeI; + xlrecOld.stateSrc = xlrec->stateSrc; + *dLen = SizeOfOldSpgxlogPickSplit; + memcpy(dMainData,&xlrecOld,*dLen); + memcpy(dMainData+*dLen,xlrec->offsets,*sLen-SizeOfSpgxlogPickSplit); + *dLen += *sLen-SizeOfSpgxlogPickSplit; + } + break; + } + case RM_BRIN_ID: + { + if (info == XLOG_BRIN_INSERT) { + xl_brin_insert *xlrec = (xl_brin_insert *)sMainData; + xl_old_brin_insert xlrecOld; + xlrecOld.heapBlk = xlrec->heapBlk; + /* extra information needed to update the revmap */ + xlrecOld.pagesPerRange = xlrec->pagesPerRange; + xlrecOld.offnum = xlrec->offnum; + *dLen = sizeof(xl_old_brin_insert); + memcpy(dMainData,&xlrecOld,*dLen); + } else if ( info == XLOG_BRIN_UPDATE) { + xl_brin_update *xlrec = (xl_brin_update *) sMainData; + xl_old_brin_update xlrecUpdate; + xl_brin_insert *xlrecInsert = &xlrec->insert; + xl_old_brin_insert xlrecOld; + xlrecOld.heapBlk = xlrecInsert->heapBlk; + /* extra information needed to update the revmap */ + xlrecOld.pagesPerRange = xlrecInsert->pagesPerRange; + xlrecOld.offnum = xlrecInsert->offnum; + /* offset number of old tuple on old page */ + xlrecUpdate.oldOffnum = xlrec->oldOffnum; + xlrecUpdate.insert = xlrecOld; + *dLen = sizeof(xl_old_brin_update); + memcpy(dMainData,&xlrecUpdate,*dLen); + } + break; + } + default: + { + break; + } } +} - XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize); - state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); - if (state->seg.ws_file >= 0) - return; - - /* - * If the file is not found, assume it's because the standby asked for a - * too old WAL segment that has already been removed or recycled. - */ - if (errno == ENOENT) - { - char xlogfname[MAXFNAMELEN]; - int save_errno = errno; - - XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size); - errno = save_errno; - ereport(ERROR, - (errcode_for_file_access(), - errmsg("requested WAL segment %s has already been removed", - xlogfname))); +//1.recomplete CRC 2.MTR as endpoint Merge 3.some struct convert 4.checkpoint redo reset 5.use file segment manage +static int MergeWalForPgMirror(char*source,char*destion,int limit,int*he3_pos) { + int pos1 = 0,pos2 = 0,prev_pos2 = 0; + bool isMtr = false; + *he3_pos = 0; + while(pos1= limit) { + return pos2; + } + XLogRecord*one = (XLogRecord*)(source + pos1); + old = (OldXLogRecord*)(destion + pos2); + old->xl_xid = one->xl_xid; + old->xl_info = one->xl_info; + old->xl_rmid = one->xl_rmid; + pos1 += sizeof(XLogRecord); + pos2 += sizeof(OldXLogRecord); + uint32 remaining = one->xl_tot_len - sizeof(XLogRecord); + uint32 datatotal = 0; + isMtr = one->mtr; + while(remaining > datatotal) { + uint8_t block_id = *(source + pos1); + if (block_id == XLR_BLOCK_ID_DATA_SHORT) { + /* XLogRecordDataHeaderShort */ + pos1 += sizeof(block_id); + if (isMtr == true) { + memcpy((destion + pos2),&block_id,sizeof(block_id)); + pos2 += sizeof(block_id); + } + uint32_t main_data_len = 0; + main_data_len = *((uint8_t*)(source + pos1)); + if (isMtr == true) { + reConvertMainData(one,source + pos1,&main_data_len,d_main_data,&d_main_data_len); + if (d_main_data_len == 0) { + memcpy(destion + pos2,source + pos1,sizeof(uint8_t)); + } + pos2 += sizeof(uint8_t); + } + pos1 += sizeof(uint8_t); + remaining -= sizeof(uint8_t); + datatotal += main_data_len; + break; + } else if (block_id == XLR_BLOCK_ID_DATA_LONG) { + /* XLogRecordDataHeaderLong */ + pos1 += sizeof(block_id); + if (isMtr == true) { + memcpy((destion + pos2),&block_id,sizeof(block_id)); + pos2 += sizeof(block_id); + } + uint32 main_data_len = 0,d_main_data_len = 0; + memcpy(&main_data_len,source + pos1,sizeof(uint32)); + if (isMtr == true) { + reConvertMainData(one,source + pos1,&main_data_len,d_main_data,&d_main_data_len); + if (d_main_data_len == 0) { + memcpy(destion + pos2,&main_data_len,sizeof(main_data_len)); + pos2 += sizeof(main_data_len); + } else { + if (d_main_data_len > 255) { + memcpy(destion + pos2,&d_main_data_len,sizeof(d_main_data_len)); + pos2 += sizeof(d_main_data_len); + } else { + *(destion + pos2 - 1) = XLR_BLOCK_ID_DATA_SHORT; + uint8_t tlen = d_main_data_len; + memcpy(destion + pos2,&tlen,sizeof(tlen)); + pos2 += sizeof(uint8_t); + } + } + } + pos1 += sizeof(main_data_len); + remaining -= sizeof(main_data_len); + datatotal += main_data_len; + break; /* by convention, the main data fragment is + * always last */ + } else if (block_id == XLR_BLOCK_ID_ORIGIN) { + pos1 += sizeof(block_id); + if (isMtr == true) { + memcpy((destion + pos2),&block_id,sizeof(block_id)); + pos2 += sizeof(block_id); + } + memcpy(&RepOriginId, source + pos1,sizeof(RepOriginId)); + if (isMtr == true) { + memcpy(destion + pos2,&RepOriginId,sizeof(RepOriginId)); + pos2 += sizeof(RepOriginId); + } + pos1 += sizeof(RepOriginId); + remaining -= sizeof(RepOriginId); + } else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID) { + pos1 += sizeof(block_id); + if (isMtr == true) { + memcpy((destion + pos2),&block_id,sizeof(block_id)); + pos2 += sizeof(block_id); + } + memcpy(&TransactionId,source + pos1,sizeof(TransactionId)); + if (isMtr == true) { + memcpy(destion + pos2,&TransactionId,sizeof(TransactionId)); + pos2 += sizeof(TransactionId); + } + pos1 += sizeof(TransactionId); + remaining -= sizeof(TransactionId); + } else if (block_id <= XLR_MAX_BLOCK_ID) { + /* Ok, copy the header to the scratch buffer */ + memcpy(destion + pos2, source + pos1, SizeOfXLogRecordBlockHeader); + uint8_t fork_flags = *(source + pos1 + sizeof(block_id)); + *(destion + pos2) = blkNum; + data_len[blkNum] = *((uint16_t*)(source + pos1 + sizeof(block_id) + sizeof(fork_flags))); + datatotal += data_len; + pos1 += SizeOfXLogRecordBlockHeader; + pos2 += SizeOfXLogRecordBlockHeader; + remaining -= SizeOfXLogRecordBlockHeader; + if ((fork_flags & BKPBLOCK_HAS_IMAGE) != 0) { + bimg_len[blkNum] = *((uint16_t*)(source + pos1)); + datatotal += bimg_len; + uint16_t hole_offset = *((uint16_t*)(source + pos1 + sizeof(bimg_len))); + uint8_t bimg_info = *((uint16_t*)(source + pos1 + sizeof(bimg_len) + sizeof(hole_offset))); + memcpy(destion + pos2, source + pos1, SizeOfXLogRecordBlockImageHeader); + pos1 += SizeOfXLogRecordBlockImageHeader; + pos2 += SizeOfXLogRecordBlockImageHeader; + remaining -= SizeOfXLogRecordBlockImageHeader; + if ((bimg_info & BKPIMAGE_IS_COMPRESSED) != 0) { + if ((bimg_info & BKPIMAGE_HAS_HOLE) != 0) { + memcpy(destion + pos2, source + pos1, SizeOfXLogRecordBlockCompressHeader); + pos1 += SizeOfXLogRecordBlockCompressHeader; + pos2 += SizeOfXLogRecordBlockCompressHeader; + remaining -= SizeOfXLogRecordBlockCompressHeader; + } + } + if (!(fork_flags & BKPBLOCK_SAME_REL)) { + memcpy(destion + pos2, source + pos1, sizeof(RelFileNode)); + pos1 += sizeof(RelFileNode); + pos2 += sizeof(RelFileNode); + remaining -= sizeof(RelFileNode); + } + memcpy(destion + pos2, source + pos1, sizeof(BlockNumber)); + pos1 += sizeof(BlockNumber); + pos2 += sizeof(BlockNumber); + remaining -= sizeof(BlockNumber); + } + } else { + elog(FATAL,"invalid block_id %u",block_id); + } + } + assert(remaining == datatotal); + if (bimg_len[blkNum] != 0 ) { + img_ptr[blkNum] = source + pos1; + pos1 += bimg_len[blkNum]; + } + if (data_len[blkNum] != 0) { + data_ptr[blkNum] = source + pos1; + pos1 += data_len[blkNum]; + } + blkNum++; + } + *he3_pos = pos1; + int idx = 0; + while(idx < blkNum) { + if (bimg_len[idx] != 0) { + memcpy(destion + pos2, img_ptr[idx], bimg_len[idx]); + pos2 += bimg_len[idx]; + } + if (data_len[blkNum] != 0){ + memcpy(destion + pos2, data_ptr[idx], data_len[idx]); + pos2 += data_len[idx]; + } + } + memcpy(destion + pos2, d_main_data, d_main_data_len); + pos2 += d_main_data_len; + old->xl_tot_len = pos2-prev_pos2; + isMtr = false; } - else - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - path))); + return pos2; +} + +static int findFirstCheckPoint(char* source,int limit) { + XLogRecord* head = (XLogRecord*)source; + bool find = false; + int datalen = 0; + while(!(head->xl_rmid == RM_XLOG_ID && + ((head->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN || (head->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_ONLINE)) && + datalen < limit) { + datalen += head->xl_tot_len; + } + if (datalen == limit) { + return -1; + } + return datalen; } /* @@ -2546,8 +2860,9 @@ WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, * If there is no unsent WAL remaining, WalSndCaughtUp is set to true, * otherwise WalSndCaughtUp is set to false. */ -static void -XLogSendPhysical(void) + static uint64_t EndLsn = 0; + static void +XLogSendTiKVPhysical(void) { XLogRecPtr SendRqstPtr; XLogRecPtr startptr; @@ -2597,7 +2912,7 @@ XLogSendPhysical(void) */ bool becameHistoric = false; - SendRqstPtr = GetStandbyFlushRecPtr(); + SendRqstPtr = GetStandbyReplayRecPtr(); if (!RecoveryInProgress()) { @@ -2613,7 +2928,7 @@ XLogSendPhysical(void) /* * Still a cascading standby. But is the timeline we're sending * still the one recovery is recovering from? ThisTimeLineID was - * updated by the GetStandbyFlushRecPtr() call above. + * updated by the GetStandbyReplayRecPtr() call above. */ if (sendTimeLine != ThisTimeLineID) becameHistoric = true; @@ -2697,8 +3012,317 @@ XLogSendPhysical(void) if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr) { /* close the current file. */ - if (xlogreader->seg.ws_file >= 0) - wal_segment_close(xlogreader); + // if (xlogreader->seg.ws_file >= 0) + // wal_segment_close(xlogreader); + + /* Send CopyDone */ + pq_putmessage_noblock('c', NULL, 0); + streamingDoneSending = true; + + WalSndCaughtUp = true; + + elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)", + LSN_FORMAT_ARGS(sendTimeLineValidUpto), + LSN_FORMAT_ARGS(sentPtr)); + return; + } + + /* Do we have any work to do? */ + Assert(sentPtr <= SendRqstPtr); + if (SendRqstPtr <= sentPtr) + { + WalSndCaughtUp = true; + return; + } + + /* + * Figure out how much to send in one message. If there's no more than + * MAX_SEND_SIZE bytes to send, send everything. Otherwise send + * MAX_SEND_SIZE bytes, but round back to logfile or page boundary. + * + * The rounding is not only for performance reasons. Walreceiver relies on + * the fact that we never split a WAL record across two messages. Since a + * long WAL record is split at page boundary into continuation records, + * page boundary is always a safe cut-off point. We also assume that + * SendRqstPtr never points to the middle of a WAL record. + */ + startptr = sentPtr; + endptr = startptr; + endptr += ONCE_READ_TIKV_WAL; + + /* if we went beyond SendRqstPtr, back off */ + if (SendRqstPtr <= endptr) + { + endptr = SendRqstPtr; + if (sendTimeLineIsHistoric) + WalSndCaughtUp = false; + else + WalSndCaughtUp = true; + } + else + { + /* round down to page boundary. */ + endptr -= (endptr % XLOG_BLCKSZ); + WalSndCaughtUp = false; + } + + nbytes = endptr - startptr; + Assert(nbytes <= ONCE_READ_TIKV_WAL); + + /* + * OK to read and send the slice. + */ + resetStringInfo(&output_message); + pq_sendbyte(&output_message, 'w'); + + pq_sendint64(&output_message, 0); /* dataStart */ + pq_sendint64(&output_message, 0); /* walEnd */ + pq_sendint64(&output_message, 0); /* sendtime, filled in last */ + + /* + * Read the log directly into the output buffer to avoid extra memcpy + * calls. + */ + enlargeStringInfo(&output_message, DEFAULT_SEND_WAL_CAPCITY); + static char* he3_wal_cache = NULL; + uint64_t StartLsn = 0; + static uint64_t PrevLsn = 0; + if (he3_wal_cache == NULL) { + he3_wal_cache = malloc(DEFAULT_SEND_WAL_CAPCITY); + } +retry: + xlogreader->currTLI = ThisTimeLineID; + int ret = -1; + ret = He3DBWALRead(xlogreader, + startptr, + nbytes, + he3_wal_cache); + + if (ret < 0) { + WALReadRaiseError(&errinfo); + return; + } else { + nbytes = ret; + } + + int dLen = 0; + int mtrLen = ArrayXlogHe3ToPg(he3_wal_cache,nbytes,&output_message.data[output_message.len],&dLen,&StartLsn,&EndLsn); + output_message.len += dLen; + output_message.data[output_message.len] = '\0'; + /* + * Fill the send timestamp last, so that it is taken as late as possible. + */ + if (StartLsn % XLOG_BLCKSZ == SizeOfXLogShortPHD && + XLogSegmentOffset(StartLsn, DEFAULT_XLOG_SEG_SIZE) > XLOG_BLCKSZ) { + StartLsn -= SizeOfXLogShortPHD; + } + else if (StartLsn % XLOG_BLCKSZ == SizeOfXLogLongPHD && + XLogSegmentOffset(StartLsn, DEFAULT_XLOG_SEG_SIZE) < XLOG_BLCKSZ) { + StartLsn -= SizeOfXLogLongPHD; + } + endptr = startptr + mtrLen ; + resetStringInfo(&tmpbuf); + pq_sendint64(&tmpbuf, StartLsn); + /* walStart */ + memcpy(&output_message.data[1], + tmpbuf.data, sizeof(int64)); + EndLsn = StartLsn+dLen; + resetStringInfo(&tmpbuf); + pq_sendint64(&tmpbuf, EndLsn); + /* walEnd */ + memcpy(&output_message.data[1 + sizeof(int64)], + tmpbuf.data, sizeof(int64)); + resetStringInfo(&tmpbuf); + /* sendtime, filled in last */ + pq_sendint64(&tmpbuf, GetCurrentTimestamp()); + memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], + tmpbuf.data, sizeof(int64)); + pq_putmessage_noblock('d', output_message.data, output_message.len); + sentPtr = endptr; + + /* Update shared memory status */ + { + WalSnd *walsnd = MyWalSnd; + + SpinLockAcquire(&walsnd->mutex); + walsnd->sentPtr = sentPtr; + SpinLockRelease(&walsnd->mutex); + } + + /* Report progress of XLOG streaming in PS display */ + if (update_process_title) + { + char activitymsg[50]; + + snprintf(activitymsg, sizeof(activitymsg), "streaming %X/%X", + LSN_FORMAT_ARGS(EndLsn)); + set_ps_display(activitymsg); + } +} + +/* + * Send out the WAL in its normal physical/stored form. + * + * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk, + * but not yet sent to the client, and buffer it in the libpq output + * buffer. + * + * If there is no unsent WAL remaining, WalSndCaughtUp is set to true, + * otherwise WalSndCaughtUp is set to false. + */ +static void +XLogSendPhysical(void) +{ + XLogRecPtr SendRqstPtr; + XLogRecPtr startptr; + XLogRecPtr endptr; + Size nbytes; + XLogSegNo segno; + WALReadError errinfo; + + /* If requested switch the WAL sender to the stopping state. */ + if (got_STOPPING) + WalSndSetState(WALSNDSTATE_STOPPING); + + if (streamingDoneSending) + { + WalSndCaughtUp = true; + return; + } + + /* Figure out how far we can safely send the WAL. */ + if (sendTimeLineIsHistoric) + { + /* + * Streaming an old timeline that's in this server's history, but is + * not the one we're currently inserting or replaying. It can be + * streamed up to the point where we switched off that timeline. + */ + SendRqstPtr = sendTimeLineValidUpto; + } + else if (am_cascading_walsender) + { + /* + * Streaming the latest timeline on a standby. + * + * Attempt to send all WAL that has already been replayed, so that we + * know it's valid. If we're receiving WAL through streaming + * replication, it's also OK to send any WAL that has been received + * but not replayed. + * + * The timeline we're recovering from can change, or we can be + * promoted. In either case, the current timeline becomes historic. We + * need to detect that so that we don't try to stream past the point + * where we switched to another timeline. We check for promotion or + * timeline switch after calculating FlushPtr, to avoid a race + * condition: if the timeline becomes historic just after we checked + * that it was still current, it's still be OK to stream it up to the + * FlushPtr that was calculated before it became historic. + */ + bool becameHistoric = false; + + SendRqstPtr = GetStandbyReplayRecPtr(); + + if (!RecoveryInProgress()) + { + /* + * We have been promoted. RecoveryInProgress() updated + * ThisTimeLineID to the new current timeline. + */ + am_cascading_walsender = false; + becameHistoric = true; + } + else + { + /* + * Still a cascading standby. But is the timeline we're sending + * still the one recovery is recovering from? ThisTimeLineID was + * updated by the GetStandbyReplayRecPtr() call above. + */ + if (sendTimeLine != ThisTimeLineID) + becameHistoric = true; + } + + if (becameHistoric) + { + /* + * The timeline we were sending has become historic. Read the + * timeline history file of the new timeline to see where exactly + * we forked off from the timeline we were sending. + */ + List *history; + + history = readTimeLineHistory(ThisTimeLineID); + sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI); + + Assert(sendTimeLine < sendTimeLineNextTLI); + list_free_deep(history); + + sendTimeLineIsHistoric = true; + + SendRqstPtr = sendTimeLineValidUpto; + } + } + else + { + /* + * Streaming the current timeline on a primary. + * + * Attempt to send all data that's already been written out and + * fsync'd to disk. We cannot go further than what's been written out + * given the current implementation of WALRead(). And in any case + * it's unsafe to send WAL that is not securely down to disk on the + * primary: if the primary subsequently crashes and restarts, standbys + * must not have applied any WAL that got lost on the primary. + */ + SendRqstPtr = GetFlushRecPtr(); + } + + /* + * Record the current system time as an approximation of the time at which + * this WAL location was written for the purposes of lag tracking. + * + * In theory we could make XLogFlush() record a time in shmem whenever WAL + * is flushed and we could get that time as well as the LSN when we call + * GetFlushRecPtr() above (and likewise for the cascading standby + * equivalent), but rather than putting any new code into the hot WAL path + * it seems good enough to capture the time here. We should reach this + * after XLogFlush() runs WalSndWakeupProcessRequests(), and although that + * may take some time, we read the WAL flush pointer and take the time + * very close to together here so that we'll get a later position if it is + * still moving. + * + * Because LagTrackerWrite ignores samples when the LSN hasn't advanced, + * this gives us a cheap approximation for the WAL flush time for this + * LSN. + * + * Note that the LSN is not necessarily the LSN for the data contained in + * the present message; it's the end of the WAL, which might be further + * ahead. All the lag tracking machinery cares about is finding out when + * that arbitrary LSN is eventually reported as written, flushed and + * applied, so that it can measure the elapsed time. + */ + LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp()); + + /* + * If this is a historic timeline and we've reached the point where we + * forked to the next timeline, stop streaming. + * + * Note: We might already have sent WAL > sendTimeLineValidUpto. The + * startup process will normally replay all WAL that has been received + * from the primary, before promoting, but if the WAL streaming is + * terminated at a WAL page boundary, the valid portion of the timeline + * might end in the middle of a WAL record. We might've already sent the + * first half of that partial WAL record to the cascading standby, so that + * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't + * replay the partial WAL record either, so it can still follow our + * timeline switch. + */ + if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr) + { + /* close the current file. */ + // if (xlogreader->seg.ws_file >= 0) + // wal_segment_close(xlogreader); /* Send CopyDone */ pq_putmessage_noblock('c', NULL, 0); @@ -2762,28 +3386,33 @@ XLogSendPhysical(void) pq_sendint64(&output_message, startptr); /* dataStart */ pq_sendint64(&output_message, SendRqstPtr); /* walEnd */ +#ifndef PG_NOREPLAY + pq_sendint64(&output_message, nbytes); /* walEnd */ +#endif pq_sendint64(&output_message, 0); /* sendtime, filled in last */ /* * Read the log directly into the output buffer to avoid extra memcpy * calls. */ +#ifdef PG_NOREPLAY enlargeStringInfo(&output_message, nbytes); +#endif retry: - if (!WALRead(xlogreader, - &output_message.data[output_message.len], - startptr, - nbytes, - xlogreader->seg.ws_tli, /* Pass the current TLI because - * only WalSndSegmentOpen controls - * whether new TLI is needed. */ - &errinfo)) - WALReadRaiseError(&errinfo); + // if (!He3DBWALRead(xlogreader, + // &output_message.data[output_message.len], + // startptr, + // nbytes, + // xlogreader->currTLI, /* Pass the current TLI because + // * only WalSndSegmentOpen controls + // * whether new TLI is needed. */ + // &errinfo)) + // WALReadRaiseError(&errinfo); /* See logical_read_xlog_page(). */ - XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize); - CheckXLogRemoved(segno, xlogreader->seg.ws_tli); + // XLByteToSeg(startptr, segno, xlogreader->segcxt.ws_segsize); + // CheckXLogRemoved(segno, xlogreader->seg.ws_tli); /* * During recovery, the currently-open WAL file might be replaced with the @@ -2801,15 +3430,15 @@ retry: walsnd->needreload = false; SpinLockRelease(&walsnd->mutex); - if (reload && xlogreader->seg.ws_file >= 0) + if (reload) { - wal_segment_close(xlogreader); - goto retry; } } - + +#ifdef PG_NOREPLAY output_message.len += nbytes; +#endif output_message.data[output_message.len] = '\0'; /* @@ -2817,8 +3446,13 @@ retry: */ resetStringInfo(&tmpbuf); pq_sendint64(&tmpbuf, GetCurrentTimestamp()); +#ifdef PG_NOREPLAY memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64)], tmpbuf.data, sizeof(int64)); +#else + memcpy(&output_message.data[1 + sizeof(int64) + sizeof(int64) + sizeof(int64)], + tmpbuf.data, sizeof(int64)); +#endif pq_putmessage_noblock('d', output_message.data, output_message.len); @@ -2994,6 +3628,28 @@ GetStandbyFlushRecPtr(void) return result; } +static XLogRecPtr +GetStandbyReplayRecPtr(void) +{ + XLogRecPtr replayPtr; + XLogRecPtr consistPtr; + XLogRecPtr result; + TimeLineID replayTLI; + + /* + * We can safely send what's already been replayed. + */ + + replayPtr = GetXLogReplayRecPtr(&replayTLI); + consistPtr = GetXLogPushToDisk(); + Assert(consistPtr <= replayPtr); + + ThisTimeLineID = replayTLI; + + + return consistPtr; +} + /* * Request walsenders to reload the currently-open WAL file */ @@ -3484,7 +4140,11 @@ WalSndKeepalive(bool requestReply) /* construct the message... */ resetStringInfo(&output_message); pq_sendbyte(&output_message, 'k'); - pq_sendint64(&output_message, sentPtr); + if (EndLsn != 0) { + pq_sendint64(&output_message, sentPtr); + } else { + pq_sendint64(&output_message, EndLsn); + } pq_sendint64(&output_message, GetCurrentTimestamp()); pq_sendbyte(&output_message, requestReply ? 1 : 0); @@ -3528,6 +4188,9 @@ WalSndKeepaliveIfNecessary(void) /* Try to flush pending output to the client */ if (pq_flush_if_writable() != 0) WalSndShutdown(); + // } else { + // WalSndKeepalive(true); + // pg_usleep(10000); } } diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index a299be1..f9054e2 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -141,6 +141,22 @@ InitBufferPool(void) /* Init other shared buffer-management stuff */ StrategyInitialize(!foundDescs); + isPromoteIsTriggered = (bool *) + ShmemInitStruct("isPromoteIsTriggered", + sizeof(bool), &foundBufCkpt); + memset(isPromoteIsTriggered, 0, sizeof(bool)); + + /* Init preCacheNodes arrays */ + preCacheNodesPtr = (Oid *) + ShmemInitStruct("preCacheNodesPtr", + NPreCacheNodes * sizeof(Oid), &foundBufCkpt); + memset(preCacheNodesPtr, 0, NPreCacheNodes * sizeof(Oid)); + + preCacheNodesCountPtr = (uint16 *) + ShmemInitStruct("preCacheNodesCountPtr", + sizeof(uint16), &foundBufCkpt); + memset(preCacheNodesCountPtr, 0, sizeof(uint16)); + /* Initialize per-backend file flush context */ WritebackContextInit(&BackendWritebackContext, &backend_flush_after); @@ -167,6 +183,7 @@ BufferShmemSize(void) /* size of stuff controlled by freelist.c */ size = add_size(size, StrategyShmemSize()); + size = add_size(size, sizeof(bool)); /* size of I/O condition variables */ size = add_size(size, mul_size(NBuffers, @@ -177,5 +194,8 @@ BufferShmemSize(void) /* size of checkpoint sort array in bufmgr.c */ size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem))); + /* size of preCacheNodes */ + size = add_size(size, mul_size(NPreCacheNodes, sizeof(Oid)) + sizeof(uint16)); + return size; } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 7caf959..4e60c82 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -20,7 +20,7 @@ * is using it. * * ReleaseBuffer() -- unpin a buffer - * + *f * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty". * The disk write is delayed until buffer replacement or checkpoint. * @@ -49,6 +49,7 @@ #include "storage/proc.h" #include "storage/smgr.h" #include "storage/standby.h" +#include "storage/md.h" #include "utils/memdebug.h" #include "utils/ps_status.h" #include "utils/rel.h" @@ -57,8 +58,11 @@ #include "access/xlog_internal.h" #include "access/pushpage.h" #include "utils/memutils.h" +#include "utils/hfs.h" +#include "storage/he3db_logindex.h" +#include "access/ringbuffer.h" - +bool *isPromoteIsTriggered; /* Note: these two macros only work on shared buffers, not local ones! */ #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr))) @@ -492,7 +496,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, - bool *foundPtr); + bool *foundPtr,bool *exist); static BufferDesc *He3DBBufferAlloc_replay(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, @@ -515,7 +519,9 @@ static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg); static void he3db_apply_page(BufferDesc *bufHdr, char *pageXlogBuf, int nbytes); static int he3db_apply_one_record(XLogReaderState *state, Buffer buffer, char *pageXlogBuf); - +bool PinBufferForPush(void *buf, BufferAccessStrategy strategy) { + return PinBuffer(buf,strategy); +} /* * Implementation of PrefetchBuffer() for shared buffers. */ @@ -788,6 +794,18 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, * miss. */ pgstat_count_buffer_read(reln); + /* precache or unprecache index */ + if (isPreCacheIndex && !isPreCacheIndexDone && preCacheNodeOid == reln->rd_node.relNode) + { + BlockNumber precacheblocks; + precacheblocks = smgrnblocks(reln->rd_smgr, forkNum); + for(BlockNumber i=0; i < precacheblocks; i++) + { + ReleaseBuffer(ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence, forkNum, i, mode, strategy, &hit)); + } + isPreCacheIndexDone = true; + } + buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence, forkNum, blockNum, mode, strategy, &hit); if (hit) @@ -836,8 +854,7 @@ He3DBReadBufferWithoutRelcache_replay(RelFileNode rnode, ForkNumber forkNum, return He3DBReadBuffer_replay(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum, mode, strategy, hit); } - - + /* * ReadBuffer_common -- common logic for all ReadBuffer variants * @@ -850,17 +867,12 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, { BufferDesc *bufHdr; Block bufBlock; + bool exist = false; bool found; bool isExtend; bool isLocalBuf = SmgrIsTemp(smgr); /* he3db: local tem buffer for pageXlog */ - char *pageXlogBuf; - /* he3db: Bytes he3dbsmgrread actually read */ - int nbytes; - *hit = false; - pageXlogBuf = NULL; - /* Make sure we will have room to remember the buffer pin */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); @@ -893,11 +905,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, relpath(smgr->smgr_rnode, forkNum), P_NEW))); } - + if (isLocalBuf) { bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found); - if (found) + if (found) pgBufferUsage.local_blks_hit++; else if (isExtend) pgBufferUsage.local_blks_written++; @@ -912,7 +924,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * not currently in memory. */ bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum, - strategy, &found); + strategy, &found,&exist); if (found) pgBufferUsage.shared_blks_hit++; else if (isExtend) @@ -920,6 +932,16 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG || mode == RBM_ZERO_ON_ERROR) pgBufferUsage.shared_blks_read++; + // for precache: buf not be eliminated by clock algorithm + if (needPreCacheEscape && preCacheNodeOid == bufHdr->tag.rnode.relNode) + { + bufHdr->isPreCacheEscape=true; + } + // for unprecache: buf be eliminated by clock algorithm + if (needUnpreCacheEscape && preCacheNodeOid == bufHdr->tag.rnode.relNode) + { + bufHdr->isPreCacheEscape=false; + } } /* At this point we do NOT hold any locks. */ @@ -1012,7 +1034,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } while (!StartBufferIO(bufHdr, true)); } } - + /* * if we have gotten to this point, we have allocated a buffer for the * page but its contents are not yet valid. IO_IN_PROGRESS is set for it, @@ -1028,13 +1050,44 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */ bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); - + XLogRecPtr replayLsn = InvalidXLogRecPtr; + TimeLineID tli; + int lsnLen = 0; + bool outdata = true; + Bufrd tWalRecord; + tWalRecord.count = 0; + tWalRecord.buf = NULL; + LsnNode* head = NULL; + char* pageXlogPtr = NULL; + int nbytes = 0; + walRecord_t walRecord; + walRecord.cap = 0; + walRecord.buf = NULL; + walRecord.count = 0; + if (isExtend) { /* new buffers are zero-filled */ + MemSet((char *) bufBlock, 0, BLCKSZ); - /* don't set checksum for all-zero page */ smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false); + + /* don't set checksum for all-zero page */ + + /* for new page precache */ + if (*preCacheNodesCountPtr > 0) + { + uint16 preCacheNodei = 0; + while (preCacheNodei < *preCacheNodesCountPtr) + { + if (preCacheNodesPtr[preCacheNodei] == bufHdr->tag.rnode.relNode) + { + bufHdr->isPreCacheEscape=true; + break; + } + preCacheNodei++; + } + } /* * NB: we're *not* doing a ScheduleBufferTagForWriteback here; @@ -1055,52 +1108,76 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, { instr_time io_start, io_time; - /* - * he3db: alloc local tem buffer for pageXlog - * first 8K is page data, after 8k is xlog data - * He3FS abandon - pageXlogBuf = (char *) palloc_extended(PAGEXLOG_BLCKSZ, MCXT_ALLOC_NO_OOM); - if (!pageXlogBuf) - { - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("invalid pageXlog local buffer alloc"))); - } - */ if (track_io_timing) INSTR_TIME_SET_CURRENT(io_start); - - /* he3db: read page and xlog Associated with it */ - if (EnableHotStandby == true) - { - /* propeller and slave instance */ - //nbytes = he3dbsmgrread(smgr, forkNum, blockNum, bufBlock, InvalidXLogRecPtr); - XLogRecPtr replayLsn = GetXLogReplayRecPtr(NULL); - if (IsBootstrapProcessingMode() || InitdbSingle) { - replayLsn = GetXLogWriteRecPtr(); - } - nbytes = he3dbsmgrread(smgr, forkNum, blockNum, &pageXlogBuf,replayLsn); - memcpy((char *) bufBlock, pageXlogBuf, BLCKSZ); - /* propeller instance no page xlog replay */ - if (push_standby) - { - free(pageXlogBuf); - pageXlogBuf = NULL; - } - } - else - { - /* primary instance */ - XLogRecPtr replayLsn = GetXLogWriteRecPtr(); - nbytes = he3dbsmgrread(smgr, forkNum, blockNum, &pageXlogBuf, replayLsn); - memcpy((char *) bufBlock, pageXlogBuf, BLCKSZ); - - if (nbytes <= BLCKSZ) - { - free(pageXlogBuf); - pageXlogBuf = NULL; + + if ((EnableHotStandby == true && *isPromoteIsTriggered == false) || InRecovery) { + if (IsBootstrapProcessingMode() == true || InitdbSingle == true) { + smgrread(smgr, forkNum, blockNum, (char *) bufBlock); + } else { + replayLsn = GetXLogReplayRecPtr(&tli); + if (exist == true) { + BufferTag pageTag; + pageTag.rnode = smgr->smgr_rnode.node; + pageTag.forkNum = forkNum; + pageTag.blockNum = blockNum; + // XLogRecPtr pageLsn = BufferGetLSN(bufHdr); + XLogRecPtr pageLsn = Max(GetXLogPushToDisk(), BufferGetLSN(bufHdr)); + head = GetLogIndexByPage(&pageTag,pageLsn,replayLsn); + if ((EnableHotStandby == true && *isPromoteIsTriggered == false) && push_standby == false) { + if (head->next != NULL) { + tWalRecord = ReadWalsByPage(pageTag.rnode.dbNode,pageTag.rnode.relNode,forkNum,blockNum,tli,head); + } + } else { + LsnNode* next = head->next; + if (next != NULL) { + walRecord.cap = 8192; + walRecord.buf = malloc(walRecord.cap); + } + while(next!=NULL) { + int count = walRecordQuery(&walRecord.buf,&walRecord.count,&walRecord.cap,next->lsn); + if (count == -1) { + elog(FATAL,"======walRecordQuery query wal Faild %X/%X===2===",LSN_FORMAT_ARGS(next->lsn)); + } + next = next->next; + } + } + } else { + nbytes = he3db_mdread(smgr, forkNum, blockNum, &pageXlogPtr,true, replayLsn); + if (nbytes < BLCKSZ) { + elog(FATAL,"smgrextend=>he3dbsmgrread rel %d flk %d blk %d nbytes %d",smgr->smgr_rnode.node.relNode,forkNum, blockNum,nbytes); + } else { + memcpy(bufBlock,pageXlogPtr,BLCKSZ); + if (push_standby == true || EnableHotStandby == false || *isPromoteIsTriggered) { + BufferTag pageTag; + pageTag.rnode = smgr->smgr_rnode.node; + pageTag.forkNum = forkNum; + pageTag.blockNum = blockNum; + // XLogRecPtr pageLsn = BufferGetLSN(bufHdr); + XLogRecPtr pageLsn = Max(GetXLogPushToDisk(), BufferGetLSN(bufHdr)); + head = GetLogIndexByPage(&pageTag,pageLsn,replayLsn); + if (head->next!=NULL) { + LsnNode* next = head->next; + if (next != NULL) { + walRecord.cap = 8192; + walRecord.buf = malloc(walRecord.cap); + } + while(next!=NULL) { + int count = walRecordQuery(&walRecord.buf,&walRecord.count,&walRecord.cap,next->lsn); + if (count == -1) { + elog(FATAL,"======walRecordQuery query wal Faild %X/%X===3===",LSN_FORMAT_ARGS(next->lsn)); + } + next = next->next; + } + } + } + } + } + bufHdr->pageIsVaild = true; } + } else { + smgrread(smgr, forkNum, blockNum, (char *) bufBlock); } if (track_io_timing) @@ -1123,12 +1200,6 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, blockNum, relpath(smgr->smgr_rnode, forkNum)))); MemSet((char *) bufBlock, 0, BLCKSZ); - /* He3DB: He3FS */ - if(pageXlogBuf != NULL) - { - free(pageXlogBuf); - pageXlogBuf = NULL; - } } else ereport(ERROR, @@ -1150,11 +1221,12 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * that we cannot use LockBuffer() or LockBufferForCleanup() here, because * they assert that the buffer is already valid.) */ + if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) && !isLocalBuf) { LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE); - } + } if (isLocalBuf) { @@ -1166,24 +1238,40 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } else { - + //todo: read related wals in standby instance. /* * He3DB: page-replay. * * apply logs to this old page when read from disk. * */ - if (pageXlogBuf) + if (pageXlogPtr != NULL || tWalRecord.count != 0 || walRecord.count != 0) { - /* UnpinBuffer for xlog replay */ - //UnpinBuffer(bufHdr, true); - - he3db_apply_page(bufHdr, pageXlogBuf + BLCKSZ, nbytes - BLCKSZ); - - free(pageXlogBuf); - - /* re pin */ - //PinBuffer(bufHdr, strategy); + XLogRecPtr pageLsn = BufferGetLSN(bufHdr); + char *xlogStart = NULL; + if (pageXlogPtr != NULL) { + xlogStart = pageXlogPtr + BLCKSZ; + nbytes = nbytes - BLCKSZ; + } else if (tWalRecord.count != 0) { + xlogStart = tWalRecord.buf; + nbytes = tWalRecord.count; + } + if (walRecord.count != 0) { + xlogStart = walRecord.buf; + nbytes = walRecord.count; + } + he3db_apply_page(bufHdr, xlogStart, nbytes); + if (pageXlogPtr != NULL) { + free(pageXlogPtr); + pageXlogPtr = NULL; + } else if (tWalRecord.count != 0) { + free_dataRead(tWalRecord.buf,tWalRecord.count,tWalRecord.cap); + FreeLsnNode(head); + } + if (walRecord.count != 0) { + free(walRecord.buf); + FreeLsnNode(head); + } } /* He3DB end */ /* Set BM_VALID, terminate IO, and wake up any waiters */ @@ -1336,7 +1424,7 @@ static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, - bool *foundPtr) + bool *foundPtr,bool *exist) { BufferTag newTag; /* identity of requested block */ uint32 newHash; /* hash value for newTag */ @@ -1391,6 +1479,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * If we get here, previous attempts to read the buffer must * have failed ... but we shall bravely try again. */ + if (buf->pageIsVaild == false) { + *exist = false; + } else { + *exist = true; + } *foundPtr = false; } } @@ -1487,12 +1580,12 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, smgr->smgr_rnode.node.relNode); /* he3db: FlushBuffer to He3DBFlushBuffer*/ - if (push_standby == true) { + // if (push_standby == true) { + // master/slave/push standby need to flush dirty page to release space FlushBuffer(buf, NULL); - } else { - - He3DBFlushBuffer(buf, NULL); - } + // } else { + // He3DBFlushBuffer(buf, NULL); + // } LWLockRelease(BufferDescriptorGetContentLock(buf)); @@ -1610,6 +1703,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * If we get here, previous attempts to read the buffer * must have failed ... but we shall bravely try again. */ + if (buf->pageIsVaild == false) { + *exist = false; + } else { + *exist = true; + } *foundPtr = false; } } @@ -1662,7 +1760,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE; else buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; - + buf->pageIsVaild = false; UnlockBufHdr(buf, buf_state); if (oldPartitionLock != NULL) @@ -1680,8 +1778,14 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * to read it before we did, so there's nothing left for BufferAlloc() to * do. */ - if (StartBufferIO(buf, true)) + if (StartBufferIO(buf, true)) { + if (buf->pageIsVaild == false) { + *exist = false; + } else { + *exist = true; + } *foundPtr = false; + } else *foundPtr = true; @@ -1897,8 +2001,9 @@ MarkBufferDirty(Buffer buffer) bufHdr = GetBufferDescriptor(buffer - 1); Assert(BufferIsPinned(buffer)); - Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), - LW_EXCLUSIVE)); + //this assert will crash for mode == RBM_NORMAL_VALID + // Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), + // LW_EXCLUSIVE)); old_buf_state = pg_atomic_read_u32(&bufHdr->state); for (;;) @@ -2077,40 +2182,6 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) ref->refcount++; Assert(ref->refcount > 0); - // for precache: buf not be eliminated by clock algorithm - if (needPreCacheEscape) - { - uint32 buf_state; - uint32 old_buf_state; - - old_buf_state = pg_atomic_read_u32(&buf->state); - for (;;) - { - if (old_buf_state & BM_LOCKED) - old_buf_state = WaitBufHdrUnlocked(buf); - - buf_state = old_buf_state; - - /* increase refcount */ - buf_state += BUF_REFCOUNT_ONE; - - if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, - buf_state)) - { - result = (buf_state & BM_VALID) != 0; - - /* - * Assume that we acquired a buffer pin for the purposes of - * Valgrind buffer client checks (even in !result case) to - * keep things simple. Buffers that are unsafe to access are - * not generally guaranteed to be marked undefined or - * non-accessible in any case. - */ - VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); - break; - } - } - } ResourceOwnerRememberBuffer(CurrentResourceOwner, b); return result; @@ -2164,11 +2235,6 @@ PinBuffer_Locked(BufferDesc *buf) buf_state = pg_atomic_read_u32(&buf->state); Assert(buf_state & BM_LOCKED); buf_state += BUF_REFCOUNT_ONE; - // for precache: buf not be eliminated by clock algorithm - if (needPreCacheEscape) - { - buf_state += BUF_REFCOUNT_ONE; - } UnlockBufHdr(buf, buf_state); b = BufferDescriptorGetBuffer(buf); @@ -2912,11 +2978,11 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) PinBuffer_Locked(bufHdr); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); /* he3db: FlushBuffer to He3DBFlushBuffer*/ - if (push_standby == true) { + //if (push_standby == true) { FlushBuffer(bufHdr, NULL); - } else { + /*} else { He3DBFlushBuffer(bufHdr, NULL); - } + }*/ LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); @@ -3257,6 +3323,14 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) * buffer, other processes might be updating hint bits in it, so we must * copy the page to private storage if we do checksumming. */ + // PageKey pageKey; + // pageKey.relfileNode.dbNode = buf->tag.rnode.dbNode;; + // pageKey.relfileNode.relNode = buf->tag.rnode.relNode; + // pageKey.relfileNode.spcNode = buf->tag.rnode.spcNode; + + // pageKey.blkNo = buf->tag.blockNum; + // pageKey.forkNo = buf->tag.forkNum; + // pageKey.pageLsn = recptr; bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); if (track_io_timing) @@ -3265,11 +3339,15 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) /* * bufToWrite is either the shared buffer or a copy, as appropriate. */ - smgrwrite(reln, + he3dbsmgrwrite(reln, buf->tag.forkNum, buf->tag.blockNum, bufToWrite, - false); + false, recptr); + + + //将page放到本地盘 + // EvictOnePageOutOfMemory(pageKey, bufToWrite); if (track_io_timing) { @@ -4047,12 +4125,13 @@ FlushRelationBuffers(Relation rel) error_context_stack = &errcallback; PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + XLogRecPtr lsn = BufferGetLSN(bufHdr); - smgrwrite(rel->rd_smgr, + he3dbsmgrwrite(rel->rd_smgr, bufHdr->tag.forkNum, bufHdr->tag.blockNum, localpage, - false); + false, lsn); buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED); pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); @@ -5008,7 +5087,18 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits) if (clear_dirty && !(buf_state & BM_JUST_DIRTIED)) buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED); - buf_state |= set_flag_bits; + if (!(IsBootstrapProcessingMode() == true || InitdbSingle == true) && (InRecovery || (EnableHotStandby && *isPromoteIsTriggered == false)) && set_flag_bits == BM_VALID) + { + XLogRecPtr pageLsn = BufferGetLSN(buf); + XLogRecPtr replayLsn = GetXLogReplayRecPtr(NULL); + bool hasdata = CheckBufTagExistByLsnRange(&buf->tag, pageLsn, replayLsn); + if (hasdata) + buf_state &= ~BM_VALID; + else + buf_state |= set_flag_bits; + } else { + buf_state |= set_flag_bits; + } UnlockBufHdr(buf, buf_state); if (!bulk_io_is_in_progress) { @@ -5435,8 +5525,11 @@ TestForOldSnapshot_impl(Snapshot snapshot, Relation relation) * He3DB: page-replay. */ static void -he3db_apply_page(BufferDesc *bufHdr, char *pageXlogBuf, int nbytes) +he3db_apply_page(BufferDesc *bufHdr, char *pageXlogBuf, int nbyte) { + if (nbyte == 0) { + return; + } XLogReaderState *state; Buffer buffer; @@ -5456,7 +5549,7 @@ he3db_apply_page(BufferDesc *bufHdr, char *pageXlogBuf, int nbytes) state->tag = &tag; state->buffer = buffer; memcpy(state->tag,&buf_desc->tag,sizeof(buf_desc->tag)); - while (nbytes > 0) + while (nbyte > 0) { int recordLen; recordLen = he3db_apply_one_record(state, buffer, pageXlogBuf); @@ -5465,7 +5558,7 @@ he3db_apply_page(BufferDesc *bufHdr, char *pageXlogBuf, int nbytes) break; } pageXlogBuf += recordLen; - nbytes -= recordLen; + nbyte -= recordLen; } /* set page lsn to read point lsn */ diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 6be8047..62f2a80 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -324,7 +324,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) */ local_buf_state = LockBufHdr(buf); - if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0) + if (buf->isPreCacheEscape == false && BUF_STATE_GET_REFCOUNT(local_buf_state) == 0) { if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0) { diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index eb0fadc..4513cec 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -223,13 +223,15 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, oreln = smgropen(bufHdr->tag.rnode, MyBackendId); PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + XLogRecPtr lsn = PageGetLSN(localpage); /* And write... */ - smgrwrite(oreln, + he3dbsmgrwrite(oreln, bufHdr->tag.forkNum, bufHdr->tag.blockNum, localpage, - false); + false, + lsn); /* Mark not-dirty now in case we error out below */ buf_state &= ~BM_DIRTY; diff --git a/src/backend/storage/file/Makefile b/src/backend/storage/file/Makefile index 9135ab5..92ef9b4 100644 --- a/src/backend/storage/file/Makefile +++ b/src/backend/storage/file/Makefile @@ -11,7 +11,7 @@ subdir = src/backend/storage/file top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -override CPPFLAGS := -lfs $(CPPFLAGS) +override CPPFLAGS := -lrust_log $(CPPFLAGS) OBJS = \ buffile.o \ diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 981c09c..bac152a 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -72,12 +72,12 @@ #include "postgres.h" -#include "utils/ufs.h" #include #include #include #include #include +#include #ifndef WIN32 #include #endif @@ -97,10 +97,13 @@ #include "pgstat.h" #include "port/pg_iovec.h" #include "portability/mem.h" +#include "postmaster/secondbuffer.h" #include "storage/fd.h" #include "storage/ipc.h" +#include "storage/spin.h" #include "utils/guc.h" #include "utils/resowner_private.h" +//#include "utils/hfs.h" /* He3DB: He3FS */ //#include "storage/iport.h" @@ -192,6 +195,8 @@ int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC; #define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */ #define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */ +extern bool PageIsHot(); + typedef struct vfd { int fd; /* current FD, or VFD_CLOSED if none */ @@ -254,7 +259,7 @@ typedef struct { FILE *file; DIR *dir; - int fd; + int64_t fd; } desc; } AllocateDesc; @@ -348,8 +353,8 @@ static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel); static int fsync_parent_path(const char *fname, int elevel); /* He3DB: He3FS */ -ssize_t he3fs_pread(int fd, void **buf, off_t offset, XLogRecPtr lsn, uint16 type); -ssize_t he3fs_pwrite(int fd, const void *buf, size_t size, off_t offset); +//ssize_t he3fs_pread(int64_t fd, void **buf, off_t offset, XLogRecPtr lsn, uint16 type, uint32_t dbid, uint32_t relid, uint32_t segno, uint32_t forkno); +//ssize_t he3fs_pwrite(int64_t fd, const void *buf, size_t size, off_t offset); /* @@ -979,12 +984,7 @@ count_usable_fds(int max_to_probe, int *usable_fds, int *already_open) /* release the files we opened */ for (j = 0; j < used; j++) { - //close(fd[j]); - if(close(fd[j]) != 0) - { - /* He3DB: Add He3FS Compatibility*/ - he3Close(fd[j]); - } + close(fd[j]); } pfree(fd); @@ -1057,11 +1057,11 @@ BasicOpenFile(const char *fileName, int fileFlags) * * Modified points: He3FS replace OSFS. */ -int -He3DBBasicOpenFile(const char *fileName, int fileFlags) -{ - return He3DBBasicOpenFilePerm(fileName, fileFlags | PG_O_DIRECT, pg_file_create_mode); -} +// int64_t +// He3DBBasicOpenFile(const char *fileName, int fileFlags) +// { +// return He3DBBasicOpenFilePerm(fileName, fileFlags | PG_O_DIRECT, pg_file_create_mode); +// } /* * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed @@ -1112,43 +1112,53 @@ tryAgain: * * Modified points: He3FS replace OSFS. */ -int -He3DBBasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) -{ - fdInfo fi; +// int64_t +// He3DBBasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) +// { +// IOResult ioResult; -tryAgain: - //fd = open(fileName, fileFlags, fileMode); - /* He3DB: He3FS replace OSFS */ - if (push_standby) - { - /* propeller instance */ - fi = he3Open(fileName, fileFlags, fileMode, 3); - } - else - { - /* primary instance */ - fi = he3Open(fileName, fileFlags, fileMode, 1); - } - errno = fi.errNo; - if (fi.fd >= 0) - return fi.fd; /* success! */ +// tryAgain: +// //fd = open(fileName, fileFlags, fileMode); +// /* He3DB: He3FS replace OSFS */ +// // if (push_standby) +// // { +// // /* propeller instance */ +// // fi = he3Open(fileName, fileFlags, fileMode, 3); +// // } +// // else +// // { +// // /* primary instance */ +// // fi = he3Open(fileName, fileFlags, fileMode, 1); +// // } +// // errno = fi.errNo; +// // if (fi.fd >= 0) +// // return fi.fd; /* success! */ - if (errno == EMFILE || errno == ENFILE) - { - int save_errno = errno; - ereport(LOG, - (errcode(ERRCODE_INSUFFICIENT_RESOURCES), - errmsg("out of file descriptors: %m; release and retry"))); - errno = 0; - if (ReleaseLruFile()) - goto tryAgain; - errno = save_errno; - } +// ioResult = openfs(fileName, fileFlags); + +// if (ioResult.fd >= 0){ +// return ioResult.fd; +// } - return -1; /* failure */ -} +// errno = ioResult.error; +// if (errno == EMFILE || errno == ENFILE) +// { +// int save_errno = errno; + +// ereport(LOG, +// (errcode(ERRCODE_INSUFFICIENT_RESOURCES), +// errmsg("out of file descriptors: %m; release and retry"))); +// errno = 0; +// if (ReleaseLruFile()) +// goto tryAgain; +// errno = save_errno; +// } + + + +// return -1; /* failure */ +// } /* * AcquireExternalFD - attempt to reserve an external file descriptor @@ -1280,9 +1290,7 @@ LruDelete(File file) * Close the file. We aren't expecting this to fail; if it does, better * to leak the FD than to mess up our internal state. */ - //if (close(vfdP->fd) != 0) - /* He3DB: Add He3FS Compatibility*/ - if (he3Close(vfdP->fd) != 0 && close(vfdP->fd) != 0) + if (close(vfdP->fd) != 0) elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG), "could not close file \"%s\": %m", vfdP->fileName); vfdP->fd = VFD_CLOSED; @@ -1336,7 +1344,7 @@ LruInsert(File file) * overall system file table being full. So, be prepared to release * another FD if necessary... */ - vfdP->fd = He3DBBasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags, + vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags, vfdP->fileMode); if (vfdP->fd < 0) { @@ -1359,49 +1367,49 @@ LruInsert(File file) } /* returns 0 on success, -1 on re-open failure (with errno set) */ -int -He3LruInsert(File file) -{ - Vfd *vfdP; +// int +// He3LruInsert(File file) +// { +// Vfd *vfdP; - Assert(file != 0); +// Assert(file != 0); - DO_DB(elog(LOG, "LruInsert %d (%s)", - file, VfdCache[file].fileName)); +// DO_DB(elog(LOG, "LruInsert %d (%s)", +// file, VfdCache[file].fileName)); - vfdP = &VfdCache[file]; +// vfdP = &VfdCache[file]; - if (FileIsNotOpen(file)) - { - /* Close excess kernel FDs. */ - ReleaseLruFiles(); +// if (FileIsNotOpen(file)) +// { +// /* Close excess kernel FDs. */ +// ReleaseLruFiles(); - /* - * The open could still fail for lack of file descriptors, eg due to - * overall system file table being full. So, be prepared to release - * another FD if necessary... - */ - vfdP->fd = He3DBBasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags, - vfdP->fileMode); - if (vfdP->fd < 0) - { - DO_DB(elog(LOG, "re-open failed: %m")); - return -1; - } - else - { - ++nfile; - } - } +// /* +// * The open could still fail for lack of file descriptors, eg due to +// * overall system file table being full. So, be prepared to release +// * another FD if necessary... +// */ +// vfdP->fd = He3DBBasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags, +// vfdP->fileMode); +// if (vfdP->fd < 0) +// { +// DO_DB(elog(LOG, "re-open failed: %m")); +// return -1; +// } +// else +// { +// ++nfile; +// } +// } - /* - * put it at the head of the Lru ring - */ +// /* +// * put it at the head of the Lru ring +// */ - Insert(file); +// Insert(file); - return 0; -} +// return 0; +// } /* * Release one kernel FD by closing the least-recently-used VFD. */ @@ -1549,38 +1557,38 @@ FileAccess(File file) } /* returns 0 on success, -1 on re-open failure (with errno set) */ -int -He3FileAccess(File file) -{ - int returnValue; +// int +// He3FileAccess(File file) +// { +// int returnValue; - DO_DB(elog(LOG, "FileAccess %d (%s)", - file, VfdCache[file].fileName)); +// DO_DB(elog(LOG, "FileAccess %d (%s)", +// file, VfdCache[file].fileName)); - /* - * Is the file open? If not, open it and put it at the head of the LRU - * ring (possibly closing the least recently used file to get an FD). - */ +// /* +// * Is the file open? If not, open it and put it at the head of the LRU +// * ring (possibly closing the least recently used file to get an FD). +// */ - if (FileIsNotOpen(file)) - { - returnValue = He3LruInsert(file); - if (returnValue != 0) - return returnValue; - } - else if (VfdCache[0].lruLessRecently != file) - { - /* - * We now know that the file is open and that it is not the last one - * accessed, so we need to move it to the head of the Lru ring. - */ +// if (FileIsNotOpen(file)) +// { +// returnValue = He3LruInsert(file); +// if (returnValue != 0) +// return returnValue; +// } +// else if (VfdCache[0].lruLessRecently != file) +// { +// /* +// * We now know that the file is open and that it is not the last one +// * accessed, so we need to move it to the head of the Lru ring. +// */ - Delete(file); - Insert(file); - } +// Delete(file); +// Insert(file); +// } - return 0; -} +// return 0; +// } /* * Called whenever a temporary file is deleted to report its size. */ @@ -1643,11 +1651,16 @@ PathNameOpenFile(const char *fileName, int fileFlags) * * Modified points: He3FS replace OSFS. */ -File -He3DBPathNameOpenFile(const char *fileName, int fileFlags) -{ - return He3DBPathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode); -} +// File +// He3DBPathNameOpenFile(const char *fileName, int fileFlags) +// { +// File fd; +// // clock_t start1 = clock(); +// fd = He3DBPathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode); +// // clock_t end1 = clock(); +// // elog(LOG,"===He3DBPathNameOpenFile==fileName=%s=fileFlags=%d===%lu=",fileName,fileFlags,end1-start1); +// return fd; +// } /* * open a file in an arbitrary directory @@ -1715,59 +1728,59 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) * * Modified points: He3FS replace OSFS. */ -File -He3DBPathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) -{ - char *fnamecopy; - File file; - Vfd *vfdP; +// File +// He3DBPathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) +// { +// char *fnamecopy; +// File file; +// Vfd *vfdP; - DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o", - fileName, fileFlags, fileMode)); +// DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o", +// fileName, fileFlags, fileMode)); - /* - * We need a malloc'd copy of the file name; fail cleanly if no room. - */ - fnamecopy = strdup(fileName); - if (fnamecopy == NULL) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); +// /* +// * We need a malloc'd copy of the file name; fail cleanly if no room. +// */ +// fnamecopy = strdup(fileName); +// if (fnamecopy == NULL) +// ereport(ERROR, +// (errcode(ERRCODE_OUT_OF_MEMORY), +// errmsg("out of memory"))); - file = AllocateVfd(); - vfdP = &VfdCache[file]; +// file = AllocateVfd(); +// vfdP = &VfdCache[file]; - /* Close excess kernel FDs. */ - ReleaseLruFiles(); +// /* Close excess kernel FDs. */ +// ReleaseLruFiles(); - /* He3DB: He3FS replace OSFS */ - vfdP->fd = He3DBBasicOpenFilePerm(fileName, fileFlags, fileMode); +// /* He3DB: He3FS replace OSFS */ +// vfdP->fd = He3DBBasicOpenFilePerm(fileName, fileFlags, fileMode); - if (vfdP->fd < 0) - { - int save_errno = errno; +// if (vfdP->fd < 0) +// { +// int save_errno = errno; - FreeVfd(file); - free(fnamecopy); - errno = save_errno; - return -1; - } - ++nfile; - DO_DB(elog(LOG, "PathNameOpenFile: success %d", - vfdP->fd)); +// FreeVfd(file); +// free(fnamecopy); +// errno = save_errno; +// return -1; +// } +// ++nfile; +// DO_DB(elog(LOG, "PathNameOpenFile: success %d", +// vfdP->fd)); - vfdP->fileName = fnamecopy; - /* Saved flags are adjusted to be OK for re-opening file */ - vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL); - vfdP->fileMode = fileMode; - vfdP->fileSize = 0; - vfdP->fdstate = 0x0; - vfdP->resowner = NULL; +// vfdP->fileName = fnamecopy; +// /* Saved flags are adjusted to be OK for re-opening file */ +// vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL); +// vfdP->fileMode = fileMode; +// vfdP->fileSize = 0; +// vfdP->fdstate = 0x0; +// vfdP->resowner = NULL; - Insert(file); +// Insert(file); - return file; -} +// return file; +// } /* * Create directory 'directory'. If necessary, create 'basedir', which must @@ -2107,8 +2120,7 @@ FileClose(File file) if (!FileIsNotOpen(file)) { /* close the file */ - //if (close(vfdP->fd) != 0) - if (he3Close(vfdP->fd) != 0 && close(vfdP->fd) != 0) + if (close(vfdP->fd) != 0) { /* * We may need to panic on failure to close non-temporary files; @@ -2184,6 +2196,99 @@ FileClose(File file) FreeVfd(file); } +// void +// He3DBFileClose(File file) +// { +// Vfd *vfdP; + +// Assert(FileIsValid(file)); + +// DO_DB(elog(LOG, "FileClose: %d (%s)", +// file, VfdCache[file].fileName)); + +// vfdP = &VfdCache[file]; + +// if (!FileIsNotOpen(file)) +// { +// /* close the file */ +// //if (close(vfdP->fd) != 0) +// if (closefs(vfdP->fd) != 0) +// { +// /* +// * We may need to panic on failure to close non-temporary files; +// * see LruDelete. +// */ +// elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG), +// "could not close file \"%s\": %m", vfdP->fileName); +// } + +// --nfile; +// vfdP->fd = VFD_CLOSED; + +// /* remove the file from the lru ring */ +// Delete(file); +// } + +// if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) +// { +// /* Subtract its size from current usage (do first in case of error) */ +// temporary_files_size -= vfdP->fileSize; +// vfdP->fileSize = 0; +// } + +// /* +// * Delete the file if it was temporary, and make a log entry if wanted +// */ +// if (vfdP->fdstate & FD_DELETE_AT_CLOSE) +// { +// struct stat filestats; +// int stat_errno; + +// /* +// * If we get an error, as could happen within the ereport/elog calls, +// * we'll come right back here during transaction abort. Reset the +// * flag to ensure that we can't get into an infinite loop. This code +// * is arranged to ensure that the worst-case consequence is failing to +// * emit log message(s), not failing to attempt the unlink. +// */ +// vfdP->fdstate &= ~FD_DELETE_AT_CLOSE; + + +// /* first try the stat() */ +// if (stat(vfdP->fileName, &filestats)) +// stat_errno = errno; +// else +// stat_errno = 0; + +// /* in any case do the unlink */ +// if (unlink(vfdP->fileName)) +// ereport(LOG, +// (errcode_for_file_access(), +// errmsg("could not delete file \"%s\": %m", vfdP->fileName))); + +// /* and last report the stat results */ +// if (stat_errno == 0) +// ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size); +// else +// { +// errno = stat_errno; +// ereport(LOG, +// (errcode_for_file_access(), +// errmsg("could not stat file \"%s\": %m", vfdP->fileName))); +// } +// } + +// /* Unregister it from the resource owner */ +// if (vfdP->resowner) +// ResourceOwnerForgetFile(vfdP->resowner, file); + +// /* +// * Return the Vfd slot to the free list +// */ +// FreeVfd(file); +// } + + /* * FilePrefetch - initiate asynchronous read of a given range of the file. * @@ -2244,60 +2349,108 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) pgstat_report_wait_end(); } -int -He3DBFileRead(File file, char **buffer, off_t offset, - uint32 wait_event_info, XLogRecPtr lsn) -{ - int returnCode; - Vfd *vfdP; +// int +// He3DBFileRead(File file, char **buffer, off_t offset, +// uint32 wait_event_info, XLogRecPtr lsn, +// BufferTag pageTag) +// { +// return 0; + //TODO 先从本地盘读取数据,如果存在则返回 +// PageKey pageKey; +// Bufrd bufrd; +// bufrd.count = 0; + +// pageKey.relfileNode.dbNode = pageTag.rnode.dbNode; +// pageKey.relfileNode.relNode = pageTag.rnode.relNode; +// pageKey.forkNo = pageTag.forkNum; +// pageKey.blkNo = pageTag.blockNum; +// pageKey.pageLsn = 0; +// pageKey.replyLsn = lsn; - Assert(FileIsValid(file)); +// bufrd = MoveOnePageToMemory(pageKey); +// if (bufrd.count > 0) +// { +// *buffer = (uint8_t *)malloc(bufrd.count); +// memcpy(buffer, bufrd.buf,bufrd.count); + +// if (push_standby) +// { +// Assert(bufrd.count == BLCKSZ); +// pageKey.pageLsn = PageGetLSN(bufrd.buf); +// LsnNode *head = GetLogIndexByPage(&pageTag, pageKey.pageLsn, pageKey.replyLsn); +// Bufrd result; +// result = ReadWalsByPage(pageKey.relfileNode.dbNode, pageKey.relfileNode.relNode, +// pageKey.forkNo, pageKey.blkNo, ThisTimeLineID, head); +// buffer = (uint8_t *)realloc(buffer, BLCKSZ + result.count); +// strcat(buffer,result.buf); +// //TODO free result +// free_dataRead(result.buf, result.count, result.cap); +// } +// free_dataRead(bufrd.buf, bufrd.count, bufrd.cap); +// // *buffer = bufrd.buf; +// return bufrd.count; +// } +// else +// { +// //TODO 如果本地盘不存在,则调用标准接口读取page,再调用tikv的借口获取范围的wal +// uint8_t *buf = (uint8_t *)malloc(BLCKSZ); - DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %p", - file, VfdCache[file].fileName, - (int64) offset, - buffer)); +// FileRead(file,buf,BLCKSZ,offset,wait_event_info); + +// pageKey.pageLsn = PageGetLSN(buf);; +// pageKey.replyLsn = lsn; - returnCode = FileAccess(file); - if (returnCode < 0) - return returnCode; +// LsnNode *head = GetLogIndexByPage(&pageTag, pageKey.pageLsn, pageKey.replyLsn); +// if (head->next != NULL) +// { +// Bufrd result; +// result = GetWalsFromDisk(pageKey); +// if (result.count == 0) { +// result = ReadWalsByPage(pageKey.relfileNode.dbNode,pageKey.relfileNode.relNode, +// pageKey.forkNo,pageKey.blkNo, ThisTimeLineID, head); +// } - vfdP = &VfdCache[file]; +// buf = (uint8_t *)realloc(buf, BLCKSZ + result.count); +// strcat(buf,result.buf); +// //TODO free result +// free_dataRead(result.buf, result.count, result.cap); +// *buffer = buf; +// return BLCKSZ + result.count; +// } +// return BLCKSZ; +// } +// } -retry: - pgstat_report_wait_start(wait_event_info); - returnCode = he3fs_pread(vfdP->fd, buffer, offset, lsn, DataRead); - pgstat_report_wait_end(); +int +MasterFileRead(char *buffer,uint32_t dbid, uint32_t relid, uint32_t forkno, uint32_t blockno){ + OriginDPageKey odpk; - if (returnCode < 0) + PageKey pageKey; + Bufrd *bufrd = NULL; + bufrd = (Bufrd *)malloc(sizeof(Bufrd)); + bufrd->count = 0; + bufrd->cap = 0; + bufrd->buf = buffer; + int count = 0; + + pageKey.relfileNode.dbNode = dbid; + pageKey.relfileNode.relNode = relid; + pageKey.forkNo = forkno; + pageKey.blkNo = blockno; + pageKey.pageLsn = 0; + pageKey.replyLsn = GetXLogWriteRecPtr(); + + odpk.pk = pageKey; + odpk.opration = (int)EVICT; + GetPageFromCurrentNode(pageKey,bufrd); + count = bufrd->count; + if (count > 0) { - /* - * Windows may run out of kernel buffers and return "Insufficient - * system resources" error. Wait a bit and retry to solve it. - * - * It is rumored that EINTR is also possible on some Unix filesystems, - * in which case immediate retry is indicated. - */ -#ifdef WIN32 - DWORD error = GetLastError(); - - switch (error) - { - case ERROR_NO_SYSTEM_RESOURCES: - pg_usleep(1000L); - errno = EINTR; - break; - default: - _dosmaperr(error); - break; - } -#endif - /* OK to retry if interrupted */ - if (errno == EINTR) - goto retry; + AddOneItemToDPArray(odpk); + bufrd->buf = NULL; } - - return returnCode; + free(bufrd); + return count; } int @@ -2356,103 +2509,100 @@ retry: return returnCode; } -int -He3DBFileWrite(File file, char *buffer, int amount, off_t offset, - uint32 wait_event_info) -{ - int returnCode; - Vfd *vfdP; +// int +// He3DBFileWrite(File file, char *buffer, int amount, off_t offset, +// uint32 wait_event_info) +// { +// int returnCode; +// Vfd *vfdP; - Assert(FileIsValid(file)); +// Assert(FileIsValid(file)); - DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p", - file, VfdCache[file].fileName, - (int64) offset, - amount, buffer)); +// DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p", +// file, VfdCache[file].fileName, +// (int64) offset, +// amount, buffer)); - returnCode = FileAccess(file); - if (returnCode < 0) - return returnCode; +// returnCode = FileAccess(file); +// if (returnCode < 0) +// return returnCode; +// /* +// * If enforcing temp_file_limit and it's a temp file, check to see if the +// * write would overrun temp_file_limit, and throw error if so. Note: it's +// * really a modularity violation to throw error here; we should set errno +// * and return -1. However, there's no way to report a suitable error +// * message if we do that. All current callers would just throw error +// * immediately anyway, so this is safe at present. +// */ +// if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT)) +// { +// off_t past_write = offset + amount; - vfdP = &VfdCache[file]; +// if (past_write > vfdP->fileSize) +// { +// uint64 newTotal = temporary_files_size; - /* - * If enforcing temp_file_limit and it's a temp file, check to see if the - * write would overrun temp_file_limit, and throw error if so. Note: it's - * really a modularity violation to throw error here; we should set errno - * and return -1. However, there's no way to report a suitable error - * message if we do that. All current callers would just throw error - * immediately anyway, so this is safe at present. - */ - if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT)) - { - off_t past_write = offset + amount; +// newTotal += past_write - vfdP->fileSize; +// if (newTotal > (uint64) temp_file_limit * (uint64) 1024) +// ereport(ERROR, +// (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), +// errmsg("temporary file size exceeds temp_file_limit (%dkB)", +// temp_file_limit))); +// } +// } - if (past_write > vfdP->fileSize) - { - uint64 newTotal = temporary_files_size; +// retry: +// errno = 0; +// pgstat_report_wait_start(wait_event_info); +// returnCode = he3fs_pwrite(VfdCache[file].fd, buffer, amount, offset); +// pgstat_report_wait_end(); - newTotal += past_write - vfdP->fileSize; - if (newTotal > (uint64) temp_file_limit * (uint64) 1024) - ereport(ERROR, - (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), - errmsg("temporary file size exceeds temp_file_limit (%dkB)", - temp_file_limit))); - } - } +// /* if write didn't set errno, assume problem is no disk space */ +// if (returnCode != amount && errno == 0) +// errno = ENOSPC; -retry: - errno = 0; - pgstat_report_wait_start(wait_event_info); - returnCode = he3fs_pwrite(VfdCache[file].fd, buffer, amount, offset); - pgstat_report_wait_end(); +// if (returnCode >= 0) +// { +// /* +// * Maintain fileSize and temporary_files_size if it's a temp file. +// */ +// if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) +// { +// off_t past_write = offset + amount; - /* if write didn't set errno, assume problem is no disk space */ - if (returnCode != amount && errno == 0) - errno = ENOSPC; +// if (past_write > vfdP->fileSize) +// { +// temporary_files_size += past_write - vfdP->fileSize; +// vfdP->fileSize = past_write; +// } +// } +// } +// else +// { +// /* +// * See comments in FileRead() +// */ +// #ifdef WIN32 +// DWORD error = GetLastError(); - if (returnCode >= 0) - { - /* - * Maintain fileSize and temporary_files_size if it's a temp file. - */ - if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) - { - off_t past_write = offset + amount; +// switch (error) +// { +// case ERROR_NO_SYSTEM_RESOURCES: +// pg_usleep(1000L); +// errno = EINTR; +// break; +// default: +// _dosmaperr(error); +// break; +// } +// #endif +// /* OK to retry if interrupted */ +// if (errno == EINTR) +// goto retry; +// } - if (past_write > vfdP->fileSize) - { - temporary_files_size += past_write - vfdP->fileSize; - vfdP->fileSize = past_write; - } - } - } - else - { - /* - * See comments in FileRead() - */ -#ifdef WIN32 - DWORD error = GetLastError(); - - switch (error) - { - case ERROR_NO_SYSTEM_RESOURCES: - pg_usleep(1000L); - errno = EINTR; - break; - default: - _dosmaperr(error); - break; - } -#endif - /* OK to retry if interrupted */ - if (errno == EINTR) - goto retry; - } - - return returnCode; -} +// return returnCode; +// } int FileWrite(File file, char *buffer, int amount, off_t offset, @@ -2595,22 +2745,22 @@ FileSize(File file) /* * He3DB: He3FS replace OSFS. */ -off_t -He3DBFileSize(File file) -{ - Assert(FileIsValid(file)); +// off_t +// He3DBFileSize(File file) +// { +// Assert(FileIsValid(file)); - DO_DB(elog(LOG, "FileSize %d (%s)", - file, VfdCache[file].fileName)); +// DO_DB(elog(LOG, "FileSize %d (%s)", +// file, VfdCache[file].fileName)); - if (FileIsNotOpen(file)) - { - if (FileAccess(file) < 0) - return (off_t) -1; - } +// if (FileIsNotOpen(file)) +// { +// if (FileAccess(file) < 0) +// return (off_t) -1; +// } - return (off_t)he3Lseek(VfdCache[file].fd, 0, SEEK_END); -} +// return (off_t)lseekfs(VfdCache[file].fd, 0, SEEK_END); +// } int FileTruncate(File file, off_t offset, uint32 wait_event_info) @@ -2641,36 +2791,36 @@ FileTruncate(File file, off_t offset, uint32 wait_event_info) return returnCode; } -int -He3FileTruncate(File file, off_t offset, uint32 wait_event_info,bool isTemp) -{ - int returnCode; +// int +// He3FileTruncate(File file, off_t offset, uint32 wait_event_info,bool isTemp) +// { +// int returnCode; - Assert(FileIsValid(file)); +// Assert(FileIsValid(file)); - DO_DB(elog(LOG, "FileTruncate %d (%s)", - file, VfdCache[file].fileName)); +// DO_DB(elog(LOG, "FileTruncate %d (%s)", +// file, VfdCache[file].fileName)); - returnCode = He3FileAccess(file); - if (returnCode < 0) - return returnCode; +// returnCode = He3FileAccess(file); +// if (returnCode < 0) +// return returnCode; - pgstat_report_wait_start(wait_event_info); - if (isTemp == true || push_standby == true) { - returnCode = he3Truncate(VfdCache[file].fd, offset); - } - pgstat_report_wait_end(); +// pgstat_report_wait_start(wait_event_info); +// if (isTemp == true || push_standby == true) { +// returnCode = truncatefs(VfdCache[file].fd, offset); +// } +// pgstat_report_wait_end(); - if (returnCode == 0 && VfdCache[file].fileSize > offset) - { - /* adjust our state for truncation of a temp file */ - Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT); - temporary_files_size -= VfdCache[file].fileSize - offset; - VfdCache[file].fileSize = offset; - } +// if (returnCode == 0 && VfdCache[file].fileSize > offset) +// { +// /* adjust our state for truncation of a temp file */ +// Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT); +// temporary_files_size -= VfdCache[file].fileSize - offset; +// VfdCache[file].fileSize = offset; +// } - return returnCode; -} +// return returnCode; +// } /* * Return the pathname associated with an open file. * @@ -2977,16 +3127,7 @@ FreeDesc(AllocateDesc *desc) result = closedir(desc->desc.dir); break; case AllocateDescRawFD: - //result = close(desc->desc.fd); - /* He3DB: Add He3FS Compatibility*/ - if(close(desc->desc.fd) == 0 || he3Close(desc->desc.fd) == 0) - { - result = 0; - } - else - { - result = 1; - } + result = close(desc->desc.fd); break; default: elog(ERROR, "AllocateDesc kind not recognized"); @@ -3054,16 +3195,7 @@ CloseTransientFile(int fd) /* Only get here if someone passes us a file not in allocatedDescs */ elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile"); - //return close(fd); - /* He3DB: Add He3FS Compatibility*/ - if(close(fd) == 0 || he3Close(fd) == 0) - { - return 0; - } - else - { - return 1; - } + return close(fd); } /* @@ -4231,31 +4363,55 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) /* * He3DB: He3FS replace OSFS. */ -ssize_t -he3fs_pread(int fd, void **buf, off_t offset, XLogRecPtr lsn, uint16 type) -{ - ProtobufCBinaryData result; +// ssize_t +// he3fs_pread(int64_t fd, void **buf, off_t offset, XLogRecPtr lsn, uint16 type, uint32_t dbid, uint32_t relid, uint32_t segno, uint32_t forkno) +// { +// Bufrd result; - switch (type) - { - case DataRead: - result = dataRead(fd, offset, lsn); - *buf = result.data; - return (ssize_t)result.len; - default: - elog(ERROR, "unrecognized strategy number: %d", type); - } -} +// switch (type) +// { +// case DataRead: +// result = dataRead(fd, offset, lsn, dbid, relid, segno, forkno); +// *buf = result.buf; +// return (ssize_t)result.count; +// default: +// elog(ERROR, "unrecognized strategy number: %d", type); +// } +// } /* * He3DB: He3FS replace OSFS. */ -ssize_t -he3fs_pwrite(int fd, const void *buf, size_t size, off_t offset) -{ - ProtobufCBinaryData request; +//ssize_t +// he3fs_pwrite(int64_t fd, const void *buf, size_t size, off_t offset) +// { +// return (ssize_t)writefs(fd, (char *)buf, size, offset); +// } - request.data = (char *)buf; - request.len = size; - return (ssize_t)he3Write(fd, request, offset); -} +// ssize_t +// he3fs_xlogread(int64_t fd, void *buf, off_t offset, size_t size) +// { +// Bufrd result; +// size_t count; +// // printf("start read xlog %ld\n", fd); +// // pg_usleep(20000000); +// // printf("end sleep, offset %d, size %d\n", offset, size); +// result = readfs(fd, offset, size); +// if (result.count <= 0) +// return (ssize_t)result.count; +// else if (result.count <= XLOG_BLCKSZ) +// { +// memcpy(buf, result.buf, result.count); +// count = result.count; +// } +// else +// { +// memcpy(buf, result.buf, XLOG_BLCKSZ); +// count = BLCKSZ; +// } + +// free_dataRead(result.buf, 1, 1); +// // *buf = result.buf; +// return (ssize_t)count; + +// } diff --git a/src/backend/storage/file/libfs.a b/src/backend/storage/file/libfs.a deleted file mode 100644 index 6934521..0000000 Binary files a/src/backend/storage/file/libfs.a and /dev/null differ diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index 8c12dda..42077ba 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -30,6 +30,7 @@ #include "storage/fsm_internals.h" #include "storage/lmgr.h" #include "storage/smgr.h" +#include "postmaster/secondbuffer.h" /* @@ -641,6 +642,18 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks) smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now, pg.data, false); + if (!(InitdbSingle || IsBootstrapProcessingMode() == true) && !push_standby && !he3mirror) + { + PageKey pageKey; + pageKey.relfileNode.dbNode = rel->rd_smgr->smgr_rnode.node.dbNode; + pageKey.relfileNode.relNode = rel->rd_smgr->smgr_rnode.node.relNode; + + pageKey.blkNo = fsm_nblocks_now; + pageKey.forkNo = FSM_FORKNUM; + pageKey.pageLsn = 0; + + ReceivePageFromDataBuffer(&pageKey, (uint8_t *) pg.data); + } fsm_nblocks_now++; } diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 3e4ec53..0001ccc 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -29,6 +29,7 @@ #include "postmaster/bgworker_internals.h" #include "postmaster/bgwriter.h" #include "postmaster/postmaster.h" +#include "postmaster/secondbuffer.h" #include "replication/logicallauncher.h" #include "replication/origin.h" #include "replication/slot.h" @@ -36,6 +37,7 @@ #include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/dsm.h" +#include "storage/he3db_logindex.h" #include "storage/ipc.h" #include "storage/pg_shmem.h" #include "storage/pmsignal.h" @@ -46,7 +48,9 @@ #include "storage/sinvaladt.h" #include "storage/spin.h" #include "utils/snapmgr.h" - +#include "access/pagehashqueue.h" +#include "access/ringbuffer.h" +#include "storage/filecache.h" /* GUCs */ int shared_memory_type = DEFAULT_SHARED_MEMORY_TYPE; @@ -150,6 +154,11 @@ CreateSharedMemoryAndSemaphores(void) size = add_size(size, BTreeShmemSize()); size = add_size(size, SyncScanShmemSize()); size = add_size(size, AsyncShmemSize()); + //size = add_size(size, RelCutShmemSize()); + size = add_size(size, PageHashQueueShmemSize()); + size = add_size(size, PageHashMapSize()); + //size = add_size(size, LogindexHashAllShmemSize()); + size = add_size(size,WalReadBufferShmemSize()); #ifdef EXEC_BACKEND size = add_size(size, ShmemBackendArraySize()); #endif @@ -158,6 +167,15 @@ CreateSharedMemoryAndSemaphores(void) addin_request_allowed = false; size = add_size(size, total_addin_request); + /* secondbufferhash code. */ + //TODO the size should be calculated base on data buffer size. + size = add_size(size, SecondBufferShmemSize()); + size = add_size(size, SecondBufferLWLockShmemSize()); + size = add_size(size, He3dbLogIndexShmemSize()); + + /* cache file size */ + size = add_size(size, FileCacheSize()); + /* might as well round it off to a multiple of a typical page size */ size = add_size(size, 8192 - (size % 8192)); @@ -206,6 +224,8 @@ CreateSharedMemoryAndSemaphores(void) */ CreateLWLocks(); + CreateSecondBufferLWLocks(); + /* * Set up shmem.c index hashtable */ @@ -227,7 +247,25 @@ CreateSharedMemoryAndSemaphores(void) * Set up lock manager */ InitLocks(); + InitCacheRel(); + /* + * set up wal log hash + */ + He3dbLogIndexTblListInit(); + //InitCleanupInfo(); + /* + * set up second buffer hash + */ + InitSecondBufferHash(); + InitSecondBufferMeta(); + InitDPageKeyArray(); + + /* + * set up fs meta + */ + // InitFSMetaHash(); + /* * Set up predicate lock manager */ @@ -255,6 +293,10 @@ CreateSharedMemoryAndSemaphores(void) ProcSignalShmemInit(); CheckpointerShmemInit(); AutoVacuumShmemInit(); + PageHashQueueShmemInit(); + InitBufferPoolHashMap(); + //InitLogindexHashBrucket(); + InitRingBufferSpace(); ReplicationSlotsShmemInit(); ReplicationOriginShmemInit(); WalSndShmemInit(); diff --git a/src/backend/storage/lmgr/Makefile b/src/backend/storage/lmgr/Makefile index 829b792..6944cde 100644 --- a/src/backend/storage/lmgr/Makefile +++ b/src/backend/storage/lmgr/Makefile @@ -22,7 +22,8 @@ OBJS = \ predicate.o \ proc.o \ s_lock.o \ - spin.o + spin.o \ + he3db_logindex.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/lmgr/he3db_logindex.c b/src/backend/storage/lmgr/he3db_logindex.c new file mode 100644 index 0000000..7a3c50a --- /dev/null +++ b/src/backend/storage/lmgr/he3db_logindex.c @@ -0,0 +1,861 @@ + +#include "postgres.h" + +#include "storage/he3db_logindex.h" +#include "storage/shmem.h" +#include "storage/spin.h" + +static LogIndexMemList *log_index_mem_list; +static uint64 logindex_mem_tbl_size; + +static Size +LogIndexMemListSize(uint64 he3db_logindex_mem_size) +{ + Size size; + + logindex_mem_tbl_size = (he3db_logindex_mem_size * 1024L * 1024L) / sizeof(LogIndexMemTBL); + size = offsetof(LogIndexMemList, mem_table); // 去除柔性数组之外的空间大小 + size = add_size(size, mul_size(sizeof(LogIndexMemTBL), logindex_mem_tbl_size)); + + size = MAXALIGN(size);//为了使sizeof(struct)向上对齐,成为8的倍数的大小 + + /* The number of logindex memory table is at least 3 */ + if (logindex_mem_tbl_size < 3) + elog(FATAL, "The number=%ld of logindex memory table is less than 3", logindex_mem_tbl_size); + else + ereport(LOG, (errmsg("The total log index memory table size is %ld, number logindex mem-table size is %ld", size, logindex_mem_tbl_size))); + + return size; +} + +static void SetNewPageItem(LogIndexMemTBL *mem_tbl, const BufferTag *page) +{ + // set page item + LogIndexMemItemHead *page_head = &(mem_tbl->page_head[mem_tbl->meta.page_free_head-1]); + memcpy(&(page_head->tag), page, sizeof(BufferTag)); + page_head->next_item = LOG_INDEX_TBL_INVALID_SEG; + page_head->next_seg = mem_tbl->meta.lsn_free_head; + page_head->tail_seg = mem_tbl->meta.lsn_free_head; +} + +// When active table is full, get next free mem table and will change to active mem. +static LogIndexMemTBL *GetNextFreeMemTbl(void) +{ + // TODO change to Lightweight Lock + uint64 active_tbl_index = (log_index_mem_list->active_table_index + 1)%(log_index_mem_list->table_cap); + // if all mem table is full, waiting for recycle + if(active_tbl_index == log_index_mem_list->table_start_index) + { + elog(LOG, "Mem table is full, waiting for cleanup. Total size: %ld", logindex_mem_tbl_size); + } + while(active_tbl_index == log_index_mem_list->table_start_index) + { + pg_usleep(10); /* 10 us */ + } + elog(DEBUG5, "Find next free mem table and set active_table_index + 1: %ld", active_tbl_index); + LWLockAcquire(LogIndexMemListLock,LW_EXCLUSIVE); + // Circular List + log_index_mem_list->active_table_index = active_tbl_index; + LWLockRelease(LogIndexMemListLock); + // if it finds free mem table will return directly. + return &(log_index_mem_list->mem_table[log_index_mem_list->active_table_index]); +} + +static void SetLsnSeg(LogIndexMemItemSeg *lsn_seg, XLogRecPtr lsn){ + LOG_INDEX_INSERT_LSN_INFO(lsn_seg, lsn_seg->number, lsn); + lsn_seg->number++; +} + +static void SetNewLsnSeg(LogIndexMemTBL *mem_tbl, XLogRecPtr lsn) +{ + // set lsn seg + // first seg index start with 0, seg_item[0] + LogIndexMemItemSeg *lsn_seg = &(mem_tbl->seg_item[mem_tbl->meta.lsn_free_head-1]); + lsn_seg->prev_seg = LOG_INDEX_TBL_INVALID_SEG; + lsn_seg->next_seg = LOG_INDEX_TBL_INVALID_SEG; + SetLsnSeg(lsn_seg, lsn); +} + +static void SetNextLsnSeg(LogIndexMemItemHead *page_head, LogIndexMemItemSeg *lsn_seg_old, LogIndexMemTBL *mem_tbl, XLogRecPtr lsn) +{ + // set lsn next seg + LogIndexMemItemSeg *lsn_seg_next = &(mem_tbl->seg_item[mem_tbl->meta.lsn_free_head-1]); + lsn_seg_old->next_seg = mem_tbl->meta.lsn_free_head; + lsn_seg_next->prev_seg = page_head->tail_seg; + lsn_seg_next->next_seg = LOG_INDEX_TBL_INVALID_SEG; + page_head->tail_seg = mem_tbl->meta.lsn_free_head; + SetLsnSeg(lsn_seg_next, lsn); +} + +static void UpdateMemTableMetaWithNewPage(LogIndexMemTBL *mem_tbl, XLogRecPtr lsn) +{ + // set metadata for active mem table + SpinLockAcquire(&(mem_tbl->meta.meta_lock)); + // set prefix_lsn, min_lsn and max_lsn + LOG_INDEX_MEM_TBL_SET_PREFIX_LSN(mem_tbl, lsn); + mem_tbl->meta.max_lsn = Max(lsn, mem_tbl->meta.max_lsn); + mem_tbl->meta.min_lsn = Min(lsn, mem_tbl->meta.min_lsn); + // page,lsn free index ++ + mem_tbl->meta.page_free_head++; + mem_tbl->meta.lsn_free_head++; + SpinLockRelease(&(mem_tbl->meta.meta_lock)); +} + +static void UpdateMemTableMetaWithNextPage(LogIndexMemTBL *mem_tbl, XLogRecPtr lsn) +{ + // set metadata for active mem table + SpinLockAcquire(&(mem_tbl->meta.meta_lock)); + // set prefix_lsn, min_lsn and max_lsn + mem_tbl->meta.max_lsn = Max(lsn, mem_tbl->meta.max_lsn); + mem_tbl->meta.min_lsn = Min(lsn, mem_tbl->meta.min_lsn); + // page,lsn free index ++ + mem_tbl->meta.page_free_head++; + mem_tbl->meta.lsn_free_head++; + SpinLockRelease(&(mem_tbl->meta.meta_lock)); +} + +static void UpdateMemTableMetaWithNextSeg(LogIndexMemTBL *mem_tbl, XLogRecPtr lsn) +{ + // set metadata for active mem table + SpinLockAcquire(&(mem_tbl->meta.meta_lock)); + mem_tbl->meta.max_lsn = Max(lsn, mem_tbl->meta.max_lsn); + mem_tbl->meta.min_lsn = Min(lsn, mem_tbl->meta.min_lsn); + mem_tbl->meta.lsn_free_head++; + SpinLockRelease(&(mem_tbl->meta.meta_lock)); +} + +static void UpdateMemTableMetaWithCurrentSeg(LogIndexMemTBL *mem_tbl, XLogRecPtr lsn) +{ + // set metadata for active mem table + SpinLockAcquire(&(mem_tbl->meta.meta_lock)); + mem_tbl->meta.max_lsn = Max(lsn, mem_tbl->meta.max_lsn); + mem_tbl->meta.min_lsn = Min(lsn, mem_tbl->meta.min_lsn); + SpinLockRelease(&(mem_tbl->meta.meta_lock)); +} + +static void SetActiveTblWithFirstPage(LogIndexMemTBL *mem_tbl, const BufferTag *page, XLogRecPtr lsn) +{ + uint32 hash_key; + + // set mem table state to active + pg_atomic_write_u32(&(mem_tbl->meta.state), LOG_INDEX_MEM_TBL_STATE_ACTIVE); + + // index start with 1, 0 means INVALID. hash[] all values will be 0 after init, so set to 1 when first use. + mem_tbl->meta.id = log_index_mem_list->active_table_index; + mem_tbl->meta.lsn_free_head = 1; + mem_tbl->meta.page_free_head = 1; + // calculate hashcode by buffer tag + hash_key = LOG_INDEX_MEM_TBL_HASH_PAGE(page); + mem_tbl->hash[hash_key] = mem_tbl->meta.page_free_head; + + // set page item + SetNewPageItem(mem_tbl, page); + + // set lsn seg + SetNewLsnSeg(mem_tbl, lsn); + + // set metadata for active mem table + UpdateMemTableMetaWithNewPage(mem_tbl, lsn); +} + +static void InsertLsnWhenOldTblIsFull(LogIndexMemTBL *mem_tbl_old, const BufferTag *page, XLogRecPtr lsn) +{ + LogIndexMemTBL *mem_tbl_new; + + // set mem table state to inactive + pg_atomic_write_u32(&(mem_tbl_old->meta.state), LOG_INDEX_MEM_TBL_STATE_INACTIVE); + mem_tbl_new = GetNextFreeMemTbl(); + SetActiveTblWithFirstPage(mem_tbl_new, page, lsn); +} + +static void SetNextPageItem(LogIndexMemTBL *mem_tbl, const BufferTag *page, XLogRecPtr lsn) +{ + // there's no free page_head or lsn_seg, means current active is full, will apply for new mem table as active table + if (mem_tbl->meta.page_free_head > LOG_INDEX_MEM_TBL_PAGE_NUM || mem_tbl->meta.lsn_free_head > LOG_INDEX_MEM_TBL_SEG_NUM) + { + // no free page head in active mem table, will apply for new mem table + InsertLsnWhenOldTblIsFull(mem_tbl, page, lsn); + } + else + { + // set new page and lsn seg when active mem table have free resource + SetNewPageItem(mem_tbl, page); + SetNewLsnSeg(mem_tbl, lsn); + UpdateMemTableMetaWithNewPage(mem_tbl, lsn); + } +} + +static void RestMemTable(LogIndexMemTBL *mem_tbl) +{ + // reset table's metadata + mem_tbl->meta.id = LOG_INDEX_TABLE_INVALID_ID; + pg_atomic_write_u32(&(mem_tbl->meta.state), LOG_INDEX_MEM_TBL_STATE_FREE); + mem_tbl->meta.page_free_head = LOG_INDEX_TBL_INVALID_SEG; + mem_tbl->meta.lsn_free_head = LOG_INDEX_TBL_INVALID_SEG; + mem_tbl->meta.min_lsn = UINT64_MAX; + mem_tbl->meta.max_lsn = InvalidXLogRecPtr; + mem_tbl->meta.prefix_lsn = 0; + + // reset hash[] and page head[] + for(int i = 0; i < LOG_INDEX_MEM_TBL_PAGE_NUM; i++) + { + mem_tbl->hash[i] = LOG_INDEX_TBL_INVALID_SEG; + CLEAR_BUFFERTAG(mem_tbl->page_head[i].tag); + mem_tbl->page_head[i].next_item = LOG_INDEX_TBL_INVALID_SEG; + mem_tbl->page_head[i].next_seg = LOG_INDEX_TBL_INVALID_SEG; + mem_tbl->page_head[i].tail_seg = LOG_INDEX_TBL_INVALID_SEG; + // reset seg_item[] + mem_tbl->seg_item[i].prev_seg = LOG_INDEX_TBL_INVALID_SEG; + mem_tbl->seg_item[i].next_seg = LOG_INDEX_TBL_INVALID_SEG; + mem_tbl->seg_item[i].number = 0; + } + // reset seg_item[] + for(int i = LOG_INDEX_MEM_TBL_PAGE_NUM; i < LOG_INDEX_MEM_TBL_SEG_NUM; i++){ + mem_tbl->seg_item[i].prev_seg = LOG_INDEX_TBL_INVALID_SEG; + mem_tbl->seg_item[i].next_seg = LOG_INDEX_TBL_INVALID_SEG; + mem_tbl->seg_item[i].number = 0; + } +} + +static LsnNode *InitLsnNode() +{ + LsnNode *head; + + head = (LsnNode *)malloc(sizeof(LsnNode)); + head->next = NULL; + return head; +} + +// insert nodelist from head, eg: before: head-->node1-->NULL, after: head-->newNode-->node1-->NULL +static void InsertLsnNodeByHead(LsnNode *head, XLogRecPtr lsn) +{ + LsnNode *new_node; + + new_node = (LsnNode *)malloc(sizeof(LsnNode)); + new_node->lsn = lsn; + new_node->next = head->next; + head->next = new_node; +} + +// eg: before: head-->node1-->NULL, after: head-->node1-->newNode-->NULL +static LsnNode *InsertLsnNodeByTail(LsnNode *head, XLogRecPtr lsn) +{ + LsnNode *new_node; + new_node = (LsnNode *)malloc(sizeof(LsnNode)); + head->next = new_node; + new_node->lsn = lsn; + new_node->next = NULL; + return new_node; +} + +// print nodelist +static void PrintLsnNode(LsnNode *head) +{ + LsnNode *p; + p = head->next; + while (p) { + printf(" %d\t ", p->lsn); + p = p->next; + } +} + +static void ReverseLsnNode(LsnNode *head) +{ + if (head == NULL || head->next == NULL) { + return; + } + LsnNode *p = NULL; + LsnNode *q = head->next; + LsnNode *next ; + while (q != NULL) { + next = q->next; + q->next = p; + p = q; + q = next; + } + head->next=p; +} + +static uint16 FindFirstLsnSegInMemTblByPageTag(LogIndexMemTBL *mem_tbl, const BufferTag *page, XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + LogIndexMemItemHead *page_head; + uint32 hash_key; + + // end_lsn <= min_lsn or start_lsn > max_lsn means the request lsn region not in this mem table + if(mem_tbl->meta.min_lsn >= end_lsn || mem_tbl->meta.max_lsn < start_lsn) + { + return LOG_INDEX_TBL_INVALID_SEG; + }else{ + hash_key = LOG_INDEX_MEM_TBL_HASH_PAGE(page); + if(mem_tbl->hash[hash_key] != LOG_INDEX_TBL_INVALID_SEG) + { + page_head = &(mem_tbl->page_head[mem_tbl->hash[hash_key]-1]); + while(!BUFFERTAGS_EQUAL(page_head->tag, *page)){ + if(page_head->next_item == LOG_INDEX_TBL_INVALID_SEG) + { + return LOG_INDEX_TBL_INVALID_SEG; + } + page_head = &(mem_tbl->page_head[page_head->next_item-1]); + } + // find request page, return lsn seg + return (page_head->next_seg); + }else + { + return LOG_INDEX_TBL_INVALID_SEG; + } + } +} + +static TagNode *InitTagNode() +{ + TagNode *head; + + head = (TagNode *)malloc(sizeof(TagNode)); + head->next = NULL; + return head; +} + +// insert nodelist from head, eg: before: head-->node1-->NULL, after: head-->newNode-->node1-->NULL +static void InsertTagNodeByHead(TagNode *head, BufferTag tag) +{ + TagNode *new_node; + + new_node = (TagNode *)malloc(sizeof(TagNode)); + new_node->tag.tag = tag; + new_node->next = head->next; + head->next = new_node; +} + +void He3dbLogIndexTblListInit(void) +{ + bool found_logindex; + log_index_mem_list = (LogIndexMemList *) + ShmemInitStruct("log index", LogIndexMemListSize(he3db_logindex_mem_size), &found_logindex); + Assert(log_index_mem_list != NULL); + log_index_mem_list->table_start_index = 0; + log_index_mem_list->active_table_index = 0; + log_index_mem_list->table_cap = logindex_mem_tbl_size; + //SpinLockInit(&(log_index_mem_list->lock)); + for (uint64 i = 0; i < log_index_mem_list->table_cap; i++) { + // set mem table init values + SpinLockInit(&(log_index_mem_list->mem_table[i].meta.meta_lock)); + log_index_mem_list->mem_table[i].meta.id = i + 1; + log_index_mem_list->mem_table[i].meta.min_lsn = UINT64_MAX; + log_index_mem_list->mem_table[i].meta.max_lsn = InvalidXLogRecPtr; + SpinLockInit(&(log_index_mem_list->mem_table[i].meta.meta_lock)); + pg_atomic_write_u32(&(log_index_mem_list->mem_table[i].meta.state), LOG_INDEX_MEM_TBL_STATE_FREE); + } + //SpinLockInit(&(log_index_mem_list->lock)); +} + +uint64 GetMemTblSize(void) +{ + return log_index_mem_list->table_cap; +} + +void InsertLogIndexByPage(const BufferTag *page, XLogRecPtr lsn) +{ + LogIndexMemItemSeg *lsn_seg; + uint32 hash_key; + LogIndexMemTBL *mem_tbl; + LogIndexMemItemHead *page_head; + + // calculate hashcode by buffer tag + hash_key = LOG_INDEX_MEM_TBL_HASH_PAGE(page); + // get active mem table + mem_tbl = &(log_index_mem_list->mem_table[log_index_mem_list->active_table_index]); + // first time to use active mem table + if(pg_atomic_read_u32(&mem_tbl->meta.state) == LOG_INDEX_MEM_TBL_STATE_FREE) + { + SetActiveTblWithFirstPage(mem_tbl, page, lsn); + } + else + { + // if have same lsn prefix with active table + if(LOG_INDEX_SAME_TABLE_LSN_PREFIX(mem_tbl, lsn)) + { + // 0 means INVALID, also means page don't exist in active mem table + if(mem_tbl->hash[hash_key] == 0) + { + // set hash value to next free head + if (!(mem_tbl->meta.page_free_head > LOG_INDEX_MEM_TBL_PAGE_NUM || mem_tbl->meta.lsn_free_head > LOG_INDEX_MEM_TBL_SEG_NUM)) + mem_tbl->hash[hash_key] = mem_tbl->meta.page_free_head; + SetNextPageItem(mem_tbl, page, lsn); + } + else + { + // page already exist or hash conflict + // get exist page item + page_head = &(mem_tbl->page_head[mem_tbl->hash[hash_key]-1]); + /* if item page tag equal to current tag, true insert lsn to lsn_seg, + * false loop for next_item until equal or not found one. Then apply new page_item and lsn_seg. + */ + while(!BUFFERTAGS_EQUAL(page_head->tag, *page)){ + if(page_head->next_item == LOG_INDEX_TBL_INVALID_SEG) + { + // apply new page item + // there's no free page_head or lsn_seg, means current active is full, will apply for new mem table as active table + if (mem_tbl->meta.page_free_head > LOG_INDEX_MEM_TBL_PAGE_NUM || mem_tbl->meta.lsn_free_head > LOG_INDEX_MEM_TBL_SEG_NUM) + { + // no free page head in active mem table, will apply for new mem table + InsertLsnWhenOldTblIsFull(mem_tbl, page, lsn); + } + else + { + // set new page and lsn seg when active mem table have free resource + // set old page item's next_item to new one. + page_head->next_item = mem_tbl->meta.page_free_head; + // set page item + SetNewPageItem(mem_tbl, page); + SetNewLsnSeg(mem_tbl, lsn); + UpdateMemTableMetaWithNextPage(mem_tbl, lsn); + } + return; + } + page_head = &(mem_tbl->page_head[page_head->next_item-1]); + } + + // find same tag's page_head + lsn_seg = &(mem_tbl->seg_item[page_head->tail_seg-1]); + // if current seg full? + if(lsn_seg->number < LOG_INDEX_MEM_ITEM_SEG_LSN_NUM) + { + // insert lsn to seg + SetLsnSeg(lsn_seg, lsn); + UpdateMemTableMetaWithCurrentSeg(mem_tbl, lsn); + } + else + { + if(mem_tbl->meta.lsn_free_head > LOG_INDEX_MEM_TBL_SEG_NUM) + { + // no free page head in active mem table, will apply for new mem table + InsertLsnWhenOldTblIsFull(mem_tbl, page, lsn); + } + else + { + // apply new seg and insert lsn + SetNextLsnSeg(page_head, lsn_seg, mem_tbl, lsn); + UpdateMemTableMetaWithNextSeg(mem_tbl, lsn); + } + } + } + } + else + { + // prefix of lsn is different, so cannot use current active table, will apply new mem table + InsertLsnWhenOldTblIsFull(mem_tbl, page, lsn); + } + } +} + +LsnNode *GetLogIndexByPage(const BufferTag *page, XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + LsnNode *head_node; + LsnNode *tail; + uint64 tbl_index; + + // Prevent metadata changes during discovery. + // TODO change to Lightweight Lock + head_node = InitLsnNode(); + tail = head_node; + LWLockAcquire(LogIndexMemListLock,LW_SHARED); + tbl_index = log_index_mem_list->table_start_index; + while(tbl_index != log_index_mem_list->active_table_index) + { + LogIndexMemTBL *mem_tbl = &(log_index_mem_list->mem_table[tbl_index]); + tbl_index = (tbl_index + 1)%(log_index_mem_list->table_cap); + // current mem table no suitability lsn_list + if(mem_tbl->meta.max_lsn < start_lsn) + { + continue; + }else if(mem_tbl->meta.min_lsn >= end_lsn) + { + // there is no suitability lsn_list after this mem table + break; + } else + { + // get index of current table's seg + uint16 seg_index = FindFirstLsnSegInMemTblByPageTag(mem_tbl, page, start_lsn, end_lsn); + while (seg_index != LOG_INDEX_TBL_INVALID_SEG) + { + LogIndexMemItemSeg *item_seg = &(mem_tbl->seg_item[seg_index - 1]); + // loop for lsn list + for(int i=0; i < item_seg->number; i++){ + XLogRecPtr lsn = LOG_INDEX_COMBINE_LSN(mem_tbl, item_seg->suffix_lsn[i]); + if(lsn >= start_lsn) + { + if(lsn < end_lsn) + { + tail = InsertLsnNodeByTail(tail, lsn); + }else{ + LWLockRelease(LogIndexMemListLock); + return head_node; + } + }else + { + continue; + } + } + seg_index = item_seg->next_seg; + } + } + } + // loop for active table + if(tbl_index == log_index_mem_list->active_table_index) + { + LogIndexMemTBL *mem_tbl = &(log_index_mem_list->mem_table[log_index_mem_list->active_table_index]); + // get index of current table's seg + uint16 seg_index = FindFirstLsnSegInMemTblByPageTag(mem_tbl, page, start_lsn, end_lsn); + while (seg_index != LOG_INDEX_TBL_INVALID_SEG) + { + LogIndexMemItemSeg *item_seg = &(mem_tbl->seg_item[seg_index - 1]); + // loop for lsn list + for(int i=0; i < item_seg->number; i++){ + XLogRecPtr lsn = LOG_INDEX_COMBINE_LSN(mem_tbl, item_seg->suffix_lsn[i]); + if(lsn >= start_lsn) + { + if(lsn < end_lsn) + { + tail = InsertLsnNodeByTail(tail, lsn); + }else{ + LWLockRelease(LogIndexMemListLock); + return head_node; + } + }else + { + continue; + } + } + seg_index = item_seg->next_seg; + } + LWLockRelease(LogIndexMemListLock); + return head_node; + } + LWLockRelease(LogIndexMemListLock); + return head_node; +} + +/* cleanup useless mem table which max_lsn less than consist_lsn, + * and reset mem table to reuse. + */ +void CleanLogIndexByPage(XLogRecPtr consist_lsn) +{ + // TODO change to Lightweight Lock + LWLockAcquire(LogIndexMemListLock,LW_EXCLUSIVE); + // loop mem table from table_start_index + while(log_index_mem_list->table_start_index != log_index_mem_list->active_table_index) + { + LogIndexMemTBL *mem_tbl = &(log_index_mem_list->mem_table[log_index_mem_list->table_start_index]); + // max_lsn large than consistLsn? true: cannot cleanup and reuse just break; false: cleanup + if (mem_tbl->meta.max_lsn >= consist_lsn || pg_atomic_read_u32(&mem_tbl->meta.state) != LOG_INDEX_MEM_TBL_STATE_INACTIVE) + { + break; + } + elog(DEBUG5, "Reset Mem table id=%ld by consist_lsn=%ld ", mem_tbl->meta.id, consist_lsn); + RestMemTable(mem_tbl); + log_index_mem_list->table_start_index = (log_index_mem_list->table_start_index + 1)%(log_index_mem_list->table_cap); + } + LWLockRelease(LogIndexMemListLock); +} + +Size He3dbLogIndexShmemSize(void) +{ + Size size = 0; + if (he3db_logindex_mem_size <= 0) + return size; + size = LogIndexMemListSize(he3db_logindex_mem_size); + size = CACHELINEALIGN(size); + elog(DEBUG5, "Mem table size=%ld in share memory", size); + return size; +} + +void FreeLsnNode(LsnNode *head) +{ + LsnNode* ln; + while (head != NULL) + { + ln = head; + head = head->next; + free(ln); + ln = NULL; + } +} + +TagNode *GetBufTagByLsnRange(XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + TagNode *head_node; + uint64 tbl_index; + LogIndexMemItemHead *item_page; + LogIndexMemItemSeg *first_seg; + LogIndexMemItemSeg *last_seg; + XLogRecPtr page_min_lsn; + XLogRecPtr page_max_lsn; + + // Prevent metadata changes during discovery. + // change to Lightweight Lock + head_node = InitTagNode(); + if (end_lsn < start_lsn) + { + return head_node; + } + LWLockAcquire(LogIndexMemListLock,LW_SHARED); + tbl_index = log_index_mem_list->table_start_index; + while(tbl_index != log_index_mem_list->active_table_index) + { + LogIndexMemTBL *mem_tbl = &(log_index_mem_list->mem_table[tbl_index]); + tbl_index = (tbl_index + 1)%(log_index_mem_list->table_cap); + // current mem table no suitability lsn_list + if(mem_tbl->meta.max_lsn < start_lsn) + { + continue; + }else if(mem_tbl->meta.min_lsn > end_lsn) + { + // there is no suitability lsn_list after this mem table + LWLockRelease(LogIndexMemListLock); + return head_node; + } + else + { + end_lsn = Min(end_lsn, mem_tbl->meta.max_lsn); + head_node->tag.lsn = end_lsn; + // loop for page list + for(int i = 0; i < (mem_tbl->meta.page_free_head - 1); i++) + { + item_page = &(mem_tbl->page_head[i]); + if(item_page->next_seg == LOG_INDEX_TBL_INVALID_SEG || item_page->tail_seg == LOG_INDEX_TBL_INVALID_SEG) + { + continue; + } + else + { + first_seg = &(mem_tbl->seg_item[item_page->next_seg - 1]); + last_seg = &(mem_tbl->seg_item[item_page->tail_seg - 1]); + page_min_lsn = LOG_INDEX_COMBINE_LSN(mem_tbl, first_seg->suffix_lsn[0]); + uint8 id = Min(LOG_INDEX_MEM_ITEM_SEG_LSN_NUM - 1, last_seg->number - 1); + page_max_lsn = LOG_INDEX_COMBINE_LSN(mem_tbl, last_seg->suffix_lsn[id]); + if(page_min_lsn > end_lsn || page_max_lsn < start_lsn) + { + continue; + } + else + { + InsertTagNodeByHead(head_node, item_page->tag); + } + } + } + LWLockRelease(LogIndexMemListLock); + return head_node; + } + } + if (tbl_index == log_index_mem_list->active_table_index){ + + LogIndexMemTBL *mem_tbl = &(log_index_mem_list->mem_table[tbl_index]); + // current mem table no suitability lsn_list + if(!(mem_tbl->meta.max_lsn < start_lsn || mem_tbl->meta.min_lsn > end_lsn)) + { + end_lsn = Min(end_lsn, mem_tbl->meta.max_lsn); + head_node->tag.lsn = end_lsn; + // loop for page list + for(int i = 0; i < (mem_tbl->meta.page_free_head - 1); i++) + { + item_page = &(mem_tbl->page_head[i]); + if(item_page->next_seg == LOG_INDEX_TBL_INVALID_SEG || item_page->tail_seg == LOG_INDEX_TBL_INVALID_SEG) + { + continue; + } + else + { + first_seg = &(mem_tbl->seg_item[item_page->next_seg - 1]); + last_seg = &(mem_tbl->seg_item[item_page->tail_seg - 1]); + page_min_lsn = LOG_INDEX_COMBINE_LSN(mem_tbl, first_seg->suffix_lsn[0]); + uint8 id = Min(LOG_INDEX_MEM_ITEM_SEG_LSN_NUM - 1, last_seg->number - 1); + page_max_lsn = LOG_INDEX_COMBINE_LSN(mem_tbl, last_seg->suffix_lsn[id]); + if(page_min_lsn > end_lsn || page_max_lsn < start_lsn) + { + continue; + } + else + { + InsertTagNodeByHead(head_node, item_page->tag); + } + } + } + } + } + LWLockRelease(LogIndexMemListLock); + + return head_node; +} + +bool CheckBufTagExistByLsnRange(const BufferTag *page, XLogRecPtr start_lsn, XLogRecPtr end_lsn) +{ + uint64 tbl_index; + LogIndexMemItemSeg *first_seg; + LogIndexMemItemSeg *last_seg; + XLogRecPtr page_min_lsn; + XLogRecPtr page_max_lsn; + uint32 hash_key; + LogIndexMemItemHead *page_head; + + // Prevent metadata changes during discovery. + LWLockAcquire(LogIndexMemListLock,LW_SHARED); + tbl_index = log_index_mem_list->table_start_index; +loop: + while(tbl_index != log_index_mem_list->active_table_index) + { + LogIndexMemTBL *mem_tbl = &(log_index_mem_list->mem_table[tbl_index]); + tbl_index = (tbl_index + 1)%(log_index_mem_list->table_cap); + // current mem table no suitability lsn_list + if(mem_tbl->meta.max_lsn < start_lsn) + { + continue; + }else if(mem_tbl->meta.min_lsn >= end_lsn) + { + // there is no suitability lsn_list after this mem table + goto outerloop; + } + else + { + // find page from current mem table + hash_key = LOG_INDEX_MEM_TBL_HASH_PAGE(page); + if(mem_tbl->hash[hash_key] != LOG_INDEX_TBL_INVALID_SEG) + { + page_head = &(mem_tbl->page_head[mem_tbl->hash[hash_key]-1]); + while(!BUFFERTAGS_EQUAL(page_head->tag, *page)){ + if(page_head->next_item == LOG_INDEX_TBL_INVALID_SEG) + { + // cannot find page from current mem table + goto loop; + } + page_head = &(mem_tbl->page_head[page_head->next_item-1]); + } + // find request page, but not lsn + if(page_head->next_seg == LOG_INDEX_TBL_INVALID_SEG || page_head->tail_seg == LOG_INDEX_TBL_INVALID_SEG) + { + continue; + } + else + { + first_seg = &(mem_tbl->seg_item[page_head->next_seg - 1]); + last_seg = &(mem_tbl->seg_item[page_head->tail_seg - 1]); + page_min_lsn = LOG_INDEX_COMBINE_LSN(mem_tbl, first_seg->suffix_lsn[0]); + uint8 id = Min(LOG_INDEX_MEM_ITEM_SEG_LSN_NUM - 1, last_seg->number - 1); + page_max_lsn = LOG_INDEX_COMBINE_LSN(mem_tbl, last_seg->suffix_lsn[id]); + // lsn not correspond with request + if(page_min_lsn >= end_lsn || page_max_lsn < start_lsn) + { + continue; + } + else + { + // find one + LWLockRelease(LogIndexMemListLock); + return true; + } + } + }else + { + continue; + } + } + } + + if (tbl_index == log_index_mem_list->active_table_index){ + LogIndexMemTBL *mem_tbl = &(log_index_mem_list->mem_table[tbl_index]); + tbl_index = (tbl_index + 1)%(log_index_mem_list->table_cap); + // current mem table no suitability lsn_list + if(mem_tbl->meta.max_lsn < start_lsn) + { + goto outerloop; + }else if(mem_tbl->meta.min_lsn >= end_lsn) + { + // there is no suitability lsn_list after this mem table + goto outerloop; + } + else + { + // find page from current mem table + hash_key = LOG_INDEX_MEM_TBL_HASH_PAGE(page); + if(mem_tbl->hash[hash_key] != LOG_INDEX_TBL_INVALID_SEG) + { + page_head = &(mem_tbl->page_head[mem_tbl->hash[hash_key]-1]); + while(!BUFFERTAGS_EQUAL(page_head->tag, *page)){ + if(page_head->next_item == LOG_INDEX_TBL_INVALID_SEG) + { + // cannot find page from current mem table + goto outerloop; + } + page_head = &(mem_tbl->page_head[page_head->next_item-1]); + } + // find request page + if(page_head->next_seg == LOG_INDEX_TBL_INVALID_SEG || page_head->tail_seg == LOG_INDEX_TBL_INVALID_SEG) + { + goto outerloop; + } + else + { + first_seg = &(mem_tbl->seg_item[page_head->next_seg - 1]); + last_seg = &(mem_tbl->seg_item[page_head->tail_seg - 1]); + page_min_lsn = LOG_INDEX_COMBINE_LSN(mem_tbl, first_seg->suffix_lsn[0]); + uint8 id = Min(LOG_INDEX_MEM_ITEM_SEG_LSN_NUM - 1, last_seg->number - 1); + page_max_lsn = LOG_INDEX_COMBINE_LSN(mem_tbl, last_seg->suffix_lsn[id]); + if(page_min_lsn >= end_lsn || page_max_lsn < start_lsn) + { + goto outerloop; + } + else + { + // find one + LWLockRelease(LogIndexMemListLock); + return true; + } + } + }else + { + goto outerloop; + } + } + } +outerloop: + LWLockRelease(LogIndexMemListLock); + return false; +} + +void FreeTagNode(TagNode *head) +{ + TagNode* tn; + while (head != NULL) + { + tn = head; + head = head->next; + free(tn); + tn = NULL; + } +} + +void He3DBGetLogindexStats(uint64 *memtable_total, uint64 *memtable_used, uint64 *memtable_active_index, + uint64 *memtable_start_index, uint64 *page_total) +{ + LWLockAcquire(LogIndexMemListLock,LW_SHARED); + *memtable_start_index = log_index_mem_list->table_start_index; + *memtable_active_index = log_index_mem_list->active_table_index; + *memtable_total = log_index_mem_list->table_cap; + LWLockRelease(LogIndexMemListLock); + *memtable_used = ((*memtable_active_index - *memtable_start_index) + *memtable_total)%*memtable_total + 1; + uint64 tbl_index = *memtable_start_index; + uint64 page_num = 0; + while(tbl_index != *memtable_active_index) + { + LogIndexMemTBL *mem_tbl = &(log_index_mem_list->mem_table[tbl_index]); + tbl_index = (tbl_index + 1)%(*memtable_total); + page_num = page_num + mem_tbl->meta.page_free_head - 2; + } + if (tbl_index == *memtable_active_index) + { + LogIndexMemTBL *mem_tbl = &(log_index_mem_list->mem_table[tbl_index]); + if (pg_atomic_read_u32(&mem_tbl->meta.state) != LOG_INDEX_MEM_TBL_STATE_FREE){ + page_num = page_num + mem_tbl->meta.page_free_head - 2; + } + } + *page_total = page_num; +} diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 818666f..6775a8b 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -49,14 +49,13 @@ #include "utils/ps_status.h" #include "utils/resowner_private.h" - /* This configuration variable is used to set the lock table size */ -int max_locks_per_xact; /* set by guc.c */ +int max_locks_per_xact; /* set by guc.c */ #define NLOCKENTS() \ mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts)) - +static int secondbuffer_match(const void *key1, const void *key2, Size keysize); /* * Data structures defining the semantics of the standard lock methods. * @@ -73,57 +72,56 @@ static const LOCKMASK LockConflicts[] = { /* RowExclusiveLock */ LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | - LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), /* ShareUpdateExclusiveLock */ LOCKBIT_ON(ShareUpdateExclusiveLock) | - LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | - LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), /* ShareLock */ LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | - LOCKBIT_ON(ShareRowExclusiveLock) | - LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), /* ShareRowExclusiveLock */ LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | - LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | - LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), /* ExclusiveLock */ LOCKBIT_ON(RowShareLock) | - LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | - LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | - LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), /* AccessExclusiveLock */ LOCKBIT_ON(AccessShareLock) | LOCKBIT_ON(RowShareLock) | - LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | - LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | - LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock) + LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock) }; /* Names of lock modes, for debug printouts */ static const char *const lock_mode_names[] = -{ - "INVALID", - "AccessShareLock", - "RowShareLock", - "RowExclusiveLock", - "ShareUpdateExclusiveLock", - "ShareLock", - "ShareRowExclusiveLock", - "ExclusiveLock", - "AccessExclusiveLock" -}; + { + "INVALID", + "AccessShareLock", + "RowShareLock", + "RowExclusiveLock", + "ShareUpdateExclusiveLock", + "ShareLock", + "ShareRowExclusiveLock", + "ExclusiveLock", + "AccessExclusiveLock"}; #ifndef LOCK_DEBUG static bool Dummy_trace = false; #endif static const LockMethodData default_lockmethod = { - AccessExclusiveLock, /* highest valid lock mode number */ + AccessExclusiveLock, /* highest valid lock mode number */ LockConflicts, lock_mode_names, #ifdef LOCK_DEBUG @@ -134,7 +132,7 @@ static const LockMethodData default_lockmethod = { }; static const LockMethodData user_lockmethod = { - AccessExclusiveLock, /* highest valid lock mode number */ + AccessExclusiveLock, /* highest valid lock mode number */ LockConflicts, lock_mode_names, #ifdef LOCK_DEBUG @@ -150,25 +148,22 @@ static const LockMethodData user_lockmethod = { static const LockMethod LockMethods[] = { NULL, &default_lockmethod, - &user_lockmethod -}; - + &user_lockmethod}; /* Record that's written to 2PC state file when a lock is persisted */ typedef struct TwoPhaseLockRecord { - LOCKTAG locktag; - LOCKMODE lockmode; + LOCKTAG locktag; + LOCKMODE lockmode; } TwoPhaseLockRecord; - /* * Count of the number of fast path lock slots we believe to be used. This * might be higher than the real number if another backend has transferred * our locks to the primary lock table, but it can never be lower than the * real value, since only we can acquire locks on our own behalf. */ -static int FastPathLocalUseCount = 0; +static int FastPathLocalUseCount = 0; /* * Flag to indicate if the relation extension lock is held by this backend. @@ -198,22 +193,22 @@ static bool IsRelationExtensionLockHeld PG_USED_FOR_ASSERTS_ONLY = false; static bool IsPageLockHeld PG_USED_FOR_ASSERTS_ONLY = false; /* Macros for manipulating proc->fpLockBits */ -#define FAST_PATH_BITS_PER_SLOT 3 -#define FAST_PATH_LOCKNUMBER_OFFSET 1 -#define FAST_PATH_MASK ((1 << FAST_PATH_BITS_PER_SLOT) - 1) +#define FAST_PATH_BITS_PER_SLOT 3 +#define FAST_PATH_LOCKNUMBER_OFFSET 1 +#define FAST_PATH_MASK ((1 << FAST_PATH_BITS_PER_SLOT) - 1) #define FAST_PATH_GET_BITS(proc, n) \ (((proc)->fpLockBits >> (FAST_PATH_BITS_PER_SLOT * n)) & FAST_PATH_MASK) -#define FAST_PATH_BIT_POSITION(n, l) \ - (AssertMacro((l) >= FAST_PATH_LOCKNUMBER_OFFSET), \ - AssertMacro((l) < FAST_PATH_BITS_PER_SLOT+FAST_PATH_LOCKNUMBER_OFFSET), \ - AssertMacro((n) < FP_LOCK_SLOTS_PER_BACKEND), \ - ((l) - FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT * (n))) +#define FAST_PATH_BIT_POSITION(n, l) \ + (AssertMacro((l) >= FAST_PATH_LOCKNUMBER_OFFSET), \ + AssertMacro((l) < FAST_PATH_BITS_PER_SLOT + FAST_PATH_LOCKNUMBER_OFFSET), \ + AssertMacro((n) < FP_LOCK_SLOTS_PER_BACKEND), \ + ((l)-FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT * (n))) #define FAST_PATH_SET_LOCKMODE(proc, n, l) \ - (proc)->fpLockBits |= UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l) + (proc)->fpLockBits |= UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l) #define FAST_PATH_CLEAR_LOCKMODE(proc, n, l) \ - (proc)->fpLockBits &= ~(UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)) + (proc)->fpLockBits &= ~(UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)) #define FAST_PATH_CHECK_LOCKMODE(proc, n, l) \ - ((proc)->fpLockBits & (UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l))) + ((proc)->fpLockBits & (UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l))) /* * The fast-path lock mechanism is concerned only with relation locks on @@ -223,17 +218,17 @@ static bool IsPageLockHeld PG_USED_FOR_ASSERTS_ONLY = false; * self-conflicting, it can't use the fast-path mechanism; but it also does * not conflict with any of the locks that do, so we can ignore it completely. */ -#define EligibleForRelationFastPath(locktag, mode) \ +#define EligibleForRelationFastPath(locktag, mode) \ ((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \ - (locktag)->locktag_type == LOCKTAG_RELATION && \ - (locktag)->locktag_field1 == MyDatabaseId && \ - MyDatabaseId != InvalidOid && \ - (mode) < ShareUpdateExclusiveLock) -#define ConflictsWithRelationFastPath(locktag, mode) \ + (locktag)->locktag_type == LOCKTAG_RELATION && \ + (locktag)->locktag_field1 == MyDatabaseId && \ + MyDatabaseId != InvalidOid && \ + (mode) < ShareUpdateExclusiveLock) +#define ConflictsWithRelationFastPath(locktag, mode) \ ((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \ - (locktag)->locktag_type == LOCKTAG_RELATION && \ - (locktag)->locktag_field1 != InvalidOid && \ - (mode) > ShareUpdateExclusiveLock) + (locktag)->locktag_type == LOCKTAG_RELATION && \ + (locktag)->locktag_field1 != InvalidOid && \ + (mode) > ShareUpdateExclusiveLock) static bool FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode); static bool FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode); @@ -256,7 +251,7 @@ static PROCLOCK *FastPathGetRelationLockEntry(LOCALLOCK *locallock); * the fast-path queues to the main lock table. */ -#define FAST_PATH_STRONG_LOCK_HASH_BITS 10 +#define FAST_PATH_STRONG_LOCK_HASH_BITS 10 #define FAST_PATH_STRONG_LOCK_HASH_PARTITIONS \ (1 << FAST_PATH_STRONG_LOCK_HASH_BITS) #define FastPathStrongLockHashPartition(hashcode) \ @@ -264,12 +259,22 @@ static PROCLOCK *FastPathGetRelationLockEntry(LOCALLOCK *locallock); typedef struct { - slock_t mutex; - uint32 count[FAST_PATH_STRONG_LOCK_HASH_PARTITIONS]; + slock_t mutex; + uint32 count[FAST_PATH_STRONG_LOCK_HASH_PARTITIONS]; } FastPathStrongRelationLockData; + + static volatile FastPathStrongRelationLockData *FastPathStrongRelationLocks; +typedef struct +{ + slock_t mutex; + uint64 offset; + uint64 ino; +} globalOffset; + +static volatile globalOffset *secondBufferGlobalOffset; /* * Pointers to hash tables containing lock state @@ -282,12 +287,17 @@ static HTAB *LockMethodProcLockHash; static HTAB *LockMethodLocalHash; + +/* +fs meta code +*/ +static HTAB *FSMetaHash; + /* private state for error cleanup */ static LOCALLOCK *StrongLockInProgress; static LOCALLOCK *awaitedLock; static ResourceOwner awaitedOwner; - #ifdef LOCK_DEBUG /*------ @@ -307,24 +317,21 @@ static ResourceOwner awaitedOwner; * -------- */ -int Trace_lock_oidmin = FirstNormalObjectId; -bool Trace_locks = false; -bool Trace_userlocks = false; -int Trace_lock_table = 0; -bool Debug_deadlocks = false; - +int Trace_lock_oidmin = FirstNormalObjectId; +bool Trace_locks = false; +bool Trace_userlocks = false; +int Trace_lock_table = 0; +bool Debug_deadlocks = false; inline static bool LOCK_DEBUG_ENABLED(const LOCKTAG *tag) { - return - (*(LockMethods[tag->locktag_lockmethodid]->trace_flag) && - ((Oid) tag->locktag_field2 >= (Oid) Trace_lock_oidmin)) - || (Trace_lock_table && + return (*(LockMethods[tag->locktag_lockmethodid]->trace_flag) && + ((Oid)tag->locktag_field2 >= (Oid)Trace_lock_oidmin)) || + (Trace_lock_table && (tag->locktag_field2 == Trace_lock_table)); } - inline static void LOCK_PRINT(const char *where, const LOCK *lock, LOCKMODE type) { @@ -348,7 +355,6 @@ LOCK_PRINT(const char *where, const LOCK *lock, LOCKMODE type) LockMethods[LOCK_LOCKMETHOD(*lock)]->lockModeNames[type]); } - inline static void PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP) { @@ -357,14 +363,13 @@ PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP) "%s: proclock(%p) lock(%p) method(%u) proc(%p) hold(%x)", where, proclockP, proclockP->tag.myLock, PROCLOCK_LOCKMETHOD(*(proclockP)), - proclockP->tag.myProc, (int) proclockP->holdMask); + proclockP->tag.myProc, (int)proclockP->holdMask); } -#else /* not LOCK_DEBUG */ - -#define LOCK_PRINT(where, lock, type) ((void) 0) -#define PROCLOCK_PRINT(where, proclockP) ((void) 0) -#endif /* not LOCK_DEBUG */ +#else /* not LOCK_DEBUG */ +#define LOCK_PRINT(where, lock, type) ((void)0) +#define PROCLOCK_PRINT(where, proclockP) ((void)0) +#endif /* not LOCK_DEBUG */ static uint32 proclock_hash(const void *key, Size keysize); static void RemoveLocalLock(LOCALLOCK *locallock); @@ -387,7 +392,6 @@ static void LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc, static void GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data); - /* * InitLocks -- Initialize the lock manager's data structures. * @@ -400,13 +404,12 @@ static void GetSingleProcBlockerStatusData(PGPROC *blocked_proc, * backend re-executes this code to obtain pointers to the already existing * shared hash tables and to create its locallock hash table. */ -void -InitLocks(void) +void InitLocks(void) { - HASHCTL info; - long init_table_size, - max_table_size; - bool found; + HASHCTL info; + long init_table_size, + max_table_size; + bool found; /* * Compute init/max size to request for lock hashtables. Note these @@ -479,6 +482,42 @@ InitLocks(void) } +void setglobaloffset(uint64 offset,uint64 ino){ + SpinLockAcquire(&secondBufferGlobalOffset->mutex); + if (offset !=0){ + secondBufferGlobalOffset->offset=offset; + } + if (ino!=0){ + secondBufferGlobalOffset->ino = ino; + } + SpinLockRelease(&secondBufferGlobalOffset->mutex); +} + +/* + fs meta + */ +void InitFSMetaHash(void) +{ + HASHCTL info; + long init_table_size, + max_table_size; + bool found; + + max_table_size = 200; + init_table_size = max_table_size / 2; + info.keysize = sizeof(FSKey); + info.entrysize = sizeof(FSValue); + + info.num_partitions = NUM_LOCK_PARTITIONS; + + FSMetaHash = ShmemInitHash("FSMeta hash", + init_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS | HASH_PARTITION); +} + + /* * Fetch the lock method table associated with a given lock */ @@ -497,13 +536,12 @@ GetLocksMethodTable(const LOCK *lock) LockMethod GetLockTagsMethodTable(const LOCKTAG *locktag) { - LOCKMETHODID lockmethodid = (LOCKMETHODID) locktag->locktag_lockmethodid; + LOCKMETHODID lockmethodid = (LOCKMETHODID)locktag->locktag_lockmethodid; Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods)); return LockMethods[lockmethodid]; } - /* * Compute the hash code associated with a LOCKTAG. * @@ -515,7 +553,7 @@ GetLockTagsMethodTable(const LOCKTAG *locktag) uint32 LockTagHashCode(const LOCKTAG *locktag) { - return get_hash_value(LockMethodLockHash, (const void *) locktag); + return get_hash_value(LockMethodLockHash, (const void *)locktag); } /* @@ -532,9 +570,9 @@ LockTagHashCode(const LOCKTAG *locktag) static uint32 proclock_hash(const void *key, Size keysize) { - const PROCLOCKTAG *proclocktag = (const PROCLOCKTAG *) key; - uint32 lockhash; - Datum procptr; + const PROCLOCKTAG *proclocktag = (const PROCLOCKTAG *)key; + uint32 lockhash; + Datum procptr; Assert(keysize == sizeof(PROCLOCKTAG)); @@ -549,7 +587,7 @@ proclock_hash(const void *key, Size keysize) * intermediate variable to suppress cast-pointer-to-int warnings. */ procptr = PointerGetDatum(proclocktag->myProc); - lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS; + lockhash ^= ((uint32)procptr) << LOG2_NUM_LOCK_PARTITIONS; return lockhash; } @@ -563,14 +601,14 @@ proclock_hash(const void *key, Size keysize) static inline uint32 ProcLockHashCode(const PROCLOCKTAG *proclocktag, uint32 hashcode) { - uint32 lockhash = hashcode; - Datum procptr; + uint32 lockhash = hashcode; + Datum procptr; /* * This must match proclock_hash()! */ procptr = PointerGetDatum(proclocktag->myProc); - lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS; + lockhash ^= ((uint32)procptr) << LOG2_NUM_LOCK_PARTITIONS; return lockhash; } @@ -578,10 +616,9 @@ ProcLockHashCode(const PROCLOCKTAG *proclocktag, uint32 hashcode) /* * Given two lock modes, return whether they would conflict. */ -bool -DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2) +bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2) { - LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD]; + LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD]; if (lockMethodTable->conflictTab[mode1] & LOCKBIT_ON(mode2)) return true; @@ -593,11 +630,10 @@ DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2) * LockHeldByMe -- test whether lock 'locktag' is held with mode 'lockmode' * by the current transaction */ -bool -LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode) +bool LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode) { LOCALLOCKTAG localtag; - LOCALLOCK *locallock; + LOCALLOCK *locallock; /* * See if there is a LOCALLOCK entry for this lock and lockmode @@ -606,9 +642,9 @@ LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode) localtag.lock = *locktag; localtag.mode = lockmode; - locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, - (void *) &localtag, - HASH_FIND, NULL); + locallock = (LOCALLOCK *)hash_search(LockMethodLocalHash, + (void *)&localtag, + HASH_FIND, NULL); return (locallock && locallock->nLocks > 0); } @@ -629,17 +665,16 @@ GetLockMethodLocalHash(void) * LockHasWaiters -- look up 'locktag' and check if releasing this * lock would wake up other processes waiting for it. */ -bool -LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) +bool LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) { LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; - LockMethod lockMethodTable; + LockMethod lockMethodTable; LOCALLOCKTAG localtag; - LOCALLOCK *locallock; - LOCK *lock; - PROCLOCK *proclock; - LWLock *partitionLock; - bool hasWaiters = false; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + LWLock *partitionLock; + bool hasWaiters = false; if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); @@ -661,9 +696,9 @@ LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) localtag.lock = *locktag; localtag.mode = lockmode; - locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, - (void *) &localtag, - HASH_FIND, NULL); + locallock = (LOCALLOCK *)hash_search(LockMethodLocalHash, + (void *)&localtag, + HASH_FIND, NULL); /* * let the caller print its own error message, too. Do not ereport(ERROR). @@ -775,17 +810,17 @@ LockAcquireExtended(const LOCKTAG *locktag, LOCALLOCK **locallockp) { LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; - LockMethod lockMethodTable; + LockMethod lockMethodTable; LOCALLOCKTAG localtag; - LOCALLOCK *locallock; - LOCK *lock; - PROCLOCK *proclock; - bool found; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + bool found; ResourceOwner owner; - uint32 hashcode; - LWLock *partitionLock; - bool found_conflict; - bool log_lock = false; + uint32 hashcode; + LWLock *partitionLock; + bool found_conflict; + bool log_lock = false; if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); @@ -823,9 +858,9 @@ LockAcquireExtended(const LOCKTAG *locktag, localtag.lock = *locktag; localtag.mode = lockmode; - locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, - (void *) &localtag, - HASH_ENTER, &found); + locallock = (LOCALLOCK *)hash_search(LockMethodLocalHash, + (void *)&localtag, + HASH_ENTER, &found); /* * if it's a new locallock object, initialize it @@ -840,7 +875,7 @@ LockAcquireExtended(const LOCKTAG *locktag, locallock->lockCleared = false; locallock->numLockOwners = 0; locallock->maxLockOwners = 8; - locallock->lockOwners = NULL; /* in case next line fails */ + locallock->lockOwners = NULL; /* in case next line fails */ locallock->lockOwners = (LOCALLOCKOWNER *) MemoryContextAlloc(TopMemoryContext, locallock->maxLockOwners * sizeof(LOCALLOCKOWNER)); @@ -850,7 +885,7 @@ LockAcquireExtended(const LOCKTAG *locktag, /* Make sure there will be room to remember the lock */ if (locallock->numLockOwners >= locallock->maxLockOwners) { - int newsize = locallock->maxLockOwners * 2; + int newsize = locallock->maxLockOwners * 2; locallock->lockOwners = (LOCALLOCKOWNER *) repalloc(locallock->lockOwners, @@ -926,8 +961,8 @@ LockAcquireExtended(const LOCKTAG *locktag, if (EligibleForRelationFastPath(locktag, lockmode) && FastPathLocalUseCount < FP_LOCK_SLOTS_PER_BACKEND) { - uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); - bool acquired; + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + bool acquired; /* * LWLockAcquire acts as a memory sequencing point, so it's safe to @@ -964,7 +999,7 @@ LockAcquireExtended(const LOCKTAG *locktag, */ if (ConflictsWithRelationFastPath(locktag, lockmode)) { - uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); BeginStrongLockAcquire(locallock, fasthashcode); if (!FastPathTransferRelationLocks(lockMethodTable, locktag, @@ -1054,13 +1089,13 @@ LockAcquireExtended(const LOCKTAG *locktag, AbortStrongLockAcquire(); if (proclock->holdMask == 0) { - uint32 proclock_hashcode; + uint32 proclock_hashcode; proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode); SHMQueueDelete(&proclock->lockLink); SHMQueueDelete(&proclock->procLink); if (!hash_search_with_hash_value(LockMethodProcLockHash, - (void *) &(proclock->tag), + (void *)&(proclock->tag), proclock_hashcode, HASH_REMOVE, NULL)) @@ -1169,20 +1204,20 @@ static PROCLOCK * SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc, const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode) { - LOCK *lock; - PROCLOCK *proclock; + LOCK *lock; + PROCLOCK *proclock; PROCLOCKTAG proclocktag; - uint32 proclock_hashcode; - bool found; + uint32 proclock_hashcode; + bool found; /* * Find or create a lock with this tag. */ - lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, - (const void *) locktag, - hashcode, - HASH_ENTER_NULL, - &found); + lock = (LOCK *)hash_search_with_hash_value(LockMethodLockHash, + (const void *)locktag, + hashcode, + HASH_ENTER_NULL, + &found); if (!lock) return NULL; @@ -1220,11 +1255,11 @@ SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc, /* * Find or create a proclock entry with this tag */ - proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash, - (void *) &proclocktag, - proclock_hashcode, - HASH_ENTER_NULL, - &found); + proclock = (PROCLOCK *)hash_search_with_hash_value(LockMethodProcLockHash, + (void *)&proclocktag, + proclock_hashcode, + HASH_ENTER_NULL, + &found); if (!proclock) { /* Oops, not enough shmem for the proclock */ @@ -1238,7 +1273,7 @@ SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc, */ Assert(SHMQueueEmpty(&(lock->procLocks))); if (!hash_search_with_hash_value(LockMethodLockHash, - (void *) &(lock->tag), + (void *)&(lock->tag), hashcode, HASH_REMOVE, NULL)) @@ -1252,7 +1287,7 @@ SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc, */ if (!found) { - uint32 partition = LockHashPartition(hashcode); + uint32 partition = LockHashPartition(hashcode); /* * It might seem unsafe to access proclock->groupLeader without a @@ -1264,8 +1299,7 @@ SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc, * lock group leader without first releasing all of its locks (and in * particular the one we are currently transferring). */ - proclock->groupLeader = proc->lockGroupLeader != NULL ? - proc->lockGroupLeader : proc; + proclock->groupLeader = proc->lockGroupLeader != NULL ? proc->lockGroupLeader : proc; proclock->holdMask = 0; proclock->releaseMask = 0; /* Add proclock to appropriate lists */ @@ -1296,16 +1330,16 @@ SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc, * better to use a table. For now, though, this works. */ { - int i; + int i; for (i = lockMethodTable->numLockModes; i > 0; i--) { if (proclock->holdMask & LOCKBIT_ON(i)) { - if (i >= (int) lockmode) - break; /* safe: we have a lock >= req level */ + if (i >= (int)lockmode) + break; /* safe: we have a lock >= req level */ elog(LOG, "deadlock risk: raising lock level" - " from %s to %s on object %u/%u/%u", + " from %s to %s on object %u/%u/%u", lockMethodTable->lockModeNames[i], lockMethodTable->lockModeNames[lockmode], lock->tag.locktag_field1, lock->tag.locktag_field2, @@ -1314,7 +1348,7 @@ SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc, } } } -#endif /* CHECK_DEADLOCK_RISK */ +#endif /* CHECK_DEADLOCK_RISK */ } /* @@ -1365,7 +1399,7 @@ CheckAndSetLockHeld(LOCALLOCK *locallock, bool acquired) static void RemoveLocalLock(LOCALLOCK *locallock) { - int i; + int i; for (i = locallock->numLockOwners - 1; i >= 0; i--) { @@ -1379,7 +1413,7 @@ RemoveLocalLock(LOCALLOCK *locallock) if (locallock->holdsStrongLockCount) { - uint32 fasthashcode; + uint32 fasthashcode; fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode); @@ -1391,7 +1425,7 @@ RemoveLocalLock(LOCALLOCK *locallock) } if (!hash_search(LockMethodLocalHash, - (void *) &(locallock->tag), + (void *)&(locallock->tag), HASH_REMOVE, NULL)) elog(WARNING, "locallock table corrupted"); @@ -1415,20 +1449,19 @@ RemoveLocalLock(LOCALLOCK *locallock) * the same group. So, we must subtract off these locks when determining * whether the requested new lock conflicts with those already held. */ -bool -LockCheckConflicts(LockMethod lockMethodTable, - LOCKMODE lockmode, - LOCK *lock, - PROCLOCK *proclock) +bool LockCheckConflicts(LockMethod lockMethodTable, + LOCKMODE lockmode, + LOCK *lock, + PROCLOCK *proclock) { - int numLockModes = lockMethodTable->numLockModes; - LOCKMASK myLocks; - int conflictMask = lockMethodTable->conflictTab[lockmode]; - int conflictsRemaining[MAX_LOCKMODES]; - int totalConflictsRemaining = 0; - int i; - SHM_QUEUE *procLocks; - PROCLOCK *otherproclock; + int numLockModes = lockMethodTable->numLockModes; + LOCKMASK myLocks; + int conflictMask = lockMethodTable->conflictTab[lockmode]; + int conflictsRemaining[MAX_LOCKMODES]; + int totalConflictsRemaining = 0; + int i; + SHM_QUEUE *procLocks; + PROCLOCK *otherproclock; /* * first check for global conflicts: If no locks conflict with my request, @@ -1509,7 +1542,7 @@ LockCheckConflicts(LockMethod lockMethodTable, proclock->groupLeader == otherproclock->groupLeader && (otherproclock->holdMask & conflictMask) != 0) { - int intersectMask = otherproclock->holdMask & conflictMask; + int intersectMask = otherproclock->holdMask & conflictMask; for (i = 1; i <= numLockModes; i++) { @@ -1550,8 +1583,7 @@ LockCheckConflicts(LockMethod lockMethodTable, * table entry; but since we may be awaking some other process, we can't do * that here; it's done by GrantLockLocal, instead. */ -void -GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode) +void GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode) { lock->nGranted++; lock->granted[lockmode]++; @@ -1577,7 +1609,7 @@ static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode, PROCLOCK *proclock, LockMethod lockMethodTable) { - bool wakeupNeeded = false; + bool wakeupNeeded = false; Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0)); Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0)); @@ -1641,14 +1673,14 @@ CleanUpLock(LOCK *lock, PROCLOCK *proclock, */ if (proclock->holdMask == 0) { - uint32 proclock_hashcode; + uint32 proclock_hashcode; PROCLOCK_PRINT("CleanUpLock: deleting", proclock); SHMQueueDelete(&proclock->lockLink); SHMQueueDelete(&proclock->procLink); proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode); if (!hash_search_with_hash_value(LockMethodProcLockHash, - (void *) &(proclock->tag), + (void *)&(proclock->tag), proclock_hashcode, HASH_REMOVE, NULL)) @@ -1664,7 +1696,7 @@ CleanUpLock(LOCK *lock, PROCLOCK *proclock, LOCK_PRINT("CleanUpLock: deleting", lock, 0); Assert(SHMQueueEmpty(&(lock->procLocks))); if (!hash_search_with_hash_value(LockMethodLockHash, - (void *) &(lock->tag), + (void *)&(lock->tag), hashcode, HASH_REMOVE, NULL)) @@ -1688,7 +1720,7 @@ static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner) { LOCALLOCKOWNER *lockOwners = locallock->lockOwners; - int i; + int i; Assert(locallock->numLockOwners < locallock->maxLockOwners); /* Count the total */ @@ -1752,11 +1784,10 @@ FinishStrongLockAcquire(void) * AbortStrongLockAcquire - undo strong lock state changes performed by * BeginStrongLockAcquire. */ -void -AbortStrongLockAcquire(void) +void AbortStrongLockAcquire(void) { - uint32 fasthashcode; - LOCALLOCK *locallock = StrongLockInProgress; + uint32 fasthashcode; + LOCALLOCK *locallock = StrongLockInProgress; if (locallock == NULL) return; @@ -1781,8 +1812,7 @@ AbortStrongLockAcquire(void) * We could just export GrantLockLocal, but that would require including * resowner.h in lock.h, which creates circularity. */ -void -GrantAwaitedLock(void) +void GrantAwaitedLock(void) { GrantLockLocal(awaitedLock, awaitedOwner); } @@ -1794,8 +1824,7 @@ GrantAwaitedLock(void) * sessions generated before we acquired this lock, and so we can confidently * assume we know about any catalog changes protected by this lock. */ -void -MarkLockClear(LOCALLOCK *locallock) +void MarkLockClear(LOCALLOCK *locallock) { Assert(locallock->nLocks > 0); locallock->lockCleared = true; @@ -1813,8 +1842,8 @@ static void WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner) { LOCKMETHODID lockmethodid = LOCALLOCK_LOCKMETHOD(*locallock); - LockMethod lockMethodTable = LockMethods[lockmethodid]; - char *volatile new_status = NULL; + LockMethod lockMethodTable = LockMethods[lockmethodid]; + char *volatile new_status = NULL; LOCK_PRINT("WaitOnLock: sleeping on lock", locallock->lock, locallock->tag.mode); @@ -1823,10 +1852,10 @@ WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner) if (update_process_title) { const char *old_status; - int len; + int len; old_status = get_ps_display(&len); - new_status = (char *) palloc(len + 8 + 1); + new_status = (char *)palloc(len + 8 + 1); memcpy(new_status, old_status, len); strcpy(new_status + len, " waiting"); set_ps_display(new_status); @@ -1913,12 +1942,11 @@ WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner) * * NB: this does not clean up any locallock object that may exist for the lock. */ -void -RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode) +void RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode) { - LOCK *waitLock = proc->waitLock; - PROCLOCK *proclock = proc->waitProcLock; - LOCKMODE lockmode = proc->waitLockMode; + LOCK *waitLock = proc->waitLock; + PROCLOCK *proclock = proc->waitProcLock; + LOCKMODE lockmode = proc->waitLockMode; LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*waitLock); /* Make sure proc is waiting */ @@ -1970,17 +1998,16 @@ RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode) * the waking process and any new process to * come along and request the lock.) */ -bool -LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) +bool LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) { LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; - LockMethod lockMethodTable; + LockMethod lockMethodTable; LOCALLOCKTAG localtag; - LOCALLOCK *locallock; - LOCK *lock; - PROCLOCK *proclock; - LWLock *partitionLock; - bool wakeupNeeded; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + LWLock *partitionLock; + bool wakeupNeeded; if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); @@ -2002,9 +2029,9 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) localtag.lock = *locktag; localtag.mode = lockmode; - locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, - (void *) &localtag, - HASH_FIND, NULL); + locallock = (LOCALLOCK *)hash_search(LockMethodLocalHash, + (void *)&localtag, + HASH_FIND, NULL); /* * let the caller print its own error message, too. Do not ereport(ERROR). @@ -2022,7 +2049,7 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) { LOCALLOCKOWNER *lockOwners = locallock->lockOwners; ResourceOwner owner; - int i; + int i; /* Identify owner for lock */ if (sessionLock) @@ -2078,7 +2105,7 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) if (EligibleForRelationFastPath(locktag, lockmode) && FastPathLocalUseCount > 0) { - bool released; + bool released; /* * We might not find the lock here, even if we originally entered it @@ -2116,21 +2143,21 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) PROCLOCKTAG proclocktag; Assert(EligibleForRelationFastPath(locktag, lockmode)); - lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, - (const void *) locktag, - locallock->hashcode, - HASH_FIND, - NULL); + lock = (LOCK *)hash_search_with_hash_value(LockMethodLockHash, + (const void *)locktag, + locallock->hashcode, + HASH_FIND, + NULL); if (!lock) elog(ERROR, "failed to re-find shared lock object"); locallock->lock = lock; proclocktag.myLock = lock; proclocktag.myProc = MyProc; - locallock->proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash, - (void *) &proclocktag, - HASH_FIND, - NULL); + locallock->proclock = (PROCLOCK *)hash_search(LockMethodProcLockHash, + (void *)&proclocktag, + HASH_FIND, + NULL); if (!locallock->proclock) elog(ERROR, "failed to re-find shared proclock object"); } @@ -2175,18 +2202,17 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) * allLocks == true: release all locks including session locks. * allLocks == false: release all non-session locks. */ -void -LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks) +void LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks) { HASH_SEQ_STATUS status; - LockMethod lockMethodTable; - int i, - numLockModes; - LOCALLOCK *locallock; - LOCK *lock; - PROCLOCK *proclock; - int partition; - bool have_fast_path_lwlock = false; + LockMethod lockMethodTable; + int i, + numLockModes; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + int partition; + bool have_fast_path_lwlock = false; if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); @@ -2218,7 +2244,7 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks) */ hash_seq_init(&status, LockMethodLocalHash); - while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + while ((locallock = (LOCALLOCK *)hash_seq_search(&status)) != NULL) { /* * If the LOCALLOCK entry is unused, we must've run out of shared @@ -2273,8 +2299,8 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks) */ if (locallock->proclock == NULL || locallock->lock == NULL) { - LOCKMODE lockmode = locallock->tag.mode; - Oid relid; + LOCKMODE lockmode = locallock->tag.mode; + Oid relid; /* Verify that a fast-path lock is what we've got. */ if (!EligibleForRelationFastPath(&locallock->tag.lock, lockmode)) @@ -2340,9 +2366,9 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks) */ for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++) { - LWLock *partitionLock; - SHM_QUEUE *procLocks = &(MyProc->myProcLocks[partition]); - PROCLOCK *nextplock; + LWLock *partitionLock; + SHM_QUEUE *procLocks = &(MyProc->myProcLocks[partition]); + PROCLOCK *nextplock; partitionLock = LockHashPartitionLockByIndex(partition); @@ -2367,16 +2393,16 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks) */ if (SHMQueueNext(procLocks, procLocks, offsetof(PROCLOCK, procLink)) == NULL) - continue; /* needn't examine this partition */ + continue; /* needn't examine this partition */ LWLockAcquire(partitionLock, LW_EXCLUSIVE); - for (proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, - offsetof(PROCLOCK, procLink)); + for (proclock = (PROCLOCK *)SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, procLink)); proclock; proclock = nextplock) { - bool wakeupNeeded = false; + bool wakeupNeeded = false; /* Get link first, since we may unlink/delete this proclock */ nextplock = (PROCLOCK *) @@ -2434,10 +2460,10 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks) lockMethodTable, LockTagHashCode(&lock->tag), wakeupNeeded); - } /* loop over PROCLOCKs within this partition */ + } /* loop over PROCLOCKs within this partition */ LWLockRelease(partitionLock); - } /* loop over partitions */ + } /* loop over partitions */ #ifdef LOCK_DEBUG if (*(lockMethodTable->trace_flag)) @@ -2449,18 +2475,17 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks) * LockReleaseSession -- Release all session locks of the specified lock method * that are held by the current process. */ -void -LockReleaseSession(LOCKMETHODID lockmethodid) +void LockReleaseSession(LOCKMETHODID lockmethodid) { HASH_SEQ_STATUS status; - LOCALLOCK *locallock; + LOCALLOCK *locallock; if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); hash_seq_init(&status, LockMethodLocalHash); - while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + while ((locallock = (LOCALLOCK *)hash_seq_search(&status)) != NULL) { /* Ignore items that are not of the specified lock method */ if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid) @@ -2479,22 +2504,21 @@ LockReleaseSession(LOCKMETHODID lockmethodid) * Otherwise, pass NULL for locallocks, and we'll traverse through our hash * table to find them. */ -void -LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks) +void LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks) { if (locallocks == NULL) { HASH_SEQ_STATUS status; - LOCALLOCK *locallock; + LOCALLOCK *locallock; hash_seq_init(&status, LockMethodLocalHash); - while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + while ((locallock = (LOCALLOCK *)hash_seq_search(&status)) != NULL) ReleaseLockIfHeld(locallock, false); } else { - int i; + int i; for (i = nlocks - 1; i >= 0; i--) ReleaseLockIfHeld(locallocks[i], false); @@ -2519,7 +2543,7 @@ ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock) { ResourceOwner owner; LOCALLOCKOWNER *lockOwners; - int i; + int i; /* Identify owner for lock (must match LockRelease!) */ if (sessionLock) @@ -2574,8 +2598,7 @@ ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock) * (e.g pg_dump with a large schema). Otherwise, pass NULL for locallocks, * and we'll traverse through our hash table to find them. */ -void -LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks) +void LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks) { ResourceOwner parent = ResourceOwnerGetParent(CurrentResourceOwner); @@ -2584,16 +2607,16 @@ LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks) if (locallocks == NULL) { HASH_SEQ_STATUS status; - LOCALLOCK *locallock; + LOCALLOCK *locallock; hash_seq_init(&status, LockMethodLocalHash); - while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + while ((locallock = (LOCALLOCK *)hash_seq_search(&status)) != NULL) LockReassignOwner(locallock, parent); } else { - int i; + int i; for (i = nlocks - 1; i >= 0; i--) LockReassignOwner(locallocks[i], parent); @@ -2608,9 +2631,9 @@ static void LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent) { LOCALLOCKOWNER *lockOwners; - int i; - int ic = -1; - int ip = -1; + int i; + int ic = -1; + int ip = -1; /* * Scan to see if there are any locks belonging to current owner or its @@ -2626,7 +2649,7 @@ LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent) } if (ic < 0) - return; /* no current locks */ + return; /* no current locks */ if (ip < 0) { @@ -2653,8 +2676,8 @@ LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent) static bool FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode) { - uint32 f; - uint32 unused_slot = FP_LOCK_SLOTS_PER_BACKEND; + uint32 f; + uint32 unused_slot = FP_LOCK_SLOTS_PER_BACKEND; /* Scan for existing entry for this relid, remembering empty slot. */ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) @@ -2690,14 +2713,13 @@ FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode) static bool FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode) { - uint32 f; - bool result = false; + uint32 f; + bool result = false; FastPathLocalUseCount = 0; for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) { - if (MyProc->fpRelId[f] == relid - && FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode)) + if (MyProc->fpRelId[f] == relid && FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode)) { Assert(!result); FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode); @@ -2721,9 +2743,9 @@ static bool FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag, uint32 hashcode) { - LWLock *partitionLock = LockHashPartitionLock(hashcode); - Oid relid = locktag->locktag_field2; - uint32 i; + LWLock *partitionLock = LockHashPartitionLock(hashcode); + Oid relid = locktag->locktag_field2; + uint32 i; /* * Every PGPROC that can potentially hold a fast-path lock is present in @@ -2733,8 +2755,8 @@ FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag */ for (i = 0; i < ProcGlobal->allProcCount; i++) { - PGPROC *proc = &ProcGlobal->allProcs[i]; - uint32 f; + PGPROC *proc = &ProcGlobal->allProcs[i]; + uint32 f; LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE); @@ -2761,7 +2783,7 @@ FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) { - uint32 lockmode; + uint32 lockmode; /* Look for an allocated slot matching the given relid. */ if (relid != proc->fpRelId[f] || FAST_PATH_GET_BITS(proc, f) == 0) @@ -2773,7 +2795,7 @@ FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag lockmode < FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT; ++lockmode) { - PROCLOCK *proclock; + PROCLOCK *proclock; if (!FAST_PATH_CHECK_LOCKMODE(proc, f, lockmode)) continue; @@ -2808,18 +2830,18 @@ FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag static PROCLOCK * FastPathGetRelationLockEntry(LOCALLOCK *locallock) { - LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD]; - LOCKTAG *locktag = &locallock->tag.lock; - PROCLOCK *proclock = NULL; - LWLock *partitionLock = LockHashPartitionLock(locallock->hashcode); - Oid relid = locktag->locktag_field2; - uint32 f; + LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD]; + LOCKTAG *locktag = &locallock->tag.lock; + PROCLOCK *proclock = NULL; + LWLock *partitionLock = LockHashPartitionLock(locallock->hashcode); + Oid relid = locktag->locktag_field2; + uint32 f; LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) { - uint32 lockmode; + uint32 lockmode; /* Look for an allocated slot matching the given relid. */ if (relid != MyProc->fpRelId[f] || FAST_PATH_GET_BITS(MyProc, f) == 0) @@ -2858,17 +2880,17 @@ FastPathGetRelationLockEntry(LOCALLOCK *locallock) /* Lock may have already been transferred by some other backend. */ if (proclock == NULL) { - LOCK *lock; + LOCK *lock; PROCLOCKTAG proclocktag; - uint32 proclock_hashcode; + uint32 proclock_hashcode; LWLockAcquire(partitionLock, LW_SHARED); - lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, - (void *) locktag, - locallock->hashcode, - HASH_FIND, - NULL); + lock = (LOCK *)hash_search_with_hash_value(LockMethodLockHash, + (void *)locktag, + locallock->hashcode, + HASH_FIND, + NULL); if (!lock) elog(ERROR, "failed to re-find shared lock object"); @@ -2878,7 +2900,7 @@ FastPathGetRelationLockEntry(LOCALLOCK *locallock) proclock_hashcode = ProcLockHashCode(&proclocktag, locallock->hashcode); proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash, - (void *) &proclocktag, + (void *)&proclocktag, proclock_hashcode, HASH_FIND, NULL); @@ -2914,15 +2936,15 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) { static VirtualTransactionId *vxids; LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; - LockMethod lockMethodTable; - LOCK *lock; - LOCKMASK conflictMask; - SHM_QUEUE *procLocks; - PROCLOCK *proclock; - uint32 hashcode; - LWLock *partitionLock; - int count = 0; - int fast_count = 0; + LockMethod lockMethodTable; + LOCK *lock; + LOCKMASK conflictMask; + SHM_QUEUE *procLocks; + PROCLOCK *proclock; + uint32 hashcode; + LWLock *partitionLock; + int count = 0; + int fast_count = 0; if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); @@ -2941,7 +2963,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) vxids = (VirtualTransactionId *) MemoryContextAlloc(TopMemoryContext, sizeof(VirtualTransactionId) * - (MaxBackends + max_prepared_xacts + 1)); + (MaxBackends + max_prepared_xacts + 1)); } else vxids = (VirtualTransactionId *) @@ -2960,8 +2982,8 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) */ if (ConflictsWithRelationFastPath(locktag, lockmode)) { - int i; - Oid relid = locktag->locktag_field2; + int i; + Oid relid = locktag->locktag_field2; VirtualTransactionId vxid; /* @@ -2976,8 +2998,8 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) */ for (i = 0; i < ProcGlobal->allProcCount; i++) { - PGPROC *proc = &ProcGlobal->allProcs[i]; - uint32 f; + PGPROC *proc = &ProcGlobal->allProcs[i]; + uint32 f; /* A backend never blocks itself */ if (proc == MyProc) @@ -3001,7 +3023,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) { - uint32 lockmask; + uint32 lockmask; /* Look for an allocated slot matching the given relid. */ if (relid != proc->fpRelId[f]) @@ -3041,11 +3063,11 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) */ LWLockAcquire(partitionLock, LW_SHARED); - lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, - (const void *) locktag, - hashcode, - HASH_FIND, - NULL); + lock = (LOCK *)hash_search_with_hash_value(LockMethodLockHash, + (const void *)locktag, + hashcode, + HASH_FIND, + NULL); if (!lock) { /* @@ -3066,14 +3088,14 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) procLocks = &(lock->procLocks); - proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, - offsetof(PROCLOCK, lockLink)); + proclock = (PROCLOCK *)SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, lockLink)); while (proclock) { if (conflictMask & proclock->holdMask) { - PGPROC *proc = proclock->tag.myProc; + PGPROC *proc = proclock->tag.myProc; /* A backend never blocks itself */ if (proc != MyProc) @@ -3084,7 +3106,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) if (VirtualTransactionIdIsValid(vxid)) { - int i; + int i; /* Avoid duplicate entries. */ for (i = 0; i < fast_count; ++i) @@ -3097,13 +3119,13 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) } } - proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink, - offsetof(PROCLOCK, lockLink)); + proclock = (PROCLOCK *)SHMQueueNext(procLocks, &proclock->lockLink, + offsetof(PROCLOCK, lockLink)); } LWLockRelease(partitionLock); - if (count > MaxBackends + max_prepared_xacts) /* should never happen */ + if (count > MaxBackends + max_prepared_xacts) /* should never happen */ elog(PANIC, "too many conflicting locks found"); vxids[count].backendId = InvalidBackendId; @@ -3129,13 +3151,13 @@ LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc, LOCKTAG *locktag, LOCKMODE lockmode, bool decrement_strong_lock_count) { - LOCK *lock; - PROCLOCK *proclock; + LOCK *lock; + PROCLOCK *proclock; PROCLOCKTAG proclocktag; - uint32 hashcode; - uint32 proclock_hashcode; - LWLock *partitionLock; - bool wakeupNeeded; + uint32 hashcode; + uint32 proclock_hashcode; + LWLock *partitionLock; + bool wakeupNeeded; hashcode = LockTagHashCode(locktag); partitionLock = LockHashPartitionLock(hashcode); @@ -3145,11 +3167,11 @@ LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc, /* * Re-find the lock object (it had better be there). */ - lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, - (void *) locktag, - hashcode, - HASH_FIND, - NULL); + lock = (LOCK *)hash_search_with_hash_value(LockMethodLockHash, + (void *)locktag, + hashcode, + HASH_FIND, + NULL); if (!lock) elog(PANIC, "failed to re-find shared lock object"); @@ -3161,11 +3183,11 @@ LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc, proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode); - proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash, - (void *) &proclocktag, - proclock_hashcode, - HASH_FIND, - NULL); + proclock = (PROCLOCK *)hash_search_with_hash_value(LockMethodProcLockHash, + (void *)&proclocktag, + proclock_hashcode, + HASH_FIND, + NULL); if (!proclock) elog(PANIC, "failed to re-find shared proclock object"); @@ -3196,10 +3218,9 @@ LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc, /* * Decrement strong lock count. This logic is needed only for 2PC. */ - if (decrement_strong_lock_count - && ConflictsWithRelationFastPath(locktag, lockmode)) + if (decrement_strong_lock_count && ConflictsWithRelationFastPath(locktag, lockmode)) { - uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); SpinLockAcquire(&FastPathStrongRelationLocks->mutex); Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0); @@ -3233,15 +3254,15 @@ CheckForSessionAndXactLocks(void) { typedef struct { - LOCKTAG lock; /* identifies the lockable object */ - bool sessLock; /* is any lockmode held at session level? */ - bool xactLock; /* is any lockmode held at xact level? */ + LOCKTAG lock; /* identifies the lockable object */ + bool sessLock; /* is any lockmode held at session level? */ + bool xactLock; /* is any lockmode held at xact level? */ } PerLockTagEntry; - HASHCTL hash_ctl; - HTAB *lockhtab; + HASHCTL hash_ctl; + HTAB *lockhtab; HASH_SEQ_STATUS status; - LOCALLOCK *locallock; + LOCALLOCK *locallock; /* Create a local hash table keyed by LOCKTAG only */ hash_ctl.keysize = sizeof(LOCKTAG); @@ -3256,12 +3277,12 @@ CheckForSessionAndXactLocks(void) /* Scan local lock table to find entries for each LOCKTAG */ hash_seq_init(&status, LockMethodLocalHash); - while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + while ((locallock = (LOCALLOCK *)hash_seq_search(&status)) != NULL) { LOCALLOCKOWNER *lockOwners = locallock->lockOwners; PerLockTagEntry *hentry; - bool found; - int i; + bool found; + int i; /* * Ignore VXID locks. We don't want those to be held by prepared @@ -3275,10 +3296,10 @@ CheckForSessionAndXactLocks(void) continue; /* Otherwise, find or make an entry in lockhtab */ - hentry = (PerLockTagEntry *) hash_search(lockhtab, - (void *) &locallock->tag.lock, - HASH_ENTER, &found); - if (!found) /* initialize, if newly created */ + hentry = (PerLockTagEntry *)hash_search(lockhtab, + (void *)&locallock->tag.lock, + HASH_ENTER, &found); + if (!found) /* initialize, if newly created */ hentry->sessLock = hentry->xactLock = false; /* Scan to see if we hold lock at session or xact level or both */ @@ -3316,11 +3337,10 @@ CheckForSessionAndXactLocks(void) * Fast-path locks are an exception, however: we move any such locks to * the main table before allowing PREPARE TRANSACTION to succeed. */ -void -AtPrepare_Locks(void) +void AtPrepare_Locks(void) { HASH_SEQ_STATUS status; - LOCALLOCK *locallock; + LOCALLOCK *locallock; /* First, verify there aren't locks of both xact and session level */ CheckForSessionAndXactLocks(); @@ -3328,13 +3348,13 @@ AtPrepare_Locks(void) /* Now do the per-locallock cleanup work */ hash_seq_init(&status, LockMethodLocalHash); - while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + while ((locallock = (LOCALLOCK *)hash_seq_search(&status)) != NULL) { TwoPhaseLockRecord record; LOCALLOCKOWNER *lockOwners = locallock->lockOwners; - bool haveSessionLock; - bool haveXactLock; - int i; + bool haveSessionLock; + bool haveXactLock; + int i; /* * Ignore VXID locks. We don't want those to be held by prepared @@ -3412,16 +3432,15 @@ AtPrepare_Locks(void) * and leave the LOCALLOCK entries to be garbage-collected by LockReleaseAll, * but that probably costs more cycles. */ -void -PostPrepare_Locks(TransactionId xid) +void PostPrepare_Locks(TransactionId xid) { - PGPROC *newproc = TwoPhaseGetDummyProc(xid, false); + PGPROC *newproc = TwoPhaseGetDummyProc(xid, false); HASH_SEQ_STATUS status; - LOCALLOCK *locallock; - LOCK *lock; - PROCLOCK *proclock; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; PROCLOCKTAG proclocktag; - int partition; + int partition; /* Can't prepare a lock group follower. */ Assert(MyProc->lockGroupLeader == NULL || @@ -3441,12 +3460,12 @@ PostPrepare_Locks(TransactionId xid) */ hash_seq_init(&status, LockMethodLocalHash); - while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + while ((locallock = (LOCALLOCK *)hash_seq_search(&status)) != NULL) { LOCALLOCKOWNER *lockOwners = locallock->lockOwners; - bool haveSessionLock; - bool haveXactLock; - int i; + bool haveSessionLock; + bool haveXactLock; + int i; if (locallock->proclock == NULL || locallock->lock == NULL) { @@ -3496,9 +3515,9 @@ PostPrepare_Locks(TransactionId xid) */ for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++) { - LWLock *partitionLock; - SHM_QUEUE *procLocks = &(MyProc->myProcLocks[partition]); - PROCLOCK *nextplock; + LWLock *partitionLock; + SHM_QUEUE *procLocks = &(MyProc->myProcLocks[partition]); + PROCLOCK *nextplock; partitionLock = LockHashPartitionLockByIndex(partition); @@ -3512,12 +3531,12 @@ PostPrepare_Locks(TransactionId xid) */ if (SHMQueueNext(procLocks, procLocks, offsetof(PROCLOCK, procLink)) == NULL) - continue; /* needn't examine this partition */ + continue; /* needn't examine this partition */ LWLockAcquire(partitionLock, LW_EXCLUSIVE); - for (proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, - offsetof(PROCLOCK, procLink)); + for (proclock = (PROCLOCK *)SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, procLink)); proclock; proclock = nextplock) { @@ -3583,8 +3602,8 @@ PostPrepare_Locks(TransactionId xid) * given lock with my own proc. */ if (!hash_update_hash_key(LockMethodProcLockHash, - (void *) proclock, - (void *) &proclocktag)) + (void *)proclock, + (void *)&proclocktag)) elog(PANIC, "duplicate entry found while reassigning a prepared transaction's locks"); /* Re-link into the new proc's proclock list */ @@ -3592,23 +3611,21 @@ PostPrepare_Locks(TransactionId xid) &proclock->procLink); PROCLOCK_PRINT("PostPrepare_Locks: updated", proclock); - } /* loop over PROCLOCKs within this partition */ + } /* loop over PROCLOCKs within this partition */ LWLockRelease(partitionLock); - } /* loop over partitions */ + } /* loop over partitions */ END_CRIT_SECTION(); } - /* * Estimate shared-memory space used for lock tables */ -Size -LockShmemSize(void) +Size LockShmemSize(void) { - Size size = 0; - long max_table_size; + Size size = 0; + long max_table_size; /* lock hash table */ max_table_size = NLOCKENTS(); @@ -3644,19 +3661,19 @@ LockShmemSize(void) LockData * GetLockStatusData(void) { - LockData *data; - PROCLOCK *proclock; + LockData *data; + PROCLOCK *proclock; HASH_SEQ_STATUS seqstat; - int els; - int el; - int i; + int els; + int el; + int i; - data = (LockData *) palloc(sizeof(LockData)); + data = (LockData *)palloc(sizeof(LockData)); /* Guess how much space we'll need. */ els = MaxBackends; el = 0; - data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * els); + data->locks = (LockInstanceData *)palloc(sizeof(LockInstanceData) * els); /* * First, we iterate through the per-backend fast-path arrays, locking @@ -3672,15 +3689,15 @@ GetLockStatusData(void) */ for (i = 0; i < ProcGlobal->allProcCount; ++i) { - PGPROC *proc = &ProcGlobal->allProcs[i]; - uint32 f; + PGPROC *proc = &ProcGlobal->allProcs[i]; + uint32 f; LWLockAcquire(&proc->fpInfoLock, LW_SHARED); for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; ++f) { LockInstanceData *instance; - uint32 lockbits = FAST_PATH_GET_BITS(proc, f); + uint32 lockbits = FAST_PATH_GET_BITS(proc, f); /* Skip unallocated slots. */ if (!lockbits) @@ -3773,10 +3790,10 @@ GetLockStatusData(void) /* Now scan the tables to copy the data */ hash_seq_init(&seqstat, LockMethodProcLockHash); - while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat))) + while ((proclock = (PROCLOCK *)hash_seq_search(&seqstat))) { - PGPROC *proc = proclock->tag.myProc; - LOCK *lock = proclock->tag.myLock; + PGPROC *proc = proclock->tag.myProc; + LOCK *lock = proclock->tag.myLock; LockInstanceData *instance = &data->locks[el]; memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG)); @@ -3790,7 +3807,7 @@ GetLockStatusData(void) instance->pid = proc->pid; instance->leaderPid = proclock->groupLeader->pid; instance->fastpath = false; - instance->waitStart = (TimestampTz) pg_atomic_read_u64(&proc->waitStart); + instance->waitStart = (TimestampTz)pg_atomic_read_u64(&proc->waitStart); el++; } @@ -3837,10 +3854,10 @@ BlockedProcsData * GetBlockerStatusData(int blocked_pid) { BlockedProcsData *data; - PGPROC *proc; - int i; + PGPROC *proc; + int i; - data = (BlockedProcsData *) palloc(sizeof(BlockedProcsData)); + data = (BlockedProcsData *)palloc(sizeof(BlockedProcsData)); /* * Guess how much space we'll need, and preallocate. Most of the time @@ -3850,9 +3867,9 @@ GetBlockerStatusData(int blocked_pid) */ data->nprocs = data->nlocks = data->npids = 0; data->maxprocs = data->maxlocks = data->maxpids = MaxBackends; - data->procs = (BlockedProcData *) palloc(sizeof(BlockedProcData) * data->maxprocs); - data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * data->maxlocks); - data->waiter_pids = (int *) palloc(sizeof(int) * data->maxpids); + data->procs = (BlockedProcData *)palloc(sizeof(BlockedProcData) * data->maxprocs); + data->locks = (LockInstanceData *)palloc(sizeof(LockInstanceData) * data->maxlocks); + data->waiter_pids = (int *)palloc(sizeof(int) * data->maxpids); /* * In order to search the ProcArray for blocked_pid and assume that that @@ -3887,11 +3904,11 @@ GetBlockerStatusData(int blocked_pid) else { /* Examine all procs in proc's lock group */ - dlist_iter iter; + dlist_iter iter; dlist_foreach(iter, &proc->lockGroupLeader->lockGroupMembers) { - PGPROC *memberProc; + PGPROC *memberProc; memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur); GetSingleProcBlockerStatusData(memberProc, data); @@ -3916,14 +3933,14 @@ GetBlockerStatusData(int blocked_pid) static void GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data) { - LOCK *theLock = blocked_proc->waitLock; + LOCK *theLock = blocked_proc->waitLock; BlockedProcData *bproc; - SHM_QUEUE *procLocks; - PROCLOCK *proclock; + SHM_QUEUE *procLocks; + PROCLOCK *proclock; PROC_QUEUE *waitQueue; - PGPROC *proc; - int queue_size; - int i; + PGPROC *proc; + int queue_size; + int i; /* Nothing to do if this proc is not blocked */ if (theLock == NULL) @@ -3942,12 +3959,12 @@ GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data) /* Collect all PROCLOCKs associated with theLock */ procLocks = &(theLock->procLocks); - proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, - offsetof(PROCLOCK, lockLink)); + proclock = (PROCLOCK *)SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, lockLink)); while (proclock) { - PGPROC *proc = proclock->tag.myProc; - LOCK *lock = proclock->tag.myLock; + PGPROC *proc = proclock->tag.myProc; + LOCK *lock = proclock->tag.myLock; LockInstanceData *instance; if (data->nlocks >= data->maxlocks) @@ -3971,8 +3988,8 @@ GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data) instance->fastpath = false; data->nlocks++; - proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink, - offsetof(PROCLOCK, lockLink)); + proclock = (PROCLOCK *)SHMQueueNext(procLocks, &proclock->lockLink, + offsetof(PROCLOCK, lockLink)); } /* Enlarge waiter_pids[] if it's too small to hold all wait queue PIDs */ @@ -3983,18 +4000,18 @@ GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data) { data->maxpids = Max(data->maxpids + MaxBackends, data->npids + queue_size); - data->waiter_pids = (int *) repalloc(data->waiter_pids, - sizeof(int) * data->maxpids); + data->waiter_pids = (int *)repalloc(data->waiter_pids, + sizeof(int) * data->maxpids); } /* Collect PIDs from the lock's wait queue, stopping at blocked_proc */ - proc = (PGPROC *) waitQueue->links.next; + proc = (PGPROC *)waitQueue->links.next; for (i = 0; i < queue_size; i++) { if (proc == blocked_proc) break; data->waiter_pids[data->npids++] = proc->pid; - proc = (PGPROC *) proc->links.next; + proc = (PGPROC *)proc->links.next; } bproc->num_locks = data->nlocks - bproc->first_lock; @@ -4017,11 +4034,11 @@ xl_standby_lock * GetRunningTransactionLocks(int *nlocks) { xl_standby_lock *accessExclusiveLocks; - PROCLOCK *proclock; + PROCLOCK *proclock; HASH_SEQ_STATUS seqstat; - int i; - int index; - int els; + int i; + int index; + int els; /* * Acquire lock on the entire shared lock data structure. @@ -4051,14 +4068,14 @@ GetRunningTransactionLocks(int *nlocks) * non-exclusive lock types. */ index = 0; - while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat))) + while ((proclock = (PROCLOCK *)hash_seq_search(&seqstat))) { /* make sure this definition matches the one used in LockAcquire */ if ((proclock->holdMask & LOCKBIT_ON(AccessExclusiveLock)) && proclock->tag.myLock->tag.locktag_type == LOCKTAG_RELATION) { - PGPROC *proc = proclock->tag.myProc; - LOCK *lock = proclock->tag.myLock; + PGPROC *proc = proclock->tag.myProc; + LOCK *lock = proclock->tag.myLock; TransactionId xid = proc->xid; /* @@ -4109,13 +4126,12 @@ GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode) * * Caller is responsible for having acquired appropriate LWLocks. */ -void -DumpLocks(PGPROC *proc) +void DumpLocks(PGPROC *proc) { - SHM_QUEUE *procLocks; - PROCLOCK *proclock; - LOCK *lock; - int i; + SHM_QUEUE *procLocks; + PROCLOCK *proclock; + LOCK *lock; + int i; if (proc == NULL) return; @@ -4127,8 +4143,8 @@ DumpLocks(PGPROC *proc) { procLocks = &(proc->myProcLocks[i]); - proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, - offsetof(PROCLOCK, procLink)); + proclock = (PROCLOCK *)SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, procLink)); while (proclock) { @@ -4151,12 +4167,11 @@ DumpLocks(PGPROC *proc) * * Caller is responsible for having acquired appropriate LWLocks. */ -void -DumpAllLocks(void) +void DumpAllLocks(void) { - PGPROC *proc; - PROCLOCK *proclock; - LOCK *lock; + PGPROC *proc; + PROCLOCK *proclock; + LOCK *lock; HASH_SEQ_STATUS status; proc = MyProc; @@ -4166,7 +4181,7 @@ DumpAllLocks(void) hash_seq_init(&status, LockMethodProcLockHash); - while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL) + while ((proclock = (PROCLOCK *)hash_seq_search(&status)) != NULL) { PROCLOCK_PRINT("DumpAllLocks", proclock); @@ -4177,7 +4192,7 @@ DumpAllLocks(void) elog(LOG, "DumpAllLocks: proclock->tag.myLock = NULL"); } } -#endif /* LOCK_DEBUG */ +#endif /* LOCK_DEBUG */ /* * LOCK 2PC resource manager's routines @@ -4208,24 +4223,23 @@ DumpAllLocks(void) * replaying the WAL record that needs to acquire a lock will throw an error * and PANIC anyway. */ -void -lock_twophase_recover(TransactionId xid, uint16 info, - void *recdata, uint32 len) +void lock_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len) { - TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; - PGPROC *proc = TwoPhaseGetDummyProc(xid, false); - LOCKTAG *locktag; - LOCKMODE lockmode; + TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *)recdata; + PGPROC *proc = TwoPhaseGetDummyProc(xid, false); + LOCKTAG *locktag; + LOCKMODE lockmode; LOCKMETHODID lockmethodid; - LOCK *lock; - PROCLOCK *proclock; + LOCK *lock; + PROCLOCK *proclock; PROCLOCKTAG proclocktag; - bool found; - uint32 hashcode; - uint32 proclock_hashcode; - int partition; - LWLock *partitionLock; - LockMethod lockMethodTable; + bool found; + uint32 hashcode; + uint32 proclock_hashcode; + int partition; + LWLock *partitionLock; + LockMethod lockMethodTable; Assert(len == sizeof(TwoPhaseLockRecord)); locktag = &rec->locktag; @@ -4245,11 +4259,11 @@ lock_twophase_recover(TransactionId xid, uint16 info, /* * Find or create a lock with this tag. */ - lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, - (void *) locktag, - hashcode, - HASH_ENTER_NULL, - &found); + lock = (LOCK *)hash_search_with_hash_value(LockMethodLockHash, + (void *)locktag, + hashcode, + HASH_ENTER_NULL, + &found); if (!lock) { LWLockRelease(partitionLock); @@ -4293,11 +4307,11 @@ lock_twophase_recover(TransactionId xid, uint16 info, /* * Find or create a proclock entry with this tag */ - proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash, - (void *) &proclocktag, - proclock_hashcode, - HASH_ENTER_NULL, - &found); + proclock = (PROCLOCK *)hash_search_with_hash_value(LockMethodProcLockHash, + (void *)&proclocktag, + proclock_hashcode, + HASH_ENTER_NULL, + &found); if (!proclock) { /* Oops, not enough shmem for the proclock */ @@ -4311,7 +4325,7 @@ lock_twophase_recover(TransactionId xid, uint16 info, */ Assert(SHMQueueEmpty(&(lock->procLocks))); if (!hash_search_with_hash_value(LockMethodLockHash, - (void *) &(lock->tag), + (void *)&(lock->tag), hashcode, HASH_REMOVE, NULL)) @@ -4375,7 +4389,7 @@ lock_twophase_recover(TransactionId xid, uint16 info, */ if (ConflictsWithRelationFastPath(&lock->tag, lockmode)) { - uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); SpinLockAcquire(&FastPathStrongRelationLocks->mutex); FastPathStrongRelationLocks->count[fasthashcode]++; @@ -4389,13 +4403,12 @@ lock_twophase_recover(TransactionId xid, uint16 info, * Re-acquire a lock belonging to a transaction that was prepared, when * starting up into hot standby mode. */ -void -lock_twophase_standby_recover(TransactionId xid, uint16 info, - void *recdata, uint32 len) +void lock_twophase_standby_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len) { - TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; - LOCKTAG *locktag; - LOCKMODE lockmode; + TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *)recdata; + LOCKTAG *locktag; + LOCKMODE lockmode; LOCKMETHODID lockmethodid; Assert(len == sizeof(TwoPhaseLockRecord)); @@ -4410,26 +4423,24 @@ lock_twophase_standby_recover(TransactionId xid, uint16 info, locktag->locktag_type == LOCKTAG_RELATION) { StandbyAcquireAccessExclusiveLock(xid, - locktag->locktag_field1 /* dboid */ , - locktag->locktag_field2 /* reloid */ ); + locktag->locktag_field1 /* dboid */, + locktag->locktag_field2 /* reloid */); } } - /* * 2PC processing routine for COMMIT PREPARED case. * * Find and release the lock indicated by the 2PC record. */ -void -lock_twophase_postcommit(TransactionId xid, uint16 info, - void *recdata, uint32 len) +void lock_twophase_postcommit(TransactionId xid, uint16 info, + void *recdata, uint32 len) { - TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; - PGPROC *proc = TwoPhaseGetDummyProc(xid, true); - LOCKTAG *locktag; + TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *)recdata; + PGPROC *proc = TwoPhaseGetDummyProc(xid, true); + LOCKTAG *locktag; LOCKMETHODID lockmethodid; - LockMethod lockMethodTable; + LockMethod lockMethodTable; Assert(len == sizeof(TwoPhaseLockRecord)); locktag = &rec->locktag; @@ -4447,9 +4458,8 @@ lock_twophase_postcommit(TransactionId xid, uint16 info, * * This is actually just the same as the COMMIT case. */ -void -lock_twophase_postabort(TransactionId xid, uint16 info, - void *recdata, uint32 len) +void lock_twophase_postabort(TransactionId xid, uint16 info, + void *recdata, uint32 len) { lock_twophase_postcommit(xid, info, recdata, len); } @@ -4471,8 +4481,7 @@ lock_twophase_postabort(TransactionId xid, uint16 info, * only ever released at the end of a transaction. Instead, * LockReleaseAll() calls VirtualXactLockTableCleanup(). */ -void -VirtualXactLockTableInsert(VirtualTransactionId vxid) +void VirtualXactLockTableInsert(VirtualTransactionId vxid) { Assert(VirtualTransactionIdIsValid(vxid)); @@ -4494,10 +4503,9 @@ VirtualXactLockTableInsert(VirtualTransactionId vxid) * Check whether a VXID lock has been materialized; if so, release it, * unblocking waiters. */ -void -VirtualXactLockTableCleanup(void) +void VirtualXactLockTableCleanup(void) { - bool fastpath; + bool fastpath; LocalTransactionId lxid; Assert(MyProc->backendId != InvalidBackendId); @@ -4521,7 +4529,7 @@ VirtualXactLockTableCleanup(void) if (!fastpath && LocalTransactionIdIsValid(lxid)) { VirtualTransactionId vxid; - LOCKTAG locktag; + LOCKTAG locktag; vxid.backendId = MyBackendId; vxid.localTransactionId = lxid; @@ -4547,7 +4555,7 @@ static bool XactLockForVirtualXact(VirtualTransactionId vxid, TransactionId xid, bool wait) { - bool more = false; + bool more = false; /* There is no point to wait for 2PCs if you have no 2PCs. */ if (max_prepared_xacts == 0) @@ -4556,7 +4564,7 @@ XactLockForVirtualXact(VirtualTransactionId vxid, do { LockAcquireResult lar; - LOCKTAG tag; + LOCKTAG tag; /* Clear state from previous iterations. */ if (more) @@ -4594,11 +4602,10 @@ XactLockForVirtualXact(VirtualTransactionId vxid, * If wait = false, just check whether that VXID or one of those XIDs is still * running, and return true or false. */ -bool -VirtualXactLock(VirtualTransactionId vxid, bool wait) +bool VirtualXactLock(VirtualTransactionId vxid, bool wait) { - LOCKTAG tag; - PGPROC *proc; + LOCKTAG tag; + PGPROC *proc; TransactionId xid = InvalidTransactionId; Assert(VirtualTransactionIdIsValid(vxid)); @@ -4628,8 +4635,7 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) */ LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE); - if (proc->backendId != vxid.backendId - || proc->fpLocalTransactionId != vxid.localTransactionId) + if (proc->backendId != vxid.backendId || proc->fpLocalTransactionId != vxid.localTransactionId) { /* VXID ended */ LWLockRelease(&proc->fpInfoLock); @@ -4653,9 +4659,9 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) */ if (proc->fpVXIDLock) { - PROCLOCK *proclock; - uint32 hashcode; - LWLock *partitionLock; + PROCLOCK *proclock; + uint32 hashcode; + LWLock *partitionLock; hashcode = LockTagHashCode(&tag); @@ -4694,7 +4700,7 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) LWLockRelease(&proc->fpInfoLock); /* Time to wait. */ - (void) LockAcquire(&tag, ShareLock, false, false); + (void)LockAcquire(&tag, ShareLock, false, false); LockRelease(&tag, ShareLock, false); return XactLockForVirtualXact(vxid, xid, wait); @@ -4705,15 +4711,14 @@ VirtualXactLock(VirtualTransactionId vxid, bool wait) * * Find the number of lock requester on this locktag */ -int -LockWaiterCount(const LOCKTAG *locktag) +int LockWaiterCount(const LOCKTAG *locktag) { LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; - LOCK *lock; - bool found; - uint32 hashcode; - LWLock *partitionLock; - int waiters = 0; + LOCK *lock; + bool found; + uint32 hashcode; + LWLock *partitionLock; + int waiters = 0; if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) elog(ERROR, "unrecognized lock method: %d", lockmethodid); @@ -4722,11 +4727,11 @@ LockWaiterCount(const LOCKTAG *locktag) partitionLock = LockHashPartitionLock(hashcode); LWLockAcquire(partitionLock, LW_EXCLUSIVE); - lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, - (const void *) locktag, - hashcode, - HASH_FIND, - &found); + lock = (LOCK *)hash_search_with_hash_value(LockMethodLockHash, + (const void *)locktag, + hashcode, + HASH_FIND, + &found); if (found) { Assert(lock != NULL); @@ -4736,3 +4741,53 @@ LockWaiterCount(const LOCKTAG *locktag) return waiters; } + + +/* +for fs meta +*/ + +FSValue * +SetupFSMetaInTable(const FSKey *fsk) +{ + + FSValue *fsm; + bool found; + + fsm = (FSValue *) + hash_search(FSMetaHash, fsk, HASH_ENTER, &found); + + if (found) + { + printf("found fskey cap is %d \n", fsm->cap); + } + else + { + printf("not found fs key \n"); + fsm->cap = 200; + fsm->len = 0; + } + + return fsm; +} + +void CleanUpFSMeta(const FSKey *fsk) +{ + + hash_search(FSMetaHash, fsk, HASH_REMOVE, NULL); +} + +FSValue * +FindFSMetaInTable(const FSKey *fsk) +{ + + FSValue *fsm; + bool found; + + + + fsm = (WalList *) + hash_search(FSMetaHash, fsk, HASH_FIND, &found); + + return fsm; +} \ No newline at end of file diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c..e84a575 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +LogIndexMemListLock 48 \ No newline at end of file diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 2575ea1..4b6315f 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -260,6 +260,13 @@ InitProcGlobal(void) ProcGlobal->bgworkerFreeProcs = &procs[i]; procs[i].procgloballist = &ProcGlobal->bgworkerFreeProcs; } + else if (i < MaxConnections + autovacuum_max_workers + 1 + max_worker_processes + max_parallel_flush_process) + { + /* PGPROC for parallel flush, add to parallelFlushProcs list */ + procs[i].links.next = (SHM_QUEUE *) ProcGlobal->parallelFlushFreeProcs; + ProcGlobal->parallelFlushFreeProcs = &procs[i]; + procs[i].procgloballist = &ProcGlobal->parallelFlushFreeProcs; + } else if (i < MaxBackends) { /* PGPROC for walsender, add to walsenderFreeProcs list */ @@ -319,6 +326,8 @@ InitProcess(void) procgloballist = &ProcGlobal->autovacFreeProcs; else if (IsBackgroundWorker) procgloballist = &ProcGlobal->bgworkerFreeProcs; + else if (IsParallelFlushWorker) + procgloballist = &ProcGlobal->parallelFlushFreeProcs; else if (am_walsender) procgloballist = &ProcGlobal->walsenderFreeProcs; else diff --git a/src/backend/storage/sharedisk/sharedisk.c b/src/backend/storage/sharedisk/sharedisk.c index 8dc8106..edef2e5 100644 --- a/src/backend/storage/sharedisk/sharedisk.c +++ b/src/backend/storage/sharedisk/sharedisk.c @@ -1,5 +1,6 @@ #include "storage/sharedisk.h" - +#include "utils/palloc.h" +#include "storage/shmem.h" static ShareDiskInfo *ShareDiskCtl = NULL; Size diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile index 596b564..ce735f3 100644 --- a/src/backend/storage/smgr/Makefile +++ b/src/backend/storage/smgr/Makefile @@ -14,6 +14,7 @@ include $(top_builddir)/src/Makefile.global OBJS = \ md.o \ - smgr.o + smgr.o \ + filecache.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/smgr/filecache.c b/src/backend/storage/smgr/filecache.c new file mode 100644 index 0000000..bcef702 --- /dev/null +++ b/src/backend/storage/smgr/filecache.c @@ -0,0 +1,70 @@ +#include "storage/filecache.h" +#include +#include "utils/palloc.h" +#include "storage/shmem.h" + + +static HTAB *CacheRelHash; + +Size FileCacheSize(void) { + return mul_size(MAX_CACHE_RELATION, sizeof(RelFileNode) + sizeof(CachedRelInfo)); +} + +void +InitCacheRel(void) +{ + HASHCTL info; + long init_table_size, + max_table_size; + info.keysize = sizeof(RelFileNode); + info.entrysize = sizeof(CachedRelInfo); + init_table_size = 100; + max_table_size = MAX_CACHE_RELATION; + CacheRelHash = ShmemInitHash("CacheRel", + init_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS); +} + +CachedRelInfo * +SetupRelCache(const RelFileNode *reln, ForkNumber forkno, BlockNumber nblocks) +{ + + CachedRelInfo *ri; + bool found; + + + ri = (CachedRelInfo *) + hash_search(CacheRelHash, reln, HASH_ENTER, &found); + + if (!found) + { + for (int i = 0; i <= MAX_FORKNUM; ++i) + ri->cached_nblocks[i] = InvalidBlockNumber; + ri->cached_nblocks[forkno] = nblocks; + } + + return ri; +} + +void RemoveCacheRel(const RelFileNode *reln) +{ + + hash_search(CacheRelHash, (const void *) reln, + HASH_REMOVE, NULL); +} + +CachedRelInfo * +FindCacheRel(const RelFileNode *reln) +{ + + CachedRelInfo *ri; + bool found; + + + ri = (CachedRelInfo *) + hash_search(CacheRelHash, reln, HASH_FIND, &found); + + return ri; +} diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index b052f28..3ffdf0a 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -28,19 +28,25 @@ #include "access/xlog.h" #include "access/xlogutils.h" #include "commands/tablespace.h" +#include "common/file_perm.h" #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" #include "postmaster/bgwriter.h" +#include "postmaster/secondbuffer.h" #include "storage/bufmgr.h" +#include "storage/buf_internals.h" #include "storage/fd.h" #include "storage/md.h" #include "storage/relfilenode.h" #include "storage/smgr.h" #include "storage/sync.h" +#include "storage/filecache.h" #include "utils/hsearch.h" #include "utils/memutils.h" #include "utils/guc.h" +// #include "utils/hfs.h" +#include "storage/he3db_logindex.h" /* * The magnetic disk storage manager keeps track of open file @@ -82,33 +88,30 @@ typedef struct _MdfdVec { - File mdfd_vfd; /* fd number in fd.c's pool */ - BlockNumber mdfd_segno; /* segment number, from 0 */ + File mdfd_vfd; /* fd number in fd.c's pool */ + BlockNumber mdfd_segno; /* segment number, from 0 */ } MdfdVec; -static MemoryContext MdCxt; /* context for all MdfdVec objects */ - +static MemoryContext MdCxt; /* context for all MdfdVec objects */ /* Populate a file tag describing an md.c segment file. */ -#define INIT_MD_FILETAG(a,xx_rnode,xx_forknum,xx_segno) \ -( \ - memset(&(a), 0, sizeof(FileTag)), \ - (a).handler = SYNC_HANDLER_MD, \ - (a).rnode = (xx_rnode), \ - (a).forknum = (xx_forknum), \ - (a).segno = (xx_segno) \ -) - +#define INIT_MD_FILETAG(a, xx_rnode, xx_forknum, xx_segno) \ + ( \ + memset(&(a), 0, sizeof(FileTag)), \ + (a).handler = SYNC_HANDLER_MD, \ + (a).rnode = (xx_rnode), \ + (a).forknum = (xx_forknum), \ + (a).segno = (xx_segno)) /*** behavior for mdopen & _mdfd_getseg ***/ /* ereport if segment not present */ -#define EXTENSION_FAIL (1 << 0) +#define EXTENSION_FAIL (1 << 0) /* return NULL if segment not present */ -#define EXTENSION_RETURN_NULL (1 << 1) +#define EXTENSION_RETURN_NULL (1 << 1) /* create new segments as needed */ -#define EXTENSION_CREATE (1 << 2) +#define EXTENSION_CREATE (1 << 2) /* create new segments if needed during recovery */ -#define EXTENSION_CREATE_RECOVERY (1 << 3) +#define EXTENSION_CREATE_RECOVERY (1 << 3) /* * Allow opening segments which are preceded by segments smaller than * RELSEG_SIZE, e.g. inactive segments (see above). Note that this breaks @@ -116,8 +119,7 @@ static MemoryContext MdCxt; /* context for all MdfdVec objects */ * because this is only required in the checkpointer which never uses * mdnblocks(). */ -#define EXTENSION_DONT_CHECK_SIZE (1 << 4) - +#define EXTENSION_DONT_CHECK_SIZE (1 << 4) /* local routines */ static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, @@ -141,12 +143,10 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno, static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); - /* * mdinit() -- Initialize private state for magnetic disk storage manager. */ -void -mdinit(void) +void mdinit(void) { MdCxt = AllocSetContextCreate(TopMemoryContext, "MdSmgr", @@ -158,8 +158,7 @@ mdinit(void) * * Note: this will return true for lingering files, with pending deletions */ -bool -mdexists(SMgrRelation reln, ForkNumber forkNum) +bool mdexists(SMgrRelation reln, ForkNumber forkNum) { /* * Close it first, to ensure that we notice if the fork has been unlinked @@ -175,15 +174,14 @@ mdexists(SMgrRelation reln, ForkNumber forkNum) * * If isRedo is true, it's okay for the relation to exist already. */ -void -mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) +void mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) { - MdfdVec *mdfd; - char *path; - File fd; + MdfdVec *mdfd; + char *path; + File fd; if (isRedo && reln->md_num_open_segs[forkNum] > 0) - return; /* created and opened already... */ + return; /* created and opened already... */ Assert(reln->md_num_open_segs[forkNum] == 0); @@ -202,15 +200,15 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) path = relpath(reln->smgr_rnode, forkNum); -// fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY | PG_O_DIRECT); + fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); - fd = He3DBPathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY | PG_O_DIRECT); + // fd = He3DBPathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY | PG_O_DIRECT); if (fd < 0) { - int save_errno = errno; + int save_errno = errno; if (isRedo) - fd = He3DBPathNameOpenFile(path, O_RDWR | PG_BINARY | PG_O_DIRECT); + fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); if (fd < 0) { /* be sure to report the error reported by create, not open */ @@ -275,17 +273,40 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * Note: any failure should be reported as WARNING not ERROR, because * we are usually not in a transaction anymore when this is called. */ -void -mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) +void mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) { + LdPageKey ldKey; /* Now do the per-fork work */ if (forkNum == InvalidForkNumber) { for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) + { mdunlinkfork(rnode, forkNum, isRedo); + ldKey.sk.dbid = rnode.node.dbNode; + ldKey.sk.relid = rnode.node.relNode; + ldKey.sk.forkno = forkNum; + ldKey.sk.blkno = 0; + SendInvalPage(&ldKey); + } } else + { mdunlinkfork(rnode, forkNum, isRedo); + ldKey.sk.dbid = rnode.node.dbNode; + ldKey.sk.relid = rnode.node.relNode; + ldKey.sk.forkno = forkNum; + ldKey.sk.blkno = 0; + SendInvalPage(&ldKey); + } + + ldKey.sk.dbid = 0; + ldKey.sk.relid = 0; + ldKey.sk.forkno = 32; + ldKey.sk.blkno = 0; + SendInvalPage(&ldKey); + + + } /* @@ -294,17 +315,17 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) static int do_truncate(const char *path) { - int save_errno; - int ret; + int save_errno; + int64_t ret; /* * He3DB: He3FS replace OS FS * only propeller instance can release disk space */ - //ret = pg_truncate(path, 0); + // ret = pg_truncate(path, 0); if (push_standby) { - ret = he3Truncate(path, 0); + ret = pg_truncate(path, 0); } else { @@ -327,8 +348,8 @@ do_truncate(const char *path) static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) { - char *path; - int ret; + char *path; + int ret; path = relpath(rnode, forkNum); @@ -343,7 +364,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) ret = do_truncate(path); /* Forget any pending sync requests for the first segment */ - register_forget_request(rnode, forkNum, 0 /* first seg */ ); + register_forget_request(rnode, forkNum, 0 /* first seg */); } else ret = 0; @@ -351,7 +372,8 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) /* Next unlink the file, unless it was already found to be missing */ if (ret == 0 || errno != ENOENT) { - if (push_standby == true || RelFileNodeBackendIsTemp(rnode)) { + if (push_standby == true || RelFileNodeBackendIsTemp(rnode)) + { ret = unlink(path); } if (ret < 0 && errno != ENOENT) @@ -366,7 +388,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) ret = do_truncate(path); /* Register request to unlink first segment later */ - register_unlink_segment(rnode, forkNum, 0 /* first seg */ ); + register_unlink_segment(rnode, forkNum, 0 /* first seg */); } /* @@ -374,7 +396,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) */ if (ret >= 0) { - char *segpath = (char *) palloc(strlen(path) + 12); + char *segpath = (char *)palloc(strlen(path) + 12); BlockNumber segno; /* @@ -400,7 +422,8 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) */ register_forget_request(rnode, forkNum, segno); } - if (push_standby == true || RelFileNodeBackendIsTemp(rnode)) { + if (push_standby == true || RelFileNodeBackendIsTemp(rnode)) + { if (unlink(segpath) < 0) { /* ENOENT is expected after the last segment... */ @@ -410,7 +433,9 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) errmsg("could not remove file \"%s\": %m", segpath))); break; } - } else { + } + else + { break; } } @@ -429,13 +454,12 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) * EOF). Note that we assume writing a block beyond current EOF * causes intervening file space to become filled with zeroes. */ -void -mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) +void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) { - off_t seekpos; - int nbytes; - MdfdVec *v; + off_t seekpos; + int nbytes; + MdfdVec *v; /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND @@ -457,11 +481,11 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE); - if ((nbytes = He3DBFileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) + if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) { if (nbytes < 0) ereport(ERROR, @@ -481,7 +505,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (!skipFsync && !SmgrIsTemp(reln)) register_dirty_segment(reln, forknum, v); - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber)RELSEG_SIZE)); } /* @@ -497,9 +521,9 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, static MdfdVec * mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) { - MdfdVec *mdfd; - char *path; - File fd; + MdfdVec *mdfd; + char *path; + File fd; /* No work if already open */ if (reln->md_num_open_segs[forknum] > 0) @@ -508,7 +532,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) path = relpath(reln->smgr_rnode, forknum); /* he3db: He3FS replace OSFS and Use the direct method to open the page file */ - fd = He3DBPathNameOpenFile(path, O_RDWR | PG_BINARY | PG_O_DIRECT); + fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); if (fd < 0) { @@ -530,7 +554,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; - Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber)RELSEG_SIZE)); return mdfd; } @@ -538,8 +562,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) /* * mdopen() -- Initialize newly-opened relation. */ -void -mdopen(SMgrRelation reln) +void mdopen(SMgrRelation reln) { /* mark it not open */ for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) @@ -549,10 +572,9 @@ mdopen(SMgrRelation reln) /* * mdclose() -- Close the specified relation, if it isn't closed already. */ -void -mdclose(SMgrRelation reln, ForkNumber forknum) +void mdclose(SMgrRelation reln, ForkNumber forknum) { - int nopensegs = reln->md_num_open_segs[forknum]; + int nopensegs = reln->md_num_open_segs[forknum]; /* No work if already closed */ if (nopensegs == 0) @@ -561,7 +583,7 @@ mdclose(SMgrRelation reln, ForkNumber forknum) /* close segments starting from the end */ while (nopensegs > 0) { - MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1]; + MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1]; FileClose(v->mdfd_vfd); _fdvec_resize(reln, forknum, nopensegs - 1); @@ -572,24 +594,23 @@ mdclose(SMgrRelation reln, ForkNumber forknum) /* * mdprefetch() -- Initiate asynchronous read of the specified block of a relation */ -bool -mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { #ifdef USE_PREFETCH - off_t seekpos; - MdfdVec *v; + off_t seekpos; + MdfdVec *v; v = _mdfd_getseg(reln, forknum, blocknum, false, InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL); if (v == NULL) return false; - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE); - (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH); -#endif /* USE_PREFETCH */ + (void)FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH); +#endif /* USE_PREFETCH */ return true; } @@ -600,9 +621,8 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) * This accepts a range of blocks because flushing several pages at once is * considerably more efficient than doing so individually. */ -void -mdwriteback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks) +void mdwriteback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) { /* * Issue flush requests in as few requests as possible; have to split at @@ -611,12 +631,12 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, while (nblocks > 0) { BlockNumber nflush = nblocks; - off_t seekpos; - MdfdVec *v; - int segnum_start, - segnum_end; + off_t seekpos; + MdfdVec *v; + int segnum_start, + segnum_end; - v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ , + v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */, EXTENSION_RETURN_NULL); /* @@ -632,14 +652,14 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, /* compute number of desired writes within the current segment */ segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE; if (segnum_start != segnum_end) - nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)); + nflush = RELSEG_SIZE - (blocknum % ((BlockNumber)RELSEG_SIZE)); Assert(nflush >= 1); Assert(nflush <= nblocks); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE)); - FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); + FileWriteback(v->mdfd_vfd, seekpos, (off_t)BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); nblocks -= nflush; blocknum += nflush; @@ -649,13 +669,13 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, /* * mdread() -- Read the specified block from a relation. */ -void -mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, XLogRecPtr lsn) +void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer) { - off_t seekpos; - int nbytes; - MdfdVec *v; + off_t seekpos; + int nbytes = 0; + MdfdVec *v; + uint32_t segno; TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, reln->smgr_rnode.node.spcNode, @@ -666,11 +686,30 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE)); + segno = (uint32_t)blocknum / ((BlockNumber)RELSEG_SIZE); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE); - nbytes = He3DBFileRead(v->mdfd_vfd, buffer, seekpos, WAIT_EVENT_DATA_FILE_READ, lsn); + // TODO read page from disk + if (!(InitdbSingle || IsBootstrapProcessingMode() == true)) + nbytes = MasterFileRead(buffer, reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, forknum, blocknum); + + if (nbytes == 0) + { + nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ); + if (!(InitdbSingle || IsBootstrapProcessingMode() == true)) + { + PageKey pageKey; + pageKey.relfileNode.dbNode = reln->smgr_rnode.node.dbNode; + pageKey.relfileNode.relNode = reln->smgr_rnode.node.relNode; + pageKey.forkNo = forknum; + pageKey.blkNo = blocknum; + pageKey.pageLsn = PageGetLSN(buffer); + pageKey.replyLsn = 0; + ReceivePageFromDataBuffer(&pageKey, (uint8_t *)buffer); + } + } TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, @@ -721,71 +760,200 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * 1)return read bytes * 2)add parameter to control pageXlog read or only page read */ -int -he3db_mdread_pagexlog(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char **buffer, XLogRecPtr lsn) +int he3db_mdread_pagexlog(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char **buffer, XLogRecPtr lsn) { - off_t seekpos; - int nbytes; - MdfdVec *v; + off_t seekpos; + int nbytes; + MdfdVec *v; + uint32_t segno; - TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - reln->smgr_rnode.backend); + BufferTag pageTag; + pageTag.rnode = reln->smgr_rnode.node; + pageTag.forkNum = forknum; + pageTag.blockNum = blocknum; - v = _mdfd_getseg(reln, forknum, blocknum, false, - EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + // TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, + // reln->smgr_rnode.node.spcNode, + // reln->smgr_rnode.node.dbNode, + // reln->smgr_rnode.node.relNode, + // reln->smgr_rnode.backend); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + OriginDPageKey odpk; + PageKey pageKey; - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Bufrd *bufrd = NULL; + int count = 0; - nbytes = He3DBFileRead(v->mdfd_vfd, buffer, seekpos, WAIT_EVENT_DATA_FILE_READ, lsn); - TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, - reln->smgr_rnode.node.spcNode, - reln->smgr_rnode.node.dbNode, - reln->smgr_rnode.node.relNode, - reln->smgr_rnode.backend, - nbytes, - PAGEXLOG_BLCKSZ); - - if (nbytes < BLCKSZ) + if (!push_standby) { - if (nbytes < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read block %u in file \"%s\": %m", - blocknum, FilePathName(v->mdfd_vfd)))); + bufrd = (Bufrd *)malloc(sizeof(Bufrd)); + bufrd->count = 0; + bufrd->cap = 0; + bufrd->buf = NULL; - /* - * Short read: we are at or past EOF, or we read a partial block at - * EOF. Normally this is an error; upper levels should never try to - * read a nonexistent block. However, if zero_damaged_pages is ON or - * we are InRecovery, we should instead return zeroes without - * complaining. This allows, for example, the case of trying to - * update a block that was later truncated away. - * He3DB: ban - if (zero_damaged_pages || InRecovery) - MemSet(buffer, 0, BLCKSZ); - else - ereport(ERROR, + pageKey.relfileNode.dbNode = pageTag.rnode.dbNode; + pageKey.relfileNode.relNode = pageTag.rnode.relNode; + pageKey.forkNo = (uint32)pageTag.forkNum; + pageKey.blkNo = pageTag.blockNum; + pageKey.pageLsn = GetXLogPushToDisk(); + pageKey.replyLsn = lsn; + + odpk.pk = pageKey; + odpk.opration = (int)EVICT; + + GetPageFromCurrentNode(pageKey, bufrd); + count = bufrd->count; + // elog(LOG, "read page from local rel %d, fork %d, blk %d, nbytes %d, replaylsn %X/%X", + // pageTag.rnode.relNode, pageTag.forkNum, pageTag.blockNum, count, LSN_FORMAT_ARGS(lsn)); + } + + if (count > 0) + { + *buffer = bufrd->buf; + free(bufrd); + AddOneItemToDPArray(odpk); + return count; + } + else + { + *buffer = (uint8_t *)malloc(BLCKSZ); + // TODO 如果本地盘不存在,则调用标准接口读取page,再调用tikv的借口获取范围的wal + v = _mdfd_getseg(reln, forknum, blocknum, false, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + + seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE)); + + Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE); + segno = (uint32_t)blocknum / ((BlockNumber)RELSEG_SIZE); + + nbytes = FileRead(v->mdfd_vfd, *buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ); + if (nbytes < BLCKSZ) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read block %u in file \"%s\": %m", + blocknum, FilePathName(v->mdfd_vfd)))); + if (he3mirror && nbytes == 0) + MemSet(*buffer, 0, BLCKSZ); + + ereport(PANIC, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", blocknum, FilePathName(v->mdfd_vfd), nbytes, BLCKSZ))); - */ - /* He3DB; ERROR */ - ereport(PANIC, + } + + if (push_standby || !EnableHotStandby || *isPromoteIsTriggered) + { + return nbytes; + } + + pageKey.pageLsn = Max(GetXLogPushToDisk(), PageGetLSN(*buffer)); + + pageKey.replyLsn = lsn; + + LsnNode *head = GetLogIndexByPage(&pageTag, pageKey.pageLsn, pageKey.replyLsn); + if (head->next != NULL) + { + TimeLineID tli; + GetXLogReplayRecPtr(&tli); + Bufrd result; + WalLdPageKey wlpk; + wlpk.sk.dbid = pageKey.relfileNode.dbNode; + wlpk.sk.relid = pageKey.relfileNode.relNode; + wlpk.sk.forkno = pageKey.forkNo; + wlpk.sk.blkno = pageKey.blkNo; + wlpk.pageLsn = SwapLsnFromLittleToBig(pageKey.pageLsn); + wlpk.partition = 0; + result.count = 0; + result = GetWalFromLocalBuffer(&wlpk, lsn); + + if (result.count == 0) + { + free(result.buf); + result = ReadWalsByPage(pageKey.relfileNode.dbNode, pageKey.relfileNode.relNode, + pageKey.forkNo, pageKey.blkNo, tli, head); + } + Assert(result.count != 0); + nbytes += result.count; + *buffer = (uint8_t *)realloc(*buffer, BLCKSZ + result.count); + memcpy((*buffer) + BLCKSZ, result.buf, result.count); + wlpk.pageLsn = SwapLsnFromLittleToBig(lsn); + SendInvalWal(&wlpk); + + wlpk.sk.dbid = 0; + wlpk.sk.relid = 0; + wlpk.sk.forkno = 32; + wlpk.sk.blkno = 0; + wlpk.pageLsn = 0; + wlpk.partition = 0; + SendInvalWal(&wlpk); + // TODO free result + free_dataRead(result.buf, result.count, result.cap); + } + else + { + ReceivePageFromDataBuffer(&pageKey, *buffer); + } + FreeLsnNode(head); + return nbytes; + } + + // v = _mdfd_getseg(reln, forknum, blocknum, false, + // EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + + // seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + + // Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + // segno = (uint32_t)blocknum /((BlockNumber) RELSEG_SIZE); + + // nbytes = He3DBFileRead(v->mdfd_vfd, buffer, seekpos, WAIT_EVENT_DATA_FILE_READ, lsn, pageTag); + + // TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, + // reln->smgr_rnode.node.spcNode, + // reln->smgr_rnode.node.dbNode, + // reln->smgr_rnode.node.relNode, + // reln->smgr_rnode.backend, + // nbytes, + // PAGEXLOG_BLCKSZ); + + // if (nbytes < BLCKSZ) + // { + // if (nbytes < 0) + // ereport(ERROR, + // (errcode_for_file_access(), + // errmsg("could not read block %u in file \"%s\": %m", + // blocknum, FilePathName(v->mdfd_vfd)))); + + /* + * Short read: we are at or past EOF, or we read a partial block at + * EOF. Normally this is an error; upper levels should never try to + * read a nonexistent block. However, if zero_damaged_pages is ON or + * we are InRecovery, we should instead return zeroes without + * complaining. This allows, for example, the case of trying to + * update a block that was later truncated away. + * He3DB: ban + if (zero_damaged_pages || InRecovery) + MemSet(buffer, 0, BLCKSZ); + else + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", blocknum, FilePathName(v->mdfd_vfd), nbytes, BLCKSZ))); - } + */ + /* He3DB; ERROR */ + // ereport(PANIC, + // (errcode(ERRCODE_DATA_CORRUPTED), + // errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", + // blocknum, FilePathName(v->mdfd_vfd), + // nbytes, BLCKSZ))); + // } - return nbytes; + // return nbytes; } /* @@ -795,17 +963,16 @@ he3db_mdread_pagexlog(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknu * 1)return read bytes * 2)add parameter to control pageXlog read or only page read */ -int -he3db_mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char **buffer, bool pagexlog, XLogRecPtr lsn) +int he3db_mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char **buffer, bool pagexlog, XLogRecPtr lsn) { -// if (likely(pagexlog)) -// { - return he3db_mdread_pagexlog(reln, forknum, blocknum, buffer, lsn); -// } + // if (likely(pagexlog)) + // { + return he3db_mdread_pagexlog(reln, forknum, blocknum, buffer, lsn); + // } -// mdread(reln, forknum, blocknum, *buffer, lsn); -// return 0; + // mdread(reln, forknum, blocknum, *buffer, lsn); + // return 0; } /* @@ -815,13 +982,12 @@ he3db_mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * relation (ie, those before the current EOF). To extend a relation, * use mdextend(). */ -void -mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, bool skipFsync) +void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync) { - off_t seekpos; - int nbytes; - MdfdVec *v; + off_t seekpos; + int nbytes; + MdfdVec *v; /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND @@ -837,11 +1003,11 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + seekpos = (off_t)BLCKSZ * (blocknum % ((BlockNumber)RELSEG_SIZE)); - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + Assert(seekpos < (off_t)BLCKSZ * RELSEG_SIZE); - nbytes = He3DBFileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); + nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, @@ -883,7 +1049,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum) { - MdfdVec *v; + MdfdVec *v; BlockNumber nblocks; BlockNumber segno; @@ -911,10 +1077,10 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) for (;;) { nblocks = _mdnblocks(reln, forknum, v); - if (nblocks > ((BlockNumber) RELSEG_SIZE)) + if (nblocks > ((BlockNumber)RELSEG_SIZE)) elog(FATAL, "segment too big"); - if (nblocks < ((BlockNumber) RELSEG_SIZE)) - return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks; + if (nblocks < ((BlockNumber)RELSEG_SIZE)) + return (segno * ((BlockNumber)RELSEG_SIZE)) + nblocks; /* * If segment is exactly RELSEG_SIZE, advance to next one. @@ -930,19 +1096,18 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) */ v = _mdfd_openseg(reln, forknum, segno, 0); if (v == NULL) - return segno * ((BlockNumber) RELSEG_SIZE); + return segno * ((BlockNumber)RELSEG_SIZE); } } /* * mdtruncate() -- Truncate relation to specified number of blocks. */ -void -mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) { BlockNumber curnblk; BlockNumber priorblocks; - int curopensegs; + int curopensegs; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -960,7 +1125,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) nblocks, curnblk))); } if (nblocks == curnblk) - return; /* no work */ + return; /* no work */ /* * Truncate segments, starting at the last one. Starting at the end makes @@ -969,7 +1134,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) curopensegs = reln->md_num_open_segs[forknum]; while (curopensegs > 0) { - MdfdVec *v; + MdfdVec *v; priorblocks = (curopensegs - 1) * RELSEG_SIZE; @@ -981,7 +1146,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * This segment is no longer active. We truncate the file, but do * not delete it, for reasons explained in the header comments. */ - if (He3FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE,SmgrIsTemp(reln)) < 0) + if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate file \"%s\": %m", @@ -996,7 +1161,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) FileClose(v->mdfd_vfd); _fdvec_resize(reln, forknum, curopensegs - 1); } - else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) + else if (priorblocks + ((BlockNumber)RELSEG_SIZE) > nblocks) { /* * This is the last segment we want to keep. Truncate the file to @@ -1007,7 +1172,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) */ BlockNumber lastsegblocks = nblocks - priorblocks; - if (He3FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE,SmgrIsTemp(reln)) < 0) + if (FileTruncate(v->mdfd_vfd, (off_t)lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not truncate file \"%s\" to %u blocks: %m", @@ -1026,6 +1191,10 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) } curopensegs--; } + // CachedRelInfo *cached_rel = NULL; + // cached_rel = FindCacheRel(&reln->smgr_rnode.node); + // if (cached_rel != NULL) + // RemoveCacheRel(&reln->smgr_rnode.node); } /* @@ -1039,11 +1208,10 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * crash before the next checkpoint syncs the newly-inactive segment, that * segment may survive recovery, reintroducing unwanted data into the table. */ -void -mdimmedsync(SMgrRelation reln, ForkNumber forknum) +void mdimmedsync(SMgrRelation reln, ForkNumber forknum) { - int segno; - int min_inactive_seg; + int segno; + int min_inactive_seg; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -1064,7 +1232,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) while (segno > 0) { - MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; + MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) ereport(data_sync_elevel(ERROR), @@ -1095,14 +1263,14 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { - FileTag tag; + FileTag tag; INIT_MD_FILETAG(tag, reln->smgr_rnode.node, forknum, seg->mdfd_segno); /* Temp relations should never be fsync'd */ Assert(!SmgrIsTemp(reln)); - if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */)) { ereport(DEBUG1, (errmsg_internal("could not forward fsync request because request queue is full"))); @@ -1122,14 +1290,14 @@ static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum, BlockNumber segno) { - FileTag tag; + FileTag tag; INIT_MD_FILETAG(tag, rnode.node, forknum, segno); /* Should never be used with temp relations */ Assert(!RelFileNodeBackendIsTemp(rnode)); - RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ ); + RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */); } /* @@ -1139,20 +1307,19 @@ static void register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum, BlockNumber segno) { - FileTag tag; + FileTag tag; INIT_MD_FILETAG(tag, rnode.node, forknum, segno); - RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ ); + RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */); } /* * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB */ -void -ForgetDatabaseSyncRequests(Oid dbid) +void ForgetDatabaseSyncRequests(Oid dbid) { - FileTag tag; + FileTag tag; RelFileNode rnode; rnode.dbNode = dbid; @@ -1161,17 +1328,16 @@ ForgetDatabaseSyncRequests(Oid dbid) INIT_MD_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber); - RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ ); + RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */); } /* * DropRelationFiles -- drop files of all given relations */ -void -DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo) +void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo) { SMgrRelation *srels; - int i; + int i; srels = palloc(sizeof(SMgrRelation) * ndelrels); for (i = 0; i < ndelrels; i++) @@ -1180,7 +1346,7 @@ DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo) if (isRedo) { - ForkNumber fork; + ForkNumber fork; for (fork = 0; fork <= MAX_FORKNUM; fork++) XLogDropRelation(delrels[i], fork); @@ -1195,7 +1361,6 @@ DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo) pfree(srels); } - /* * _fdvec_resize() -- Resize the fork's open segments array */ @@ -1240,8 +1405,8 @@ _fdvec_resize(SMgrRelation reln, static char * _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) { - char *path, - *fullpath; + char *path, + *fullpath; path = relpath(reln->smgr_rnode, forknum); @@ -1264,15 +1429,14 @@ static MdfdVec * _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, int oflags) { - MdfdVec *v; - File fd; - char *fullpath; + MdfdVec *v; + File fd; + char *fullpath; fullpath = _mdfd_segpath(reln, forknum, segno); /* open the file */ - /* he3db: He3FS replace OSFS and Use the direct method to open the page file */ - fd = He3DBPathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags | PG_O_DIRECT); + fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags); pfree(fullpath); @@ -1292,7 +1456,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, v->mdfd_vfd = fd; v->mdfd_segno = segno; - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber)RELSEG_SIZE)); /* all done */ return v; @@ -1310,7 +1474,7 @@ static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior) { - MdfdVec *v; + MdfdVec *v; BlockNumber targetseg; BlockNumber nextsegno; @@ -1318,7 +1482,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, Assert(behavior & (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL)); - targetseg = blkno / ((BlockNumber) RELSEG_SIZE); + targetseg = blkno / ((BlockNumber)RELSEG_SIZE); /* if an existing and opened segment, we're done */ if (targetseg < reln->md_num_open_segs[forknum]) @@ -1340,18 +1504,18 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, { v = mdopenfork(reln, forknum, behavior); if (!v) - return NULL; /* if behavior & EXTENSION_RETURN_NULL */ + return NULL; /* if behavior & EXTENSION_RETURN_NULL */ } for (nextsegno = reln->md_num_open_segs[forknum]; nextsegno <= targetseg; nextsegno++) { BlockNumber nblocks = _mdnblocks(reln, forknum, v); - int flags = 0; + int flags = 0; Assert(nextsegno == v->mdfd_segno + 1); - if (nblocks > ((BlockNumber) RELSEG_SIZE)) + if (nblocks > ((BlockNumber)RELSEG_SIZE)) elog(FATAL, "segment too big"); if ((behavior & EXTENSION_CREATE) || @@ -1371,19 +1535,19 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, * matters if in recovery, or if the caller is extending the * relation discontiguously, but that can happen in hash indexes.) */ - if (nblocks < ((BlockNumber) RELSEG_SIZE)) + if (nblocks < ((BlockNumber)RELSEG_SIZE)) { - char *zerobuf = palloc0(BLCKSZ); + char *zerobuf = palloc0(BLCKSZ); mdextend(reln, forknum, - nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, + nextsegno * ((BlockNumber)RELSEG_SIZE) - 1, zerobuf, skipFsync); pfree(zerobuf); } flags = O_CREAT; } else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) && - nblocks < ((BlockNumber) RELSEG_SIZE)) + nblocks < ((BlockNumber)RELSEG_SIZE)) { /* * When not extending (or explicitly including truncated @@ -1434,16 +1598,16 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { - off_t len; + off_t len; - len = He3DBFileSize(seg->mdfd_vfd); + len = FileSize(seg->mdfd_vfd); if (len < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek to end of file \"%s\": %m", FilePathName(seg->mdfd_vfd)))); /* note that this calculation will ignore any partial block at EOF */ - return (BlockNumber) (len / BLCKSZ); + return (BlockNumber)(len / BLCKSZ); } /* @@ -1452,14 +1616,13 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) * * Return 0 on success, -1 on failure, with errno set. */ -int -mdsyncfiletag(const FileTag *ftag, char *path) +int mdsyncfiletag(const FileTag *ftag, char *path) { SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId); - File file; - bool need_to_close; - int result, - save_errno; + File file; + bool need_to_close; + int result, + save_errno; /* See if we already have the file open, or need to open it. */ if (ftag->segno < reln->md_num_open_segs[ftag->forknum]) @@ -1470,15 +1633,15 @@ mdsyncfiletag(const FileTag *ftag, char *path) } else { - char *p; + char *p; p = _mdfd_segpath(reln, ftag->forknum, ftag->segno); strlcpy(path, p, MAXPGPATH); pfree(p); /* He3DB:He3FS replace OSFS */ - //file = PathNameOpenFile(path, O_RDWR | PG_BINARY); - file = He3DBPathNameOpenFile(path, O_RDWR | PG_BINARY); + file = PathNameOpenFile(path, O_RDWR | PG_BINARY); + // file = He3DBPathNameOpenFile(path, O_RDWR | PG_BINARY); if (file < 0) return -1; need_to_close = true; @@ -1501,10 +1664,9 @@ mdsyncfiletag(const FileTag *ftag, char *path) * * Return 0 on success, -1 on failure, with errno set. */ -int -mdunlinkfiletag(const FileTag *ftag, char *path) +int mdunlinkfiletag(const FileTag *ftag, char *path) { - char *p; + char *p; /* Compute the path. */ p = relpathperm(ftag->rnode, MAIN_FORKNUM); @@ -1520,8 +1682,7 @@ mdunlinkfiletag(const FileTag *ftag, char *path) * a SYNC_FILTER_REQUEST request. This will be called for all pending * requests to find out whether to forget them. */ -bool -mdfiletagmatches(const FileTag *ftag, const FileTag *candidate) +bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate) { /* * For now we only use filter requests as a way to drop all scheduled diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 2daf75a..dbd5993 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -22,7 +22,12 @@ #include "storage/bufmgr.h" #include "storage/ipc.h" #include "storage/md.h" +#include "storage/pmsignal.h" #include "storage/smgr.h" +#include "storage/filecache.h" +#include "postmaster/secondbuffer.h" +//#include "utils/hfs.h" +#include "utils/backend_status.h" #include "utils/hsearch.h" #include "utils/inval.h" #include "utils/guc.h" @@ -53,8 +58,8 @@ typedef struct f_smgr BlockNumber blocknum, char *buffer, bool skipFsync); bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); - int (*smgr_read) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char **buffer, bool onlyPage, XLogRecPtr lsn); + void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer); void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, @@ -77,7 +82,8 @@ static const f_smgr smgrsw[] = { .smgr_unlink = mdunlink, .smgr_extend = mdextend, .smgr_prefetch = mdprefetch, - .smgr_read = he3db_mdread, + // .smgr_read = he3db_mdread, + .smgr_read = mdread, .smgr_write = mdwrite, .smgr_writeback = mdwriteback, .smgr_nblocks = mdnblocks, @@ -390,7 +396,6 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) if (nrels == 0) return; - /* * Get rid of any remaining buffers for the relations. bufmgr will just * drop them without bothering to write the contents. @@ -443,7 +448,7 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) int which = rels[i]->smgr_which; for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_unlink(rnodes[i], forknum, isRedo); + smgrsw[which].smgr_unlink(rnodes[i], forknum, isRedo); } pfree(rnodes); @@ -467,9 +472,10 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, // return; // } - if ((push_standby != true && EnableHotStandby != true) || IsBootstrapProcessingMode() || InitdbSingle) { + if (!he3share || (push_standby != true && (EnableHotStandby != true || *isPromoteIsTriggered)) || IsBootstrapProcessingMode() || InitdbSingle || he3mirror) { smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum, buffer, skipFsync); + // elog(LOG,"smgrextend reln %d,flk %d,blk %d",reln->smgr_rnode.node.relNode,forknum,blocknum); } /* @@ -481,6 +487,14 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, reln->smgr_cached_nblocks[forknum] = blocknum + 1; else reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; + + // CachedRelInfo *cached_reln; + // cached_reln = FindCacheRel(&reln->smgr_rnode.node); + // if (cached_reln != NULL) + // cached_reln->cached_nblocks[forknum] = blocknum +1; + // else + // SetupRelCache(&reln->smgr_rnode.node, forknum, blocknum+1); + } /* @@ -506,9 +520,9 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) */ void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char **buffer, XLogRecPtr lsn) + char *buffer) { - smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer, false, lsn); + smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer); } /* @@ -517,12 +531,12 @@ smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * Modified points: * 1)return read bytes */ -int -he3dbsmgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char **buffer, XLogRecPtr lsn) -{ - return smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer, true, lsn); -} +// int +// he3dbsmgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +// char **buffer, XLogRecPtr lsn) +// { +// return smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer, true, lsn); +// } /* * smgrwrite() -- Write the supplied buffer out. @@ -543,12 +557,33 @@ void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { - if (push_standby == true || SmgrIsTemp(reln)) { - smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, - buffer, skipFsync); - } + smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, buffer, skipFsync); } +void +he3dbsmgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer, bool skipFsync, XLogRecPtr lsn) +{ + if (!(InitdbSingle || IsBootstrapProcessingMode() == true)) { + PageKey pageKey; + pageKey.relfileNode.dbNode = reln->smgr_rnode.node.dbNode; + pageKey.relfileNode.relNode = reln->smgr_rnode.node.relNode; + + pageKey.blkNo = blocknum; + pageKey.forkNo = forknum; + pageKey.pageLsn = lsn; + + if (push_standby || he3mirror) { + smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, buffer, skipFsync); + } else { + ReceivePageFromDataBuffer(&pageKey, (uint8_t *) buffer); + } + } + else + { + smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, buffer, skipFsync); + } +} /* * smgrwriteback() -- Trigger kernel writeback for the supplied range of @@ -558,10 +593,11 @@ void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { - if (push_standby == true || SmgrIsTemp(reln)) { + //if (push_standby == true || SmgrIsTemp(reln)) { smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum, nblocks); - } + // elog(LOG,"smgrwriteback reln %d,flk %d,blk %d",reln->smgr_rnode.node.relNode,forknum,blocknum); + //} } /* @@ -573,6 +609,42 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum) { BlockNumber result; + /* Check and return if we get the cached value for the number of blocks. */ + //if (push_standby != true ) + //{ + // result = smgrnblocks_cached(reln, forknum); + // if (result != InvalidBlockNumber) + // return result; + //} + // CachedRelInfo *cached_reln = NULL; + // cached_reln = FindCacheRel(&reln->smgr_rnode.node); + // if (cached_reln != NULL && cached_reln->cached_nblocks[forknum] != InvalidBlockNumber) + // { + // reln->smgr_cached_nblocks[forknum] = cached_reln->cached_nblocks[forknum]; + // return cached_reln->cached_nblocks[forknum]; + // } + + result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum); + // elog(LOG, "===exec lseek ==="); + // if (cached_reln == NULL) + // SetupRelCache(&reln->smgr_rnode.node, forknum, result); + // else + // cached_reln->cached_nblocks[forknum] = result; + + reln->smgr_cached_nblocks[forknum] = result; + + return result; +} + +/* + * smgrnblocks() -- Calculate the number of blocks in the + * supplied relation. + */ +BlockNumber +startupsmgrnblocks(SMgrRelation reln, ForkNumber forknum) +{ + BlockNumber result; + /* Check and return if we get the cached value for the number of blocks. */ result = smgrnblocks_cached(reln, forknum); if (result != InvalidBlockNumber) @@ -585,6 +657,7 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum) return result; } + /* * smgrnblocks_cached() -- Get the cached number of blocks in the supplied * relation. @@ -619,6 +692,8 @@ void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks) { int i; + PageKey pk; + OriginDPageKey odpk; /* * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will @@ -646,6 +721,18 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]); + //remove unused pages and related wals in localdisk cache. +// RemoveBufferFromLocal(reln->smgr_rnode.node.dbNode, reln->smgr_rnode.node.relNode, forknum[i], nblocks[i]); + if (IsBootstrapProcessingMode() != true && InitdbSingle != true) + { + pk.relfileNode.dbNode = reln->smgr_rnode.node.dbNode; + pk.relfileNode.relNode = reln->smgr_rnode.node.relNode; + pk.forkNo = forknum[i]; + pk.blkNo = nblocks[i]; + odpk.pk = pk; + odpk.opration = (int)TRUNCATE; + AddOneItemToDPArray(odpk); + } /* * We might as well update the local smgr_cached_nblocks values. The * smgr cache inval message that this function sent will cause other @@ -671,24 +758,6 @@ void smgrtruncatelsn(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks,XLogRecPtr lsn) { int i; - - /* - * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will - * just drop them without bothering to write the contents. - */ - DropRelFileNodeBuffers(reln, forknum, nforks, nblocks); - - /* - * Send a shared-inval message to force other backends to close any smgr - * references they may have for this rel. This is useful because they - * might have open file pointers to segments that got removed, and/or - * smgr_targblock variables pointing past the new rel end. (The inval - * message will come back to our backend, too, causing a - * probably-unnecessary local smgr flush. But we don't expect that this - * is a performance-critical path.) As in the unlink code, we want to be - * sure the message is sent before we start changing things on-disk. - */ - CacheInvalidateSmgr(reln->smgr_rnode); //push to truncate bool flag = false; /* Do the truncation */ @@ -698,17 +767,23 @@ smgrtruncatelsn(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber; if(!SmgrIsTemp(reln)) { if (false == flag) { - XLogRecPtr pushLsn; + XLogRecPtr minApplyLsn; do { - sleep(1); - pushLsn = QueryPushLsn(); - printf("====pushlsn=%lx==lsn==%lx==\n",pushLsn,lsn); - } while(pushLsn!=InvalidXLogRecPtr && pushLsnsmgr_which].smgr_truncate(reln, forknum[i], nblocks[i]); - + smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]); /* * We might as well update the local smgr_cached_nblocks values. The * smgr cache inval message that this function sent will cause other diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c index a12b357..e96ca23 100644 --- a/src/backend/storage/sync/sync.c +++ b/src/backend/storage/sync/sync.c @@ -221,20 +221,23 @@ SyncPostCheckpoint(void) break; /* Unlink the file */ - if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag, - path) < 0) + if (push_standby) { - /* - * There's a race condition, when the database is dropped at the - * same time that we process the pending unlink requests. If the - * DROP DATABASE deletes the file before we do, we will get ENOENT - * here. rmtree() also has to ignore ENOENT errors, to deal with - * the possibility that we delete the file first. - */ - if (errno != ENOENT) - ereport(WARNING, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", path))); + if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag, + path) < 0) + { + /* + * There's a race condition, when the database is dropped at the + * same time that we process the pending unlink requests. If the + * DROP DATABASE deletes the file before we do, we will get ENOENT + * here. rmtree() also has to ignore ENOENT errors, to deal with + * the possibility that we delete the file first. + */ + if (errno != ENOENT) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + } } /* Mark the list entry as canceled, just in case */ @@ -376,7 +379,7 @@ ProcessSyncRequests(void) * all. (We delay checking until this point so that changing fsync on * the fly behaves sensibly.) */ - if (enableFsync) + if (enableFsync && push_standby) { /* * If in checkpointer, we want to absorb pending requests every so diff --git a/src/backend/tcop/dest.c b/src/backend/tcop/dest.c index 33db412..eadbb79 100644 --- a/src/backend/tcop/dest.c +++ b/src/backend/tcop/dest.c @@ -42,6 +42,7 @@ #include "utils/portal.h" #include "utils/guc.h" #include "access/xlog.h" +#include "storage/bufmgr.h" /* ---------------- @@ -282,7 +283,7 @@ ReadyForQuery(CommandDest dest,bool PrivateConn) if (PrivateConn == true) { StringInfoData privateBuf; pq_beginmessage(&privateBuf, 'L'); - if (EnableHotStandby == false) { + if (EnableHotStandby == false || *isPromoteIsTriggered) { pq_sendint64(&privateBuf,(uint64)GetXLogWriteRecPtr()); } else { pq_sendint64(&privateBuf,(uint64)GetXLogReplayRecPtr(NULL)); diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 3a91478..55184b8 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -86,8 +86,15 @@ * global variables * ---------------- */ -bool isPreCache = false; +bool isPreCacheTable = false; +bool isPreCacheIndex = false; +bool isPreCacheIndexDone = false; bool needPreCacheEscape = false; +bool needUnpreCacheEscape = false; +bool isPreCacheAction = true; +Oid preCacheNodeOid = 0; +uint16 *preCacheNodesCountPtr = NULL; +Oid *preCacheNodesPtr = NULL; const char *debug_query_string; /* client-supplied query string */ /* Note: whereToSendOutput is initialized for the bootstrap/standalone case */ @@ -1213,9 +1220,23 @@ exec_simple_query(const char *query_string) */ MemoryContextSwitchTo(oldcontext); - if (isPreCache) + if (isPreCacheTable || isPreCacheIndex) { - needPreCacheEscape = true; + if (isPreCacheAction) + { + needPreCacheEscape = true; + needUnpreCacheEscape = false; + } + else + { + needPreCacheEscape = false; + needUnpreCacheEscape = true; + } + } + else + { + needPreCacheEscape = false; + needUnpreCacheEscape = false; } /* * Run the portal to completion, and then drop it (and the receiver). @@ -1228,9 +1249,10 @@ exec_simple_query(const char *query_string) receiver, &qc); - if (isPreCache) + if (isPreCacheTable || isPreCacheIndex) { needPreCacheEscape = false; + needUnpreCacheEscape = false; } receiver->rDestroy(receiver); @@ -1329,6 +1351,55 @@ exec_simple_query(const char *query_string) debug_query_string = NULL; } +static void +he3_exec_simple_query(const char *query_string) +{ + if (strstr(query_string, "precache table ") != NULL && query_string - strstr(query_string, "precache table ") == 0) + { + isPreCacheTable = true; + preCacheNodeOid = 0; + isPreCacheAction = true; + exec_simple_query(query_string + strlen("precache table ")); + preCacheNodeOid = 0; + isPreCacheTable = false; + } + else if (strstr(query_string, "precache index ") != NULL && query_string - strstr(query_string, "precache index ") == 0) + { + isPreCacheIndex = true; + isPreCacheIndexDone = false; + preCacheNodeOid = 0; + isPreCacheAction = true; + exec_simple_query(query_string + strlen("precache index ")); + preCacheNodeOid = 0; + isPreCacheIndexDone = false; + isPreCacheIndex = false; + } + else if (strstr(query_string, "unprecache table ") != NULL && query_string - strstr(query_string, "unprecache table ") == 0) + { + isPreCacheTable = true; + preCacheNodeOid = 0; + isPreCacheAction = false; + exec_simple_query(query_string + strlen("unprecache table ")); + preCacheNodeOid = 0; + isPreCacheTable = false; + } + else if (strstr(query_string, "unprecache index ") != NULL && query_string - strstr(query_string, "unprecache index ") == 0) + { + isPreCacheIndex = true; + isPreCacheIndexDone = false; + preCacheNodeOid = 0; + isPreCacheAction = false; + exec_simple_query(query_string + strlen("unprecache index ")); + preCacheNodeOid = 0; + isPreCacheIndexDone = false; + isPreCacheIndex = false; + } + else + { + exec_simple_query(query_string); + } +} + /* * exec_parse_message * @@ -4504,16 +4575,7 @@ PostgresMain(int argc, char *argv[], bool PrivateConn, } else { - if (strstr(query_string, "precache ") != NULL && query_string - strstr(query_string, "precache ") == 0) - { - isPreCache = true; - exec_simple_query(query_string + strlen("precache ")); - isPreCache = false; - } - else - { - exec_simple_query(query_string); - } + he3_exec_simple_query(query_string); } send_ready_for_query = true; diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c index 7229598..8c3418f 100644 --- a/src/backend/utils/activity/backend_status.c +++ b/src/backend/utils/activity/backend_status.c @@ -17,6 +17,8 @@ #include "pg_trace.h" #include "pgstat.h" #include "port/atomics.h" /* for memory barriers */ +#include "replication/walsender.h" +#include "replication/walsender_private.h" #include "storage/ipc.h" #include "storage/proc.h" /* for MyProc */ #include "storage/sinvaladt.h" @@ -1148,3 +1150,55 @@ pgstat_clip_activity(const char *raw_activity) return activity; } + +XLogRecPtr He3DBQueryMinLsnFromAllStanby() +{ + int i; + XLogRecPtr minApplyLsn = 0; + int *procpids; + int maxid = 0; + procpids = (int *) malloc(max_wal_senders * sizeof(int)); + for (i = 0; i < NumBackendStatSlots; i++) + { + if (strcmp(BackendStatusArray[i].st_appname, "pgmirror") == 0 || memcmp(BackendStatusArray[i].st_appname, "priv", 4) == 0) + { + procpids[maxid] = BackendStatusArray[i].st_procpid; + maxid++; + } + } + Assert(WalSndCtl != NULL); + + for (i = 0; i < max_wal_senders; i++) + { + int pid; + XLogRecPtr apply; + WalSnd *walsnd = &WalSndCtl->walsnds[i]; + SpinLockAcquire(&walsnd->mutex); + if (walsnd->pid == 0) + { + SpinLockRelease(&walsnd->mutex); + continue; + } + pid = walsnd->pid; + apply = walsnd->apply; + SpinLockRelease(&walsnd->mutex); + int j; + bool exist = false; + for (j = 0; j < maxid; j++) + { + if (pid == procpids[j]) + { + exist = true; + break; + } + } + if (!exist) + { + if (apply < minApplyLsn || minApplyLsn == 0) + minApplyLsn = apply; + } + } + free(procpids); + return minApplyLsn; + +} \ No newline at end of file diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index 6baf677..7ce234f 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -248,6 +248,15 @@ pgstat_get_wait_activity(WaitEventActivity w) case WAIT_EVENT_WAL_WRITER_MAIN: event_name = "WalWriterMain"; break; + case WAIT_EVENT_PAGEFLUSH_MAIN: + event_name = "PageFlushMain"; + break; + case WAIT_EVENT_CLEAN_LOGINDEX_MAIN: + event_name = "CleanLogindexMain"; + break; + case WAIT_EVENT_SECONDBUFFER_MAIN: + event_name = "SecondBufferMain"; + break; /* no default case, so that compiler will warn */ } diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index d899ba8..0801acb 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -25,12 +25,15 @@ #include "postmaster/bgworker_internals.h" #include "postmaster/postmaster.h" #include "replication/slot.h" +#include "storage/bufmgr.h" #include "storage/proc.h" #include "storage/procarray.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/inet.h" +#include "utils/pg_lsn.h" #include "utils/timestamp.h" +#include "storage/he3db_logindex.h" #define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var)))) @@ -2381,3 +2384,98 @@ pg_stat_get_replication_slot(PG_FUNCTION_ARGS) /* Returns the record as Datum */ PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); } + +/* + * Returns statistics of WAL activity + */ +Datum +pg_stat_get_he3walwrite(PG_FUNCTION_ARGS) +{ +#define PG_STAT_GET_HE3WALWRITE_COLS 4 + TupleDesc tupdesc; + Datum values[PG_STAT_GET_HE3WALWRITE_COLS]; + bool nulls[PG_STAT_GET_HE3WALWRITE_COLS]; + XLogRecPtr writtenlsn, flushlsn; + uint64 writtenTimes; + int parallels; + + /* Initialise values and NULL flags arrays */ + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + /* Initialise attributes information in the tuple descriptor */ + tupdesc = CreateTemplateTupleDesc(PG_STAT_GET_HE3WALWRITE_COLS); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "write_lsn", + PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "flush_lsn", + PG_LSNOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "writekv_totaltimes", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "writekv_parallels", + INT4OID, -1, 0); + + BlessTupleDesc(tupdesc); + + /* Get statistics about WAL Write */ + if (EnableHotStandby && *isPromoteIsTriggered == false) + PG_RETURN_NULL(); + + He3DBGetWalWriteStats(&writtenlsn, &flushlsn, &writtenTimes, ¶llels); + + /* Fill values and NULLs */ + values[0] = LSNGetDatum(writtenlsn); + values[1] = LSNGetDatum(flushlsn); + values[2] = UInt64GetDatum(writtenTimes); + values[3] = Int32GetDatum(parallels); + + /* Returns the record as Datum */ + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} + +/* + * Returns statistics of logindex + */ +Datum +pg_stat_get_he3_logindex(PG_FUNCTION_ARGS) +{ +#define PG_STAT_GET_HE3_LOGINDEX_COLS 5 + TupleDesc tupdesc; + Datum values[PG_STAT_GET_HE3_LOGINDEX_COLS]; + bool nulls[PG_STAT_GET_HE3_LOGINDEX_COLS]; + uint64 memtable_total; + uint64 memtable_used; + uint64 memtable_active_index; + uint64 memtable_start_index; + uint64 page_total; + + /* Initialise values and NULL flags arrays */ + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + + /* Initialise attributes information in the tuple descriptor */ + tupdesc = CreateTemplateTupleDesc(PG_STAT_GET_HE3_LOGINDEX_COLS); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "memtable_total", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "memtable_used", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "memtable_start_index", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 4, "memtable_active_index", + INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 5, "page_total", + INT8OID, -1, 0); + + BlessTupleDesc(tupdesc); + + He3DBGetLogindexStats(&memtable_total, &memtable_used, &memtable_active_index, &memtable_start_index, &page_total); + + /* Fill values and NULLs */ + values[0] = UInt64GetDatum(memtable_total); + values[1] = UInt64GetDatum(memtable_used); + values[2] = UInt64GetDatum(memtable_start_index); + values[3] = UInt64GetDatum(memtable_active_index); + values[4] = UInt64GetDatum(page_total); + + /* Returns the record as Datum */ + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls))); +} \ No newline at end of file diff --git a/src/backend/utils/cache/relmapper.c b/src/backend/utils/cache/relmapper.c index a6e38ad..63d8aa1 100644 --- a/src/backend/utils/cache/relmapper.c +++ b/src/backend/utils/cache/relmapper.c @@ -1032,11 +1032,18 @@ relmap_redo(XLogReaderState *record) * There shouldn't be anyone else updating relmaps during WAL replay, * but grab the lock to interlock against load_relmap_file(). */ - LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE); - write_relmap_file((xlrec->dbid == InvalidOid), &newmap, - false, true, false, - xlrec->dbid, xlrec->tsid, dbpath); - LWLockRelease(RelationMappingLock); + if (EnableHotStandby && he3share) + { + CacheInvalidateRelmap(xlrec->dbid); + } + else + { + LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE); + write_relmap_file((xlrec->dbid == InvalidOid), &newmap, + false, true, false, + xlrec->dbid, xlrec->tsid, dbpath); + LWLockRelease(RelationMappingLock); + } pfree(dbpath); } diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c index 7e925d0..1664ee5 100644 --- a/src/backend/utils/cache/syscache.c +++ b/src/backend/utils/cache/syscache.c @@ -74,7 +74,6 @@ #include "catalog/pg_ts_template.h" #include "catalog/pg_type.h" #include "catalog/pg_user_mapping.h" -#include "catalog/pg_hot_data.h" #include "lib/qunique.h" #include "utils/catcache.h" #include "utils/rel.h" @@ -476,17 +475,6 @@ static const struct cachedesc cacheinfo[] = { }, 4 }, - {HotDataRelationId, /* HOTDATADATNAMERELNAME */ - HotDataDatnameRelnameIndexId, - 2, - { - Anum_pg_hot_data_datname, - Anum_pg_hot_data_relname, - 0, - 0 - }, - 4 - }, {IndexRelationId, /* INDEXRELID */ IndexRelidIndexId, 1, diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 381d9e5..f248840 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -112,6 +112,7 @@ bool IsPostmasterEnvironment = false; bool IsUnderPostmaster = false; bool IsBinaryUpgrade = false; bool IsBackgroundWorker = false; +bool IsParallelFlushWorker = false; bool ExitOnAnyError = false; @@ -136,6 +137,7 @@ int NBuffers = 1000; int MaxConnections = 90; int max_worker_processes = 8; int max_parallel_workers = 8; +int max_parallel_flush_process = 32; int MaxBackends = 0; int VacuumCostPageHit = 1; /* GUC parameters for vacuum */ diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 6574247..b338fb0 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -104,7 +104,6 @@ InitPostmasterChild(void) set_stack_base(); InitProcessGlobals(); - ufs_init_client(); /* * make sure stderr is in binary mode before anything can possibly be * written to it, in case it's actually the syslogger pipe, so the pipe @@ -277,6 +276,15 @@ GetBackendTypeDesc(BackendType backendType) case B_LOGGER: backendDesc = "logger"; break; + case B_PARALLEL_FLUSH: + backendDesc = "parallel flush"; + break; + case B_CLEAN_LOGINDEX: + backendDesc = "clean logindex"; + break; + case B_SECONDBUFFER: + backendDesc = "second buffer"; + break; } return backendDesc; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 692c41e..63ba72a 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -518,7 +518,7 @@ InitializeMaxBackends(void) /* the extra unit accounts for the autovacuum launcher */ MaxBackends = MaxConnections + autovacuum_max_workers + 1 + - max_worker_processes + max_wal_senders; + max_worker_processes + max_parallel_flush_process + max_wal_senders; /* internal error because the values were all checked previously */ if (MaxBackends > MAX_BACKENDS) @@ -753,7 +753,6 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, } else if (!IsUnderPostmaster) { - ufs_init_client(); InitializeSessionUserIdStandalone(); am_superuser = true; if (!ThereIsAtLeastOneRole()) diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index e61f2fa..01d98e3 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -74,6 +74,7 @@ #include "postmaster/postmaster.h" #include "postmaster/syslogger.h" #include "postmaster/walwriter.h" +#include "postmaster/secondbuffer.h" #include "replication/logicallauncher.h" #include "replication/reorderbuffer.h" #include "replication/slot.h" @@ -233,6 +234,7 @@ static bool check_recovery_target_name(char **newval, void **extra, GucSource so static void assign_recovery_target_name(const char *newval, void *extra); static bool check_recovery_target_lsn(char **newval, void **extra, GucSource source); static void assign_recovery_target_lsn(const char *newval, void *extra); +static void assign_walsender_target_lsn(const char *newval, void *extra); static bool check_primary_slot_name(char **newval, void **extra, GucSource source); static bool check_default_with_oids(bool *newval, void **extra, GucSource source); @@ -605,6 +607,13 @@ char *pgstat_temp_directory; char *application_name; bool push_standby = false; +bool he3share = true; +bool mpush = false; +bool he3_point_in_time_recovery; +bool he3mirror = false; +bool pgmirror = false; +char *client_application_name = NULL; + int tcp_keepalives_idle; int tcp_keepalives_interval; @@ -626,6 +635,11 @@ int ssl_renegotiation_limit; int huge_pages; int huge_page_size; +/* he3db logindex mem-table size (unit MB), according to this value we can calculate + * the number of mem table. + */ +int he3db_logindex_mem_size; + /* * These variables are all dummies that don't do anything, except in some * cases provide the value for SHOW to display. The real state is elsewhere @@ -644,6 +658,7 @@ static char *timezone_string; static char *log_timezone_string; static char *timezone_abbreviations_string; static char *data_directory; +//static char *lmdb_directory; static char *session_authorization_string; static int max_function_args; static int max_index_keys; @@ -660,6 +675,7 @@ static char *recovery_target_string; static char *recovery_target_xid_string; static char *recovery_target_name_string; static char *recovery_target_lsn_string; +static char *walSendLsnStr; /* should be static, but commands/variable.c needs to get at this */ @@ -748,6 +764,8 @@ const char *const config_group_names[] = gettext_noop("Write-Ahead Log / Archive Recovery"), /* WAL_RECOVERY_TARGET */ gettext_noop("Write-Ahead Log / Recovery Target"), + /* WAL_SEND_LSN */ + gettext_noop("Write-Ahead Log / Wal Send Lsn"), /* REPLICATION_SENDING */ gettext_noop("Replication / Sending Servers"), /* REPLICATION_PRIMARY */ @@ -2120,6 +2138,41 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + + { + {"he3share", PGC_SIGHUP, WAL_ARCHIVE_RECOVERY, + gettext_noop("Sets storage is shared if he3share is configured true."), + }, + &he3share, + true, + NULL, NULL, NULL + }, + + { + {"mpush", PGC_SIGHUP, WAL_ARCHIVE_RECOVERY, + gettext_noop("Sets push_standby is belong to master if mpush is configured true."), + }, + &mpush, + false, + NULL, NULL, NULL + }, + + { + {"he3_point_in_time_recovery", PGC_SIGHUP, WAL_ARCHIVE_RECOVERY, + gettext_noop("Sets whether we are in he3 pitr"), + }, + &he3_point_in_time_recovery, + false, + NULL, NULL, NULL + }, + { + {"he3mirror", PGC_SIGHUP, WAL_ARCHIVE_RECOVERY, + gettext_noop("Sets he3db as replica if he3mirror is configured true."), + }, + &he3mirror, + false, + NULL, NULL, NULL + }, /* End-of-list marker */ { @@ -2280,7 +2333,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &wal_receiver_status_interval, - 10, 0, INT_MAX / 1000, + 1, 0, INT_MAX / 1000, NULL, NULL, NULL }, @@ -2342,6 +2395,17 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"second_buffers", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Sets the number of second buffers used by the server."), + NULL, + GUC_UNIT_BLOCKS + }, + &SNBuffers, + 1024, 16, INT_MAX / 2, + NULL, NULL, NULL + }, + { {"temp_buffers", PGC_USERSET, RESOURCES_MEM, gettext_noop("Sets the maximum number of temporary buffers used by each session."), @@ -3551,6 +3615,17 @@ static struct config_int ConfigureNamesInt[] = check_client_connection_check_interval, NULL, NULL }, + { + {"he3db_logindex_mem_size", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Set the size for logindex memory table"), + NULL, + GUC_UNIT_MB + }, + &he3db_logindex_mem_size, + 512, 0, INT_MAX / 2, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL @@ -3924,6 +3999,15 @@ static struct config_string ConfigureNamesString[] = "", check_recovery_target_lsn, assign_recovery_target_lsn, NULL }, + { + {"wal_send_lsn", PGC_SIGHUP, WAL_SEND_LSN, + gettext_noop("Sets the LSN of the wal send log location up to which mirror start"), + NULL + }, + &walSendLsnStr, + "", + check_recovery_target_lsn, assign_walsender_target_lsn, NULL + }, { {"promote_trigger_file", PGC_SIGHUP, REPLICATION_STANDBY, @@ -3946,6 +4030,17 @@ static struct config_string ConfigureNamesString[] = NULL, NULL, NULL }, + { + {"he3_meta_conninfo", PGC_SIGHUP, CONN_AUTH_AUTH, + gettext_noop("Sets the connection string to be used to connect to the meta server."), + NULL, + GUC_SUPERUSER_ONLY + }, + &he3_meta_conninfo, + "", + NULL, NULL, NULL + }, + { {"primary_slot_name", PGC_SIGHUP, REPLICATION_STANDBY, gettext_noop("Sets the name of the replication slot to use on the sending server."), @@ -4351,6 +4446,30 @@ static struct config_string ConfigureNamesString[] = NULL, NULL, NULL }, + { + //TODO lmdb + {"lmdb_page_directory", PGC_POSTMASTER, FILE_LOCATIONS, + gettext_noop("Sets the lmdb page directory."), + NULL, + GUC_SUPERUSER_ONLY | GUC_DISALLOW_IN_AUTO_FILE + }, + &lmdb_page_directory, + "/tmp/pagedb", + NULL, NULL, NULL + }, + + { + //TODO lmdb + {"lmdb_wal_directory", PGC_POSTMASTER, FILE_LOCATIONS, + gettext_noop("Sets the lmdb wal directory."), + NULL, + GUC_SUPERUSER_ONLY | GUC_DISALLOW_IN_AUTO_FILE + }, + &lmdb_wal_directory, + "/tmp/waldb", + NULL, NULL, NULL + }, + { {"config_file", PGC_POSTMASTER, FILE_LOCATIONS, gettext_noop("Sets the server's main configuration file."), @@ -11977,7 +12096,7 @@ static bool check_maxconnections(int *newval, void **extra, GucSource source) { if (*newval + autovacuum_max_workers + 1 + - max_worker_processes + max_wal_senders > MAX_BACKENDS) + max_worker_processes + max_parallel_flush_process + max_wal_senders > MAX_BACKENDS) return false; return true; } @@ -11986,7 +12105,7 @@ static bool check_autovacuum_max_workers(int *newval, void **extra, GucSource source) { if (MaxConnections + *newval + 1 + - max_worker_processes + max_wal_senders > MAX_BACKENDS) + max_worker_processes + max_parallel_flush_process + max_wal_senders > MAX_BACKENDS) return false; return true; } @@ -11995,7 +12114,7 @@ static bool check_max_wal_senders(int *newval, void **extra, GucSource source) { if (MaxConnections + autovacuum_max_workers + 1 + - max_worker_processes + *newval > MAX_BACKENDS) + max_worker_processes + max_parallel_flush_process + *newval > MAX_BACKENDS) return false; return true; } @@ -12026,8 +12145,8 @@ check_autovacuum_work_mem(int *newval, void **extra, GucSource source) static bool check_max_worker_processes(int *newval, void **extra, GucSource source) { - if (MaxConnections + autovacuum_max_workers + 1 + - *newval + max_wal_senders > MAX_BACKENDS) + if (MaxConnections + autovacuum_max_workers + 1 + + *newval + max_parallel_flush_process + max_wal_senders > MAX_BACKENDS) return false; return true; } @@ -12490,6 +12609,16 @@ check_recovery_target_lsn(char **newval, void **extra, GucSource source) return true; } +static void assign_walsender_target_lsn(const char *newval, void *extra) +{ + if (newval && strcmp(newval, "") != 0) + { + walsenderLsn = *((XLogRecPtr *) extra); + } else { + walsenderLsn = 0; + } +} + static void assign_recovery_target_lsn(const char *newval, void *extra) { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 4557ba7..0a0d927 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -93,7 +93,7 @@ # - Authentication - #authentication_timeout = 1min # 1s-600s -#password_encryption = scram-sha-256 # scram-sha-256 or md5 + #db_user_namespace = off # GSSAPI using Kerberos diff --git a/src/bin/Makefile b/src/bin/Makefile index 2fe0ae6..2de5827 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -23,13 +23,12 @@ SUBDIRS = \ pg_controldata \ pg_ctl \ pg_dump \ + pg_waldump \ pg_resetwal \ - pg_rewind \ pg_test_fsync \ pg_test_timing \ pg_upgrade \ pg_verifybackup \ - pg_waldump \ pgbench \ psql \ scripts diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index f911f98..674c1ee 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -235,6 +235,8 @@ main(int argc, char *argv[]) dbState(ControlFile->state)); printf(_("pg_control last modified: %s\n"), pgctime_str); + printf(_("Latest checkpoint location for PG: %X/%X\n"), + LSN_FORMAT_ARGS(ControlFile->checkPointFile)); printf(_("Latest checkpoint location: %X/%X\n"), LSN_FORMAT_ARGS(ControlFile->checkPoint)); printf(_("Latest checkpoint's REDO location: %X/%X\n"), diff --git a/src/bin/pg_produce_wal/Makefile b/src/bin/pg_produce_wal/Makefile new file mode 100644 index 0000000..5060530 --- /dev/null +++ b/src/bin/pg_produce_wal/Makefile @@ -0,0 +1,54 @@ +# src/bin/pg_waldump/Makefile + +PGFILEDESC = "pg_produce_wal - decode and display WAL" +PGAPPICON=win32 + +subdir = src/bin/pg_produce_wal +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + $(RMGRDESCOBJS) \ + $(WIN32RES) \ + xlogreader.o \ + pg_mirror.o + +override CPPFLAGS := -DFRONTEND -DPG_NOREPLAY -I$(libpq_srcdir) $(CPPFLAGS) + +librust_log = -DFRONTEND -L$(top_builddir)/src/backend/storage/file -lrust_log -lstdc++ -lm -ldl -lpthread -lfuse3 -Wl,-gc-section +LIBS += $(librust_log) + +all: pg_produce_wal + +pg_produce_wal: pg_produce_wal.o $(OBJS) | submake-libpgport submake-libpq + $(CC) $(CFLAGS) pg_produce_wal.o $(OBJS) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X) + +xlogreader.c: % : $(top_srcdir)/src/backend/access/transam/% + rm -f $@ && $(LN_S) $< . + +pg_mirror.c: % : $(top_srcdir)/src/backend/access/transam/% + rm -f $@ && $(LN_S) $< . + +#xlog.c: % : $(top_srcdir)/src/backend/access/transam/% +# rm -f $@ && $(LN_S) $< . + +#$(RMGRDESCSOURCES): % : $(top_srcdir)/src/backend/access/rmgrdesc/% +# rm -f $@ && $(LN_S) $< . + +install: all installdirs + $(INSTALL_PROGRAM) pg_produce_wal$(X) '$(DESTDIR)$(bindir)/pg_produce_wal$(X)' +installdirs: + $(MKDIR_P) '$(DESTDIR)$(bindir)' + +uninstall: + rm -f '$(DESTDIR)$(bindir)/pg_produce_wal$(X)' + +clean distclean maintainer-clean: + rm -f pg_produce_wal$(X) $(OBJS) xlogreader.c pg_mirror.c + rm -rf tmp_check + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) diff --git a/src/bin/pg_produce_wal/pg_produce_wal.c b/src/bin/pg_produce_wal/pg_produce_wal.c new file mode 100644 index 0000000..545dc16 --- /dev/null +++ b/src/bin/pg_produce_wal/pg_produce_wal.c @@ -0,0 +1,458 @@ +#define FRONTEND 1 + +#include "postgres.h" + +#include + +#include "access/transam.h" +#include "access/xlog.h" +#include "access/pg_mirror.h" +#include "access/xlog_internal.h" +#include "catalog/pg_control.h" +#include "common/controldata_utils.h" +#include "common/logging.h" +#include "getopt_long.h" +#include "pg_getopt.h" +#include "access/heapam_xlog.h" +#include "catalog/pg_control.h" +#include "access/nbtxlog.h" +#include "access/gistxlog.h" +#include "access/spgxlog.h" +#include "access/brin_xlog.h" +#include "common/file_perm.h" + +typedef struct XLogDumpPrivate +{ + TimeLineID timeline; + XLogRecPtr startptr; + XLogRecPtr endptr; + bool endptr_reached; +} XLogDumpPrivate; + + +static void +usage(const char *progname) +{ + printf(_("%s displays control information of a PostgreSQL database cluster.\n\n"), progname); + printf(_("Usage:\n")); + printf(_(" %s [OPTION] [DATADIR]\n"), progname); + printf(_("\nOptions:\n")); + printf(_(" [-D, --pgdata=]DATADIR data directory\n")); + printf(_(" -V, --version output version information, then exit\n")); + printf(_(" -?, --help show this help, then exit\n")); + printf(_("\nIf no data directory (DATADIR) is specified, " + "the environment variable PGDATA\nis used.\n\n")); + printf(_("Report bugs to <%s>.\n"), PACKAGE_BUGREPORT); + printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL); +} + + +static const char * +dbState(DBState state) +{ + switch (state) + { + case DB_STARTUP: + return _("starting up"); + case DB_SHUTDOWNED: + return _("shut down"); + case DB_SHUTDOWNED_IN_RECOVERY: + return _("shut down in recovery"); + case DB_SHUTDOWNING: + return _("shutting down"); + case DB_IN_CRASH_RECOVERY: + return _("in crash recovery"); + case DB_IN_ARCHIVE_RECOVERY: + return _("in archive recovery"); + case DB_IN_PRODUCTION: + return _("in production"); + } + return _("unrecognized status code"); +} + +static const char * +wal_level_str(WalLevel wal_level) +{ + switch (wal_level) + { + case WAL_LEVEL_MINIMAL: + return "minimal"; + case WAL_LEVEL_REPLICA: + return "replica"; + case WAL_LEVEL_LOGICAL: + return "logical"; + } + return _("unrecognized wal_level"); +} + +/* pg_waldump's XLogReaderRoutine->batch_read callback */ +static int +WALDumpBatchRead(XLogReaderState *state, XLogRecPtr targetPtr, + int reqLen, char *readBuff) +{ + XLogDumpPrivate *private = state->private_data; + int count; + + if (private->endptr != InvalidXLogRecPtr) + { + if (targetPtr >= private->endptr) + { + private->endptr_reached = true; + return -1; + } + } + + count = He3DBWALRead(state, targetPtr, SizeOfXLogRecord, readBuff); + + return count; +} + +#define UsableBytesInPage_tmp (XLOG_BLCKSZ - SizeOfXLogShortPHD) +#define DEFAULT_XLOG_SEG_SIZE (16*1024*1024) + +static uint64 UsableBytesInSegment_tmp = + (DEFAULT_XLOG_SEG_SIZE / XLOG_BLCKSZ * UsableBytesInPage_tmp) - + (SizeOfXLogLongPHD - SizeOfXLogShortPHD); + +static XLogRecPtr +XLogBytePosToRecPtr_tmp(uint64 bytepos) +{ + /* + * original logic, we abandon it. + */ + if(0) { + uint64 fullsegs; + uint64 fullpages; + uint64 bytesleft; + uint32 seg_offset; + XLogRecPtr result; + + fullsegs = bytepos / UsableBytesInSegment_tmp; + bytesleft = bytepos % UsableBytesInSegment_tmp; + + if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) + { + /* fits on first page of segment */ + seg_offset = bytesleft + SizeOfXLogLongPHD; + } + else + { + /* account for the first page on segment with long header */ + seg_offset = XLOG_BLCKSZ; + bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; + + fullpages = bytesleft / UsableBytesInPage_tmp; + bytesleft = bytesleft % UsableBytesInPage_tmp; + + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + } + + XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, DEFAULT_XLOG_SEG_SIZE, result); + + return result; + } + + return bytepos; + +} + + +/* + * Like XLogBytePosToRecPtr, but if the position is at a page boundary, + * returns a pointer to the beginning of the page (ie. before page header), + * not to where the first xlog record on that page would go to. This is used + * when converting a pointer to the end of a record. + */ +static XLogRecPtr +XLogBytePosToEndRecPtr_tmp(uint64 bytepos) +{ + /* + * original logic, we abandon it. + */ + if(0){ + uint64 fullsegs; + uint64 fullpages; + uint64 bytesleft; + uint32 seg_offset; + XLogRecPtr result; + + fullsegs = bytepos / UsableBytesInSegment_tmp; + bytesleft = bytepos % UsableBytesInSegment_tmp; + + if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) + { + /* fits on first page of segment */ + if (bytesleft == 0) + seg_offset = 0; + else + seg_offset = bytesleft + SizeOfXLogLongPHD; + } + else + { + /* account for the first page on segment with long header */ + seg_offset = XLOG_BLCKSZ; + bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD; + + fullpages = bytesleft / UsableBytesInPage_tmp; + bytesleft = bytesleft % UsableBytesInPage_tmp; + + if (bytesleft == 0) + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft; + else + seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD; + } + + XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result); + + return result; + } + + return bytepos; +} + +static int +BasicOpenFilePerm_tmp(const char *fileName, int fileFlags, mode_t fileMode) +{ + int fd; + +tryAgain: + fd = open(fileName, fileFlags, fileMode); + + if (fd >= 0) + return fd; /* success! */ + + if (errno == EMFILE || errno == ENFILE) + { + int save_errno = errno; + + printf("out of file descriptors %d",errno); + } + + return -1; /* failure */ +} + +static int64_t +XLogFileInit_tmp(char* prefix,XLogSegNo logsegno, bool *use_existent, bool use_lock) +{ + char path[MAXPGPATH]; + char tmppath[MAXPGPATH]; + int64_t fd; + int save_errno; + char buff[XLOG_BLCKSZ]={0}; + int n = snprintf(path,sizeof(path),"%s/",prefix); + XLogFilePath(&path[n], 1, logsegno, DEFAULT_XLOG_SEG_SIZE); + + /* + * Try to use existent file (checkpoint maker may have created it already) + */ + if (*use_existent) + { + fd = BasicOpenFilePerm_tmp(path, O_RDWR | PG_BINARY | SYNC_METHOD_FSYNC,PG_FILE_MODE_OWNER); + if (fd < 0) + { + if (errno != ENOENT) + printf("open file failed %s\n",path); + } + else + return fd; + } else { + fd = BasicOpenFilePerm_tmp(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,PG_FILE_MODE_OWNER); + off_t offset = 0; + while(offset 1) + { + if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) + { + usage(progname); + exit(0); + } + if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0) + { + puts("pg_controldata (PostgreSQL) " PG_VERSION); + exit(0); + } + } + + while ((c = getopt_long(argc, argv, "D:", long_options, NULL)) != -1) + { + switch (c) + { + case 'D': + DataDir = optarg; + break; + default: + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + } + + if (DataDir == NULL) + { + if (optind < argc) + DataDir = argv[optind++]; + else + DataDir = getenv("PGDATA"); + } + + /* Complain if any arguments remain */ + if (optind < argc) + { + pg_log_error("too many command-line arguments (first is \"%s\")", + argv[optind]); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), + progname); + exit(1); + } + + if (DataDir == NULL) + { + pg_log_error("no data directory specified"); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + + /* get a copy of the control file */ + ControlFile = get_controlfile(DataDir, &crc_ok); + if (!crc_ok) + printf(_("WARNING: Calculated CRC checksum does not match value stored in file.\n" + "Either the file is corrupt, or it has a different layout than this program\n" + "is expecting. The results below are untrustworthy.\n\n")); + setControlFile(ControlFile); + + /* set wal segment size */ + WalSegSz = ControlFile->xlog_seg_size; + + if (!IsValidWalSegSize(WalSegSz)) + { + printf(_("WARNING: invalid WAL segment size\n")); + printf(ngettext("The WAL segment size stored in the file, %d byte, is not a power of two\n" + "between 1 MB and 1 GB. The file is corrupt and the results below are\n" + "untrustworthy.\n\n", + "The WAL segment size stored in the file, %d bytes, is not a power of two\n" + "between 1 MB and 1 GB. The file is corrupt and the results below are\n" + "untrustworthy.\n\n", + WalSegSz), + WalSegSz); + } + + /* + * This slightly-chintzy coding will work as long as the control file + * timestamps are within the range of time_t; that should be the case in + * all foreseeable circumstances, so we don't bother importing the + * backend's timezone library into pg_controldata. + * + * Use variable for format to suppress overly-anal-retentive gcc warning + * about %c + */ + time_tmp = (time_t) ControlFile->time; + strftime(pgctime_str, sizeof(pgctime_str), strftime_fmt, + localtime(&time_tmp)); + time_tmp = (time_t) ControlFile->checkPointCopy.time; + strftime(ckpttime_str, sizeof(ckpttime_str), strftime_fmt, + localtime(&time_tmp)); + + memset(&private, 0, sizeof(XLogDumpPrivate)); + private.timeline = 1; + private.startptr = ControlFile->checkPoint; + private.endptr = InvalidXLogRecPtr; + private.endptr_reached = false; + /* we have everything we need, start reading */ + XLogReaderState *xlogreader_state; + xlogreader_state = + XLogReaderAllocate(WalSegSz, NULL, + XL_ROUTINE(.batch_read = WALDumpBatchRead), + &private); + if (!xlogreader_state) + printf("out of memory"); + + xlogreader_state->currTLI = ControlFile->checkPointCopy.ThisTimeLineID; + /* first find a valid recptr to start from */ + XLogRecPtr first_record; + int ret = -1; + ret = He3DBWALRead(xlogreader_state, + private.startptr, + SizeOfXLogRecord, + xlogreader_state->readBuf); + if (ret < SizeOfXLogRecord) { + printf("He3DBReadWalInternal Failed\n"); + return -1; + } + XLogRecord* record = (XLogRecord*)xlogreader_state->readBuf; + char DStr[1024]={0}; + int dLen = 0; + uint64 startLsn = 0,endLsn = 0; + int mtrLen = ArrayXlogHe3ToPg(record,record->xl_tot_len,DStr,&dLen,&startLsn,&endLsn); + ControlFile->checkPoint = startLsn; + ControlFile->checkPointCopy.redo = startLsn; + update_controlfile(DataDir,ControlFile,true); + XLogSegNo segno; + XLByteToSeg(ControlFile->checkPointCopy.redo, segno, WalSegSz); + int64_t recvFile = -1; + XLogSegNo recvSegNo = 0; + TimeLineID recvFileTLI = 1; + //ThisTimeLineID = 1; + /* Close the current segment if it's completed */ + if (recvFile < 0) + { + bool use_existent = false; + + /* Create/use new log file */ + XLByteToSeg(ControlFile->checkPoint, recvSegNo, DEFAULT_XLOG_SEG_SIZE); + recvFile = XLogFileInit_tmp(DataDir,recvSegNo, &use_existent, true); + recvFileTLI = 1; + } + int startoff = 0; + int byteswritten; + /* Calculate the start offset of the received logs */ + //startoff = XLogSegmentOffset(ControlFile->checkPoint, DEFAULT_XLOG_SEG_SIZE); + //int segbytes; + //if (startoff + endLsn - ControlFile->checkPoint > DEFAULT_XLOG_SEG_SIZE) + // segbytes = DEFAULT_XLOG_SEG_SIZE - startoff; + //else + // segbytes = endLsn - ControlFile->checkPoint; + + /* OK to write the logs */ + //errno = 0; + + byteswritten = pg_pwrite(recvFile, DStr, dLen, (off_t) startoff); + fsync(recvFile); + close(recvFile); + return 0; +} + diff --git a/src/bin/pg_rewind/Makefile b/src/bin/pg_rewind/Makefile index 14d21fd..c63b17e 100644 --- a/src/bin/pg_rewind/Makefile +++ b/src/bin/pg_rewind/Makefile @@ -32,6 +32,9 @@ OBJS = \ EXTRA_CLEAN = xlogreader.c +librust_log = -L$(top_builddir)/src/backend/storage/file -lrust_log -lstdc++ -lm -ldl -lpthread -lfuse3 -Wl,-gc-section +LIBS += $(librust_log) + all: pg_rewind pg_rewind: $(OBJS) | submake-libpq submake-libpgport diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 59ebac7..ba40947 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -71,7 +71,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, private.tliIndex = tliIndex; private.restoreCommand = restoreCommand; xlogreader = XLogReaderAllocate(WalSegSz, datadir, - XL_ROUTINE(.page_read = &SimpleXLogPageRead), + XL_ROUTINE(.page_read = &He3DBWALRead), &private); if (xlogreader == NULL) pg_fatal("out of memory"); @@ -129,7 +129,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex, private.tliIndex = tliIndex; private.restoreCommand = restoreCommand; xlogreader = XLogReaderAllocate(WalSegSz, datadir, - XL_ROUTINE(.page_read = &SimpleXLogPageRead), + XL_ROUTINE(.page_read = &He3DBWALRead), &private); if (xlogreader == NULL) pg_fatal("out of memory"); @@ -189,7 +189,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, private.tliIndex = tliIndex; private.restoreCommand = restoreCommand; xlogreader = XLogReaderAllocate(WalSegSz, datadir, - XL_ROUTINE(.page_read = &SimpleXLogPageRead), + XL_ROUTINE(.page_read = &He3DBWALRead), &private); if (xlogreader == NULL) pg_fatal("out of memory"); @@ -246,116 +246,116 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, } /* XLogReader callback function, to read a WAL page */ -static int -SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, - int reqLen, XLogRecPtr targetRecPtr, char *readBuf) -{ - XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; - uint32 targetPageOff; - XLogRecPtr targetSegEnd; - XLogSegNo targetSegNo; - int r; +// static int +// SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, +// int reqLen, XLogRecPtr targetRecPtr, char *readBuf) +// { +// XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; +// uint32 targetPageOff; +// XLogRecPtr targetSegEnd; +// XLogSegNo targetSegNo; +// int r; - XLByteToSeg(targetPagePtr, targetSegNo, WalSegSz); - XLogSegNoOffsetToRecPtr(targetSegNo + 1, 0, WalSegSz, targetSegEnd); - targetPageOff = XLogSegmentOffset(targetPagePtr, WalSegSz); +// XLByteToSeg(targetPagePtr, targetSegNo, WalSegSz); +// XLogSegNoOffsetToRecPtr(targetSegNo + 1, 0, WalSegSz, targetSegEnd); +// targetPageOff = XLogSegmentOffset(targetPagePtr, WalSegSz); - /* - * See if we need to switch to a new segment because the requested record - * is not in the currently open one. - */ - if (xlogreadfd >= 0 && - !XLByteInSeg(targetPagePtr, xlogreadsegno, WalSegSz)) - { - close(xlogreadfd); - xlogreadfd = -1; - } +// /* +// * See if we need to switch to a new segment because the requested record +// * is not in the currently open one. +// */ +// if (xlogreadfd >= 0 && +// !XLByteInSeg(targetPagePtr, xlogreadsegno, WalSegSz)) +// { +// close(xlogreadfd); +// xlogreadfd = -1; +// } - XLByteToSeg(targetPagePtr, xlogreadsegno, WalSegSz); +// XLByteToSeg(targetPagePtr, xlogreadsegno, WalSegSz); - if (xlogreadfd < 0) - { - char xlogfname[MAXFNAMELEN]; +// if (xlogreadfd < 0) +// { +// char xlogfname[MAXFNAMELEN]; - /* - * Since incomplete segments are copied into next timelines, switch to - * the timeline holding the required segment. Assuming this scan can - * be done both forward and backward, consider also switching timeline - * accordingly. - */ - while (private->tliIndex < targetNentries - 1 && - targetHistory[private->tliIndex].end < targetSegEnd) - private->tliIndex++; - while (private->tliIndex > 0 && - targetHistory[private->tliIndex].begin >= targetSegEnd) - private->tliIndex--; +// /* +// * Since incomplete segments are copied into next timelines, switch to +// * the timeline holding the required segment. Assuming this scan can +// * be done both forward and backward, consider also switching timeline +// * accordingly. +// */ +// while (private->tliIndex < targetNentries - 1 && +// targetHistory[private->tliIndex].end < targetSegEnd) +// private->tliIndex++; +// while (private->tliIndex > 0 && +// targetHistory[private->tliIndex].begin >= targetSegEnd) +// private->tliIndex--; - XLogFileName(xlogfname, targetHistory[private->tliIndex].tli, - xlogreadsegno, WalSegSz); +// XLogFileName(xlogfname, targetHistory[private->tliIndex].tli, +// xlogreadsegno, WalSegSz); - snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s", - xlogreader->segcxt.ws_dir, xlogfname); +// snprintf(xlogfpath, MAXPGPATH, "%s/" XLOGDIR "/%s", +// xlogreader->segcxt.ws_dir, xlogfname); - xlogreadfd = open(xlogfpath, O_RDONLY | PG_BINARY, 0); +// xlogreadfd = open(xlogfpath, O_RDONLY | PG_BINARY, 0); - if (xlogreadfd < 0) - { - /* - * If we have no restore_command to execute, then exit. - */ - if (private->restoreCommand == NULL) - { - pg_log_error("could not open file \"%s\": %m", xlogfpath); - return -1; - } +// if (xlogreadfd < 0) +// { +// /* +// * If we have no restore_command to execute, then exit. +// */ +// if (private->restoreCommand == NULL) +// { +// pg_log_error("could not open file \"%s\": %m", xlogfpath); +// return -1; +// } - /* - * Since we have restore_command, then try to retrieve missing WAL - * file from the archive. - */ - xlogreadfd = RestoreArchivedFile(xlogreader->segcxt.ws_dir, - xlogfname, - WalSegSz, - private->restoreCommand); +// /* +// * Since we have restore_command, then try to retrieve missing WAL +// * file from the archive. +// */ +// xlogreadfd = RestoreArchivedFile(xlogreader->segcxt.ws_dir, +// xlogfname, +// WalSegSz, +// private->restoreCommand); - if (xlogreadfd < 0) - return -1; - else - pg_log_debug("using file \"%s\" restored from archive", - xlogfpath); - } - } +// if (xlogreadfd < 0) +// return -1; +// else +// pg_log_debug("using file \"%s\" restored from archive", +// xlogfpath); +// } +// } - /* - * At this point, we have the right segment open. - */ - Assert(xlogreadfd != -1); +// /* +// * At this point, we have the right segment open. +// */ +// Assert(xlogreadfd != -1); - /* Read the requested page */ - if (lseek(xlogreadfd, (off_t) targetPageOff, SEEK_SET) < 0) - { - pg_log_error("could not seek in file \"%s\": %m", xlogfpath); - return -1; - } +// /* Read the requested page */ +// if (lseek(xlogreadfd, (off_t) targetPageOff, SEEK_SET) < 0) +// { +// pg_log_error("could not seek in file \"%s\": %m", xlogfpath); +// return -1; +// } - r = read(xlogreadfd, readBuf, XLOG_BLCKSZ); - if (r != XLOG_BLCKSZ) - { - if (r < 0) - pg_log_error("could not read file \"%s\": %m", xlogfpath); - else - pg_log_error("could not read file \"%s\": read %d of %zu", - xlogfpath, r, (Size) XLOG_BLCKSZ); +// r = read(xlogreadfd, readBuf, XLOG_BLCKSZ); +// if (r != XLOG_BLCKSZ) +// { +// if (r < 0) +// pg_log_error("could not read file \"%s\": %m", xlogfpath); +// else +// pg_log_error("could not read file \"%s\": read %d of %zu", +// xlogfpath, r, (Size) XLOG_BLCKSZ); - return -1; - } +// return -1; +// } - Assert(targetSegNo == xlogreadsegno); +// Assert(targetSegNo == xlogreadsegno); - xlogreader->seg.ws_tli = targetHistory[private->tliIndex].tli; - return XLOG_BLCKSZ; -} +// xlogreader->seg.ws_tli = targetHistory[private->tliIndex].tli; +// return XLOG_BLCKSZ; +// } /* * Extract information on which blocks the current record modifies. diff --git a/src/bin/pg_waldump/Makefile b/src/bin/pg_waldump/Makefile index e4a35bc..4b760fa 100644 --- a/src/bin/pg_waldump/Makefile +++ b/src/bin/pg_waldump/Makefile @@ -20,6 +20,8 @@ override CPPFLAGS := -DFRONTEND -DPG_NOREPLAY $(CPPFLAGS) RMGRDESCSOURCES = $(sort $(notdir $(wildcard $(top_srcdir)/src/backend/access/rmgrdesc/*desc.c))) RMGRDESCOBJS = $(patsubst %.c,%.o,$(RMGRDESCSOURCES)) +librust_log = -L$(top_builddir)/src/backend/storage/file -lrust_log -lstdc++ -lm -ldl -lpthread -lfuse3 -Wl,-gc-section +LIBS += $(librust_log) all: pg_waldump diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index f8b8afe..a48f6d4 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -72,6 +72,7 @@ typedef struct XLogDumpStats #define fatal_error(...) do { pg_log_fatal(__VA_ARGS__); exit(EXIT_FAILURE); } while(0) + static void print_rmgr_list(void) { @@ -126,252 +127,25 @@ split_path(const char *path, char **dir, char **fname) } } -/* - * Open the file in the valid target directory. - * - * return a read only fd - */ + +/* pg_waldump's XLogReaderRoutine->batch_read callback */ static int -open_file_in_directory(const char *directory, const char *fname) -{ - int fd = -1; - char fpath[MAXPGPATH]; - - Assert(directory != NULL); - - snprintf(fpath, MAXPGPATH, "%s/%s", directory, fname); - fd = open(fpath, O_RDONLY | PG_BINARY, 0); - - if (fd < 0 && errno != ENOENT) - fatal_error("could not open file \"%s\": %m", fname); - return fd; -} - -/* - * Try to find fname in the given directory. Returns true if it is found, - * false otherwise. If fname is NULL, search the complete directory for any - * file with a valid WAL file name. If file is successfully opened, set the - * wal segment size. - */ -static bool -search_directory(const char *directory, const char *fname) -{ - int fd = -1; - DIR *xldir; - - /* open file if valid filename is provided */ - if (fname != NULL) - fd = open_file_in_directory(directory, fname); - - /* - * A valid file name is not passed, so search the complete directory. If - * we find any file whose name is a valid WAL file name then try to open - * it. If we cannot open it, bail out. - */ - else if ((xldir = opendir(directory)) != NULL) - { - struct dirent *xlde; - - while ((xlde = readdir(xldir)) != NULL) - { - if (IsXLogFileName(xlde->d_name)) - { - fd = open_file_in_directory(directory, xlde->d_name); - fname = xlde->d_name; - break; - } - } - - closedir(xldir); - } - - /* set WalSegSz if file is successfully opened */ - if (fd >= 0) - { - PGAlignedXLogBlock buf; - int r; - - r = read(fd, buf.data, XLOG_BLCKSZ); - if (r == XLOG_BLCKSZ) - { - XLogLongPageHeader longhdr = (XLogLongPageHeader) buf.data; - - WalSegSz = longhdr->xlp_seg_size; - - if (!IsValidWalSegSize(WalSegSz)) - fatal_error(ngettext("WAL segment size must be a power of two between 1 MB and 1 GB, but the WAL file \"%s\" header specifies %d byte", - "WAL segment size must be a power of two between 1 MB and 1 GB, but the WAL file \"%s\" header specifies %d bytes", - WalSegSz), - fname, WalSegSz); - } - else - { - if (errno != 0) - fatal_error("could not read file \"%s\": %m", - fname); - else - fatal_error("could not read file \"%s\": read %d of %zu", - fname, r, (Size) XLOG_BLCKSZ); - } - close(fd); - return true; - } - - return false; -} - -/* - * Identify the target directory. - * - * Try to find the file in several places: - * if directory != NULL: - * directory / - * directory / XLOGDIR / - * else - * . - * XLOGDIR / - * $PGDATA / XLOGDIR / - * - * The valid target directory is returned. - */ -static char * -identify_target_directory(char *directory, char *fname) -{ - char fpath[MAXPGPATH]; - - if (directory != NULL) - { - if (search_directory(directory, fname)) - return pg_strdup(directory); - - /* directory / XLOGDIR */ - snprintf(fpath, MAXPGPATH, "%s/%s", directory, XLOGDIR); - if (search_directory(fpath, fname)) - return pg_strdup(fpath); - } - else - { - const char *datadir; - - /* current directory */ - if (search_directory(".", fname)) - return pg_strdup("."); - /* XLOGDIR */ - if (search_directory(XLOGDIR, fname)) - return pg_strdup(XLOGDIR); - - datadir = getenv("PGDATA"); - /* $PGDATA / XLOGDIR */ - if (datadir != NULL) - { - snprintf(fpath, MAXPGPATH, "%s/%s", datadir, XLOGDIR); - if (search_directory(fpath, fname)) - return pg_strdup(fpath); - } - } - - /* could not locate WAL file */ - if (fname) - fatal_error("could not locate WAL file \"%s\"", fname); - else - fatal_error("could not find any WAL file"); - - return NULL; /* not reached */ -} - -/* pg_waldump's XLogReaderRoutine->segment_open callback */ -static void -WALDumpOpenSegment(XLogReaderState *state, XLogSegNo nextSegNo, - TimeLineID *tli_p) -{ - TimeLineID tli = *tli_p; - char fname[MAXPGPATH]; - int tries; - - XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize); - - /* - * In follow mode there is a short period of time after the server has - * written the end of the previous file before the new file is available. - * So we loop for 5 seconds looking for the file to appear before giving - * up. - */ - for (tries = 0; tries < 10; tries++) - { - state->seg.ws_file = open_file_in_directory(state->segcxt.ws_dir, fname); - if (state->seg.ws_file >= 0) - return; - if (errno == ENOENT) - { - int save_errno = errno; - - /* File not there yet, try again */ - pg_usleep(500 * 1000); - - errno = save_errno; - continue; - } - /* Any other error, fall through and fail */ - break; - } - - fatal_error("could not find file \"%s\": %m", fname); -} - -/* - * pg_waldump's XLogReaderRoutine->segment_close callback. Same as - * wal_segment_close - */ -static void -WALDumpCloseSegment(XLogReaderState *state) -{ - close(state->seg.ws_file); - /* need to check errno? */ - state->seg.ws_file = -1; -} - -/* pg_waldump's XLogReaderRoutine->page_read callback */ -static int -WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, - XLogRecPtr targetPtr, char *readBuff) +WALDumpBatchRead(XLogReaderState *state, XLogRecPtr targetPtr, + int reqLen, char *readBuff) { XLogDumpPrivate *private = state->private_data; - int count = XLOG_BLCKSZ; - WALReadError errinfo; + int count; if (private->endptr != InvalidXLogRecPtr) { - if (targetPagePtr + XLOG_BLCKSZ <= private->endptr) - count = XLOG_BLCKSZ; - else if (targetPagePtr + reqLen <= private->endptr) - count = private->endptr - targetPagePtr; - else + if (targetPtr >= private->endptr) { private->endptr_reached = true; return -1; } } - if (!WALRead(state, readBuff, targetPagePtr, count, private->timeline, - &errinfo)) - { - WALOpenSegment *seg = &errinfo.wre_seg; - char fname[MAXPGPATH]; - - XLogFileName(fname, seg->ws_tli, seg->ws_segno, - state->segcxt.ws_segsize); - - if (errinfo.wre_errno != 0) - { - errno = errinfo.wre_errno; - fatal_error("could not read from file %s, offset %u: %m", - fname, errinfo.wre_off); - } - else - fatal_error("could not read from file %s, offset %u: read %d of %zu", - fname, errinfo.wre_off, errinfo.wre_read, - (Size) errinfo.wre_req); - } + count = He3DBWALRead(state, targetPtr, SizeOfXLogRecord, readBuff); return count; } @@ -939,131 +713,29 @@ main(int argc, char **argv) goto bad_argument; } - if (waldir != NULL) - { - /* validate path points to directory */ - if (!verify_directory(waldir)) - { - pg_log_error("could not open directory \"%s\": %m", waldir); - goto bad_argument; - } - } - - /* parse files as start/end boundaries, extract path if not specified */ - if (optind < argc) - { - char *directory = NULL; - char *fname = NULL; - int fd; - XLogSegNo segno; - - split_path(argv[optind], &directory, &fname); - - if (waldir == NULL && directory != NULL) - { - waldir = directory; - - if (!verify_directory(waldir)) - fatal_error("could not open directory \"%s\": %m", waldir); - } - - waldir = identify_target_directory(waldir, fname); - fd = open_file_in_directory(waldir, fname); - if (fd < 0) - fatal_error("could not open file \"%s\"", fname); - close(fd); - - /* parse position from file */ - XLogFromFileName(fname, &private.timeline, &segno, WalSegSz); - - if (XLogRecPtrIsInvalid(private.startptr)) - XLogSegNoOffsetToRecPtr(segno, 0, WalSegSz, private.startptr); - else if (!XLByteInSeg(private.startptr, segno, WalSegSz)) - { - pg_log_error("start WAL location %X/%X is not inside file \"%s\"", - LSN_FORMAT_ARGS(private.startptr), - fname); - goto bad_argument; - } - - /* no second file specified, set end position */ - if (!(optind + 1 < argc) && XLogRecPtrIsInvalid(private.endptr)) - XLogSegNoOffsetToRecPtr(segno + 1, 0, WalSegSz, private.endptr); - - /* parse ENDSEG if passed */ - if (optind + 1 < argc) - { - XLogSegNo endsegno; - - /* ignore directory, already have that */ - split_path(argv[optind + 1], &directory, &fname); - - fd = open_file_in_directory(waldir, fname); - if (fd < 0) - fatal_error("could not open file \"%s\"", fname); - close(fd); - - /* parse position from file */ - XLogFromFileName(fname, &private.timeline, &endsegno, WalSegSz); - - if (endsegno < segno) - fatal_error("ENDSEG %s is before STARTSEG %s", - argv[optind + 1], argv[optind]); - - if (XLogRecPtrIsInvalid(private.endptr)) - XLogSegNoOffsetToRecPtr(endsegno + 1, 0, WalSegSz, - private.endptr); - - /* set segno to endsegno for check of --end */ - segno = endsegno; - } - - - if (!XLByteInSeg(private.endptr, segno, WalSegSz) && - private.endptr != (segno + 1) * WalSegSz) - { - pg_log_error("end WAL location %X/%X is not inside file \"%s\"", - LSN_FORMAT_ARGS(private.endptr), - argv[argc - 1]); - goto bad_argument; - } - } - else - waldir = identify_target_directory(waldir, NULL); - - /* we don't know what to print */ - if (XLogRecPtrIsInvalid(private.startptr)) - { - pg_log_error("no start WAL location given"); - goto bad_argument; - } - - /* done with argument parsing, do the actual work */ /* we have everything we need, start reading */ xlogreader_state = - XLogReaderAllocate(WalSegSz, waldir, - XL_ROUTINE(.page_read = WALDumpReadPage, - .segment_open = WALDumpOpenSegment, - .segment_close = WALDumpCloseSegment), + XLogReaderAllocate(WalSegSz, NULL, + XL_ROUTINE(.batch_read = WALDumpBatchRead), &private); if (!xlogreader_state) fatal_error("out of memory"); + xlogreader_state->currTLI = private.timeline; /* first find a valid recptr to start from */ first_record = XLogFindNextRecord(xlogreader_state, private.startptr); - if (first_record == InvalidXLogRecPtr) - fatal_error("could not find a valid record after %X/%X", - LSN_FORMAT_ARGS(private.startptr)); + // if (first_record == InvalidXLogRecPtr) + // fatal_error("could not find a valid record after %X/%X", + // LSN_FORMAT_ARGS(private.startptr)); /* * Display a message that we're skipping data if `from` wasn't a pointer * to the start of a record and also wasn't a pointer to the beginning of * a segment (e.g. we were used in file mode). */ - if (first_record != private.startptr && - XLogSegmentOffset(private.startptr, WalSegSz) != 0) + if (first_record != private.startptr ) printf(ngettext("first record is after %X/%X, at %X/%X, skipping over %u byte\n", "first record is after %X/%X, at %X/%X, skipping over %u bytes\n", (first_record - private.startptr)), @@ -1074,7 +746,7 @@ main(int argc, char **argv) for (;;) { /* try to read the next record */ - record = XLogReadRecord(xlogreader_state, &errormsg); + record = He3DBXLogReadRecord(xlogreader_state, &errormsg); if (!record) { if (!config.follow || private.endptr_reached) diff --git a/src/include/access/brin_xlog.h b/src/include/access/brin_xlog.h index eaed3fa..845f812 100644 --- a/src/include/access/brin_xlog.h +++ b/src/include/access/brin_xlog.h @@ -54,6 +54,18 @@ typedef struct xl_brin_createidx } xl_brin_createidx; #define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16)) +typedef struct xl_old_brin_insert +{ + BlockNumber heapBlk; + + /* extra information needed to update the revmap */ + BlockNumber pagesPerRange; + + OffsetNumber offnum; +} xl_old_brin_insert; + +#define SizeOfOldBrinInsert (offsetof(xl_old_brin_insert, offnum) + sizeof(OffsetNumber)) + /* * This is what we need to know about a BRIN tuple insert * @@ -95,6 +107,16 @@ typedef struct xl_brin_update #define SizeOfBrinUpdate (offsetof(xl_brin_update, insert) + SizeOfBrinInsert) +typedef struct xl_old_brin_update +{ + /* offset number of old tuple on old page */ + OffsetNumber oldOffnum; + + xl_old_brin_insert insert; +} xl_old_brin_update; + +#define SizeOfOldBrinUpdate (offsetof(xl_old_brin_update, insert) + SizeOfOldBrinInsert) + /* * This is what we need to know about a BRIN tuple samepage update * diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h index babfa7c..076613a 100644 --- a/src/include/access/gistxlog.h +++ b/src/include/access/gistxlog.h @@ -59,6 +59,20 @@ typedef struct gistxlogDelete #define SizeOfGistxlogDelete (offsetof(gistxlogDelete, ntodelete) + sizeof(uint16)) +typedef struct gistoldxlogPageSplit +{ + BlockNumber origrlink; /* rightlink of the page before split */ + GistNSN orignsn; /* NSN of the page before split */ + bool origleaf; /* was splitted page a leaf page? */ + + uint16 npage; /* # of pages in the split */ + bool markfollowright; /* set F_FOLLOW_RIGHT flags */ + + /* + * follow: 1. gistxlogPage and array of IndexTupleData per page + */ +} gistoldxlogPageSplit; + /* * Backup Blk 0: If this operation completes a page split, by inserting a * downlink for the split page, the left half of the split diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 8f54f7b..5d060c1 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -192,6 +192,22 @@ typedef struct xl_multi_insert_tuple #define SizeOfMultiInsertTuple (offsetof(xl_multi_insert_tuple, t_hoff) + sizeof(uint8)) +typedef struct xl_old_heap_update +{ + TransactionId old_xmax; /* xmax of the old tuple */ + OffsetNumber old_offnum; /* old tuple's offset */ + uint8 old_infobits_set; /* infomask bits to set on old tuple */ + uint8 flags; + TransactionId new_xmax; /* xmax of the new tuple */ + OffsetNumber new_offnum; /* new tuple's offset */ + /* + * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags + * are set, xl_heap_header and tuple data for the old tuple follow. + */ +} xl_old_heap_update; + +#define SizeOfOldHeapUpdate (offsetof(xl_old_heap_update, new_offnum) + sizeof(OffsetNumber)) + /* * This is what we need to know about update|hot_update * @@ -346,6 +362,14 @@ typedef struct xl_heap_freeze_page #define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, ntuples) + sizeof(uint16)) +typedef struct xl_old_heap_visible +{ + TransactionId cutoff_xid; + uint8 flags; +} xl_old_heap_visible; + +#define SizeOfOldHeapVisible (offsetof(xl_old_heap_visible, flags) + sizeof(uint8)) + /* * This is what we need to know about setting a visibility map bit * diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index f7679ab..0c7ed90 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -86,6 +86,16 @@ typedef struct xl_btree_insert #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) +typedef struct xl_old_btree_split +{ + uint32 level; /* tree level of page being split */ + OffsetNumber firstrightoff; /* first origpage item on rightpage */ + OffsetNumber newitemoff; /* new item's offset */ + uint16 postingoff; /* offset inside orig posting tuple */ +} xl_old_btree_split; + +#define SizeOfOldBtreeSplit (offsetof(xl_old_btree_split, postingoff) + sizeof(uint16)) + /* * On insert with split, we save all the items going into the right sibling * so that we can restore it completely from the log record. This way takes diff --git a/src/include/access/pagehashqueue.h b/src/include/access/pagehashqueue.h new file mode 100644 index 0000000..a1904a6 --- /dev/null +++ b/src/include/access/pagehashqueue.h @@ -0,0 +1,47 @@ +#ifndef LIB_PAGE_HASH_QUEUE_H +#define LIB_PAGE_HASH_QUEUE_H +#include +#include "c.h" +#include "postgres.h" +#include "storage/buf_internals.h" +//max Page Num +#define G_QUEUE_LEN 2048 +#define PARALLEL_NUM 8 +typedef struct lsn_list_t { + XLogRecPtr lsn; + XLogRecPtr endlsn; + struct lsn_list_t *next; +} lsn_list_t; + +typedef struct page_head_list_t { + BufferTag tag; + uint32_t count; + lsn_list_t *head; + lsn_list_t *tail; +} page_head_list_t; +extern uint32_t addFileKey(BufferTag* onePage); +extern void cleanMap(void); +extern uint32_t hashMapSize(void); +extern BufferTag* QueuePushPage(void); +extern void SortPageQueue(void); +extern void InitBufferPoolHashMap(void); +extern Size PageHashQueueShmemSize(void); +extern uint32_t AssignLatchPos(void); +extern void OwnFlushLatch(uint32_t pos); +extern void WakeupOneFlushWork(uint32_t pos); +extern void ResetFlushLatch(uint32_t pos); +extern Latch* GetCurrentLatch(uint32_t pos); +extern void ProcFlushBufferToDisk(BufferTag*tag); +extern uint32_t CompletedTaskNum(void); +extern void SignalStartFlushWork(void); +extern int StartPageFlushWorker(void); +extern Size PageHashMapSize(void); +extern void PageHashQueueShmemInit(void); +Size LogindexHashAllShmemSize(void); +void InitLogindexHashBrucket(void); +void pushSlaveReplayQueue(int pageNum); +void CleanLogIndexMain(int argc, char *argv[]); + +extern XLogRecPtr *g_redoStartLsn; + +#endif diff --git a/src/include/access/pg_mirror.h b/src/include/access/pg_mirror.h new file mode 100644 index 0000000..4c1ff47 --- /dev/null +++ b/src/include/access/pg_mirror.h @@ -0,0 +1,8 @@ +#ifndef PG_MIRROR_H +#define PG_MIRROR_H +#include "c.h" +#include "catalog/pg_control.h" +extern int ArrayXlogHe3ToPg(char*sBuf,int sLen, char*dBuf,int* dLen,uint64 *startLsn,uint64 *endLsn); +extern void readControlFile(char*pathstr); +extern void setControlFile(ControlFileData *cfile); +#endif \ No newline at end of file diff --git a/src/include/access/pthreadpool.h b/src/include/access/pthreadpool.h new file mode 100644 index 0000000..1a59227 --- /dev/null +++ b/src/include/access/pthreadpool.h @@ -0,0 +1,11 @@ +#ifndef PTHREADPOOL_H +#define PTHREADPOOL_H +#include "access/ringbuffer.h" +extern int initPthreadPool(void); +extern int WalTaskPool(wal_batch_t*data); +extern void WalTaskFree(void); +extern void WalTaskImmediateFree(void); +extern bool IsFreePthreadPool(void); + +#endif + diff --git a/src/include/access/pushpage.h b/src/include/access/pushpage.h index 69ed4ba..affc3f5 100644 --- a/src/include/access/pushpage.h +++ b/src/include/access/pushpage.h @@ -7,23 +7,31 @@ #include #include #include -#include "access/hiredis.h" #include "common/relpath.h" #include "storage/block.h" #include "catalog/pg_control.h" extern clock_t start_time; extern XLogRecPtr PushPtr; +extern XLogRecPtr CheckPointPtr; +extern XLogRecPtr FileCheckPointPtr; extern XLogRecPtr PrePushPtr; extern pid_t startupPid; extern CheckPoint GlobalCheckPoint; extern uint8 GlobalState; extern XLogRecPtr ApplyLsn; +//this for cut logindex +extern XLogRecPtr PrevPushPoint; +extern XLogRecPtr LastPushPoint; extern XLogRecPtr QueryMinLsn(XLogRecPtr lsn); -extern XLogRecPtr QueryPushLsn(); +// extern XLogRecPtr QueryPushLsn(); extern XLogRecPtr QueryPushChkpointLsn(); +extern bool ReConnectPrimaryDB(void); + + +// extern XLogRecPtr QueryReplyLsn(XLogRecPtr lsn); typedef struct DirtyPage { XLogRecPtr startlsn; @@ -59,8 +67,6 @@ extern QDataType QueuePop(); extern bool QueueEmpty(); -extern bool pushRedisList(const char*str); - extern XLogRecPtr QueueHeadEndLsn(); #endif diff --git a/src/include/access/ringbuffer.h b/src/include/access/ringbuffer.h new file mode 100644 index 0000000..737ef5c --- /dev/null +++ b/src/include/access/ringbuffer.h @@ -0,0 +1,161 @@ +#include +#include +#include +#include +#include "c.h" +#include "postgres.h" +#include "storage/spin.h" +#include "storage/s_lock.h" +#include "access/xlogdefs.h" +#include "utils/palloc.h" +#include "storage/buf_internals.h" + +/** + * @file + * Prototypes and structures for the ring buffer module. + */ + +#ifndef RINGBUFFER_H +#define RINGBUFFER_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define RING_BUFFER_ASSERT(x) assert(x) +/** + * Checks if the buffer_size is a power of two. + * Due to the design only RING_BUFFER_SIZE-1 items + * can be contained in the buffer. + * buffer_size must be a power of two. +*/ +#define RING_BUFFER_IS_POWER_OF_TWO(buffer_size) ((buffer_size & (buffer_size - 1)) == 0) + +/** + * The type which is used to hold the size + * and the indicies of the buffer. + */ +typedef size_t ring_buffer_size_t; + +/** + * Used as a modulo operator + * as a % b = (a & (b − 1)) + * where \c a is a positive index in the buffer and + * \c b is the (power of two) size of the buffer. + */ +#define RING_BUFFER_MASK(rb) (rb->buffer_mask) + +/** + * Simplifies the use of struct ring_buffer_t. + */ +typedef struct ring_buffer_t ring_buffer_t; +typedef enum BufferStatus{ + UNKOWNSTATUS, + STARTSTATUS, + COMPLETEDSTATUS +}BufferStatus; + +typedef struct wal_batch_t { + XLogRecPtr startLsn; + XLogRecPtr endLsn; + XLogRecPtr checkPointLsn; + int dataLen; + pg_atomic_uint32 status; + char* data; +} wal_batch_t; + +extern ring_buffer_t* gRingBufferManger; +extern const int spaceNum; +/** + * Structure which holds a ring buffer. + * The buffer contains a buffer array + * as well as metadata for the ring buffer. + */ +struct ring_buffer_t { + slock_t mutex; + /** Buffer memory. */ + wal_batch_t *buffer; + /** Buffer mask. */ + ring_buffer_size_t buffer_mask; + /** Index of tail. */ + ring_buffer_size_t tail_index; + /** Index of head. */ + ring_buffer_size_t head_index; + /*push standby hash query max index*/ + int maxIdx; +}; + +Size WalReadBufferShmemSize(void); +void InitRingBufferSpace(void); +/** + * Initializes the ring buffer pointed to by buffer. + * This function can also be used to empty/reset the buffer. + * @param buffer The ring buffer to initialize. + * @param buf The buffer allocated for the ringbuffer. + * @param buf_size The size of the allocated ringbuffer. + */ +void ring_buffer_init(ring_buffer_t *buffer, wal_batch_t *buf, size_t buf_size); + +/** + * Adds a byte to a ring buffer. + * @param buffer The buffer in which the data should be placed. + * @param data The byte to place. + */ +wal_batch_t *ring_buffer_queue(ring_buffer_t *buffer, wal_batch_t data); +/** + * Returns the oldest byte in a ring buffer. + * @param buffer The buffer from which the data should be returned. + * @param data A pointer to the location at which the data should be placed. + * @return 1 if data was returned; 0 otherwise. + */ +uint8_t ring_buffer_dequeue(ring_buffer_t *buffer, wal_batch_t *data); + +/** + * Peeks a ring buffer, i.e. returns an element without removing it. + * @param buffer The buffer from which the data should be returned. + * @param data A pointer to the location at which the data should be placed. + * @param index The index to peek. + * @return 1 if data was returned; 0 otherwise. + */ +uint8_t ring_buffer_peek(ring_buffer_t *buffer, wal_batch_t **data, ring_buffer_size_t index); + + +/** + * Returns whether a ring buffer is empty. + * @param buffer The buffer for which it should be returned whether it is empty. + * @return 1 if empty; 0 otherwise. + */ +inline uint8_t ring_buffer_is_empty(ring_buffer_t *buffer) { + return (buffer->head_index == buffer->tail_index); +} + +/** + * Returns whether a ring buffer is full. + * @param buffer The buffer for which it should be returned whether it is full. + * @return 1 if full; 0 otherwise. + */ +inline uint8_t ring_buffer_is_full(ring_buffer_t *buffer) { + return ((buffer->head_index - buffer->tail_index) & RING_BUFFER_MASK(buffer)) == RING_BUFFER_MASK(buffer); +} + +/** + * Returns the number of items in a ring buffer. + * @param buffer The buffer for which the number of items should be returned. + * @return The number of items in the ring buffer. + */ +inline ring_buffer_size_t ring_buffer_num_items(ring_buffer_t *buffer) { + return ((buffer->head_index - buffer->tail_index) & RING_BUFFER_MASK(buffer)); +} + +uint8_t ring_buffer_will_full(ring_buffer_t *buffer); + +uint8_t ring_buffer_dequeue_arr(ring_buffer_t *buffer, uint32 size); + +int walRecordQuery(char**buffer,int* curpos,int* maxspace,uint64 lsn); + +#ifdef __cplusplus +} +#endif + +#endif /* RINGBUFFER_H */ diff --git a/src/include/access/spgxlog.h b/src/include/access/spgxlog.h index 71acb47..1012bb3 100644 --- a/src/include/access/spgxlog.h +++ b/src/include/access/spgxlog.h @@ -39,6 +39,19 @@ typedef struct spgxlogState bool isBuild; } spgxlogState; +typedef struct spgoldxlogAddLeaf +{ + bool newPage; /* init dest page? */ + bool storesNulls; /* page is in the nulls tree? */ + OffsetNumber offnumLeaf; /* offset where leaf tuple gets placed */ + OffsetNumber offnumHeadLeaf; /* offset of head tuple in chain, if any */ + + OffsetNumber offnumParent; /* where the parent downlink is, if any */ + uint16 nodeI; + + /* new leaf tuple follows (unaligned!) */ +} spgoldxlogAddLeaf; + /* * Backup Blk 0: destination page for leaf tuple * Backup Blk 1: parent page (if any) @@ -59,6 +72,35 @@ typedef struct spgxlogAddLeaf /* new leaf tuple follows (unaligned!) */ } spgxlogAddLeaf; +typedef struct spgoldxlogMoveLeafs +{ + uint16 nMoves; /* number of tuples moved from source page */ + bool newPage; /* init dest page? */ + bool replaceDead; /* are we replacing a DEAD source tuple? */ + bool storesNulls; /* pages are in the nulls tree? */ + + /* where the parent downlink is */ + OffsetNumber offnumParent; + uint16 nodeI; + + spgxlogState stateSrc; + + /*---------- + * data follows: + * array of deleted tuple numbers, length nMoves + * array of inserted tuple numbers, length nMoves + 1 or 1 + * list of leaf tuples, length nMoves + 1 or 1 (unaligned!) + * + * Note: if replaceDead is true then there is only one inserted tuple + * number and only one leaf tuple in the data, because we are not copying + * the dead tuple from the source + *---------- + */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgoldxlogMoveLeafs; + +#define SizeOfOldSpgxlogMoveLeafs offsetof(spgoldxlogMoveLeafs, offsets) + /* * Backup Blk 0: source leaf page * Backup Blk 1: destination leaf page @@ -96,6 +138,44 @@ typedef struct spgxlogMoveLeafs #define SizeOfSpgxlogMoveLeafs offsetof(spgxlogMoveLeafs, offsets) +typedef struct spgoldxlogAddNode +{ + /* + * Offset of the original inner tuple, in the original page (on backup + * block 0). + */ + OffsetNumber offnum; + + /* + * Offset of the new tuple, on the new page (on backup block 1). Invalid, + * if we overwrote the old tuple in the original page). + */ + OffsetNumber offnumNew; + bool newPage; /* init new page? */ + + /*---- + * Where is the parent downlink? parentBlk indicates which page it's on, + * and offnumParent is the offset within the page. The possible values for + * parentBlk are: + * + * 0: parent == original page + * 1: parent == new page + * 2: parent == different page (blk ref 2) + * -1: parent not updated + *---- + */ + int8 parentBlk; + OffsetNumber offnumParent; /* offset within the parent page */ + + uint16 nodeI; + + spgxlogState stateSrc; + + /* + * updated inner tuple follows (unaligned!) + */ +} spgoldxlogAddNode; + /* * Backup Blk 0: original page * Backup Blk 1: where new tuple goes, if not same place @@ -162,6 +242,42 @@ typedef struct spgxlogSplitTuple */ } spgxlogSplitTuple; +typedef struct spgoldxlogPickSplit +{ + bool isRootSplit; + + uint16 nDelete; /* n to delete from Src */ + uint16 nInsert; /* n to insert on Src and/or Dest */ + bool initSrc; /* re-init the Src page? */ + bool initDest; /* re-init the Dest page? */ + + /* where to put new inner tuple */ + OffsetNumber offnumInner; + bool initInner; /* re-init the Inner page? */ + + bool storesNulls; /* pages are in the nulls tree? */ + + /* where the parent downlink is, if any */ + bool innerIsParent; /* is parent the same as inner page? */ + OffsetNumber offnumParent; + uint16 nodeI; + + spgxlogState stateSrc; + + /*---------- + * data follows: + * array of deleted tuple numbers, length nDelete + * array of inserted tuple numbers, length nInsert + * array of page selector bytes for inserted tuples, length nInsert + * new inner tuple (unaligned!) + * list of leaf tuples, length nInsert (unaligned!) + *---------- + */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; +} spgoldxlogPickSplit; + +#define SizeOfOldSpgxlogPickSplit offsetof(spgoldxlogPickSplit, offsets) + /* * Buffer references in the rdata array are: * Backup Blk 0: Src page (only if not root) diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 794e7a7..4cf6435 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -31,6 +31,8 @@ extern int sync_method; extern PGDLLIMPORT TimeLineID ThisTimeLineID; /* current TLI */ +extern PGDLLIMPORT TimeLineID ThisTimeLineID2; +extern bool startup_shutdown_requested; /* * Prior to 8.4, all activity during recovery was carried out by the startup @@ -41,6 +43,8 @@ extern PGDLLIMPORT TimeLineID ThisTimeLineID; /* current TLI */ */ extern bool InRecovery; +extern XLogRecPtr localApplyLSN; + /* * Like InRecovery, standbyState is only valid in the startup process. * In all other processes it will have the value STANDBY_DISABLED (so @@ -105,6 +109,7 @@ extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd; extern bool reachedConsistency; extern int group_total_len; extern int grouo_rec_count; +extern int grouo_rec_cur_count; extern XLogRecord *grouphead[XLR_MAX_BLOCK_ID + 1]; extern int grouplens[XLR_MAX_BLOCK_ID + 1]; extern XLogRecData groupRecData[XLR_MAX_BLOCK_ID + 1]; @@ -139,11 +144,14 @@ extern char *PrimarySlotName; extern bool wal_receiver_create_temp_slot; extern bool track_wal_io_timing; +extern char *he3_meta_conninfo; + /* indirectly set via GUC system */ extern TransactionId recoveryTargetXid; extern char *recovery_target_time_string; extern const char *recoveryTargetName; extern XLogRecPtr recoveryTargetLSN; +extern XLogRecPtr walsenderLsn; extern RecoveryTargetType recoveryTarget; extern char *PromoteTriggerFile; extern RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal; @@ -279,7 +287,11 @@ typedef struct CheckpointStatsData } CheckpointStatsData; extern CheckpointStatsData CheckpointStats; - +typedef struct walRecord_t{ + char* buf; + int count; + int cap; +} walRecord_t; /* * GetWALAvailability return codes */ @@ -296,17 +308,21 @@ typedef enum WALAvailability struct XLogRecData; extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, + XLogRecPtr fpw_lsn, + uint8 flags, + int num_fpi, char **links, RelFileNode *rel_fnode, BlockNumber *blkno); +extern XLogRecPtr He3DBXLogInsertRecord(struct XLogRecData *rdata, XLogRecPtr fpw_lsn, uint8 flags, int num_fpi); -extern bool -data_buffer_for_replay(XLogReaderState *record); - extern void XLogFlush(XLogRecPtr RecPtr); +extern void He3DBXLogFlush(XLogRecPtr RecPtr); extern bool XLogBackgroundFlush(void); +extern bool He3DBXLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); -extern int XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock); -extern int XLogFileOpen(XLogSegNo segno); +extern int64_t XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock); +extern int64_t XLogFileOpen(XLogSegNo segno); +extern int64_t XLogFileCreate(XLogSegNo segno); extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli); extern XLogSegNo XLogGetLastRemovedSegno(void); @@ -334,9 +350,11 @@ extern TimestampTz GetLatestXTime(void); extern TimestampTz GetCurrentChunkReplayStartTime(void); extern void FlushNewRecoveryPoint(XLogRecPtr lsn); -extern void UpdateControlFile(); -extern void PushUpdateControlFile(); +extern void UpdateControlFile(void); +extern void PushUpdateControlFile(void); extern void PushCheckPointGuts(XLogRecPtr checkPointRedo, int flags); +extern XLogRecPtr GetXLogPushToDisk(void); +extern void SetXLogPushToDisk(XLogRecPtr pushToDiskLsn); extern uint64 GetSystemIdentifier(void); extern char *GetMockAuthenticationNonce(void); @@ -373,6 +391,14 @@ extern void XLogRequestWalReceiverReply(void); extern void assign_max_wal_size(int newval, void *extra); extern void assign_checkpoint_completion_target(double newval, void *extra); +extern void pushTikv(int onePageListLen,int pageNum,bool flag); +extern XLogRecData *DecodeXLogRecordAssemble(XLogReaderState *state, OldXLogRecord *record, + XLogRecPtr RedoRecPtr, bool doPageWrites, + XLogRecPtr *fpw_lsn, int *num_fpi); +extern XLogRecPtr producerHe3dbXLog(XLogRecData *rdata, + XLogRecPtr fpw_lsn, + uint8 flags, + int num_fpi,XLogRecPtr filelsn); /* * Routines to start, stop, and get status of a base backup. @@ -403,6 +429,8 @@ extern XLogRecPtr do_pg_stop_backup(char *labelfile, bool waitforarchive, extern void do_pg_abort_backup(int code, Datum arg); extern void register_persistent_abort_backup_handler(void); extern SessionBackupState get_backup_status(void); +extern void pushXlogToTikv(char*data,int len); +extern void He3DBGetWalWriteStats(XLogRecPtr *writtenlsn, XLogRecPtr *flushlsn, uint64 *totaltimes, int *parallels); /* File path names (all relative to $PGDATA) */ #define RECOVERY_SIGNAL_FILE "recovery.signal" diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index dcf41e9..82dc955 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -332,5 +332,6 @@ extern bool ArchiveRecoveryRequested; extern bool InArchiveRecovery; extern bool StandbyMode; extern char *recoveryRestoreCommand; +extern bool IsPrivatePgControl; #endif /* XLOG_INTERNAL_H */ diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index d6e69a2..1990d46 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -34,16 +34,19 @@ #ifndef XLOGREADER_H #define XLOGREADER_H +#include "postgres.h" + #ifndef FRONTEND #include "access/transam.h" #endif #include "access/xlogrecord.h" #include "storage/buf.h" +#include "utils/guc.h" /* WALOpenSegment represents a WAL segment being read. */ typedef struct WALOpenSegment { - int ws_file; /* segment file descriptor */ + int64_t ws_file; /* segment file descriptor */ XLogSegNo ws_segno; /* segment number */ TimeLineID ws_tli; /* timeline ID of the currently open file */ } WALOpenSegment; @@ -63,6 +66,10 @@ typedef int (*XLogPageReadCB) (XLogReaderState *xlogreader, int reqLen, XLogRecPtr targetRecPtr, char *readBuf); +typedef int (*XLogBatchReadCB) (XLogReaderState *xlogreader, + XLogRecPtr startPtr, + int reqLen, + char *readBuf); typedef void (*WALSegmentOpenCB) (XLogReaderState *xlogreader, XLogSegNo nextSegNo, TimeLineID *tli_p); @@ -90,7 +97,9 @@ typedef struct XLogReaderRoutine * The callback shall set ->seg.ws_tli to the TLI of the file the page was * read from. */ - XLogPageReadCB page_read; + XLogPageReadCB page_read; + + XLogBatchReadCB batch_read; /* * Callback to open the specified WAL segment for reading. ->seg.ws_file @@ -211,16 +220,18 @@ struct XLogReaderState uint32 readLen; /* last read XLOG position for data currently in readBuf */ - WALSegmentContext segcxt; - WALOpenSegment seg; - uint32 segoff; + uint32 bufoff; + /* last read XLOG position for data currently in readBuf */ + WALSegmentContext segcxt; + WALOpenSegment seg; + uint32 segoff; /* * beginning of prior page read, and its TLI. Doesn't necessarily * correspond to what's in readBuf; used for timeline sanity checks. */ - XLogRecPtr latestPagePtr; - TimeLineID latestPageTLI; + XLogRecPtr latestPagePtr; + TimeLineID latestPageTLI; /* beginning of the WAL record being read. */ XLogRecPtr currRecPtr; @@ -266,6 +277,9 @@ struct XLogReaderState void *tag; Buffer buffer; bool isreplay; + bool streamStart; + bool insertTikv; + bool localWalComplete; }; /* Get a new XLogReader */ @@ -288,12 +302,20 @@ extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr); extern struct XLogRecord *XLogReadRecord(XLogReaderState *state, char **errormsg); +extern struct XLogRecord *He3DBXLogReadRecord(XLogReaderState *state, + char **errormsg); + extern struct XLogRecord *He3DBXLogListReadRecord(XLogReaderState *state, char **errormsg, char *pageXlogBuf); +extern struct XLogRecord *StartupXLogReadRecord(XLogReaderState *state, char **errormsg); + /* Validate a page */ extern bool XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, char *phdr); + +extern bool He3DBValidXLogRecord( XLogRecord *record); + /* * Error information from WALRead that both backend and frontend caller can * process. Currently only errors from pg_pread can be reported. @@ -311,6 +333,9 @@ extern bool WALRead(XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count, TimeLineID tli, WALReadError *errinfo); +extern int He3DBWALRead(XLogReaderState *state, + XLogRecPtr startptr, int count, char *buf); + /* Functions for decoding an XLogRecord */ extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index 33d280f..90ee976 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -17,6 +17,21 @@ #include "storage/block.h" #include "storage/relfilenode.h" +typedef struct OldXLogRecord +{ + uint32 xl_tot_len; /* total len of entire record */ + TransactionId xl_xid; /* xact id */ + XLogRecPtr xl_prev; /* ptr to previous record in log */ + uint8 xl_info; /* flag bits, see below */ + RmgrId xl_rmid; /* resource manager for this record */ + /* 2 bytes of padding here, initialize to zero */ + pg_crc32c xl_crc; /* CRC for this record */ + + /* XLogRecordBlockHeaders and XLogRecordDataHeader follow, no padding */ + +} OldXLogRecord; +#define SizeOfOldXLogRecord (offsetof(OldXLogRecord, xl_crc) + sizeof(pg_crc32c)) + /* * The overall layout of an XLOG record is: * Fixed-size header (XLogRecord struct) diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index 19354c8..ecbf3c2 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -13,6 +13,7 @@ #include "access/xlogreader.h" #include "storage/bufmgr.h" +#include "access/xlogutils.h" extern bool XLogHaveInvalidPages(void); @@ -59,6 +60,10 @@ extern void FreeFakeRelcacheEntry(Relation fakerel); extern int read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *cur_page); +extern int read_local_xlog_batch(XLogReaderState *state, + XLogRecPtr startRecPtr, + int reqLen, + char *cur_page); extern void wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID *tli_p); diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index a107ba0..3b5e747 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -230,6 +230,9 @@ typedef struct ControlFileData /* CRC of all above ... MUST BE LAST! */ pg_crc32c crc; + + /* he3mirror: last check point record ptr for pg_wal file, do not need crc check */ + XLogRecPtr checkPointFile; } ControlFileData; /* diff --git a/src/include/catalog/pg_hot_data.h b/src/include/catalog/pg_hot_data.h deleted file mode 100644 index 0f22a32..0000000 --- a/src/include/catalog/pg_hot_data.h +++ /dev/null @@ -1,66 +0,0 @@ -/*------------------------------------------------------------------------- - * - * pg_hot_data.h - * definition of the "hot_data" system catalog (pg_hot_data) - * - * - * Portions Copyright (c) 2022, He3DB Global Development Group - * - * src/include/catalog/pg_hot_data.h - * - * NOTES - * The Catalog.pm module reads this file and derives schema - * information. - * - *------------------------------------------------------------------------- - */ -#ifndef PG_HOT_DATA_H -#define PG_HOT_DATA_H - -#include "catalog/genbki.h" -#include "catalog/pg_hot_data_d.h" - -/* ---------------- - * pg_hot_data definition. cpp turns this into - * typedef struct FormData_pg_hot_data - * ---------------- - */ -CATALOG(pg_hot_data,4790,HotDataRelationId) BKI_SHARED_RELATION BKI_ROWTYPE_OID(4793,HotDataRelation_Rowtype_Id) BKI_SCHEMA_MACRO -{ - /* database name */ - NameData datname; - - /* relation name */ - NameData relname; - - /* caching rules */ - char crules; - - /* client name */ - NameData clientname; - - /* client addr */ - NameData clientaddr; - -#ifdef CATALOG_VARLEN /* variable-length fields start here */ - /* cache rules schedule time */ - timestamptz crulessettime; - - /* hot data cache time */ - timestamptz cachetime; -#endif -}FormData_pg_hot_data; - -/* ---------------- - * Form_pg_hot_data corresponds to a pointer to a tuple with - * the format of pg_hot_data relation. - * ---------------- - */ -typedef FormData_pg_hot_data *Form_pg_hot_data; - -DECLARE_UNIQUE_INDEX(pg_hot_data_datname_relname_index, 4791, on pg_hot_data using btree(datname name_ops, relname name_ops)); -#define HotDataDatnameRelnameIndexId 4791 - -extern void PrecacheHotData(); - -#endif \ No newline at end of file diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 79669bf..cab0d06 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5582,6 +5582,22 @@ proargnames => '{wal_records,wal_fpi,wal_bytes,wal_buffers_full,wal_write,wal_sync,wal_write_time,wal_sync_time,stats_reset}', prosrc => 'pg_stat_get_wal' }, +{ oid => '6206', descr => 'statistics: information about He3DB WAL Write', + proname => 'pg_stat_get_he3walwrite', proisstrict => 'f', provolatile => 's', + proparallel => 'r', prorettype => 'record', proargtypes => '', + proallargtypes => '{pg_lsn,pg_lsn,int8,int4}', + proargmodes => '{o,o,o,o}', + proargnames => '{write_lsn,flush_lsn,writekv_totaltimes,writekv_parallels}', + prosrc => 'pg_stat_get_he3walwrite' }, + +{ oid => '6207', descr => 'statistics: information about He3DB LogIndex', + proname => 'pg_stat_get_he3_logindex', proisstrict => 'f', provolatile => 's', + proparallel => 'r', prorettype => 'record', proargtypes => '', + proallargtypes => '{int8,int8,int8,int8,int8}', + proargmodes => '{o,o,o,o,o}', + proargnames => '{memtable_total,memtable_used,memtable_start_index,memtable_active_index,page_total}', + prosrc => 'pg_stat_get_he3_logindex' }, + { oid => '2306', descr => 'statistics: information about SLRU caches', proname => 'pg_stat_get_slru', prorows => '100', proisstrict => 'f', proretset => 't', provolatile => 's', proparallel => 'r', diff --git a/src/include/grpc-c/grpc-c.h b/src/include/grpc-c/grpc-c.h deleted file mode 100644 index ee41e12..0000000 --- a/src/include/grpc-c/grpc-c.h +++ /dev/null @@ -1,523 +0,0 @@ -/* - * - * BSD 3-Clause License - * - * Copyright (c) 2016, lixiangyun - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - */ -#ifndef __GRPC_C_H__ -#define __GRPC_C_H__ - -#include -#include -#include -#include -#include -#include -#include - -#ifdef __cplusplus -#if __cplusplus -extern "C"{ -#endif -#endif /* __cplusplus */ - -#define GRPC_C_DAEMON_SOCK_PATH "unix:/var/run/grpc_c_" - -#define GRPC_C_BUFSIZ 1024 - -/* - * Tracing levels and related functions - */ -#define GRPC_C_TRACE_TCP (1 << 0) -#define GRPC_C_TRACE_CHANNEL (1 << 1) -#define GRPC_C_TRACE_SURFACE (1 << 2) -#define GRPC_C_TRACE_HTTP (1 << 3) -#define GRPC_C_TRACE_FLOWCTL (1 << 4) -#define GRPC_C_TRACE_BATCH (1 << 5) -#define GRPC_C_TRACE_CONNECTIVITY_STATE (1 << 6) -#define GRPC_C_TRACE_SECURE_ENDPOINT (1 << 7) -#define GRPC_C_TRACE_TRANSPORT_SECURITY (1 << 8) -#define GRPC_C_TRACE_ROUND_ROBIN (1 << 9) -#define GRPC_C_TRACE_HTTP_WRITE_STATE (1 << 10) -#define GRPC_C_TRACE_API (1 << 11) -#define GRPC_C_TRACE_CHANNEL_STACK_BUILDER (1 << 12) -#define GRPC_C_TRACE_HTTP1 (1 << 13) -#define GRPC_C_TRACE_COMPRESSION (1 << 14) -#define GRPC_C_TRACE_QUEUE_PLUCK (1 << 15) -#define GRPC_C_TRACE_QUEUE_TIMEOUT (1 << 16) -#define GRPC_C_TRACE_OP_FAILURE (1 << 17) -#define GRPC_C_TRACE_ALL (~0) - -void grpc_c_trace_enable(int flags, int severity); -void grpc_c_trace_disable(int flags); - - -/* - * Write return status. This should eventually become an enum and - * writer->write should return that type instead of int - */ -#define GRPC_C_WRITE_OK 0 -#define GRPC_C_WRITE_FAIL 1 -#define GRPC_C_WRITE_PENDING 2 - -/* - * GRPC-C return status codes - */ -#define GRPC_C_OK 0 -#define GRPC_C_ERR_FAIL 1 -#define GRPC_C_ERR_TMOUT 2 -#define GRPC_C_ERR_NOMEM 3 -#define GRPC_C_ERR_NORECV 4 - -/* - * List of all the possible states for grpc_c client and server - */ -typedef enum grpc_c_state_s { - GRPC_C_STATE_NEW, /* Waiting to call */ - GRPC_C_STATE_RUN, /* Called RPC handler */ - GRPC_C_STATE_DONE, /* Finished */ - GRPC_C_STATE_FREE, /* Context is free */ -} grpc_c_state_t; - - -/* - * Types of events that we use with tag when batching gRPC operations - */ -typedef enum grpc_c_event_type_s { - GRPC_C_EVENT_SEND_METADATA, - GRPC_C_EVENT_RECV_METADATA, - GRPC_C_EVENT_READ, - GRPC_C_EVENT_WRITE, - GRPC_C_EVENT_WRITE_DONE, - GRPC_C_EVENT_FINISH, - GRPC_C_EVENT_RECV_CLOSE, - GRPC_C_EVENT_SERVER_SHUTDOWN, - GRPC_C_EVENT_SERVER_REGISTER, - GRPC_C_EVENT_CLIENT_CONNECT, -} grpc_c_event_type_t; - - -typedef struct grpc_c_list_s { - struct grpc_c_list_s * next; - struct grpc_c_list_s * prev; -}grpc_c_list_t; - - -/* - * Structure definition for return status of RPC - */ -typedef struct grpc_c_status_s { - int code; - char message[GRPC_C_BUFSIZ]; -} grpc_c_status_t; - - -/* - * Forward declarations - */ -typedef struct grpc_c_thread_pool_s grpc_c_thread_pool_t; - -typedef struct grpc_c_event_s grpc_c_event_t; -typedef struct grpc_c_server_s grpc_c_server_t; -typedef struct grpc_c_client_s grpc_c_client_t; -typedef struct grpc_c_context_s grpc_c_context_t; -typedef struct grpc_c_method_funcs_s grpc_c_method_funcs_t; -typedef struct grpc_c_method_s grpc_c_method_t; - -typedef grpc_metadata_array grpc_c_metadata_array_t; - -typedef void (*grpc_c_event_callback_t)(grpc_c_event_t *event, int success); - -typedef size_t (*grpc_c_method_data_pack_t)(void *input, grpc_byte_buffer **buffer); - -typedef void *(*grpc_c_method_data_unpack_t)(grpc_c_context_t *context, grpc_byte_buffer *input); - -/* - * Structure definition for method functions - */ -struct grpc_c_method_funcs_s { - grpc_c_method_data_pack_t input_packer; /* Input packer */ - grpc_c_method_data_unpack_t input_unpacker; /* Input unpacker */ - grpc_c_method_data_pack_t output_packer; /* Output packer */ - grpc_c_method_data_unpack_t output_unpacker; /* Output unpacker */ -}; - -/* - * Event structure to be used as tag when batching gRPC operations - */ -struct grpc_c_event_s { - grpc_c_event_type_t type; /* Type of this event */ - void * data; /* Data associated with this event */ - grpc_c_event_callback_t callback; -}; - -/* - * Signature for client callback - */ -typedef void (* grpc_c_client_callback_t)(grpc_c_context_t *context, void * tag, int success); - -/* - * Service implementation - */ -typedef void (* grpc_c_service_callback_t)(grpc_c_context_t *context); - - -/* - * Definition for RPC method structure - */ -struct grpc_c_method_s { - grpc_c_list_t list; - void * method_tag; /* Tag returned by grpc_server_register_method() */ - char * method_url; /* URL for this RPC */ - int client_streaming; /* Flag to indicate if client is streaming */ - int server_streaming; /* Flag to indicate if server is streaming */ - void * handler; - grpc_c_method_funcs_t * funcs; -}; - -typedef struct grpc_c_stream_write_s { - grpc_c_event_t event; - int stream; - int count; - int write_done; - int write_wait; - gpr_mu lock; - gpr_cv cv; - int write_result; - grpc_byte_buffer * payload; -}grpc_c_stream_write_t; - - -typedef struct grpc_c_stream_read_s { - grpc_c_event_t event; - int stream; - int count; - int read_wait; - gpr_mu lock; - gpr_cv cv; - uint32_t flags; - grpc_byte_buffer * payload; -}grpc_c_stream_read_t; - - -typedef struct grpc_c_stream_status_s { - int is_client; - grpc_c_event_t event; - int result; - gpr_cv cv; - gpr_mu lock; - grpc_c_metadata_array_t trailing_metadata; - grpc_status_code status_code; - grpc_slice status_details; -}grpc_c_stream_status_t; - -typedef struct grpc_c_initial_metadata_s { - int is_send; - int done_once; - grpc_c_metadata_array_t metadata; - grpc_c_event_t event; - int result; - gpr_mu lock; -}grpc_c_initial_metadata_t; - - -typedef struct grpc_c_recv_close_s { - int client_cancel; /* Boolean indicating if client has cancelled the call */ - grpc_c_event_t event; /* Recv close grpc-c event in case of server context */ - gpr_mu lock; - gpr_cv cv; - int result; -}grpc_c_recv_close_t; - - -/* - * Structure definition for grpc_c client - */ -struct grpc_c_client_s { - grpc_channel * channel; /* Underlying grpc channel to host */ - grpc_completion_queue *queue; /* Completion queue associated with this client */ - - gpr_slice host; /* Hostname of remote providing RPC service */ - grpc_c_state_t state; /* Channel connectivity state */ - grpc_c_thread_pool_t * thread_pool; - - int connect_status; /* Connection status */ - grpc_c_event_t connect_event; - int timeout; /* Connection timeout flag */ - - gpr_mu lock; /* Mutex lock */ - gpr_cv shutdown_cv; /* Shutdown condition variable */ - int shutdown; /* Client shutdown flag */ -}; - -/* - * Structure definition for grpc-c context - */ -struct grpc_c_context_s { - char * method_url; - gpr_timespec deadline; /* Deadline for operations in this context */ - - grpc_c_state_t state; /* Current state of client/server */ - grpc_call * call; /* grpc_call for this RPC */ - gpr_mu lock; /* Mutex for access to this cq */ - gpr_cv shutdown; - grpc_c_method_funcs_t *method_funcs; /* Pointer to method functions like input/output packer,unpacker, free and method callbacks */ - - grpc_c_stream_status_t *status; - grpc_c_stream_read_t *reader; - grpc_c_stream_write_t *writer; - - grpc_c_initial_metadata_t * send_init_metadata; - grpc_c_initial_metadata_t * recv_init_metadata; - - int is_client; - - union grpc_c_ctx_data { - struct grpc_c_context_client_s { - grpc_c_client_t *client_t; - void *tag; - grpc_c_client_callback_t callback; /* Client callback */ - }client; - struct grpc_c_context_server_s { - grpc_c_method_t *method; - grpc_c_event_t event; /* grpc-c event this context belongs to */ - grpc_c_server_t *server_t; - grpc_c_service_callback_t callback; /* RPC handler */ - }server; - } type; - - grpc_c_list_t list; /* List of context objects */ -}; - - -/* - * User provided memory alloc and free functions - */ -typedef void *(* grpc_c_memory_alloc_func_t)(grpc_c_context_t *context, size_t size); - -typedef void (* grpc_c_memory_free_func_t)(grpc_c_context_t *context, void *data); - -void grpc_c_set_memory_function(grpc_c_memory_alloc_func_t , grpc_c_memory_free_func_t ); - -void * grpc_malloc(size_t size); - -void grpc_free(void *data); - -void * grpc_realloc(void * ptr,size_t size); - -ProtobufCAllocator * grpc_c_get_protobuf_c_allocator (grpc_c_context_t *context, ProtobufCAllocator *allocator); - -/* - * Server structure definition - */ -struct grpc_c_server_s { - char * hostname; /* Server hostname */ - grpc_server * server; /* Grpc server */ - - grpc_completion_queue * queue; /* Server completion queue */ - grpc_c_thread_pool_t * thread_pool; - - grpc_c_list_t method_list_head; - grpc_c_list_t contexts_list_head; - - gpr_mu lock; /* Mutex lock */ - - gpr_cv shutdown_cv; /* Shutdown condition variable */ - int shutdown; /* Server shutting down */ - - grpc_c_event_t shutdown_event; /* Event signalling server shutdown */ -}; - - -/* - * Initialize libgrpc-c to be used with given underlying libgrpc. Second - * parameter is used to pass data to underlying library if it needs any - */ -int grpc_c_init(void); - -/* - * Shutsdown initialized grpc-c library. To be called towards end of program - */ -int grpc_c_shutdown(void); - -/* - * Control log output level. - */ -void grpc_c_log_output_level(int level); - -/* - * User using Interface - */ -int grpc_c_read(grpc_c_context_t *context, void **content, uint32_t flags, long timeout); - -int grpc_c_write(grpc_c_context_t *context, void *output, uint32_t flags, long timeout); - -int grpc_c_write_done(grpc_c_context_t *context, uint32_t flags, long timeout); - -int grpc_c_finish(grpc_c_context_t *context, grpc_c_status_t *status, uint32_t flags); - -/* - * Initialize a client with client_id and server address - */ -grpc_c_client_t * grpc_c_client_init( const char *address, - grpc_channel_credentials *channel_creds, - grpc_channel_args *channel_args); - -/* - * Stop client. - */ -int grpc_c_client_stop(grpc_c_client_t *client); - -/* - * Waits for all callbacks to get done in a threaded client - */ -void grpc_c_client_wait (grpc_c_client_t *client); - -/* - * Destroy and free client object - */ -void grpc_c_client_free (grpc_c_client_t *client); - -/* - * Main function for sync nostreaming RPC call from client - */ -int grpc_c_client_request_sync( grpc_c_client_t *client, - grpc_c_metadata_array_t *array, uint32_t flags, - const char *method_url, - void *input, void **output, - grpc_c_status_t *status, - grpc_c_method_funcs_t * funcs, - long timeout); - -/* - * Main function for async nostreaming RPC call from client - */ -int grpc_c_client_request_async( grpc_c_client_t *client, - grpc_c_metadata_array_t *mdarray, uint32_t flags, - const char *method_url, - void *input, - grpc_c_client_callback_t *cb, void *tag, - grpc_c_method_funcs_t * funcs, - long timeout); - -/* - * Main function for streaming RPC call from client - */ -int grpc_c_client_request_stream( grpc_c_client_t *client, - grpc_c_metadata_array_t *mdarray, uint32_t flags, - const char *method_url, - grpc_c_context_t **context, - int client_streaming, int server_streaming, - grpc_c_method_funcs_t * funcs, - long timeout); - -/* - * Create a server object with given tcp/ip address - */ -grpc_c_server_t * grpc_c_server_create( const char *addr, grpc_server_credentials *creds, grpc_channel_args *args); - - -/* - * Start server - */ -int grpc_c_server_start(grpc_c_server_t *server); - -/* - * Makes a threaded server block - */ -void grpc_c_server_wait(grpc_c_server_t *server); - -/* - * stop server - */ -int grpc_c_server_stop(grpc_c_server_t *server); - -/* - * free grpc-c server - */ -void grpc_c_server_destroy(grpc_c_server_t *server); - - -/* - * Register a method along with corresponding method functions - */ -int grpc_c_register_method( grpc_c_server_t *server, const char *method_url, - int client_streaming, int server_streaming, - grpc_c_service_callback_t handler, - grpc_c_method_funcs_t * funcs); - -/* - * Initialize a metadata array - */ -void grpc_c_metadata_array_init(grpc_c_metadata_array_t *array); - -/* - * Destroy a metadata array - */ -void grpc_c_metadata_array_destroy(grpc_c_metadata_array_t *array); - -/* - * Insert provided key value pair to given metadata array and storage list - */ -int grpc_c_metadata_array_add(grpc_c_metadata_array_t *mdarray, - const char *key, const char *value); - - -/* - * Extract the value from initial metadata by key. Return NULL if not found - */ -int grpc_c_get_initial_metadata_by_key(grpc_c_context_t *context, const char *key, char *value, size_t len); - -/* - * Extract the value from trailing metadata by key. Return NULL if not found - */ -int grpc_c_get_trailing_metadata_by_key(grpc_c_context_t *context, const char *key, char *value, size_t len); - -/* - * Adds given key value pair to initial metadata array of given context. - * Returns 0 on success and 1 on failure - */ -int grpc_c_add_initial_metadata(grpc_c_context_t *context, const char *key, const char *value); - -/* - * Adds given key value pair to trailing metadata array of given context. - * Returns 0 on success and 1 on failure - */ -int grpc_c_add_trailing_metadata(grpc_c_context_t *context, const char *key, const char *value); - - -#ifdef __cplusplus -#if __cplusplus -} -#endif -#endif /* __cplusplus */ - - -#endif /* GRPC_C_GRPC_C_H */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 35f4cd2..2f948df 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -165,6 +165,7 @@ extern PGDLLIMPORT bool IsPostmasterEnvironment; extern PGDLLIMPORT bool IsUnderPostmaster; extern PGDLLIMPORT bool IsBackgroundWorker; extern PGDLLIMPORT bool IsBinaryUpgrade; +extern PGDLLIMPORT bool IsParallelFlushWorker; extern PGDLLIMPORT bool ExitOnAnyError; @@ -175,6 +176,7 @@ extern PGDLLIMPORT int NBuffers; extern PGDLLIMPORT int MaxBackends; extern PGDLLIMPORT int MaxConnections; extern PGDLLIMPORT int max_worker_processes; +extern PGDLLIMPORT int max_parallel_flush_process; extern PGDLLIMPORT int max_parallel_workers; extern PGDLLIMPORT int MyProcPid; @@ -336,6 +338,9 @@ typedef enum BackendType B_ARCHIVER, B_STATS_COLLECTOR, B_LOGGER, + B_PARALLEL_FLUSH, + B_CLEAN_LOGINDEX, + B_SECONDBUFFER, } BackendType; extern BackendType MyBackendType; @@ -436,6 +441,8 @@ typedef enum WalWriterProcess, WalReceiverProcess, + CleanLogIndexProcess, + SecondBufferProcess, NUM_AUXPROCTYPES /* Must be last! */ } AuxProcType; diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index c430b1b..32da7c2 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -41,5 +41,6 @@ extern Size CheckpointerShmemSize(void); extern void CheckpointerShmemInit(void); extern bool FirstCallSinceLastCheckpoint(void); +extern pid_t He3DBQueryCkpPid(void); #endif /* _BGWRITER_H */ diff --git a/src/include/postmaster/secondbuffer.h b/src/include/postmaster/secondbuffer.h new file mode 100644 index 0000000..8230622 --- /dev/null +++ b/src/include/postmaster/secondbuffer.h @@ -0,0 +1,137 @@ +#include "postgres.h" +#include "utils/hfs.h" + +#include + + +#define MAXREADERS 512 +#define MAPSIE (uint64)1<<40 +#define DEFAULTPAGEPATH "/tmp/pagedb" +#define DEFAULTWALPATH "/tmp/waldb" +#define PAGE 1 +#define WAL 2 +#define BLKSZ 8192 + +#define DROP 1 +#define TRUNCATE 2 +#define EVICT 3 +#define SDLEN 1024 +#define SDNUM 128 + +#define SecondBufferTableHashPartition(hashcode) \ + ((hashcode) % NUM_LOCK_PARTITIONS) +#define SecondBufferMappingPartitionLock(hashcode) \ + (&SecondBufferMainLWLockArray[SecondBufferTableHashPartition(hashcode)].lock) + +extern char *lmdb_page_directory; +extern char *lmdb_wal_directory; +extern Size SNBuffers; + +/* +for secondbufferhash code +*/ +typedef struct SdPageKey +{ + uint32 dbid; + uint32 relid; + uint32 forkno; + uint32 blkno; +} SdPageKey; + +typedef struct SdPageKeyEntity +{ + SdPageKey spk; + struct SdPageKeyEntity *next; +} SdPageKeyEntity; + +typedef struct SdPageKeyList +{ + SdPageKeyEntity *head; + SdPageKeyEntity *tail; +} SdPageKeyList; + +typedef struct LdPageKey +{ + SdPageKey sk; +} LdPageKey; + +typedef struct WalLdPageKey +{ + SdPageKey sk; + uint64 pageLsn; + uint8 partition; +} WalLdPageKey; + +typedef struct OriginDPageKey +{ + PageKey pk; + int opration; +} OriginDPageKey; + +typedef struct SdPageValue +{ + SdPageKey pk; + bool canDelete; + uint8 pagecontent[BLKSZ]; +} SdPageValue; + + + +typedef struct DPageKey +{ + PageKey pk; + bool pagedeleted; + uint8_t operation; +} DPageKey; + + + +typedef struct kvStruct { + LdPageKey lpk; + uint8_t *buf; + int32 length; + uint64_t lsn; +} kvStruct; + +//extern SingleKeyArray *MultiKeyArrays; + +extern MDB_env *pageEnv; +extern MDB_env *walEnv; + +extern MDB_dbi pageDbi; +extern MDB_dbi walDbi; + +extern MDB_txn *pageTxn; +extern MDB_txn *walTxn; +extern MDB_cursor *cursor; + +// MDB_stat mst; +// MDB_cursor_op op; + +extern void InitSecondBufferMeta(void); +extern void InitSecondBufferHash(void); + +extern void InitDPageKeyArray(void); + +extern void InitPageDBEnv(void); +extern void InitWalDBEnv(void); + +extern void storeWalInLocalBuffer(kvStruct *ks,int32 length); +extern void ReceivePageFromDataBuffer(PageKey *pk, uint8_t *buffer); // when evict one page out databuffer, we should call this to store the page. +extern void GetPageFromCurrentNode(PageKey pk,Bufrd *bufrd); // async delete old version page and wal. we should call this when move page from ld/sdb to db. +extern Bufrd GetWalFromLd(PageKey *pk); +extern Bufrd GetWalFromLocalBuffer(WalLdPageKey *pk, uint64_t replyLsn); +extern void AddOneItemToDPArray(OriginDPageKey pk); +extern void SecondBufferMain(void); +extern void ClosePageDBEnv(void); +extern void CloseWalDBEnv(void); + +extern void CreateSecondBufferLWLocks(void); +extern Size SecondBufferLWLockShmemSize(void); +extern Size SecondBufferShmemSize(void); +extern uint64_t SwapLsnFromLittleToBig(uint64_t lsn); +extern uint64_t SwapLsnFromBigToLittle(uint64_t lsn); +extern void SendInvalWal(WalLdPageKey *walkey); +extern void SendInvalPage(LdPageKey *ldKey); +extern void *CleanWalsInLmdb(void *arg); +extern void *CleanPagesInLmdb(void *arg); diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h index bf6adf1..4447adc 100644 --- a/src/include/postmaster/startup.h +++ b/src/include/postmaster/startup.h @@ -18,5 +18,5 @@ extern void PreRestoreCommand(void); extern void PostRestoreCommand(void); extern bool IsPromoteSignaled(void); extern void ResetPromoteSignaled(void); - +extern bool ProcHasReleaseFlag(void); #endif /* _STARTUP_H */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index a44fcc0..afc64fd 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -55,6 +55,7 @@ * Note: BM_TAG_VALID essentially means that there is a buffer hashtable * entry associated with the buffer's tag. */ + #define BM_LOCKED (1U << 22) /* buffer header is locked */ #define BM_DIRTY (1U << 23) /* data needs writing */ #define BM_VALID (1U << 24) /* data is valid */ @@ -132,6 +133,11 @@ typedef struct buftag #define BufMappingPartitionLockByIndex(i) \ (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock) +#define LOGIndexPartitionLock(hashcode) \ + (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + \ + BufTableHashPartition(hashcode)].lock) + + /* * BufferDesc -- shared descriptor/state data for a single shared buffer. * @@ -184,12 +190,15 @@ typedef struct BufferDesc BufferTag tag; /* ID of page contained in buffer */ int buf_id; /* buffer's index number (from 0) */ + bool isPreCacheEscape; /* escape from clock algorithm */ + /* state of the tag, containing flags, refcount and usagecount */ pg_atomic_uint32 state; int wait_backend_pid; /* backend PID of pin-count waiter */ int freeNext; /* link in freelist chain */ LWLock content_lock; /* to lock access to buffer contents */ + bool pageIsVaild; } BufferDesc; extern BufferDesc **bulk_io_in_progress_buf; diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 0bd0d6f..3bf69ed 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -79,8 +79,15 @@ extern int bgwriter_flush_after; extern bool bulk_io_is_in_progress; extern int bulk_io_in_progress_count; -extern bool isPreCache; +extern bool isPreCacheTable; +extern bool isPreCacheIndex; +extern bool isPreCacheIndexDone; extern bool needPreCacheEscape; +extern bool needUnpreCacheEscape; +extern bool isPreCacheAction; +extern Oid preCacheNodeOid; +extern uint16 *preCacheNodesCountPtr; +extern Oid *preCacheNodesPtr; /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks; @@ -254,7 +261,7 @@ extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation); /* in freelist.c */ extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype); extern void FreeAccessStrategy(BufferAccessStrategy strategy); - +extern bool PinBufferForPush(void *buf, BufferAccessStrategy strategy); /* inline functions */ @@ -305,4 +312,8 @@ TestForOldSnapshot(Snapshot snapshot, Relation relation, Page page) */ #define PAGEXLOG_BLCKSZ 49152 +/* Max preCacheNodes */ +#define NPreCacheNodes 128 + +extern bool *isPromoteIsTriggered; #endif /* BUFMGR_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 1595e29..c82e76f 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -86,21 +86,23 @@ extern int max_safe_fds; /* Operations on virtual Files --- equivalent to Unix kernel file ops */ extern File PathNameOpenFile(const char *fileName, int fileFlags); -extern File He3DBPathNameOpenFile(const char *fileName, int fileFlags); +//extern File He3DBPathNameOpenFile(const char *fileName, int fileFlags); extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); -extern File He3DBPathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); +//extern File He3DBPathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); extern File OpenTemporaryFile(bool interXact); extern void FileClose(File file); +//extern void He3DBFileClose(File file); extern int FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info); extern int FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info); -extern int He3DBFileRead(File file, char **buffer, off_t offset, uint32 wait_event_info, XLogRecPtr lsn); +extern int MasterFileRead(char *buffer, uint32_t dbid, uint32_t relid, uint32_t forkno, uint32_t blockno); +// extern int He3DBFileRead(File file, char **buffer, off_t offset, uint32 wait_event_info, XLogRecPtr lsn, BufferTag pageTag); extern int FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info); extern int He3DBFileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info); extern int FileSync(File file, uint32 wait_event_info); extern off_t FileSize(File file); -extern off_t He3DBFileSize(File file); +//extern off_t He3DBFileSize(File file); extern int FileTruncate(File file, off_t offset, uint32 wait_event_info); -extern int He3FileTruncate(File file, off_t offset, uint32 wait_event_info,bool isTemp); +//extern int He3FileTruncate(File file, off_t offset, uint32 wait_event_info,bool isTemp); extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info); extern char *FilePathName(File file); extern int FileGetRawDesc(File file); @@ -137,9 +139,9 @@ extern int CloseTransientFile(int fd); /* If you've really really gotta have a plain kernel FD, use this */ extern int BasicOpenFile(const char *fileName, int fileFlags); -extern int He3DBBasicOpenFile(const char *fileName, int fileFlags); +extern int64_t He3DBBasicOpenFile(const char *fileName, int fileFlags); extern int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); -extern int He3DBBasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); +//extern int64_t He3DBBasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); /* Use these for other cases, and also for long-lived BasicOpenFile FDs */ extern bool AcquireExternalFD(void); @@ -184,8 +186,9 @@ extern void SyncDataDirectory(void); extern int data_sync_elevel(int elevel); /* He3DB: He3FS */ -extern ssize_t he3fs_pread(int fd, void **buf, off_t offset, XLogRecPtr lsn, uint16 type); -extern ssize_t he3fs_pwrite(int fd, const void *buf, size_t size, off_t offset); +//extern ssize_t he3fs_pread(int64_t fd, void **buf, off_t offset, XLogRecPtr lsn, uint16 type, uint32_t dbid, uint32_t relid, uint32_t segno, uint32_t forkno);// +//extern ssize_t he3fs_pwrite(int64_t fd, const void *buf, size_t size, off_t offset); +//extern ssize_t he3fs_xlogread(int64_t fd, void *buf, off_t Offset, size_t size); /* Filename components */ #define PG_TEMP_FILES_DIR "pgsql_tmp" diff --git a/src/include/storage/filecache.h b/src/include/storage/filecache.h new file mode 100644 index 0000000..35a73c4 --- /dev/null +++ b/src/include/storage/filecache.h @@ -0,0 +1,19 @@ +#include "c.h" +#include "storage/block.h" +#include "storage/relfilenode.h" + +#define MAX_CACHE_RELATION 8192 + +typedef struct CachedRelInfo { + RelFileNode cached_reln; + BlockNumber cached_nblocks[MAX_FORKNUM + 1]; +} CachedRelInfo; + + + +extern Size FileCacheSize(void); +extern void InitCacheRel(void); + +extern CachedRelInfo *FindCacheRel(const RelFileNode *reln); +extern CachedRelInfo *SetupRelCache(const RelFileNode *reln, ForkNumber forkno, BlockNumber nblocks); +extern void RemoveCacheRel(const RelFileNode *reln); \ No newline at end of file diff --git a/src/include/storage/he3db_logindex.h b/src/include/storage/he3db_logindex.h new file mode 100644 index 0000000..ef548a7 --- /dev/null +++ b/src/include/storage/he3db_logindex.h @@ -0,0 +1,122 @@ +#ifndef HE3DB_LOGINDEX_H +#define HE3DB_LOGINDEX_H + +#include "access/xlog.h" +#include "common/hashfn.h" +#include "port/atomics.h" +#include "storage/lockdefs.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "storage/s_lock.h" +#include "storage/buf_internals.h" +#include "utils/pg_lsn.h" + +#define LOG_INDEX_MEM_ITEM_SEG_LSN_NUM 10 +#define LOG_INDEX_MEM_TBL_SEG_NUM 4096 +#define LOG_INDEX_MEM_TBL_PAGE_NUM (LOG_INDEX_MEM_TBL_SEG_NUM/2) +#define LOG_INDEX_TABLE_INVALID_ID 0 +#define LOG_INDEX_TBL_INVALID_SEG 0 + +#define LOG_INDEX_MEM_TBL_STATE_FREE (0x00) +#define LOG_INDEX_MEM_TBL_STATE_ACTIVE (0x01) +#define LOG_INDEX_MEM_TBL_STATE_INACTIVE (0x02) +#define LOG_INDEX_MEM_TBL_STATE_FLUSHED (0x04) + +#define LOG_INDEX_MEM_TBL_HASH_PAGE(tag) \ + (tag_hash(tag, sizeof(BufferTag)) % LOG_INDEX_MEM_TBL_PAGE_NUM) + +#define LOG_INDEX_SAME_TABLE_LSN_PREFIX(table, lsn) ((table)->meta.prefix_lsn == ((lsn) >> 32)) + +#define LOG_INDEX_MEM_TBL_SET_PREFIX_LSN(table, lsn) \ + { \ + (table)->meta.prefix_lsn = ((lsn) >> 32) ; \ + } +#define LOG_INDEX_INSERT_LSN_INFO(lsn_seg, number, lsn) \ + { \ + (lsn_seg)->suffix_lsn[(number)] = ((lsn << 32) >> 32); \ + } +#define LOG_INDEX_COMBINE_LSN(table, suffix) \ + ((((XLogRecPtr)((table)->meta.prefix_lsn)) << 32) | (suffix)) + +// metadata of log index mem table; size:37 +typedef struct LogIndexMemMeta +{ + uint64 id; + pg_atomic_uint32 state; + uint16 page_free_head; // free location for LogIndexMemItemHead + uint16 lsn_free_head; // free location for LogIndexMemItemSeg + XLogRecPtr min_lsn; + XLogRecPtr max_lsn; + uint32 prefix_lsn; + slock_t meta_lock; +} LogIndexMemMeta; + +// log index value, prefix of page head; size: 20+2+2+1=25 +typedef struct LogIndexMemItemHead +{ + BufferTag tag; + uint16 next_item; + uint16 next_seg; + uint16 tail_seg; + slock_t head_lock; +} LogIndexMemItemHead; + +// save page suffix lsn; size: 2+1+4*10=43 +typedef struct LogIndexMemItemSeg +{ + uint16 prev_seg; + uint16 next_seg; + uint8 number; + uint32 suffix_lsn[LOG_INDEX_MEM_ITEM_SEG_LSN_NUM]; +} LogIndexMemItemSeg; + +// log index mem table; size: 37+25*2048+43*4096+2*2048=231461≈226kB +typedef struct LogIndexMemTBL +{ + LogIndexMemMeta meta; + uint16 hash[LOG_INDEX_MEM_TBL_PAGE_NUM]; + LogIndexMemItemHead page_head[LOG_INDEX_MEM_TBL_PAGE_NUM]; + LogIndexMemItemSeg seg_item[LOG_INDEX_MEM_TBL_SEG_NUM]; +} LogIndexMemTBL; + +// list of log index mem tables +typedef struct LogIndexMemList +{ + uint64 table_start_index; // first mem_table index, will change by remove unless inactive table + uint64 active_table_index; // current mem_table index + uint64 table_cap; + //slock_t lock; + LogIndexMemTBL mem_table[FLEXIBLE_ARRAY_MEMBER]; +} LogIndexMemList; + +// lsn listNode +// typedef struct LsnNode { +// XLogRecPtr lsn; +// struct LsnNode * next; +// } LsnNode; + +typedef union He3DBBufTag{ + BufferTag tag; + XLogRecPtr lsn; +} He3DBBufTag; + +// page tag listNode +typedef struct TagNode { + He3DBBufTag tag; + struct TagNode * next; +} TagNode; + +extern int he3db_logindex_mem_size; +extern Size He3dbLogIndexShmemSize(void); +extern uint64 GetMemTblSize(void); +extern void He3dbLogIndexTblListInit(void); +extern void InsertLogIndexByPage(const BufferTag *page, XLogRecPtr lsn); +extern void CleanLogIndexByPage(XLogRecPtr consistLsn); +extern LsnNode *GetLogIndexByPage(const BufferTag *page, XLogRecPtr start_lsn, XLogRecPtr end_lsn); +extern void FreeLsnNode(LsnNode *head); +extern TagNode *GetBufTagByLsnRange(XLogRecPtr start_lsn, XLogRecPtr end_lsn); +extern void FreeTagNode(TagNode *head); +extern bool CheckBufTagExistByLsnRange(const BufferTag *page, XLogRecPtr start_lsn, XLogRecPtr end_lsn); +extern void He3DBGetLogindexStats(uint64 *memtable_total, uint64 *memtable_used, uint64 *memtable_active_index, + uint64 *memtable_start_index, uint64 *page_total); +#endif /* HE3DB_LOGINDEX_H */ \ No newline at end of file diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index a5286fa..5413e80 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -22,6 +22,7 @@ #include "storage/lockdefs.h" #include "storage/lwlock.h" #include "storage/shmem.h" +#include "storage/s_lock.h" #include "utils/timestamp.h" /* struct PGPROC is declared in proc.h, but must forward-reference it */ @@ -29,21 +30,20 @@ typedef struct PGPROC PGPROC; typedef struct PROC_QUEUE { - SHM_QUEUE links; /* head of list of PGPROC objects */ - int size; /* number of entries in list */ + SHM_QUEUE links; /* head of list of PGPROC objects */ + int size; /* number of entries in list */ } PROC_QUEUE; /* GUC variables */ -extern int max_locks_per_xact; +extern int max_locks_per_xact; #ifdef LOCK_DEBUG -extern int Trace_lock_oidmin; +extern int Trace_lock_oidmin; extern bool Trace_locks; extern bool Trace_userlocks; -extern int Trace_lock_table; +extern int Trace_lock_table; extern bool Debug_deadlocks; -#endif /* LOCK_DEBUG */ - +#endif /* LOCK_DEBUG */ /* * Top-level transactions are identified by VirtualTransactionIDs comprising @@ -63,33 +63,34 @@ extern bool Debug_deadlocks; */ typedef struct { - BackendId backendId; /* backendId from PGPROC */ - LocalTransactionId localTransactionId; /* lxid from PGPROC */ + BackendId backendId; /* backendId from PGPROC */ + LocalTransactionId localTransactionId; /* lxid from PGPROC */ } VirtualTransactionId; -#define InvalidLocalTransactionId 0 +#define InvalidLocalTransactionId 0 #define LocalTransactionIdIsValid(lxid) ((lxid) != InvalidLocalTransactionId) #define VirtualTransactionIdIsValid(vxid) \ (LocalTransactionIdIsValid((vxid).localTransactionId)) #define VirtualTransactionIdIsRecoveredPreparedXact(vxid) \ ((vxid).backendId == InvalidBackendId) #define VirtualTransactionIdEquals(vxid1, vxid2) \ - ((vxid1).backendId == (vxid2).backendId && \ + ((vxid1).backendId == (vxid2).backendId && \ (vxid1).localTransactionId == (vxid2).localTransactionId) #define SetInvalidVirtualTransactionId(vxid) \ - ((vxid).backendId = InvalidBackendId, \ + ((vxid).backendId = InvalidBackendId, \ (vxid).localTransactionId = InvalidLocalTransactionId) -#define GET_VXID_FROM_PGPROC(vxid, proc) \ +#define GET_VXID_FROM_PGPROC(vxid, proc) \ ((vxid).backendId = (proc).backendId, \ (vxid).localTransactionId = (proc).lxid) /* MAX_LOCKMODES cannot be larger than the # of bits in LOCKMASK */ -#define MAX_LOCKMODES 10 +#define MAX_LOCKMODES 10 +#define BLKSZ 8192 +#define LSNSZ 64 #define LOCKBIT_ON(lockmode) (1 << (lockmode)) #define LOCKBIT_OFF(lockmode) (~(1 << (lockmode))) - /* * This data structure defines the locking semantics associated with a * "lock method". The semantics specify the meaning of each lock mode @@ -112,7 +113,7 @@ typedef struct */ typedef struct LockMethodData { - int numLockModes; + int numLockModes; const LOCKMASK *conflictTab; const char *const *lockModeNames; const bool *trace_flag; @@ -127,8 +128,8 @@ typedef const LockMethodData *LockMethod; typedef uint16 LOCKMETHODID; /* These identify the known lock methods */ -#define DEFAULT_LOCKMETHOD 1 -#define USER_LOCKMETHOD 2 +#define DEFAULT_LOCKMETHOD 1 +#define USER_LOCKMETHOD 2 /* * LOCKTAG is the key information needed to look up a LOCK item in the @@ -139,20 +140,20 @@ typedef uint16 LOCKMETHODID; */ typedef enum LockTagType { - LOCKTAG_RELATION, /* whole relation */ - LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */ - LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */ - LOCKTAG_PAGE, /* one page of a relation */ - LOCKTAG_TUPLE, /* one physical tuple */ - LOCKTAG_TRANSACTION, /* transaction (for waiting for xact done) */ - LOCKTAG_VIRTUALTRANSACTION, /* virtual transaction (ditto) */ - LOCKTAG_SPECULATIVE_TOKEN, /* speculative insertion Xid and token */ - LOCKTAG_OBJECT, /* non-relation database object */ - LOCKTAG_USERLOCK, /* reserved for old contrib/userlock code */ - LOCKTAG_ADVISORY /* advisory user locks */ + LOCKTAG_RELATION, /* whole relation */ + LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */ + LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */ + LOCKTAG_PAGE, /* one page of a relation */ + LOCKTAG_TUPLE, /* one physical tuple */ + LOCKTAG_TRANSACTION, /* transaction (for waiting for xact done) */ + LOCKTAG_VIRTUALTRANSACTION, /* virtual transaction (ditto) */ + LOCKTAG_SPECULATIVE_TOKEN, /* speculative insertion Xid and token */ + LOCKTAG_OBJECT, /* non-relation database object */ + LOCKTAG_USERLOCK, /* reserved for old contrib/userlock code */ + LOCKTAG_ADVISORY /* advisory user locks */ } LockTagType; -#define LOCKTAG_LAST_TYPE LOCKTAG_ADVISORY +#define LOCKTAG_LAST_TYPE LOCKTAG_ADVISORY extern const char *const LockTagTypeNames[]; @@ -166,14 +167,92 @@ extern const char *const LockTagTypeNames[]; */ typedef struct LOCKTAG { - uint32 locktag_field1; /* a 32-bit ID field */ - uint32 locktag_field2; /* a 32-bit ID field */ - uint32 locktag_field3; /* a 32-bit ID field */ - uint16 locktag_field4; /* a 16-bit ID field */ - uint8 locktag_type; /* see enum LockTagType */ - uint8 locktag_lockmethodid; /* lockmethod indicator */ + uint32 locktag_field1; /* a 32-bit ID field */ + uint32 locktag_field2; /* a 32-bit ID field */ + uint32 locktag_field3; /* a 32-bit ID field */ + uint16 locktag_field4; /* a 16-bit ID field */ + uint8 locktag_type; /* see enum LockTagType */ + uint8 locktag_lockmethodid; /* lockmethod indicator */ } LOCKTAG; +#define WAL_LIST_LEN 16000 +/* +for walloghash code +*/ +// typedef struct WalLoc +// { +// uint64 Inode; +// uint64 Offset; +// int32 Length; +// uint64 Lsn; +// uint64 LinkedInode; +// } WalLoc; + +/* +for walloghash code +*/ +typedef struct WalLoc +{ + uint64 Lsn; + uint32 tl; +} WalLoc; + +typedef struct Prefix +{ + uint32 dbNode; + uint32 relNode; + int32 forkno; + uint32 blockno; +} Prefix; + +typedef struct WalList +{ + Prefix px; + uint32 cap; + uint32 len; + unsigned char wals[WAL_LIST_LEN]; + slock_t append_lck; +} WalList; + +// /* +// for secondbufferhash code +// */ +// typedef struct SdPageKey +// { +// uint32 dbid; +// uint32 relid; +// uint32 forkno; +// uint32 blkno; +// } SdPageKey; + +// typedef struct SdPageValue +// { +// SdPageKey pk; +// uint8_t page[BLKSZ]; +// uint8_t pageLsn[LSNSZ]; +// } SdPageValue; + + +//**************for fs meta************ + +typedef struct FSKey +{ + uint32 cap; + uint32 len; + unsigned char key[100]; +} FSKey; + + +typedef struct FSValue +{ + FSKey key; + uint32 cap; + uint32 len; + unsigned char value[200]; +} FSValue; +//************************************** + + /* * These macros define how we map logical IDs of lockable objects into * the physical fields of LOCKTAG. Use these to set up LOCKTAG values, @@ -181,78 +260,78 @@ typedef struct LOCKTAG */ /* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */ -#define SET_LOCKTAG_RELATION(locktag,dboid,reloid) \ - ((locktag).locktag_field1 = (dboid), \ - (locktag).locktag_field2 = (reloid), \ - (locktag).locktag_field3 = 0, \ - (locktag).locktag_field4 = 0, \ - (locktag).locktag_type = LOCKTAG_RELATION, \ +#define SET_LOCKTAG_RELATION(locktag, dboid, reloid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_RELATION, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) /* same ID info as RELATION */ -#define SET_LOCKTAG_RELATION_EXTEND(locktag,dboid,reloid) \ - ((locktag).locktag_field1 = (dboid), \ - (locktag).locktag_field2 = (reloid), \ - (locktag).locktag_field3 = 0, \ - (locktag).locktag_field4 = 0, \ - (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \ +#define SET_LOCKTAG_RELATION_EXTEND(locktag, dboid, reloid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) /* ID info for frozen IDs is DB OID */ -#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \ - ((locktag).locktag_field1 = (dboid), \ - (locktag).locktag_field2 = 0, \ - (locktag).locktag_field3 = 0, \ - (locktag).locktag_field4 = 0, \ +#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag, dboid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = 0, \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) /* ID info for a page is RELATION info + BlockNumber */ -#define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \ - ((locktag).locktag_field1 = (dboid), \ - (locktag).locktag_field2 = (reloid), \ - (locktag).locktag_field3 = (blocknum), \ - (locktag).locktag_field4 = 0, \ - (locktag).locktag_type = LOCKTAG_PAGE, \ +#define SET_LOCKTAG_PAGE(locktag, dboid, reloid, blocknum) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = (blocknum), \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_PAGE, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) /* ID info for a tuple is PAGE info + OffsetNumber */ -#define SET_LOCKTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum) \ - ((locktag).locktag_field1 = (dboid), \ - (locktag).locktag_field2 = (reloid), \ - (locktag).locktag_field3 = (blocknum), \ - (locktag).locktag_field4 = (offnum), \ - (locktag).locktag_type = LOCKTAG_TUPLE, \ +#define SET_LOCKTAG_TUPLE(locktag, dboid, reloid, blocknum, offnum) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (reloid), \ + (locktag).locktag_field3 = (blocknum), \ + (locktag).locktag_field4 = (offnum), \ + (locktag).locktag_type = LOCKTAG_TUPLE, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) /* ID info for a transaction is its TransactionId */ -#define SET_LOCKTAG_TRANSACTION(locktag,xid) \ - ((locktag).locktag_field1 = (xid), \ - (locktag).locktag_field2 = 0, \ - (locktag).locktag_field3 = 0, \ - (locktag).locktag_field4 = 0, \ +#define SET_LOCKTAG_TRANSACTION(locktag, xid) \ + ((locktag).locktag_field1 = (xid), \ + (locktag).locktag_field2 = 0, \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ (locktag).locktag_type = LOCKTAG_TRANSACTION, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) /* ID info for a virtual transaction is its VirtualTransactionId */ -#define SET_LOCKTAG_VIRTUALTRANSACTION(locktag,vxid) \ - ((locktag).locktag_field1 = (vxid).backendId, \ +#define SET_LOCKTAG_VIRTUALTRANSACTION(locktag, vxid) \ + ((locktag).locktag_field1 = (vxid).backendId, \ (locktag).locktag_field2 = (vxid).localTransactionId, \ - (locktag).locktag_field3 = 0, \ - (locktag).locktag_field4 = 0, \ - (locktag).locktag_type = LOCKTAG_VIRTUALTRANSACTION, \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_VIRTUALTRANSACTION, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) /* * ID info for a speculative insert is TRANSACTION info + * its speculative insert counter. */ -#define SET_LOCKTAG_SPECULATIVE_INSERTION(locktag,xid,token) \ - ((locktag).locktag_field1 = (xid), \ - (locktag).locktag_field2 = (token), \ - (locktag).locktag_field3 = 0, \ - (locktag).locktag_field4 = 0, \ - (locktag).locktag_type = LOCKTAG_SPECULATIVE_TOKEN, \ +#define SET_LOCKTAG_SPECULATIVE_INSERTION(locktag, xid, token) \ + ((locktag).locktag_field1 = (xid), \ + (locktag).locktag_field2 = (token), \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_SPECULATIVE_TOKEN, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) /* @@ -262,23 +341,22 @@ typedef struct LOCKTAG * pg_description, but notice that we are constraining SUBID to 16 bits. * Also, we use DB OID = 0 for shared objects such as tablespaces. */ -#define SET_LOCKTAG_OBJECT(locktag,dboid,classoid,objoid,objsubid) \ - ((locktag).locktag_field1 = (dboid), \ - (locktag).locktag_field2 = (classoid), \ - (locktag).locktag_field3 = (objoid), \ - (locktag).locktag_field4 = (objsubid), \ - (locktag).locktag_type = LOCKTAG_OBJECT, \ +#define SET_LOCKTAG_OBJECT(locktag, dboid, classoid, objoid, objsubid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = (classoid), \ + (locktag).locktag_field3 = (objoid), \ + (locktag).locktag_field4 = (objsubid), \ + (locktag).locktag_type = LOCKTAG_OBJECT, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) -#define SET_LOCKTAG_ADVISORY(locktag,id1,id2,id3,id4) \ - ((locktag).locktag_field1 = (id1), \ - (locktag).locktag_field2 = (id2), \ - (locktag).locktag_field3 = (id3), \ - (locktag).locktag_field4 = (id4), \ - (locktag).locktag_type = LOCKTAG_ADVISORY, \ +#define SET_LOCKTAG_ADVISORY(locktag, id1, id2, id3, id4) \ + ((locktag).locktag_field1 = (id1), \ + (locktag).locktag_field2 = (id2), \ + (locktag).locktag_field3 = (id3), \ + (locktag).locktag_field4 = (id4), \ + (locktag).locktag_type = LOCKTAG_ADVISORY, \ (locktag).locktag_lockmethodid = USER_LOCKMETHOD) - /* * Per-locked-object lock information: * @@ -300,22 +378,21 @@ typedef struct LOCKTAG typedef struct LOCK { /* hash key */ - LOCKTAG tag; /* unique identifier of lockable object */ + LOCKTAG tag; /* unique identifier of lockable object */ /* data */ - LOCKMASK grantMask; /* bitmask for lock types already granted */ - LOCKMASK waitMask; /* bitmask for lock types awaited */ - SHM_QUEUE procLocks; /* list of PROCLOCK objects assoc. with lock */ - PROC_QUEUE waitProcs; /* list of PGPROC objects waiting on lock */ - int requested[MAX_LOCKMODES]; /* counts of requested locks */ - int nRequested; /* total of requested[] array */ - int granted[MAX_LOCKMODES]; /* counts of granted locks */ - int nGranted; /* total of granted[] array */ + LOCKMASK grantMask; /* bitmask for lock types already granted */ + LOCKMASK waitMask; /* bitmask for lock types awaited */ + SHM_QUEUE procLocks; /* list of PROCLOCK objects assoc. with lock */ + PROC_QUEUE waitProcs; /* list of PGPROC objects waiting on lock */ + int requested[MAX_LOCKMODES]; /* counts of requested locks */ + int nRequested; /* total of requested[] array */ + int granted[MAX_LOCKMODES]; /* counts of granted locks */ + int nGranted; /* total of granted[] array */ } LOCK; -#define LOCK_LOCKMETHOD(lock) ((LOCKMETHODID) (lock).tag.locktag_lockmethodid) -#define LOCK_LOCKTAG(lock) ((LockTagType) (lock).tag.locktag_type) - +#define LOCK_LOCKMETHOD(lock) ((LOCKMETHODID)(lock).tag.locktag_lockmethodid) +#define LOCK_LOCKTAG(lock) ((LockTagType)(lock).tag.locktag_type) /* * We may have several different backends holding or awaiting locks @@ -354,21 +431,21 @@ typedef struct LOCK typedef struct PROCLOCKTAG { /* NB: we assume this struct contains no padding! */ - LOCK *myLock; /* link to per-lockable-object information */ - PGPROC *myProc; /* link to PGPROC of owning backend */ + LOCK *myLock; /* link to per-lockable-object information */ + PGPROC *myProc; /* link to PGPROC of owning backend */ } PROCLOCKTAG; typedef struct PROCLOCK { /* tag */ - PROCLOCKTAG tag; /* unique identifier of proclock object */ + PROCLOCKTAG tag; /* unique identifier of proclock object */ /* data */ - PGPROC *groupLeader; /* proc's lock group leader, or proc itself */ - LOCKMASK holdMask; /* bitmask for lock types currently held */ - LOCKMASK releaseMask; /* bitmask for lock types to be released */ - SHM_QUEUE lockLink; /* list link in LOCK's list of proclocks */ - SHM_QUEUE procLink; /* list link in PGPROC's list of proclocks */ + PGPROC *groupLeader; /* proc's lock group leader, or proc itself */ + LOCKMASK holdMask; /* bitmask for lock types currently held */ + LOCKMASK releaseMask; /* bitmask for lock types to be released */ + SHM_QUEUE lockLink; /* list link in LOCK's list of proclocks */ + SHM_QUEUE procLink; /* list link in PGPROC's list of proclocks */ } PROCLOCK; #define PROCLOCK_LOCKMETHOD(proclock) \ @@ -399,8 +476,8 @@ typedef struct PROCLOCK */ typedef struct LOCALLOCKTAG { - LOCKTAG lock; /* identifies the lockable object */ - LOCKMODE mode; /* lock mode for this table entry */ + LOCKTAG lock; /* identifies the lockable object */ + LOCKMODE mode; /* lock mode for this table entry */ } LOCALLOCKTAG; typedef struct LOCALLOCKOWNER @@ -412,29 +489,28 @@ typedef struct LOCALLOCKOWNER * Must use a forward struct reference to avoid circularity. */ struct ResourceOwnerData *owner; - int64 nLocks; /* # of times held by this owner */ + int64 nLocks; /* # of times held by this owner */ } LOCALLOCKOWNER; typedef struct LOCALLOCK { /* tag */ - LOCALLOCKTAG tag; /* unique identifier of locallock entry */ + LOCALLOCKTAG tag; /* unique identifier of locallock entry */ /* data */ - uint32 hashcode; /* copy of LOCKTAG's hash value */ - LOCK *lock; /* associated LOCK object, if any */ - PROCLOCK *proclock; /* associated PROCLOCK object, if any */ - int64 nLocks; /* total number of times lock is held */ - int numLockOwners; /* # of relevant ResourceOwners */ - int maxLockOwners; /* allocated size of array */ + uint32 hashcode; /* copy of LOCKTAG's hash value */ + LOCK *lock; /* associated LOCK object, if any */ + PROCLOCK *proclock; /* associated PROCLOCK object, if any */ + int64 nLocks; /* total number of times lock is held */ + int numLockOwners; /* # of relevant ResourceOwners */ + int maxLockOwners; /* allocated size of array */ LOCALLOCKOWNER *lockOwners; /* dynamically resizable array */ - bool holdsStrongLockCount; /* bumped FastPathStrongRelationLocks */ - bool lockCleared; /* we read all sinval msgs for lock */ + bool holdsStrongLockCount; /* bumped FastPathStrongRelationLocks */ + bool lockCleared; /* we read all sinval msgs for lock */ } LOCALLOCK; #define LOCALLOCK_LOCKMETHOD(llock) ((llock).tag.lock.locktag_lockmethodid) -#define LOCALLOCK_LOCKTAG(llock) ((LockTagType) (llock).tag.lock.locktag_type) - +#define LOCALLOCK_LOCKTAG(llock) ((LockTagType)(llock).tag.lock.locktag_type) /* * These structures hold information passed from lmgr internals to the lock @@ -443,69 +519,68 @@ typedef struct LOCALLOCK typedef struct LockInstanceData { - LOCKTAG locktag; /* tag for locked object */ - LOCKMASK holdMask; /* locks held by this PGPROC */ - LOCKMODE waitLockMode; /* lock awaited by this PGPROC, if any */ - BackendId backend; /* backend ID of this PGPROC */ - LocalTransactionId lxid; /* local transaction ID of this PGPROC */ - TimestampTz waitStart; /* time at which this PGPROC started waiting - * for lock */ - int pid; /* pid of this PGPROC */ - int leaderPid; /* pid of group leader; = pid if no group */ - bool fastpath; /* taken via fastpath? */ + LOCKTAG locktag; /* tag for locked object */ + LOCKMASK holdMask; /* locks held by this PGPROC */ + LOCKMODE waitLockMode; /* lock awaited by this PGPROC, if any */ + BackendId backend; /* backend ID of this PGPROC */ + LocalTransactionId lxid; /* local transaction ID of this PGPROC */ + TimestampTz waitStart; /* time at which this PGPROC started waiting + * for lock */ + int pid; /* pid of this PGPROC */ + int leaderPid; /* pid of group leader; = pid if no group */ + bool fastpath; /* taken via fastpath? */ } LockInstanceData; typedef struct LockData { - int nelements; /* The length of the array */ - LockInstanceData *locks; /* Array of per-PROCLOCK information */ + int nelements; /* The length of the array */ + LockInstanceData *locks; /* Array of per-PROCLOCK information */ } LockData; typedef struct BlockedProcData { - int pid; /* pid of a blocked PGPROC */ + int pid; /* pid of a blocked PGPROC */ /* Per-PROCLOCK information about PROCLOCKs of the lock the pid awaits */ /* (these fields refer to indexes in BlockedProcsData.locks[]) */ - int first_lock; /* index of first relevant LockInstanceData */ - int num_locks; /* number of relevant LockInstanceDatas */ + int first_lock; /* index of first relevant LockInstanceData */ + int num_locks; /* number of relevant LockInstanceDatas */ /* PIDs of PGPROCs that are ahead of "pid" in the lock's wait queue */ /* (these fields refer to indexes in BlockedProcsData.waiter_pids[]) */ - int first_waiter; /* index of first preceding waiter */ - int num_waiters; /* number of preceding waiters */ + int first_waiter; /* index of first preceding waiter */ + int num_waiters; /* number of preceding waiters */ } BlockedProcData; typedef struct BlockedProcsData { - BlockedProcData *procs; /* Array of per-blocked-proc information */ - LockInstanceData *locks; /* Array of per-PROCLOCK information */ - int *waiter_pids; /* Array of PIDs of other blocked PGPROCs */ - int nprocs; /* # of valid entries in procs[] array */ - int maxprocs; /* Allocated length of procs[] array */ - int nlocks; /* # of valid entries in locks[] array */ - int maxlocks; /* Allocated length of locks[] array */ - int npids; /* # of valid entries in waiter_pids[] array */ - int maxpids; /* Allocated length of waiter_pids[] array */ + BlockedProcData *procs; /* Array of per-blocked-proc information */ + LockInstanceData *locks; /* Array of per-PROCLOCK information */ + int *waiter_pids; /* Array of PIDs of other blocked PGPROCs */ + int nprocs; /* # of valid entries in procs[] array */ + int maxprocs; /* Allocated length of procs[] array */ + int nlocks; /* # of valid entries in locks[] array */ + int maxlocks; /* Allocated length of locks[] array */ + int npids; /* # of valid entries in waiter_pids[] array */ + int maxpids; /* Allocated length of waiter_pids[] array */ } BlockedProcsData; - /* Result codes for LockAcquire() */ typedef enum { - LOCKACQUIRE_NOT_AVAIL, /* lock not available, and dontWait=true */ - LOCKACQUIRE_OK, /* lock successfully acquired */ - LOCKACQUIRE_ALREADY_HELD, /* incremented count for lock already held */ - LOCKACQUIRE_ALREADY_CLEAR /* incremented count for lock already clear */ + LOCKACQUIRE_NOT_AVAIL, /* lock not available, and dontWait=true */ + LOCKACQUIRE_OK, /* lock successfully acquired */ + LOCKACQUIRE_ALREADY_HELD, /* incremented count for lock already held */ + LOCKACQUIRE_ALREADY_CLEAR /* incremented count for lock already clear */ } LockAcquireResult; /* Deadlock states identified by DeadLockCheck() */ typedef enum { - DS_NOT_YET_CHECKED, /* no deadlock check has run yet */ - DS_NO_DEADLOCK, /* no deadlock detected */ - DS_SOFT_DEADLOCK, /* deadlock avoided by queue rearrangement */ - DS_HARD_DEADLOCK, /* deadlock, no way out but ERROR */ - DS_BLOCKED_BY_AUTOVACUUM /* no deadlock; queue blocked by autovacuum - * worker */ + DS_NOT_YET_CHECKED, /* no deadlock check has run yet */ + DS_NO_DEADLOCK, /* no deadlock detected */ + DS_SOFT_DEADLOCK, /* deadlock avoided by queue rearrangement */ + DS_HARD_DEADLOCK, /* deadlock, no way out but ERROR */ + DS_BLOCKED_BY_AUTOVACUUM /* no deadlock; queue blocked by autovacuum + * worker */ } DeadLockState; /* @@ -516,9 +591,10 @@ typedef enum */ #define LockHashPartition(hashcode) \ ((hashcode) % NUM_LOCK_PARTITIONS) -#define LockHashPartitionLock(hashcode) \ +#define LockHashPartitionLock(hashcode) \ (&MainLWLockArray[LOCK_MANAGER_LWLOCK_OFFSET + \ - LockHashPartition(hashcode)].lock) + LockHashPartition(hashcode)] \ + .lock) #define LockHashPartitionLockByIndex(i) \ (&MainLWLockArray[LOCK_MANAGER_LWLOCK_OFFSET + (i)].lock) @@ -601,7 +677,7 @@ extern void RememberSimpleDeadLock(PGPROC *proc1, PGPROC *proc2); extern void InitDeadLockChecking(void); -extern int LockWaiterCount(const LOCKTAG *locktag); +extern int LockWaiterCount(const LOCKTAG *locktag); #ifdef LOCK_DEBUG extern void DumpLocks(PGPROC *proc); @@ -613,4 +689,4 @@ extern void VirtualXactLockTableInsert(VirtualTransactionId vxid); extern void VirtualXactLockTableCleanup(void); extern bool VirtualXactLock(VirtualTransactionId vxid, bool wait); -#endif /* LOCK_H_ */ +#endif /* LOCK_H_ */ diff --git a/src/include/storage/md.h b/src/include/storage/md.h index c4e5de5..697fee6 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -31,7 +31,7 @@ extern void mdextend(SMgrRelation reln, ForkNumber forknum, extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer, XLogRecPtr lsn); + char *buffer); extern int he3db_mdread_pagexlog(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char **buffer, XLogRecPtr lsn); extern int he3db_mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h index 8ed4d87..b2bb3d1 100644 --- a/src/include/storage/pmsignal.h +++ b/src/include/storage/pmsignal.h @@ -40,7 +40,9 @@ typedef enum PMSIGNAL_BACKGROUND_WORKER_CHANGE, /* background worker state change */ PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */ PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */ - + PMSIGNAL_PARALLEL_FLUSH_WORKER, + PMSIGNAL_CLEAN_LOGINDEX_WORKER, + PMSIGNAL_SECONDBUFFER_WORKER, NUM_PMSIGNALS /* Must be last value of enum! */ } PMSignalReason; diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index cfabfdb..01cffb9 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -349,6 +349,8 @@ typedef struct PROC_HDR PGPROC *bgworkerFreeProcs; /* Head of list of walsender free PGPROC structures */ PGPROC *walsenderFreeProcs; + /* Head of list of parallel flush's free PGPROC structures */ + PGPROC *parallelFlushFreeProcs; /* First pgproc waiting for group XID clear */ pg_atomic_uint32 procArrayGroupFirst; /* First pgproc waiting for group transaction status update */ @@ -381,7 +383,7 @@ extern PGPROC *PreparedXactProcs; * operation. Startup process and WAL receiver also consume 2 slots, but WAL * writer is launched only after startup has exited, so we only need 5 slots. */ -#define NUM_AUXILIARY_PROCS 5 +#define NUM_AUXILIARY_PROCS 10 /* configurable options */ extern PGDLLIMPORT int DeadlockTimeout; diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 0332963..9ff1a65 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -93,14 +93,17 @@ extern void smgrextend(SMgrRelation reln, ForkNumber forknum, extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void smgrread(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char **buffer, XLogRecPtr lsn); + BlockNumber blocknum, char *buffer); extern int he3dbsmgrread(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char **buffer, XLogRecPtr lsn); + BlockNumber blocknum, char **buffer); extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); +extern void he3dbsmgrwrite(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync, XLogRecPtr lsn); extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); +extern BlockNumber startupsmgrnblocks(SMgrRelation reln, ForkNumber forknum); extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum); extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks); diff --git a/src/include/utils/backend_status.h b/src/include/utils/backend_status.h index 8042b81..5486647 100644 --- a/src/include/utils/backend_status.h +++ b/src/include/utils/backend_status.h @@ -10,6 +10,7 @@ #ifndef BACKEND_STATUS_H #define BACKEND_STATUS_H +#include "access/xlogdefs.h" #include "datatype/timestamp.h" #include "libpq/pqcomm.h" #include "miscadmin.h" /* for BackendType */ @@ -317,5 +318,7 @@ extern PgBackendStatus *pgstat_fetch_stat_beentry(int beid); extern LocalPgBackendStatus *pgstat_fetch_stat_local_beentry(int beid); extern char *pgstat_clip_activity(const char *raw_activity); +extern XLogRecPtr He3DBQueryMinLsnFromAllStanby(); + #endif /* BACKEND_STATUS_H */ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 0c927d0..6d983dd 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -270,9 +270,16 @@ extern PGDLLIMPORT char *ConfigFileName; extern char *HbaFileName; extern char *IdentFileName; extern char *external_pid_file; +extern char *client_application_name; extern PGDLLIMPORT char *application_name; extern PGDLLIMPORT bool push_standby; +extern PGDLLIMPORT bool he3_point_in_time_recovery; +extern PGDLLIMPORT bool he3mirror; +extern PGDLLIMPORT bool pgmirror; +extern PGDLLIMPORT bool he3share; +extern PGDLLIMPORT bool mpush; + extern int tcp_keepalives_idle; extern int tcp_keepalives_interval; diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 6b40f1e..548904d 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -69,6 +69,7 @@ enum config_group WAL_ARCHIVING, WAL_ARCHIVE_RECOVERY, WAL_RECOVERY_TARGET, + WAL_SEND_LSN, REPLICATION_SENDING, REPLICATION_PRIMARY, REPLICATION_STANDBY, diff --git a/src/include/utils/hfs.h b/src/include/utils/hfs.h new file mode 100644 index 0000000..50c4576 --- /dev/null +++ b/src/include/utils/hfs.h @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include "utils/pg_lsn.h" +#include "storage/relfilenode.h" + +typedef struct +{ + uint8_t *buf; + size_t count; + size_t cap; +} Bufrd; + +typedef struct +{ + int64_t fd; + int32_t error; + +} IOResult; + +typedef struct XLogKey +{ + // uint32_t timeLine; + uint64_t lsn; +} XLogKey; + + +typedef struct XLogItem +{ + XLogKey xlogKey; + char *begin; + int length; + struct XLogItem *next; +} XLogItem; + +typedef struct PageKey +{ + RelFileNode relfileNode; + uint32 forkNo; + uint32 blkNo; + uint64 pageLsn; + uint64 replyLsn; +} PageKey; + + + +extern IOResult openfs(const char *pathname, int flags); + +extern int closefs(int64_t fd); + +extern int64_t lseekfs(int64_t fd, int64_t offset, int32_t whence); + +extern int64_t truncatefs(int64_t fd, uint64_t offset); + +extern int writefs(int64_t fd, const void *buf, size_t count, int64_t offset); + +extern Bufrd dataRead(int64_t fd, + int64_t offset, + int64_t lastLsn, + uint32_t dbid, + uint32_t relid, + uint32_t segno, + uint32_t forkno); + +extern void free_dataRead(uint8_t *buf, size_t count, size_t cap); + +extern Bufrd readfs(int64_t fd, int64_t offset, uint32_t size); +extern int batchRead(uint8_t *buf, uint32_t timeline, uint64_t startPtr,uint64_t endPtr, bool needStore); +extern int batchReadForTools(uint8_t *buf, uint32_t timeline, uint64_t startPtr,uint64_t endPtr, bool needStore); +extern uint8_t kvwrite(XLogItem *xlogItem); +extern uint8_t flushwals(XLogItem *xlogItem, uint32_t timeline); +extern uint8_t kvflush(XLogRecPtr lsn); +extern Bufrd ReadWalsByPage(uint32_t dbid, + uint32_t relid, + uint32_t forkno, + uint32_t blkno, + uint32_t timeline, + LsnNode* lsnhead); +extern void InsertConsistToKV(uint64_t lsn); +extern uint64_t GetConsistLsn(uint64_t lsn); +extern void DelConsistLsns(uint64_t lsn); +extern void DelRangeWals(uint32_t timeline, uint64_t startPtr,uint64_t endPtr); +//extern void ReceivePageFromDataBuffer(PageKey *pk, uint8_t *buffer); //when evict one page out databuffer, we should call this to store the page. +extern uint8_t EvictOnePageOutOfMemory(PageKey pageKey, char *value); + +//GetPageFromCurrentNode(PageKey *pk); +extern Bufrd MoveOnePageToMemory(PageKey pageKey); + +//extern Bufrd GetWalsFromDisk(PageKey pageKey); + + +extern void RemoveBufferFromLocal(uint32_t dbid, uint32_t relid, uint32_t forkno, uint32_t blkno); diff --git a/src/include/utils/pg_lsn.h b/src/include/utils/pg_lsn.h index eeeac5c..a898968 100644 --- a/src/include/utils/pg_lsn.h +++ b/src/include/utils/pg_lsn.h @@ -18,6 +18,11 @@ #include "access/xlogdefs.h" #include "fmgr.h" +typedef struct LsnNode { + XLogRecPtr lsn; + struct LsnNode * next; +} LsnNode; + #define DatumGetLSN(X) ((XLogRecPtr) DatumGetInt64(X)) #define LSNGetDatum(X) (Int64GetDatum((int64) (X))) @@ -27,3 +32,4 @@ extern XLogRecPtr pg_lsn_in_internal(const char *str, bool *have_error); #endif /* PG_LSN_H */ + diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index e922edb..d74a348 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -63,7 +63,6 @@ enum SysCacheIdentifier FOREIGNSERVERNAME, FOREIGNSERVEROID, FOREIGNTABLEREL, - HOTDATADATNAMERELNAME, INDEXRELID, LANGNAME, LANGOID, diff --git a/src/include/utils/ufs.h b/src/include/utils/ufs.h deleted file mode 100644 index b8fa215..0000000 --- a/src/include/utils/ufs.h +++ /dev/null @@ -1,27 +0,0 @@ -#include -#include -#include -#include "fs.grpc-c.h" - -/*typedef struct { - size_t len; - uint8_t* data -} ProtobufCBinaryData; -*/ -typedef struct { - int fd; - int errNo; -} fdInfo; -extern void ufs_init_client(); -//extern void shutdown(); -extern fdInfo he3Open(char *fileName,uint32_t fileFlages,uint16_t fileMode,uint8_t state); -extern uint64_t he3Lseek(uint64_t fd,uint64_t offset,uint64_t whence); -extern int walRead(uint64_t fd,ProtobufCBinaryData buf, uint64_t offset); -extern int he3Write(uint64_t fd, ProtobufCBinaryData buf, uint64_t offset); -extern int walLocalRead(uint64_t fd, ProtobufCBinaryData buf, uint64_t offset); -extern int walRestoreRead(uint64_t fd, ProtobufCBinaryData buf, uint64_t offset); -extern ProtobufCBinaryData dataRead(uint64_t fd, uint64_t offset, uint64_t lastLsn); -extern int he3Unlink(char* filename); -extern int he3Close(uint64_t fd); -extern int he3Truncate(uint64_t fd, uint64_t offset); -extern int he3Fsync(uint64_t fd); diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index 6c6ec2e..4261e43 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -47,7 +47,10 @@ typedef enum WAIT_EVENT_SYSLOGGER_MAIN, WAIT_EVENT_WAL_RECEIVER_MAIN, WAIT_EVENT_WAL_SENDER_MAIN, - WAIT_EVENT_WAL_WRITER_MAIN + WAIT_EVENT_WAL_WRITER_MAIN, + WAIT_EVENT_PAGEFLUSH_MAIN, + WAIT_EVENT_CLEAN_LOGINDEX_MAIN, + WAIT_EVENT_SECONDBUFFER_MAIN } WaitEventActivity; /* ---------- diff --git a/src/test/regress/expected/misc_functions.out b/src/test/regress/expected/misc_functions.out index e845042..7fb5ef5 100644 --- a/src/test/regress/expected/misc_functions.out +++ b/src/test/regress/expected/misc_functions.out @@ -155,44 +155,12 @@ SELECT * FROM pg_log_backend_memory_contexts(pg_backend_pid()); select setting as segsize from pg_settings where name = 'wal_segment_size' \gset -select count(*) > 0 as ok from pg_ls_waldir(); - ok ----- - t -(1 row) - --- Test ProjectSet as well as FunctionScan -select count(*) > 0 as ok from (select pg_ls_waldir()) ss; - ok ----- - t -(1 row) - -- Test not-run-to-completion cases. select * from pg_ls_waldir() limit 0; name | size | modification ------+------+-------------- (0 rows) -select count(*) > 0 as ok from (select * from pg_ls_waldir() limit 1) ss; - ok ----- - t -(1 row) - -select (w).size = :segsize as ok -from (select pg_ls_waldir() w) ss where length((w).name) = 24 limit 1; - ok ----- - t -(1 row) - -select count(*) >= 0 as ok from pg_ls_archive_statusdir(); - ok ----- - t -(1 row) - select * from (select pg_ls_dir('.') a) a where a = 'base' limit 1; a ------ diff --git a/src/test/regress/sql/misc_functions.sql b/src/test/regress/sql/misc_functions.sql index a398349..9d2f7ea 100644 --- a/src/test/regress/sql/misc_functions.sql +++ b/src/test/regress/sql/misc_functions.sql @@ -49,16 +49,8 @@ select setting as segsize from pg_settings where name = 'wal_segment_size' \gset -select count(*) > 0 as ok from pg_ls_waldir(); --- Test ProjectSet as well as FunctionScan -select count(*) > 0 as ok from (select pg_ls_waldir()) ss; -- Test not-run-to-completion cases. select * from pg_ls_waldir() limit 0; -select count(*) > 0 as ok from (select * from pg_ls_waldir() limit 1) ss; -select (w).size = :segsize as ok -from (select pg_ls_waldir() w) ss where length((w).name) = 24 limit 1; - -select count(*) >= 0 as ok from pg_ls_archive_statusdir(); select * from (select pg_ls_dir('.') a) a where a = 'base' limit 1;