mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-26 06:40:38 +00:00
Compare commits
3 Commits
issue_56
...
compute_no
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a267dfa41f | ||
|
|
1b9eb9430c | ||
|
|
9a4fbf365c |
6
.github/workflows/testing.yml
vendored
6
.github/workflows/testing.yml
vendored
@@ -4,7 +4,6 @@ on: [push]
|
||||
|
||||
jobs:
|
||||
regression-check:
|
||||
timeout-minutes: 10
|
||||
name: run regression test suite
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
@@ -77,7 +76,10 @@ jobs:
|
||||
target
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
|
||||
|
||||
- name: Build
|
||||
# That build is only to build dependencies and can be skipped if Cargo.lock
|
||||
# wasn't changed. Next steps need their own build
|
||||
- name: Install cargo deps
|
||||
if: steps.cache_cargo.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
cargo build
|
||||
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,5 +1,3 @@
|
||||
/target
|
||||
/tmp_check
|
||||
/tmp_install
|
||||
/tmp_check_cli
|
||||
.vscode
|
||||
|
||||
409
Cargo.lock
generated
409
Cargo.lock
generated
@@ -162,9 +162,9 @@ checksum = "e91831deabf0d6d7ec49552e489aed63b7456a7a3c46cff62adad428110b0af0"
|
||||
|
||||
[[package]]
|
||||
name = "async-trait"
|
||||
version = "0.1.49"
|
||||
version = "0.1.48"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589652ce7ccb335d1e7ecb3be145425702b290dbcb7029bbeaae263fc1d87b48"
|
||||
checksum = "36ea56748e10732c49404c153638a15ec3d6211ec5ff35d9bb20e13b93576adf"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -241,30 +241,6 @@ version = "0.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
|
||||
|
||||
[[package]]
|
||||
name = "bindgen"
|
||||
version = "0.53.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c72a978d268b1d70b0e963217e60fdabd9523a941457a6c42a7315d15c7e89e5"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cexpr",
|
||||
"cfg-if 0.1.10",
|
||||
"clang-sys",
|
||||
"clap",
|
||||
"env_logger",
|
||||
"lazy_static",
|
||||
"lazycell",
|
||||
"log",
|
||||
"peeking_take_while",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"shlex",
|
||||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.2.1"
|
||||
@@ -347,15 +323,6 @@ version = "1.0.67"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd"
|
||||
|
||||
[[package]]
|
||||
name = "cexpr"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27"
|
||||
dependencies = [
|
||||
"nom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "0.1.10"
|
||||
@@ -381,17 +348,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clang-sys"
|
||||
version = "0.29.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fe6837df1d5cba2397b835c8530f51723267e16abbf83892e9e5af4f0e5dd10a"
|
||||
dependencies = [
|
||||
"glob",
|
||||
"libc",
|
||||
"libloading",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "2.33.3"
|
||||
@@ -422,28 +378,6 @@ version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc"
|
||||
|
||||
[[package]]
|
||||
name = "control_plane"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"fs_extra",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"pageserver",
|
||||
"postgres",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.3",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_derive",
|
||||
"tar",
|
||||
"tokio-postgres",
|
||||
"toml",
|
||||
"walkeeper",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.9.1"
|
||||
@@ -477,9 +411,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.1"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4"
|
||||
checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"crossbeam-utils",
|
||||
@@ -594,19 +528,6 @@ dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"humantime",
|
||||
"log",
|
||||
"regex",
|
||||
"termcolor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "event-listener"
|
||||
version = "2.5.1"
|
||||
@@ -628,18 +549,6 @@ dependencies = [
|
||||
"instant",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "filetime"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d34cfa13a63ae058bfa601fe9e313bbdb3746427c1459185464ce0fcf62e1e8"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"libc",
|
||||
"redox_syscall 0.2.6",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
@@ -681,17 +590,11 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs_extra"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394"
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.14"
|
||||
version = "0.3.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a9d5813545e459ad3ca1bff9915e9ad7f1a47dc6a91b627ce321d5863b7dd253"
|
||||
checksum = "7f55667319111d593ba876406af7c409c0ebb44dc4be6132a783ccf163ea14c1"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
@@ -704,9 +607,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.14"
|
||||
version = "0.3.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ce79c6a52a299137a6013061e0cf0e688fce5d7f1bc60125f520912fdb29ec25"
|
||||
checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
@@ -714,15 +617,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.14"
|
||||
version = "0.3.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "098cd1c6dda6ca01650f1a37a794245eb73181d0d4d4e955e2f3c37db7af1815"
|
||||
checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.14"
|
||||
version = "0.3.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "10f6cb7042eda00f0049b1d2080aa4b93442997ee507eb3828e8bd7577f94c9d"
|
||||
checksum = "891a4b7b96d84d5940084b2a37632dd65deeae662c114ceaa2c879629c9c0ad1"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
@@ -731,9 +634,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.14"
|
||||
version = "0.3.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "365a1a1fb30ea1c03a830fdb2158f5236833ac81fa0ad12fe35b29cddc35cb04"
|
||||
checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59"
|
||||
|
||||
[[package]]
|
||||
name = "futures-lite"
|
||||
@@ -752,9 +655,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.14"
|
||||
version = "0.3.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "668c6733a182cd7deb4f1de7ba3bf2120823835b3bcfbeacf7d2c4a773c1bb8b"
|
||||
checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7"
|
||||
dependencies = [
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
@@ -764,21 +667,21 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.14"
|
||||
version = "0.3.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c5629433c555de3d82861a7a4e3794a4c40040390907cfbfd7143a92a426c23"
|
||||
checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3"
|
||||
|
||||
[[package]]
|
||||
name = "futures-task"
|
||||
version = "0.3.14"
|
||||
version = "0.3.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba7aa51095076f3ba6d9a1f702f74bd05ec65f555d70d2033d55ba8d69f581bc"
|
||||
checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80"
|
||||
|
||||
[[package]]
|
||||
name = "futures-util"
|
||||
version = "0.3.14"
|
||||
version = "0.3.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c144ad54d60f23927f0a6b6d816e4271278b64f005ad65e4e35291d2de9c025"
|
||||
checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
@@ -826,12 +729,6 @@ dependencies = [
|
||||
"wasi 0.10.0+wasi-snapshot-preview1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "glob"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
|
||||
|
||||
[[package]]
|
||||
name = "gloo-timers"
|
||||
version = "0.2.1"
|
||||
@@ -900,9 +797,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "0.2.4"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11"
|
||||
checksum = "7245cd7449cc792608c3c8a9eaf69bd4eabbabf802713748fd739c98b82f0747"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
@@ -922,9 +819,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "httparse"
|
||||
version = "1.3.6"
|
||||
version = "1.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc35c995b9d93ec174cf9a27d425c7892722101e14993cd227fdb51d70cf9589"
|
||||
checksum = "615caabe2c3160b313d52ccc905335f4ed5f10881dd63dc5699d47e90be85691"
|
||||
|
||||
[[package]]
|
||||
name = "httpdate"
|
||||
@@ -932,15 +829,6 @@ version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47"
|
||||
|
||||
[[package]]
|
||||
name = "humantime"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f"
|
||||
dependencies = [
|
||||
"quick-error",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
version = "0.14.5"
|
||||
@@ -958,7 +846,7 @@ dependencies = [
|
||||
"httpdate",
|
||||
"itoa",
|
||||
"pin-project",
|
||||
"socket2",
|
||||
"socket2 0.4.0",
|
||||
"tokio",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
@@ -1012,7 +900,6 @@ dependencies = [
|
||||
name = "integration_tests"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"control_plane",
|
||||
"lazy_static",
|
||||
"pageserver",
|
||||
"postgres",
|
||||
@@ -1057,27 +944,11 @@ version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||
|
||||
[[package]]
|
||||
name = "lazycell"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.93"
|
||||
version = "0.2.92"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41"
|
||||
|
||||
[[package]]
|
||||
name = "libloading"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f2b111a074963af1d37a139918ac6d49ad1d0d5e47f72fd55388619691a7d753"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"winapi",
|
||||
]
|
||||
checksum = "56d855069fafbb9b344c0f962150cd2c1187975cb1c22c1522c240d8c4986714"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
@@ -1191,17 +1062,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a19900e7eee95eb2b3c2e26d12a874cc80aaf750e31be6fcbe743ead369fa45d"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"socket2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "5.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"version_check",
|
||||
"socket2 0.4.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1315,14 +1176,12 @@ dependencies = [
|
||||
"crc32c",
|
||||
"crossbeam-channel",
|
||||
"daemonize",
|
||||
"fs2",
|
||||
"futures",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.3",
|
||||
"regex",
|
||||
"rust-s3",
|
||||
@@ -1331,14 +1190,11 @@ dependencies = [
|
||||
"slog-scope",
|
||||
"slog-stdlog",
|
||||
"slog-term",
|
||||
"tar",
|
||||
"termion",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tui",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1367,17 +1223,11 @@ dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"instant",
|
||||
"libc",
|
||||
"redox_syscall 0.2.6",
|
||||
"redox_syscall 0.2.5",
|
||||
"smallvec",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peeking_take_while"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.1.0"
|
||||
@@ -1455,8 +1305,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.1"
|
||||
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=a0d067b66447951d1276a53fb09886539c3fa094#a0d067b66447951d1276a53fb09886539c3fa094"
|
||||
version = "0.19.0"
|
||||
source = "git+https://github.com/kelvich/rust-postgres?branch=replication_rebase#f3425d991f75cb7b464a37e6b3d5d05f8bf51c02"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -1468,8 +1318,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.1"
|
||||
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=a0d067b66447951d1276a53fb09886539c3fa094#a0d067b66447951d1276a53fb09886539c3fa094"
|
||||
version = "0.6.0"
|
||||
source = "git+https://github.com/kelvich/rust-postgres?branch=replication_rebase#f3425d991f75cb7b464a37e6b3d5d05f8bf51c02"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"byteorder",
|
||||
@@ -1485,28 +1335,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.1"
|
||||
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=a0d067b66447951d1276a53fb09886539c3fa094#a0d067b66447951d1276a53fb09886539c3fa094"
|
||||
version = "0.2.0"
|
||||
source = "git+https://github.com/kelvich/rust-postgres?branch=replication_rebase#f3425d991f75cb7b464a37e6b3d5d05f8bf51c02"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
"postgres-protocol",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "postgres_ffi"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bindgen",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"crc32c",
|
||||
"hex",
|
||||
"rand 0.8.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.10"
|
||||
@@ -1534,12 +1370,6 @@ dependencies = [
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-error"
|
||||
version = "1.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.9"
|
||||
@@ -1638,9 +1468,9 @@ checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.2.6"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8270314b5ccceb518e7e578952f0b72b88222d02e8f77f5ecf7abbb673539041"
|
||||
checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
@@ -1651,7 +1481,7 @@ version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8440d8acb4fd3d277125b4bd01a6f38aee8d814b3b5fc09b3f2b825d37d3fe8f"
|
||||
dependencies = [
|
||||
"redox_syscall 0.2.6",
|
||||
"redox_syscall 0.2.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1672,7 +1502,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64"
|
||||
dependencies = [
|
||||
"getrandom 0.2.2",
|
||||
"redox_syscall 0.2.6",
|
||||
"redox_syscall 0.2.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1703,9 +1533,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
version = "0.11.3"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124"
|
||||
checksum = "bf12057f289428dbf5c591c74bf10392e4a8003f993405a902f20117019022d4"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"bytes",
|
||||
@@ -1761,7 +1591,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "rust-s3"
|
||||
version = "0.27.0-beta1"
|
||||
source = "git+https://github.com/hlinnaka/rust-s3?rev=7f15a24ec7daa0a5d9516da706212745f9042818#7f15a24ec7daa0a5d9516da706212745f9042818"
|
||||
source = "git+https://github.com/hlinnaka/rust-s3#7f15a24ec7daa0a5d9516da706212745f9042818"
|
||||
dependencies = [
|
||||
"async-std",
|
||||
"async-trait",
|
||||
@@ -1789,12 +1619,6 @@ dependencies = [
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.2.3"
|
||||
@@ -1816,15 +1640,6 @@ version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
|
||||
|
||||
[[package]]
|
||||
name = "same-file"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schannel"
|
||||
version = "0.1.19"
|
||||
@@ -1944,12 +1759,6 @@ dependencies = [
|
||||
"opaque-debug",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2"
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-registry"
|
||||
version = "1.3.0"
|
||||
@@ -2036,6 +1845,17 @@ version = "1.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
version = "0.3.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "122e570113d28d773067fab24266b66753f6ea915758651696b6e35e49f88d6e"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "socket2"
|
||||
version = "0.4.0"
|
||||
@@ -2070,9 +1890,9 @@ checksum = "1e81da0851ada1f3e9d4312c704aa4f8806f0f9d69faaf8df2f3464b4a9437c2"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.69"
|
||||
version = "1.0.68"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48fe99c6bd8b1cc636890bcc071842de909d902c81ac7dab53ba33c421ab8ffb"
|
||||
checksum = "3ce15dd3ed8aa2f8eeac4716d6ef5ab58b6b9256db41d7e1a0224c2788e8fd87"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -2085,17 +1905,6 @@ version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
|
||||
|
||||
[[package]]
|
||||
name = "tar"
|
||||
version = "0.4.33"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c0bcfbd6a598361fda270d82469fff3d65089dc33e175c9a131f7b4cd395f228"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"libc",
|
||||
"xattr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.2.0"
|
||||
@@ -2105,7 +1914,7 @@ dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"libc",
|
||||
"rand 0.8.3",
|
||||
"redox_syscall 0.2.6",
|
||||
"redox_syscall 0.2.5",
|
||||
"remove_dir_all",
|
||||
"winapi",
|
||||
]
|
||||
@@ -2121,15 +1930,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termion"
|
||||
version = "1.5.6"
|
||||
@@ -2138,7 +1938,7 @@ checksum = "077185e2eac69c3f8379a4298e1e07cd36beb962290d4a51199acf0fdc10607e"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"numtoa",
|
||||
"redox_syscall 0.2.6",
|
||||
"redox_syscall 0.2.5",
|
||||
"redox_termios",
|
||||
]
|
||||
|
||||
@@ -2208,9 +2008,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.5.0"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83f0c8e7c0addab50b663055baf787d0af7f413a46e6e7fb9559a4e4db7137a5"
|
||||
checksum = "134af885d758d645f0f0505c9a8b3f9bf8a348fd822e112ab5248138348f1722"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"bytes",
|
||||
@@ -2249,8 +2049,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.1"
|
||||
source = "git+https://github.com/zenithdb/rust-postgres.git?rev=a0d067b66447951d1276a53fb09886539c3fa094#a0d067b66447951d1276a53fb09886539c3fa094"
|
||||
version = "0.7.0"
|
||||
source = "git+https://github.com/kelvich/rust-postgres?branch=replication_rebase#f3425d991f75cb7b464a37e6b3d5d05f8bf51c02"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -2261,10 +2061,10 @@ dependencies = [
|
||||
"parking_lot",
|
||||
"percent-encoding",
|
||||
"phf",
|
||||
"pin-project-lite",
|
||||
"pin-project",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"socket2",
|
||||
"socket2 0.3.19",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
]
|
||||
@@ -2282,9 +2082,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio-util"
|
||||
version = "0.6.6"
|
||||
version = "0.6.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "940a12c99365c31ea8dd9ba04ec1be183ffe4920102bb7122c2f515437601e8e"
|
||||
checksum = "5143d049e85af7fbc36f5454d990e62c2df705b3589f123b71f441b6b59f443f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-core",
|
||||
@@ -2294,15 +2094,6 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "0.5.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower-service"
|
||||
version = "0.3.1"
|
||||
@@ -2356,9 +2147,9 @@ checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-bidi"
|
||||
version = "0.3.5"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eeb8be209bb1c96b7c177c7420d26e04eccacb0eeae6b980e35fcb74678107e0"
|
||||
checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5"
|
||||
dependencies = [
|
||||
"matches",
|
||||
]
|
||||
@@ -2441,17 +2232,6 @@ version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d5b2c62b4012a3e1eca5a7e077d13b3bf498c4073e33ccd58626607748ceeca"
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
|
||||
dependencies = [
|
||||
"same-file",
|
||||
"winapi",
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "walkeeper"
|
||||
version = "0.1.0"
|
||||
@@ -2459,24 +2239,31 @@ dependencies = [
|
||||
"anyhow",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
"crossbeam-channel",
|
||||
"daemonize",
|
||||
"fs2",
|
||||
"futures",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"pageserver",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"rand 0.8.3",
|
||||
"regex",
|
||||
"rust-s3",
|
||||
"slog",
|
||||
"slog-async",
|
||||
"slog-scope",
|
||||
"slog-stdlog",
|
||||
"slog-term",
|
||||
"termion",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"tui",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2588,15 +2375,6 @@ dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "which"
|
||||
version = "3.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d011071ae14a2f6671d0b74080ae0cd8ebf3a6f8c9589a2cd45f23126fe29724"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wildmatch"
|
||||
version = "1.1.0"
|
||||
@@ -2619,15 +2397,6 @@ version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-util"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
@@ -2643,32 +2412,8 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xattr"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "244c3741f4240ef46274860397c7c74e50eb23624996930e484c16679633a54c"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xml-rs"
|
||||
version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b07db065a5cf61a7e4ba64f29e67db906fb1787316516c4e6e5ff0fea1efcd8a"
|
||||
|
||||
[[package]]
|
||||
name = "zenith"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"control_plane",
|
||||
"pageserver",
|
||||
"postgres_ffi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zenith_utils"
|
||||
version = "0.1.0"
|
||||
|
||||
@@ -3,8 +3,4 @@ members = [
|
||||
"integration_tests",
|
||||
"pageserver",
|
||||
"walkeeper",
|
||||
"zenith",
|
||||
"control_plane",
|
||||
"postgres_ffi",
|
||||
"zenith_utils",
|
||||
]
|
||||
|
||||
48
README.md
48
README.md
@@ -2,54 +2,6 @@
|
||||
|
||||
Zenith substitutes PostgreSQL storage layer and redistributes data across a cluster of nodes
|
||||
|
||||
## Running local installation
|
||||
|
||||
1. Build zenith and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
cd zenith
|
||||
./pgbuild.sh # builds postgres and installs it to ./tmp_install
|
||||
cargo build
|
||||
```
|
||||
|
||||
2. Start pageserver and postggres on top of it (should be called from repo root):
|
||||
```sh
|
||||
# Create ~/.zenith with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
>./target/debug/zenith init
|
||||
|
||||
# start pageserver
|
||||
> ./target/debug/zenith pageserver start
|
||||
Starting pageserver at '127.0.0.1:64000'
|
||||
|
||||
# create and configure postgres data dir
|
||||
> ./target/debug/zenith pg create
|
||||
Creating new postgres: path=/Users/user/code/zenith/tmp_check_cli/compute/pg1 port=55432
|
||||
Database initialized
|
||||
|
||||
# start it
|
||||
> ./target/debug/zenith pg start pg1
|
||||
|
||||
# look up status and connection info
|
||||
> ./target/debug/zenith pg list
|
||||
NODE ADDRESS STATUS
|
||||
pg1 127.0.0.1:55432 running
|
||||
```
|
||||
|
||||
3. Now it is possible to connect to postgres and run some queries:
|
||||
```
|
||||
> psql -p55432 -h 127.0.0.1 postgres
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
CREATE TABLE
|
||||
postgres=# insert into t values(1,1);
|
||||
INSERT 0 1
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
|
||||
188
cli-v2-story.md
188
cli-v2-story.md
@@ -1,188 +0,0 @@
|
||||
Create a new Zenith repository in the current directory:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init
|
||||
The files belonging to this database system will be owned by user "heikki".
|
||||
This user must also own the server process.
|
||||
|
||||
The database cluster will be initialized with locale "en_GB.UTF-8".
|
||||
The default database encoding has accordingly been set to "UTF8".
|
||||
The default text search configuration will be set to "english".
|
||||
|
||||
Data page checksums are disabled.
|
||||
|
||||
creating directory tmp ... ok
|
||||
creating subdirectories ... ok
|
||||
selecting dynamic shared memory implementation ... posix
|
||||
selecting default max_connections ... 100
|
||||
selecting default shared_buffers ... 128MB
|
||||
selecting default time zone ... Europe/Helsinki
|
||||
creating configuration files ... ok
|
||||
running bootstrap script ... ok
|
||||
performing post-bootstrap initialization ... ok
|
||||
syncing data to disk ... ok
|
||||
|
||||
initdb: warning: enabling "trust" authentication for local connections
|
||||
You can change this by editing pg_hba.conf or using the option -A, or
|
||||
--auth-local and --auth-host, the next time you run initdb.
|
||||
new zenith repository was created in .zenith
|
||||
|
||||
Initially, there is only one branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
|
||||
main
|
||||
|
||||
Start a local Postgres instance on the branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv6 address "::1", port 5432
|
||||
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv4 address "127.0.0.1", port 5432
|
||||
2021-04-13 09:27:43.927 EEST [984664] LOG: listening on Unix socket "/tmp/.s.PGSQL.5432"
|
||||
2021-04-13 09:27:43.939 EEST [984665] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:27:43.939 EEST [984665] LOG: creating missing WAL directory "pg_wal/archive_status"
|
||||
2021-04-13 09:27:44.189 EEST [984665] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:27:44.195 EEST [984665] LOG: invalid record length at 0/15FFB80: wanted 24, got 0
|
||||
2021-04-13 09:27:44.195 EEST [984665] LOG: redo is not required
|
||||
2021-04-13 09:27:44.225 EEST [984664] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
|
||||
Run some commands against it:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);"
|
||||
CREATE TABLE
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');"
|
||||
INSERT 0 1
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
Create a new branch called 'experimental'. We create it from the
|
||||
current end of the 'main' branch, but you could specify a different
|
||||
LSN as the start point instead.
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main
|
||||
branching at end of WAL: 0/161F478
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
|
||||
experimental
|
||||
main
|
||||
|
||||
Start another Postgres instance off the 'experimental' branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv6 address "::1", port 5433
|
||||
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv4 address "127.0.0.1", port 5433
|
||||
2021-04-13 09:28:41.883 EEST [984766] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
|
||||
2021-04-13 09:28:41.896 EEST [984767] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:28:42.265 EEST [984767] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:28:42.269 EEST [984767] LOG: redo starts at 0/15FFB80
|
||||
2021-04-13 09:28:42.272 EEST [984767] LOG: invalid record length at 0/161F4B0: wanted 24, got 0
|
||||
2021-04-13 09:28:42.272 EEST [984767] LOG: redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
|
||||
2021-04-13 09:28:42.321 EEST [984766] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
|
||||
Insert some a row on the 'experimental' branch:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')"
|
||||
INSERT 0 1
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
inserted on experimental
|
||||
(2 rows)
|
||||
|
||||
See that the other Postgres instance is still running on 'main' branch on port 5432:
|
||||
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
(1 row)
|
||||
|
||||
|
||||
|
||||
|
||||
Everything is stored in the .zenith directory:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/
|
||||
total 12
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines
|
||||
|
||||
The 'datadirs' directory contains the datadirs of the running instances:
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/
|
||||
total 8
|
||||
drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e
|
||||
drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76
|
||||
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/
|
||||
total 124
|
||||
drwxr-xr-x 5 heikki heikki 4096 Apr 13 09:27 base
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 global
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_commit_ts
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_dynshmem
|
||||
-rw------- 1 heikki heikki 4760 Apr 13 09:27 pg_hba.conf
|
||||
-rw------- 1 heikki heikki 1636 Apr 13 09:27 pg_ident.conf
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:32 pg_logical
|
||||
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 pg_multixact
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_notify
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_replslot
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_serial
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_snapshots
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_stat
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:34 pg_stat_tmp
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_subtrans
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_tblspc
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_twophase
|
||||
-rw------- 1 heikki heikki 3 Apr 13 09:27 PG_VERSION
|
||||
lrwxrwxrwx 1 heikki heikki 52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal
|
||||
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_xact
|
||||
-rw------- 1 heikki heikki 88 Apr 13 09:27 postgresql.auto.conf
|
||||
-rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf
|
||||
-rw------- 1 heikki heikki 96 Apr 13 09:27 postmaster.opts
|
||||
-rw------- 1 heikki heikki 149 Apr 13 09:27 postmaster.pid
|
||||
|
||||
Note how 'pg_wal' is just a symlink to the 'timelines' directory. The
|
||||
datadir is ephemeral, you can delete it at any time, and it can be reconstructed
|
||||
from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull
|
||||
the repository, the 'datadirs' are not included. (They are like git working trees)
|
||||
|
||||
~/git-sandbox/zenith (cli-v2)$ killall -9 postgres
|
||||
~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/*
|
||||
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
|
||||
Creating data directory from snapshot at 0/15FFB08...
|
||||
waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
|
||||
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv6 address "::1", port 5433
|
||||
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv4 address "127.0.0.1", port 5433
|
||||
2021-04-13 09:37:05.487 EEST [985340] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
|
||||
2021-04-13 09:37:05.498 EEST [985341] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
|
||||
2021-04-13 09:37:05.808 EEST [985341] LOG: database system was not properly shut down; automatic recovery in progress
|
||||
2021-04-13 09:37:05.813 EEST [985341] LOG: redo starts at 0/15FFB80
|
||||
2021-04-13 09:37:05.815 EEST [985341] LOG: invalid record length at 0/161F770: wanted 24, got 0
|
||||
2021-04-13 09:37:05.815 EEST [985341] LOG: redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
|
||||
2021-04-13 09:37:05.866 EEST [985340] LOG: database system is ready to accept connections
|
||||
done
|
||||
server started
|
||||
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
|
||||
t
|
||||
-----------------------------
|
||||
inserted on the main branch
|
||||
inserted on experimental
|
||||
(2 rows)
|
||||
|
||||
1
control_plane/.gitignore
vendored
1
control_plane/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
tmp_check/
|
||||
@@ -1,27 +0,0 @@
|
||||
[package]
|
||||
name = "control_plane"
|
||||
version = "0.1.0"
|
||||
authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
rand = "0.8.3"
|
||||
tar = "0.4.33"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
|
||||
serde = ""
|
||||
serde_derive = ""
|
||||
toml = ""
|
||||
lazy_static = ""
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
hex = "0.4.3"
|
||||
bytes = "1.0.1"
|
||||
fs_extra = "1.2.0"
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
@@ -1,459 +0,0 @@
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::io::{Read, Write};
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::process::Command;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{collections::BTreeMap, path::PathBuf};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use tar;
|
||||
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::storage::{PageServerNode, WalProposerNode};
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
//
|
||||
pub struct ComputeControlPlane {
|
||||
base_port: u16,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
pub nodes: BTreeMap<String, Arc<PostgresNode>>,
|
||||
env: LocalEnv,
|
||||
}
|
||||
|
||||
impl ComputeControlPlane {
|
||||
// Load current nodes with ports from data directories on disk
|
||||
pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
|
||||
// TODO: since pageserver do not have config file yet we believe here that
|
||||
// it is running on default port. Change that when pageserver will have config.
|
||||
let pageserver = Arc::new(PageServerNode::from_env(&env));
|
||||
|
||||
let pgdatadirspath = env.repo_path.join("pgdatadirs");
|
||||
let nodes: Result<BTreeMap<_, _>> = fs::read_dir(&pgdatadirspath)
|
||||
.with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
|
||||
.into_iter()
|
||||
.map(|f| {
|
||||
PostgresNode::from_dir_entry(f?, &env, &pageserver)
|
||||
.map(|node| (node.name.clone(), Arc::new(node)))
|
||||
})
|
||||
.collect();
|
||||
let nodes = nodes?;
|
||||
|
||||
Ok(ComputeControlPlane {
|
||||
base_port: 55431,
|
||||
pageserver,
|
||||
nodes,
|
||||
env,
|
||||
})
|
||||
}
|
||||
|
||||
fn get_port(&mut self) -> u16 {
|
||||
1 + self
|
||||
.nodes
|
||||
.iter()
|
||||
.map(|(_name, node)| node.address.port())
|
||||
.max()
|
||||
.unwrap_or(self.base_port)
|
||||
}
|
||||
|
||||
pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
|
||||
ComputeControlPlane {
|
||||
base_port: 65431,
|
||||
pageserver: Arc::clone(pageserver),
|
||||
nodes: BTreeMap::new(),
|
||||
env: local_env.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Connect to a page server, get base backup, and untar it to initialize a
|
||||
/// new data directory
|
||||
pub fn new_from_page_server(
|
||||
&mut self,
|
||||
is_test: bool,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<Arc<PostgresNode>> {
|
||||
let node_id = self.nodes.len() as u32 + 1;
|
||||
|
||||
let node = Arc::new(PostgresNode {
|
||||
name: format!("pg{}", node_id),
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
|
||||
env: self.env.clone(),
|
||||
pageserver: Arc::clone(&self.pageserver),
|
||||
is_test,
|
||||
timelineid,
|
||||
});
|
||||
|
||||
node.init_from_page_server()?;
|
||||
self.nodes.insert(node.name.clone(), Arc::clone(&node));
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
|
||||
pub fn new_test_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
|
||||
let node = self.new_from_page_server(true, timelineid);
|
||||
assert!(node.is_ok());
|
||||
let node = node.unwrap();
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
node.connstr()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_test_master_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
|
||||
let node = self.new_from_page_server(true, timelineid).unwrap();
|
||||
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
"synchronous_standby_names = 'safekeeper_proxy'\n",
|
||||
);
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_node(&mut self, timelineid: ZTimelineId) -> Result<Arc<PostgresNode>> {
|
||||
let node = self.new_from_page_server(false, timelineid).unwrap();
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
node.connstr()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct PostgresNode {
|
||||
pub address: SocketAddr,
|
||||
name: String,
|
||||
pub env: LocalEnv,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
is_test: bool,
|
||||
timelineid: ZTimelineId,
|
||||
}
|
||||
|
||||
impl PostgresNode {
|
||||
fn from_dir_entry(
|
||||
entry: std::fs::DirEntry,
|
||||
env: &LocalEnv,
|
||||
pageserver: &Arc<PageServerNode>,
|
||||
) -> Result<PostgresNode> {
|
||||
if !entry.file_type()?.is_dir() {
|
||||
anyhow::bail!(
|
||||
"PostgresNode::from_dir_entry failed: '{}' is not a directory",
|
||||
entry.path().display()
|
||||
);
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap();
|
||||
}
|
||||
|
||||
// parse data directory name
|
||||
let fname = entry.file_name();
|
||||
let name = fname.to_str().unwrap().to_string();
|
||||
|
||||
// find out tcp port in config file
|
||||
let cfg_path = entry.path().join("postgresql.conf");
|
||||
let config = fs::read_to_string(cfg_path.clone()).with_context(|| {
|
||||
format!(
|
||||
"failed to read config file in {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
)
|
||||
})?;
|
||||
|
||||
let err_msg = format!(
|
||||
"failed to find port definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let port: u16 = CONF_PORT_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or(anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or(anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or(anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
// FIXME: What timeline is this server on? Would have to parse the postgresql.conf
|
||||
// file for that, too. It's currently not needed for anything, but it would be
|
||||
// nice to list the timeline in "zenith pg list"
|
||||
let timelineid_buf = [0u8; 16];
|
||||
let timelineid = ZTimelineId::from(timelineid_buf);
|
||||
|
||||
// ok now
|
||||
Ok(PostgresNode {
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
|
||||
name,
|
||||
env: env.clone(),
|
||||
pageserver: Arc::clone(pageserver),
|
||||
is_test: false,
|
||||
timelineid,
|
||||
})
|
||||
}
|
||||
|
||||
// Connect to a page server, get base backup, and untar it to initialize a
|
||||
// new data directory
|
||||
pub fn init_from_page_server(&self) -> Result<()> {
|
||||
let pgdata = self.pgdata();
|
||||
|
||||
println!(
|
||||
"Extracting base backup to create postgres instance: path={} port={}",
|
||||
pgdata.display(),
|
||||
self.address.port()
|
||||
);
|
||||
|
||||
// initialize data directory
|
||||
if self.is_test {
|
||||
fs::remove_dir_all(&pgdata).ok();
|
||||
}
|
||||
|
||||
let sql = format!("basebackup {}", self.timelineid);
|
||||
let mut client = self
|
||||
.pageserver
|
||||
.page_server_psql_client()
|
||||
.with_context(|| "connecting to page server failed")?;
|
||||
|
||||
fs::create_dir_all(&pgdata)
|
||||
.with_context(|| format!("could not create data directory {}", pgdata.display()))?;
|
||||
fs::set_permissions(pgdata.as_path(), fs::Permissions::from_mode(0o700)).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"could not set permissions in data directory {}",
|
||||
pgdata.display()
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
// FIXME: The compute node should be able to stream the WAL it needs from the WAL safekeepers or archive.
|
||||
// But that's not implemented yet. For now, 'pg_wal' is included in the base backup tarball that
|
||||
// we receive from the Page Server, so we don't need to create the empty 'pg_wal' directory here.
|
||||
//fs::create_dir_all(pgdata.join("pg_wal"))?;
|
||||
|
||||
let mut copyreader = client
|
||||
.copy_out(sql.as_str())
|
||||
.with_context(|| "page server 'basebackup' command failed")?;
|
||||
|
||||
// FIXME: Currently, we slurp the whole tarball into memory, and then extract it,
|
||||
// but we really should do this:
|
||||
//let mut ar = tar::Archive::new(copyreader);
|
||||
let mut buf = vec![];
|
||||
copyreader
|
||||
.read_to_end(&mut buf)
|
||||
.with_context(|| "reading base backup from page server failed")?;
|
||||
let mut ar = tar::Archive::new(buf.as_slice());
|
||||
ar.unpack(&pgdata)
|
||||
.with_context(|| "extracting page backup failed")?;
|
||||
|
||||
// listen for selected port
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"max_wal_senders = 10\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
max_connections = 100\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n",
|
||||
address = self.address.ip(),
|
||||
port = self.address.port()
|
||||
),
|
||||
);
|
||||
|
||||
// Never clean up old WAL. TODO: We should use a replication
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeepr or
|
||||
// page server yet. But this will do for now.
|
||||
self.append_conf("postgresql.conf", &format!("wal_keep_size='10TB'\n"));
|
||||
|
||||
// Connect it to the page server.
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"page_server_connstring = 'host={} port={}'\n\
|
||||
zenith_timeline='{}'\n",
|
||||
self.pageserver.address().ip(),
|
||||
self.pageserver.address().port(),
|
||||
self.timelineid
|
||||
),
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pgdata(&self) -> PathBuf {
|
||||
self.env.repo_path.join("pgdatadirs").join(&self.name)
|
||||
}
|
||||
|
||||
pub fn status(&self) -> &str {
|
||||
let timeout = Duration::from_millis(300);
|
||||
let has_pidfile = self.pgdata().join("postmaster.pid").exists();
|
||||
let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
|
||||
|
||||
match (has_pidfile, can_connect) {
|
||||
(true, true) => "running",
|
||||
(false, false) => "stopped",
|
||||
(true, false) => "crashed",
|
||||
(false, true) => "running, no pidfile",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append_conf(&self, config: &str, opts: &str) {
|
||||
OpenOptions::new()
|
||||
.append(true)
|
||||
.open(self.pgdata().join(config).to_str().unwrap())
|
||||
.unwrap()
|
||||
.write_all(opts.as_bytes())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str]) -> Result<()> {
|
||||
let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl");
|
||||
|
||||
let pg_ctl = Command::new(pg_ctl_path)
|
||||
.args(
|
||||
[
|
||||
&[
|
||||
"-D",
|
||||
self.pgdata().to_str().unwrap(),
|
||||
"-l",
|
||||
self.pgdata().join("log").to_str().unwrap(),
|
||||
],
|
||||
args,
|
||||
]
|
||||
.concat(),
|
||||
)
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.with_context(|| "pg_ctl failed")?;
|
||||
if !pg_ctl.success() {
|
||||
anyhow::bail!("pg_ctl failed");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"])
|
||||
}
|
||||
|
||||
pub fn restart(&self) -> Result<()> {
|
||||
self.pg_ctl(&["restart"])
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"])
|
||||
}
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
format!(
|
||||
"host={} port={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
self.whoami()
|
||||
)
|
||||
}
|
||||
|
||||
// XXX: cache that in control plane
|
||||
pub fn whoami(&self) -> String {
|
||||
let output = Command::new("whoami")
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Running {}", sql);
|
||||
client.query(sql, &[]).unwrap()
|
||||
}
|
||||
|
||||
pub fn open_psql(&self, db: &str) -> Client {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls).unwrap()
|
||||
}
|
||||
|
||||
pub fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode {
|
||||
let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy");
|
||||
match Command::new(proxy_path.as_path())
|
||||
.args(&["--ztimelineid", &self.timelineid.to_string()])
|
||||
.args(&["-s", wal_acceptors])
|
||||
.args(&["-h", &self.address.ip().to_string()])
|
||||
.args(&["-p", &self.address.port().to_string()])
|
||||
.arg("-v")
|
||||
.stderr(
|
||||
OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(self.pgdata().join("safekeeper_proxy.log"))
|
||||
.unwrap(),
|
||||
)
|
||||
.spawn()
|
||||
{
|
||||
Ok(child) => WalProposerNode { pid: child.id() },
|
||||
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO
|
||||
pub fn pg_bench() {}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
// destructor to clean up state after test is done
|
||||
// XXX: we may detect failed test by setting some flag in catch_unwind()
|
||||
// and checking it here. But let just clean datadirs on start.
|
||||
fn drop(&mut self) {
|
||||
if self.is_test {
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,12 +0,0 @@
|
||||
//
|
||||
// Local control plane.
|
||||
//
|
||||
// Can start, cofigure and stop postgres instances running as a local processes.
|
||||
//
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
|
||||
pub mod compute;
|
||||
pub mod local_env;
|
||||
pub mod storage;
|
||||
@@ -1,389 +0,0 @@
|
||||
//
|
||||
// This module is responsible for locating and loading paths in a local setup.
|
||||
//
|
||||
// Now it also provides init method which acts like a stub for proper installation
|
||||
// script which will use local paths.
|
||||
//
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use rand::Rng;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
|
||||
use anyhow::Result;
|
||||
use serde_derive::{Deserialize, Serialize};
|
||||
|
||||
use pageserver::ZTimelineId;
|
||||
use walkeeper::xlog_utils;
|
||||
|
||||
//
|
||||
// This data structure represents deserialized zenith config, which should be
|
||||
// located in ~/.zenith
|
||||
//
|
||||
// TODO: should we also support ZENITH_CONF env var?
|
||||
//
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct LocalEnv {
|
||||
// Path to the Repository. Here page server and compute nodes will create and store their data.
|
||||
pub repo_path: PathBuf,
|
||||
|
||||
// System identifier, from the PostgreSQL control file
|
||||
pub systemid: u64,
|
||||
|
||||
// Path to postgres distribution. It's expected that "bin", "include",
|
||||
// "lib", "share" from postgres distribution are there. If at some point
|
||||
// in time we will be able to run against vanilla postgres we may split that
|
||||
// to four separate paths and match OS-specific installation layout.
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
|
||||
// Path to pageserver binary.
|
||||
pub zenith_distrib_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalEnv {
|
||||
// postgres installation
|
||||
pub fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
}
|
||||
pub fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
}
|
||||
}
|
||||
|
||||
fn zenith_repo_dir() -> PathBuf {
|
||||
// Find repository path
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Initialize a new Zenith repository
|
||||
//
|
||||
pub fn init() -> Result<()> {
|
||||
// check if config already exists
|
||||
let repo_path = zenith_repo_dir();
|
||||
if repo_path.exists() {
|
||||
anyhow::bail!(
|
||||
"{} already exists. Perhaps already initialized?",
|
||||
repo_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// Now we can run init only from crate directory, so check that current dir is our crate.
|
||||
// Use 'pageserver/Cargo.toml' existence as evidendce.
|
||||
let cargo_path = env::current_dir()?;
|
||||
if !cargo_path.join("pageserver/Cargo.toml").exists() {
|
||||
anyhow::bail!(
|
||||
"Current dirrectory does not look like a zenith repo. \
|
||||
Please, run 'init' from zenith repo root."
|
||||
);
|
||||
}
|
||||
|
||||
// ok, now check that expected binaries are present
|
||||
|
||||
// check postgres
|
||||
let pg_distrib_dir = cargo_path.join("tmp_install");
|
||||
let pg_path = pg_distrib_dir.join("bin/postgres");
|
||||
if !pg_path.exists() {
|
||||
anyhow::bail!(
|
||||
"Can't find postres binary at {}. \
|
||||
Perhaps './pgbuild.sh' is needed to build it first.",
|
||||
pg_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// check pageserver
|
||||
let zenith_distrib_dir = cargo_path.join("target/debug/");
|
||||
let pageserver_path = zenith_distrib_dir.join("pageserver");
|
||||
if !pageserver_path.exists() {
|
||||
anyhow::bail!(
|
||||
"Can't find pageserver binary at {}. Please build it.",
|
||||
pageserver_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// ok, we are good to go
|
||||
let mut conf = LocalEnv {
|
||||
repo_path: repo_path.clone(),
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir,
|
||||
systemid: 0,
|
||||
};
|
||||
init_repo(&mut conf)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> {
|
||||
let repopath = &local_env.repo_path;
|
||||
fs::create_dir(&repopath)
|
||||
.with_context(|| format!("could not create directory {}", repopath.display()))?;
|
||||
fs::create_dir(repopath.join("pgdatadirs"))?;
|
||||
fs::create_dir(repopath.join("timelines"))?;
|
||||
fs::create_dir(repopath.join("refs"))?;
|
||||
fs::create_dir(repopath.join("refs").join("branches"))?;
|
||||
fs::create_dir(repopath.join("refs").join("tags"))?;
|
||||
println!("created directory structure in {}", repopath.display());
|
||||
|
||||
// Create initial timeline
|
||||
let tli = create_timeline(&local_env, None)?;
|
||||
let timelinedir = repopath.join("timelines").join(tli.to_string());
|
||||
println!("created initial timeline {}", timelinedir.display());
|
||||
|
||||
// Run initdb
|
||||
//
|
||||
// FIXME: we create it temporarily in "tmp" directory, and move it into
|
||||
// the repository. Use "tempdir()" or something? Or just create it directly
|
||||
// in the repo?
|
||||
let initdb_path = local_env.pg_bin_dir().join("initdb");
|
||||
let _initdb = Command::new(initdb_path)
|
||||
.args(&["-D", "tmp"])
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap())
|
||||
.stdout(Stdio::null())
|
||||
.status()
|
||||
.with_context(|| "failed to execute initdb")?;
|
||||
println!("initdb succeeded");
|
||||
|
||||
// Read control file to extract the LSN and system id
|
||||
let controlfile =
|
||||
postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?;
|
||||
let systemid = controlfile.system_identifier;
|
||||
let lsn = controlfile.checkPoint;
|
||||
let lsnstr = format!("{:016X}", lsn);
|
||||
|
||||
// Move the initial WAL file
|
||||
fs::rename(
|
||||
"tmp/pg_wal/000000010000000000000001",
|
||||
timelinedir
|
||||
.join("wal")
|
||||
.join("000000010000000000000001.partial"),
|
||||
)?;
|
||||
println!("moved initial WAL file");
|
||||
|
||||
// Remove pg_wal
|
||||
fs::remove_dir_all("tmp/pg_wal")?;
|
||||
println!("removed tmp/pg_wal");
|
||||
|
||||
force_crash_recovery(&PathBuf::from("tmp"))?;
|
||||
println!("updated pg_control");
|
||||
|
||||
let target = timelinedir.join("snapshots").join(&lsnstr);
|
||||
fs::rename("tmp", &target)?;
|
||||
println!("moved 'tmp' to {}", target.display());
|
||||
|
||||
// Create 'main' branch to refer to the initial timeline
|
||||
let data = tli.to_string();
|
||||
fs::write(repopath.join("refs").join("branches").join("main"), data)?;
|
||||
println!("created main branch");
|
||||
|
||||
// Also update the system id in the LocalEnv
|
||||
local_env.systemid = systemid;
|
||||
|
||||
// write config
|
||||
let toml = toml::to_string(&local_env)?;
|
||||
fs::write(repopath.join("config"), toml)?;
|
||||
|
||||
println!(
|
||||
"new zenith repository was created in {}",
|
||||
repopath.display()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// If control file says the cluster was shut down cleanly, modify it, to mark
|
||||
// it as crashed. That forces crash recovery when you start the cluster.
|
||||
//
|
||||
// FIXME:
|
||||
// We currently do this to the initial snapshot in "zenith init". It would
|
||||
// be more natural to do this when the snapshot is restored instead, but we
|
||||
// currently don't have any code to create new snapshots, so it doesn't matter
|
||||
// Or better yet, use a less hacky way of putting the cluster into recovery.
|
||||
// Perhaps create a backup label file in the data directory when it's restored.
|
||||
fn force_crash_recovery(datadir: &Path) -> Result<()> {
|
||||
// Read in the control file
|
||||
let controlfilepath = datadir.to_path_buf().join("global").join("pg_control");
|
||||
let mut controlfile =
|
||||
postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfilepath.as_path())?))?;
|
||||
|
||||
controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION;
|
||||
|
||||
fs::write(
|
||||
controlfilepath.as_path(),
|
||||
postgres_ffi::encode_pg_control(controlfile),
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// check that config file is present
|
||||
pub fn load_config(repopath: &Path) -> Result<LocalEnv> {
|
||||
if !repopath.exists() {
|
||||
anyhow::bail!(
|
||||
"Zenith config is not found in {}. You need to run 'zenith init' first",
|
||||
repopath.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// load and parse file
|
||||
let config = fs::read_to_string(repopath.join("config"))?;
|
||||
toml::from_str(config.as_str()).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
// local env for tests
|
||||
pub fn test_env(testname: &str) -> LocalEnv {
|
||||
fs::create_dir_all("../tmp_check").expect("could not create directory ../tmp_check");
|
||||
|
||||
let repo_path = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_check/")
|
||||
.join(testname);
|
||||
|
||||
// Remove remnants of old test repo
|
||||
let _ = fs::remove_dir_all(&repo_path);
|
||||
|
||||
let mut local_env = LocalEnv {
|
||||
repo_path,
|
||||
pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"),
|
||||
zenith_distrib_dir: cargo_bin_dir(),
|
||||
systemid: 0,
|
||||
};
|
||||
init_repo(&mut local_env).expect("could not initialize zenith repository");
|
||||
return local_env;
|
||||
}
|
||||
|
||||
// Find the directory where the binaries were put (i.e. target/debug/)
|
||||
pub fn cargo_bin_dir() -> PathBuf {
|
||||
let mut pathbuf = std::env::current_exe().unwrap();
|
||||
|
||||
pathbuf.pop();
|
||||
if pathbuf.ends_with("deps") {
|
||||
pathbuf.pop();
|
||||
}
|
||||
|
||||
return pathbuf;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PointInTime {
|
||||
pub timelineid: ZTimelineId,
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
fn create_timeline(local_env: &LocalEnv, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
|
||||
let repopath = &local_env.repo_path;
|
||||
|
||||
// Create initial timeline
|
||||
let mut tli_buf = [0u8; 16];
|
||||
rand::thread_rng().fill(&mut tli_buf);
|
||||
let timelineid = ZTimelineId::from(tli_buf);
|
||||
|
||||
let timelinedir = repopath.join("timelines").join(timelineid.to_string());
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
fs::create_dir(&timelinedir.join("snapshots"))?;
|
||||
fs::create_dir(&timelinedir.join("wal"))?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!(
|
||||
"{}@{:X}/{:X}",
|
||||
ancestor.timelineid,
|
||||
ancestor.lsn >> 32,
|
||||
ancestor.lsn & 0xffffffff
|
||||
);
|
||||
fs::write(timelinedir.join("ancestor"), data)?;
|
||||
}
|
||||
|
||||
Ok(timelineid)
|
||||
}
|
||||
|
||||
// Parse an LSN in the format used in filenames
|
||||
//
|
||||
// For example: 00000000015D3DD8
|
||||
//
|
||||
fn parse_lsn(s: &str) -> std::result::Result<u64, std::num::ParseIntError> {
|
||||
u64::from_str_radix(s, 16)
|
||||
}
|
||||
|
||||
// Create a new branch in the repository (for the "zenith branch" subcommand)
|
||||
pub fn create_branch(
|
||||
local_env: &LocalEnv,
|
||||
branchname: &str,
|
||||
startpoint: PointInTime,
|
||||
) -> Result<()> {
|
||||
let repopath = &local_env.repo_path;
|
||||
|
||||
// create a new timeline for it
|
||||
let newtli = create_timeline(local_env, Some(startpoint))?;
|
||||
let newtimelinedir = repopath.join("timelines").join(newtli.to_string());
|
||||
|
||||
let data = newtli.to_string();
|
||||
fs::write(
|
||||
repopath.join("refs").join("branches").join(branchname),
|
||||
data,
|
||||
)?;
|
||||
|
||||
// Copy the latest snapshot (TODO: before the startpoint) and all WAL
|
||||
// TODO: be smarter and avoid the copying...
|
||||
let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(local_env, startpoint.timelineid)?;
|
||||
let copy_opts = fs_extra::dir::CopyOptions::new();
|
||||
fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), ©_opts)?;
|
||||
|
||||
let oldtimelinedir = repopath
|
||||
.join("timelines")
|
||||
.join(startpoint.timelineid.to_string());
|
||||
let mut copy_opts = fs_extra::dir::CopyOptions::new();
|
||||
copy_opts.content_only = true;
|
||||
fs_extra::dir::copy(
|
||||
oldtimelinedir.join("wal"),
|
||||
newtimelinedir.join("wal"),
|
||||
©_opts,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Find the end of valid WAL in a wal directory
|
||||
pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<u64> {
|
||||
let repopath = &local_env.repo_path;
|
||||
let waldir = repopath
|
||||
.join("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("wal");
|
||||
|
||||
let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true);
|
||||
|
||||
return Ok(lsn);
|
||||
}
|
||||
|
||||
// Find the latest snapshot for a timeline
|
||||
fn find_latest_snapshot(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<(u64, PathBuf)> {
|
||||
let repopath = &local_env.repo_path;
|
||||
|
||||
let snapshotsdir = repopath
|
||||
.join("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("snapshots");
|
||||
let paths = fs::read_dir(&snapshotsdir)?;
|
||||
let mut maxsnapshot: u64 = 0;
|
||||
let mut snapshotdir: Option<PathBuf> = None;
|
||||
for path in paths {
|
||||
let path = path?;
|
||||
let filename = path.file_name().to_str().unwrap().to_owned();
|
||||
if let Ok(lsn) = parse_lsn(&filename) {
|
||||
maxsnapshot = std::cmp::max(lsn, maxsnapshot);
|
||||
snapshotdir = Some(path.path());
|
||||
}
|
||||
}
|
||||
if maxsnapshot == 0 {
|
||||
// TODO: check ancestor timeline
|
||||
anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
|
||||
}
|
||||
|
||||
Ok((maxsnapshot, snapshotdir.unwrap()))
|
||||
}
|
||||
@@ -1,413 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::compute::PostgresNode;
|
||||
use crate::local_env::LocalEnv;
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
//
|
||||
// Collection of several example deployments useful for tests.
|
||||
//
|
||||
// I'm intendedly modelling storage and compute control planes as a separate entities
|
||||
// as it is closer to the actual setup.
|
||||
//
|
||||
pub struct TestStorageControlPlane {
|
||||
pub wal_acceptors: Vec<WalAcceptorNode>,
|
||||
pub pageserver: Arc<PageServerNode>,
|
||||
pub test_done: AtomicBool,
|
||||
pub repopath: PathBuf,
|
||||
}
|
||||
|
||||
impl TestStorageControlPlane {
|
||||
// Peek into the repository, to grab the timeline ID of given branch
|
||||
pub fn get_branch_timeline(&self, branchname: &str) -> ZTimelineId {
|
||||
let branchpath = self.repopath.join("refs/branches/".to_owned() + branchname);
|
||||
|
||||
ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap()
|
||||
}
|
||||
|
||||
// postgres <-> page_server
|
||||
//
|
||||
// Initialize a new repository and configure a page server to run in it
|
||||
//
|
||||
pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane {
|
||||
let repopath = local_env.repo_path.clone();
|
||||
|
||||
let pserver = Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
});
|
||||
pserver.start().unwrap();
|
||||
|
||||
TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: pserver,
|
||||
test_done: AtomicBool::new(false),
|
||||
repopath: repopath,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn one_page_server_no_start(local_env: &LocalEnv) -> TestStorageControlPlane {
|
||||
let repopath = local_env.repo_path.clone();
|
||||
|
||||
let pserver = Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
});
|
||||
|
||||
TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: pserver,
|
||||
test_done: AtomicBool::new(false),
|
||||
repopath: repopath,
|
||||
}
|
||||
}
|
||||
|
||||
// postgres <-> {wal_acceptor1, wal_acceptor2, ...}
|
||||
pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane {
|
||||
let repopath = local_env.repo_path.clone();
|
||||
|
||||
let mut cplane = TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
}),
|
||||
test_done: AtomicBool::new(false),
|
||||
repopath: repopath,
|
||||
};
|
||||
cplane.pageserver.start().unwrap();
|
||||
|
||||
const WAL_ACCEPTOR_PORT: usize = 54321;
|
||||
|
||||
for i in 0..redundancy {
|
||||
let wal_acceptor = WalAcceptorNode {
|
||||
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
data_dir: local_env.repo_path.join(format!("wal_acceptor_{}", i)),
|
||||
env: local_env.clone(),
|
||||
};
|
||||
wal_acceptor.init();
|
||||
wal_acceptor.start();
|
||||
cplane.wal_acceptors.push(wal_acceptor);
|
||||
}
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
self.test_done.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn get_wal_acceptor_conn_info(&self) -> String {
|
||||
self.wal_acceptors
|
||||
.iter()
|
||||
.map(|wa| wa.listen.to_string())
|
||||
.collect::<Vec<String>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub fn is_running(&self) -> bool {
|
||||
self.test_done.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestStorageControlPlane {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Control routines for pageserver.
|
||||
//
|
||||
// Used in CLI and tests.
|
||||
//
|
||||
pub struct PageServerNode {
|
||||
kill_on_exit: bool,
|
||||
listen_address: Option<SocketAddr>,
|
||||
pub env: LocalEnv,
|
||||
}
|
||||
|
||||
impl PageServerNode {
|
||||
pub fn from_env(env: &LocalEnv) -> PageServerNode {
|
||||
PageServerNode {
|
||||
kill_on_exit: false,
|
||||
listen_address: None, // default
|
||||
env: env.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn address(&self) -> SocketAddr {
|
||||
match self.listen_address {
|
||||
Some(addr) => addr,
|
||||
None => "127.0.0.1:64000".parse().unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn repo_path(&self) -> PathBuf {
|
||||
self.env.repo_path.clone()
|
||||
}
|
||||
|
||||
pub fn pid_file(&self) -> PathBuf {
|
||||
self.env.repo_path.join("pageserver.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Result<()> {
|
||||
println!(
|
||||
"Starting pageserver at '{}' in {}",
|
||||
self.address(),
|
||||
self.repo_path().display()
|
||||
);
|
||||
|
||||
let mut cmd = Command::new(self.env.zenith_distrib_dir.join("pageserver"));
|
||||
cmd.args(&["-l", self.address().to_string().as_str()])
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.env("ZENITH_REPO_DIR", self.repo_path())
|
||||
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
anyhow::bail!(
|
||||
"Pageserver failed to start. See '{}' for details.",
|
||||
self.repo_path().join("pageserver.log").display()
|
||||
);
|
||||
}
|
||||
|
||||
// It takes a while for the page server to start up. Wait until it is
|
||||
// open for business.
|
||||
for retries in 1..15 {
|
||||
let client = self.page_server_psql_client();
|
||||
if client.is_ok() {
|
||||
break;
|
||||
} else {
|
||||
println!("page server not responding yet, retrying ({})...", retries);
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
let pidfile = self.pid_file();
|
||||
let pid = read_pidfile(&pidfile)?;
|
||||
|
||||
let status = Command::new("kill")
|
||||
.arg(&pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
anyhow::bail!("Failed to kill pageserver with pid {}", pid);
|
||||
}
|
||||
|
||||
// await for pageserver stop
|
||||
for _ in 0..5 {
|
||||
let stream = TcpStream::connect(self.address());
|
||||
if let Err(_e) = stream {
|
||||
return Ok(());
|
||||
}
|
||||
println!("Stopping pageserver on {}", self.address());
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
|
||||
// ok, we failed to stop pageserver, let's panic
|
||||
if !status.success() {
|
||||
anyhow::bail!("Failed to stop pageserver with pid {}", pid);
|
||||
} else {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address().ip(),
|
||||
self.address().port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Pageserver query: '{}'", sql);
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(
|
||||
&self,
|
||||
) -> std::result::Result<postgres::Client, postgres::Error> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address().ip(),
|
||||
self.address().port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
if self.kill_on_exit {
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Control routines for WalAcceptor.
|
||||
//
|
||||
// Now used only in test setups.
|
||||
//
|
||||
pub struct WalAcceptorNode {
|
||||
listen: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
env: LocalEnv,
|
||||
}
|
||||
|
||||
impl WalAcceptorNode {
|
||||
pub fn init(&self) {
|
||||
if self.data_dir.exists() {
|
||||
fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!(
|
||||
"Starting wal_acceptor in {} listening '{}'",
|
||||
self.data_dir.to_str().unwrap(),
|
||||
self.listen
|
||||
);
|
||||
|
||||
let status = Command::new(self.env.zenith_distrib_dir.join("wal_acceptor"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.listen.to_string().as_str()])
|
||||
.args(&["--systemid", &self.env.systemid.to_string()])
|
||||
// Tell page server it can receive WAL from this WAL safekeeper
|
||||
// FIXME: If there are multiple safekeepers, they will all inform
|
||||
// the page server. Only the last "notification" will stay in effect.
|
||||
// So it's pretty random which safekeeper the page server will connect to
|
||||
.args(&["--pageserver", "127.0.0.1:64000"])
|
||||
.arg("-d")
|
||||
.arg("-n")
|
||||
.status()
|
||||
.expect("failed to start wal_acceptor");
|
||||
|
||||
if !status.success() {
|
||||
panic!("wal_acceptor start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> std::result::Result<(), io::Error> {
|
||||
println!("Stopping wal acceptor on {}", self.listen);
|
||||
let pidfile = self.data_dir.join("wal_acceptor.pid");
|
||||
let pid = read_pidfile(&pidfile)?;
|
||||
// Ignores any failures when running this command
|
||||
let _status = Command::new("kill")
|
||||
.arg(pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalAcceptorNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct WalProposerNode {
|
||||
pub pid: u32,
|
||||
}
|
||||
|
||||
impl WalProposerNode {
|
||||
pub fn stop(&self) {
|
||||
let status = Command::new("kill")
|
||||
.arg(self.pid.to_string())
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
panic!("kill start failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalProposerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub fn regress_check(pg: &PostgresNode) {
|
||||
pg.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
|
||||
let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress");
|
||||
fs::create_dir_all(regress_run_path.clone()).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let _regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", pg.env.pg_bin_dir().to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", pg.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGHOST", pg.address.ip().to_string())
|
||||
.env("PGPORT", pg.address.port().to_string())
|
||||
.env("PGUSER", pg.whoami())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
}
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// This should contain an unsigned integer, but we return it as a String
|
||||
/// because our callers only want to pass it back into a subcommand.
|
||||
fn read_pidfile(pidfile: &Path) -> std::result::Result<String, io::Error> {
|
||||
fs::read_to_string(pidfile).map_err(|err| {
|
||||
eprintln!("failed to read pidfile {:?}: {:?}", pidfile, err);
|
||||
err
|
||||
})
|
||||
}
|
||||
1
integration_tests/.gitignore
vendored
1
integration_tests/.gitignore
vendored
@@ -1 +0,0 @@
|
||||
tmp_check/
|
||||
@@ -9,9 +9,8 @@ edition = "2018"
|
||||
[dependencies]
|
||||
lazy_static = "1.4.0"
|
||||
rand = "0.8.3"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
|
||||
844
integration_tests/tests/control_plane/mod.rs
Normal file
844
integration_tests/tests/control_plane/mod.rs
Normal file
@@ -0,0 +1,844 @@
|
||||
//
|
||||
// Local control plane.
|
||||
//
|
||||
// Can start, cofigure and stop postgres instances running as a local processes.
|
||||
//
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
|
||||
use std::fs::File;
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use std::{
|
||||
io::Write,
|
||||
net::{IpAddr, Ipv4Addr, SocketAddr},
|
||||
};
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use postgres;
|
||||
|
||||
lazy_static! {
|
||||
// postgres would be there if it was build by 'make postgres' here in the repo
|
||||
pub static ref PG_BIN_DIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_install/bin");
|
||||
pub static ref PG_LIB_DIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_install/lib");
|
||||
|
||||
pub static ref BIN_DIR : PathBuf = cargo_bin_dir();
|
||||
|
||||
pub static ref TEST_WORKDIR : PathBuf = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tmp_check");
|
||||
}
|
||||
|
||||
// Find the directory where the binaries were put (i.e. target/debug/)
|
||||
pub fn cargo_bin_dir() -> PathBuf {
|
||||
let mut pathbuf = std::env::current_exe().ok().unwrap();
|
||||
|
||||
pathbuf.pop();
|
||||
if pathbuf.ends_with("deps") {
|
||||
pathbuf.pop();
|
||||
}
|
||||
|
||||
return pathbuf;
|
||||
}
|
||||
|
||||
//
|
||||
// I'm intendedly modelling storage and compute control planes as a separate entities
|
||||
// as it is closer to the actual setup.
|
||||
//
|
||||
pub struct StorageControlPlane {
|
||||
pub wal_acceptors: Vec<WalAcceptorNode>,
|
||||
pub page_servers: Vec<PageServerNode>,
|
||||
}
|
||||
|
||||
impl StorageControlPlane {
|
||||
// postgres <-> page_server
|
||||
pub fn one_page_server(froms3: bool) -> StorageControlPlane {
|
||||
let mut cplane = StorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
page_servers: Vec::new(),
|
||||
};
|
||||
|
||||
let pserver = PageServerNode {
|
||||
page_service_addr: "127.0.0.1:65200".parse().unwrap(),
|
||||
data_dir: TEST_WORKDIR.join("pageserver"),
|
||||
};
|
||||
pserver.init();
|
||||
if froms3 {
|
||||
pserver.start_froms3();
|
||||
} else {
|
||||
pserver.start();
|
||||
}
|
||||
|
||||
cplane.page_servers.push(pserver);
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn fault_tolerant(redundancy: usize) -> StorageControlPlane {
|
||||
let mut cplane = StorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
page_servers: Vec::new(),
|
||||
};
|
||||
const WAL_ACCEPTOR_PORT: usize = 54321;
|
||||
|
||||
for i in 0..redundancy {
|
||||
let wal_acceptor = WalAcceptorNode {
|
||||
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
data_dir: TEST_WORKDIR.join(format!("wal_acceptor_{}", i)),
|
||||
};
|
||||
wal_acceptor.init();
|
||||
wal_acceptor.start();
|
||||
cplane.wal_acceptors.push(wal_acceptor);
|
||||
}
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
for wa in self.wal_acceptors.iter() {
|
||||
wa.stop();
|
||||
}
|
||||
}
|
||||
|
||||
// // postgres <-> wal_acceptor x3 <-> page_server
|
||||
// fn local(&mut self) -> StorageControlPlane {
|
||||
// }
|
||||
|
||||
pub fn page_server_addr(&self) -> &SocketAddr {
|
||||
&self.page_servers[0].page_service_addr
|
||||
}
|
||||
|
||||
pub fn get_wal_acceptor_conn_info(&self) -> String {
|
||||
self.wal_acceptors
|
||||
.iter()
|
||||
.map(|wa| wa.listen.to_string().to_string())
|
||||
.collect::<Vec<String>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
let addr = &self.page_servers[0].page_service_addr;
|
||||
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
addr.ip(),
|
||||
addr.port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Pageserver query: '{}'", sql);
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for StorageControlPlane {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageServerNode {
|
||||
page_service_addr: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl PageServerNode {
|
||||
// TODO: method to force redo on a specific relation
|
||||
|
||||
// TODO: make wal-redo-postgres workable without data directory?
|
||||
pub fn init(&self) {
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
|
||||
let datadir_path = self.data_dir.join("wal_redo_pgdata");
|
||||
fs::remove_dir_all(datadir_path.to_str().unwrap()).ok();
|
||||
|
||||
let initdb = Command::new(PG_BIN_DIR.join("initdb"))
|
||||
.args(&["-D", datadir_path.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!("Starting pageserver at '{}'", self.page_service_addr);
|
||||
|
||||
let status = Command::new(BIN_DIR.join("pageserver"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.page_service_addr.to_string().as_str()])
|
||||
.arg("-d")
|
||||
.arg("--skip-recovery")
|
||||
.env_clear()
|
||||
.env("PATH", PG_BIN_DIR.to_str().unwrap()) // path to postres-wal-redo binary
|
||||
.status()
|
||||
.expect("failed to start pageserver");
|
||||
|
||||
if !status.success() {
|
||||
panic!("pageserver start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_froms3(&self) {
|
||||
println!("Starting pageserver at '{}'", self.page_service_addr);
|
||||
|
||||
let status = Command::new(BIN_DIR.join("pageserver"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.page_service_addr.to_string().as_str()])
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("PATH", PG_BIN_DIR.to_str().unwrap()) // path to postres-wal-redo binary
|
||||
.env("S3_ENDPOINT", "https://127.0.0.1:9000")
|
||||
.env("S3_REGION", "us-east-1")
|
||||
.env("S3_ACCESSKEY", "minioadmin")
|
||||
.env("S3_SECRET", "minioadmin")
|
||||
.status()
|
||||
.expect("failed to start pageserver");
|
||||
|
||||
if !status.success() {
|
||||
panic!("pageserver start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
let pidfile = self.data_dir.join("pageserver.pid");
|
||||
let pid = fs::read_to_string(pidfile).unwrap();
|
||||
let status = Command::new("kill")
|
||||
.arg(pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
panic!("kill start failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WalAcceptorNode {
|
||||
listen: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl WalAcceptorNode {
|
||||
pub fn init(&self) {
|
||||
if self.data_dir.exists() {
|
||||
fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!(
|
||||
"Starting wal_acceptor in {} listening '{}'",
|
||||
self.data_dir.to_str().unwrap(),
|
||||
self.listen
|
||||
);
|
||||
|
||||
let status = Command::new(BIN_DIR.join("wal_acceptor"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.listen.to_string().as_str()])
|
||||
.arg("-d")
|
||||
.arg("-n")
|
||||
.status()
|
||||
.expect("failed to start wal_acceptor");
|
||||
|
||||
if !status.success() {
|
||||
panic!("wal_acceptor start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
let pidfile = self.data_dir.join("wal_acceptor.pid");
|
||||
if let Ok(pid) = fs::read_to_string(pidfile) {
|
||||
let _status = Command::new("kill")
|
||||
.arg(pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalAcceptorNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
//
|
||||
pub struct ComputeControlPlane<'a> {
|
||||
pg_bin_dir: PathBuf,
|
||||
work_dir: PathBuf,
|
||||
last_assigned_port: u16,
|
||||
storage_cplane: &'a StorageControlPlane,
|
||||
nodes: Vec<Arc<PostgresNode>>,
|
||||
}
|
||||
|
||||
impl ComputeControlPlane<'_> {
|
||||
pub fn local(storage_cplane: &StorageControlPlane) -> ComputeControlPlane {
|
||||
ComputeControlPlane {
|
||||
pg_bin_dir: PG_BIN_DIR.to_path_buf(),
|
||||
work_dir: TEST_WORKDIR.to_path_buf(),
|
||||
last_assigned_port: 65431,
|
||||
storage_cplane: storage_cplane,
|
||||
nodes: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: check port availability and
|
||||
fn get_port(&mut self) -> u16 {
|
||||
let port = self.last_assigned_port + 1;
|
||||
self.last_assigned_port += 1;
|
||||
port
|
||||
}
|
||||
|
||||
pub fn new_vanilla_node<'a>(&mut self) -> &Arc<PostgresNode> {
|
||||
// allocate new node entry with generated port
|
||||
let node_id = self.nodes.len() + 1;
|
||||
let node = PostgresNode {
|
||||
_node_id: node_id,
|
||||
port: self.get_port(),
|
||||
ip: IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)),
|
||||
pgdata: self.work_dir.join(format!("compute/pg{}", node_id)),
|
||||
pg_bin_dir: self.pg_bin_dir.clone(),
|
||||
};
|
||||
self.nodes.push(Arc::new(node));
|
||||
let node = self.nodes.last().unwrap();
|
||||
|
||||
// initialize data directory
|
||||
fs::remove_dir_all(node.pgdata.to_str().unwrap()).ok();
|
||||
let initdb_path = self.pg_bin_dir.join("initdb");
|
||||
println!("initdb_path: {}", initdb_path.to_str().unwrap());
|
||||
let initdb = Command::new(initdb_path)
|
||||
.args(&["-D", node.pgdata.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
|
||||
// // allow local replication connections
|
||||
// node.append_conf("pg_hba.conf", format!("\
|
||||
// host replication all {}/32 sspi include_realm=1 map=regress\n\
|
||||
// ", node.ip).as_str());
|
||||
|
||||
// listen for selected port
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
max_wal_senders = 10\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
max_connections = 100\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n\
|
||||
",
|
||||
address = node.ip,
|
||||
port = node.port
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
// Init compute node without files, only datadir structure
|
||||
// use initdb --compute-node flag and GUC 'computenode_mode'
|
||||
// to distinguish the node
|
||||
pub fn new_minimal_node<'a>(&mut self) -> &Arc<PostgresNode> {
|
||||
// allocate new node entry with generated port
|
||||
let node_id = self.nodes.len() + 1;
|
||||
let node = PostgresNode {
|
||||
_node_id: node_id,
|
||||
port: self.get_port(),
|
||||
ip: IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)),
|
||||
pgdata: self.work_dir.join(format!("compute/pg{}", node_id)),
|
||||
pg_bin_dir: self.pg_bin_dir.clone(),
|
||||
};
|
||||
self.nodes.push(Arc::new(node));
|
||||
let node = self.nodes.last().unwrap();
|
||||
|
||||
// initialize data directory
|
||||
fs::remove_dir_all(node.pgdata.to_str().unwrap()).ok();
|
||||
let initdb_path = self.pg_bin_dir.join("initdb");
|
||||
println!("initdb_path: {}", initdb_path.to_str().unwrap());
|
||||
let initdb = Command::new(initdb_path)
|
||||
.args(&["-D", node.pgdata.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.arg("--no-instructions")
|
||||
.arg("--compute-node")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.success() {
|
||||
panic!("initdb failed");
|
||||
}
|
||||
|
||||
// // allow local replication connections
|
||||
// node.append_conf("pg_hba.conf", format!("\
|
||||
// host replication all {}/32 sspi include_realm=1 map=regress\n\
|
||||
// ", node.ip).as_str());
|
||||
|
||||
// listen for selected port
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
max_wal_senders = 10\n\
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
max_connections = 100\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n\
|
||||
computenode_mode = true\n\
|
||||
",
|
||||
address = node.ip,
|
||||
port = node.port
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_node_wo_data(&mut self) -> Arc<PostgresNode> {
|
||||
let storage_cplane = self.storage_cplane;
|
||||
let node = self.new_minimal_node();
|
||||
|
||||
let pserver = storage_cplane.page_server_addr();
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
page_server_connstring = 'host={} port={}'\n\
|
||||
",
|
||||
pserver.ip(),
|
||||
pserver.port()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node.clone()
|
||||
}
|
||||
|
||||
pub fn new_node(&mut self) -> Arc<PostgresNode> {
|
||||
let storage_cplane = self.storage_cplane;
|
||||
let node = self.new_vanilla_node();
|
||||
|
||||
let pserver = storage_cplane.page_server_addr();
|
||||
|
||||
// Configure that node to take pages from pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"\
|
||||
page_server_connstring = 'host={} port={}'\n\
|
||||
",
|
||||
pserver.ip(),
|
||||
pserver.port()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
|
||||
node.clone()
|
||||
}
|
||||
|
||||
pub fn new_master_node(&mut self) -> Arc<PostgresNode> {
|
||||
let node = self.new_vanilla_node();
|
||||
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
"synchronous_standby_names = 'safekeeper_proxy'\n\
|
||||
",
|
||||
);
|
||||
node.clone()
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct WalProposerNode {
|
||||
pid: u32,
|
||||
}
|
||||
|
||||
impl WalProposerNode {
|
||||
pub fn stop(&self) {
|
||||
let status = Command::new("kill")
|
||||
.arg(self.pid.to_string())
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
panic!("kill start failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalProposerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct PostgresNode {
|
||||
_node_id: usize,
|
||||
pub port: u16,
|
||||
pub ip: IpAddr,
|
||||
pgdata: PathBuf,
|
||||
pg_bin_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl PostgresNode {
|
||||
pub fn append_conf(&self, config: &str, opts: &str) {
|
||||
OpenOptions::new()
|
||||
.append(true)
|
||||
.open(self.pgdata.join(config).to_str().unwrap())
|
||||
.unwrap()
|
||||
.write_all(opts.as_bytes())
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str], check_ok: bool) {
|
||||
let pg_ctl_path = self.pg_bin_dir.join("pg_ctl");
|
||||
let pg_ctl = Command::new(pg_ctl_path)
|
||||
.args(
|
||||
[
|
||||
&[
|
||||
"-D",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
"-l",
|
||||
self.pgdata.join("log").to_str().unwrap(),
|
||||
],
|
||||
args,
|
||||
]
|
||||
.concat(),
|
||||
)
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.status()
|
||||
.expect("failed to execute pg_ctl");
|
||||
|
||||
if check_ok && !pg_ctl.success() {
|
||||
panic!("pg_ctl failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self, storage_cplane: &StorageControlPlane) {
|
||||
if storage_cplane.page_servers.len() != 0 {
|
||||
let _res =
|
||||
storage_cplane.page_server_psql(format!("callmemaybe {}", self.connstr()).as_str());
|
||||
}
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"], true);
|
||||
}
|
||||
|
||||
pub fn restart(&self) {
|
||||
self.pg_ctl(&["restart"], true);
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"], true);
|
||||
}
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
format!("host={} port={} user={}", self.ip, self.port, self.whoami())
|
||||
}
|
||||
|
||||
// XXX: cache that in control plane
|
||||
pub fn whoami(&self) -> String {
|
||||
let output = Command::new("whoami")
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.ip,
|
||||
self.port,
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Running {}", sql);
|
||||
client.query(sql, &[]).unwrap()
|
||||
}
|
||||
|
||||
pub fn open_psql(&self, db: &str) -> Client {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.ip,
|
||||
self.port,
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls).unwrap()
|
||||
}
|
||||
|
||||
pub fn get_pgdata(&self) -> Option<&str> {
|
||||
self.pgdata.to_str()
|
||||
}
|
||||
|
||||
// Request from pageserver stub controlfile, respective xlog
|
||||
// and a bunch of files needed to start computenode
|
||||
//
|
||||
// NOTE this "file" request is a crutch.
|
||||
// It asks pageserver to write requested page to the provided filepath
|
||||
// and thus only works locally.
|
||||
// TODO receive pages via some libpq protocol.
|
||||
// The problem I've met is that nonrelfiles are not valid utf8 and cannot be
|
||||
// handled by simple_query(). that expects test.
|
||||
// And reqular query() uses prepared queries.
|
||||
|
||||
// TODO pass sysid as parameter
|
||||
pub fn setup_compute_node(&self, sysid: u64, storage_cplane: &StorageControlPlane) {
|
||||
let mut query;
|
||||
//Request pg_control from pageserver
|
||||
query = format!(
|
||||
"file {}/global/pg_control,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
1664, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
42, //forknum pg_control
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
//Request pg_xact and pg_multixact from pageserver
|
||||
//We need them for initial pageserver startup and authentication
|
||||
//TODO figure out which block number we really need
|
||||
query = format!(
|
||||
"file {}/pg_xact/0000,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
0, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
44, //forknum
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
query = format!(
|
||||
"file {}/pg_multixact/offsets/0000,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
0, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
45, //forknum
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
query = format!(
|
||||
"file {}/pg_multixact/members/0000,{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
sysid as u64, //sysid
|
||||
0, //tablespace
|
||||
0, //dboid
|
||||
0, //reloid
|
||||
46, //forknum
|
||||
0, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
|
||||
//Request a few shared catalogs needed for authentication
|
||||
//Without them we cannot setup connection with pageserver to request further pages
|
||||
let reloids = [1260, 1261, 1262, 2396];
|
||||
for reloid in reloids.iter() {
|
||||
//FIXME request all blocks from file, not just 10
|
||||
for blkno in 0..10 {
|
||||
query = format!(
|
||||
"file {}/global/{},{},{},{},{},{},{},{}",
|
||||
self.pgdata.to_str().unwrap(),
|
||||
reloid, //suse it as filename
|
||||
sysid as u64, //sysid
|
||||
1664, //tablespace
|
||||
0, //dboid
|
||||
reloid, //reloid
|
||||
0, //forknum
|
||||
blkno, //blkno
|
||||
0 //lsn
|
||||
);
|
||||
storage_cplane.page_server_psql(query.as_str());
|
||||
}
|
||||
}
|
||||
|
||||
fs::create_dir(format!("{}/base/13006", self.pgdata.to_str().unwrap())).unwrap();
|
||||
fs::create_dir(format!("{}/base/13007", self.pgdata.to_str().unwrap())).unwrap();
|
||||
|
||||
//FIXME figure out what wal file we need to successfully start
|
||||
let walfilepath = format!(
|
||||
"{}/pg_wal/000000010000000000000001",
|
||||
self.pgdata.to_str().unwrap()
|
||||
);
|
||||
fs::copy(
|
||||
"/home/anastasia/zenith/zenith/tmp_check/pgdata/pg_wal/000000010000000000000001",
|
||||
walfilepath,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
println!("before resetwal ");
|
||||
|
||||
let pg_resetwal_path = self.pg_bin_dir.join("pg_resetwal");
|
||||
|
||||
// Now it does nothing, just prints existing content of pg_control.
|
||||
// TODO update values with most recent lsn, xid, oid requested from pageserver
|
||||
let pg_resetwal = Command::new(pg_resetwal_path)
|
||||
.args(&["-D", self.pgdata.to_str().unwrap()])
|
||||
.arg("-n") //dry run
|
||||
//.arg("-f")
|
||||
//.args(&["--next-transaction-id", "100500"])
|
||||
//.args(&["--next-oid", "17000"])
|
||||
//.args(&["--next-transaction-id", "100500"])
|
||||
.status()
|
||||
.expect("failed to execute pg_resetwal");
|
||||
|
||||
if !pg_resetwal.success() {
|
||||
panic!("pg_resetwal failed");
|
||||
}
|
||||
|
||||
println!("setup done");
|
||||
}
|
||||
|
||||
pub fn start_proxy(&self, wal_acceptors: String) -> WalProposerNode {
|
||||
let proxy_path = PG_BIN_DIR.join("safekeeper_proxy");
|
||||
match Command::new(proxy_path.as_path())
|
||||
.args(&["-s", &wal_acceptors])
|
||||
.args(&["-h", &self.ip.to_string()])
|
||||
.args(&["-p", &self.port.to_string()])
|
||||
.arg("-v")
|
||||
.stderr(File::create(TEST_WORKDIR.join("safepkeeper_proxy.log")).unwrap())
|
||||
.spawn()
|
||||
{
|
||||
Ok(child) => WalProposerNode { pid: child.id() },
|
||||
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_to_s3(&self) {
|
||||
println!("Push to s3 node at '{}'", self.pgdata.to_str().unwrap());
|
||||
|
||||
let zenith_push_path = self.pg_bin_dir.join("zenith_push");
|
||||
println!("zenith_push_path: {}", zenith_push_path.to_str().unwrap());
|
||||
|
||||
let status = Command::new(zenith_push_path)
|
||||
.args(&["-D", self.pgdata.to_str().unwrap()])
|
||||
.env_clear()
|
||||
.env("S3_ENDPOINT", "https://127.0.0.1:9000")
|
||||
.env("S3_REGION", "us-east-1")
|
||||
.env("S3_ACCESSKEY", "minioadmin")
|
||||
.env("S3_SECRET", "minioadmin")
|
||||
// .env("S3_BUCKET", "zenith-testbucket")
|
||||
.status()
|
||||
.expect("failed to push node to s3");
|
||||
|
||||
if !status.success() {
|
||||
panic!("zenith_push failed");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO
|
||||
pub fn pg_bench() {}
|
||||
pub fn pg_regress() {}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
// destructor to clean up state after test is done
|
||||
// XXX: we may detect failed test by setting some flag in catch_unwind()
|
||||
// and checking it here. But let just clean datadirs on start.
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
// fs::remove_dir_all(self.pgdata.clone()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn regress_check(pg: &PostgresNode) {
|
||||
pg.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
|
||||
let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress");
|
||||
fs::create_dir_all(regress_run_path.clone()).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let _regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", PG_BIN_DIR.to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", PG_LIB_DIR.to_str().unwrap())
|
||||
.env("PGPORT", pg.port.to_string())
|
||||
.env("PGUSER", pg.whoami())
|
||||
.env("PGHOST", pg.ip.to_string())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
}
|
||||
@@ -1,11 +1,7 @@
|
||||
// test node resettlement to an empty datadir
|
||||
|
||||
// TODO
|
||||
/*
|
||||
#[test]
|
||||
fn test_resettlement() {}
|
||||
|
||||
// test seq scan of everythin after restart
|
||||
#[test]
|
||||
fn test_cold_seqscan() {}
|
||||
*/
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
// TODO
|
||||
/*
|
||||
#[test]
|
||||
fn test_actions() {}
|
||||
|
||||
#[test]
|
||||
fn test_regress() {}
|
||||
*/
|
||||
|
||||
@@ -1,24 +1,29 @@
|
||||
// mod control_plane;
|
||||
use control_plane::compute::ComputeControlPlane;
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::PointInTime;
|
||||
use control_plane::storage::TestStorageControlPlane;
|
||||
#[allow(dead_code)]
|
||||
mod control_plane;
|
||||
use std::thread::sleep;
|
||||
use std::time::Duration;
|
||||
|
||||
use control_plane::ComputeControlPlane;
|
||||
use control_plane::StorageControlPlane;
|
||||
|
||||
// XXX: force all redo at the end
|
||||
// -- restart + seqscan won't read deleted stuff
|
||||
// -- pageserver api endpoint to check all rels
|
||||
#[test]
|
||||
fn test_redo_cases() {
|
||||
let local_env = local_env::test_env("test_redo_cases");
|
||||
|
||||
//Handcrafted cases with wal records that are (were) problematic for redo.
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_redo_cases() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_node(maintli);
|
||||
node.start().unwrap();
|
||||
let node = compute_cplane.new_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
println!("await pageserver connection...");
|
||||
sleep(Duration::from_secs(3));
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
@@ -27,7 +32,7 @@ fn test_redo_cases() {
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
@@ -35,9 +40,9 @@ fn test_redo_cases() {
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
assert_eq!(count, 5050);
|
||||
|
||||
// check 'create table as'
|
||||
//check 'create table as'
|
||||
node.safe_psql("postgres", "CREATE TABLE t2 AS SELECT * FROM t");
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
@@ -45,56 +50,45 @@ fn test_redo_cases() {
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
assert_eq!(count, 5050);
|
||||
}
|
||||
|
||||
// Runs pg_regress on a compute node
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_regress() {
|
||||
let local_env = local_env::test_env("test_regress");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_node(maintli);
|
||||
node.start().unwrap();
|
||||
let node = compute_cplane.new_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
control_plane::storage::regress_check(&node);
|
||||
println!("await pageserver connection...");
|
||||
sleep(Duration::from_secs(3));
|
||||
|
||||
control_plane::regress_check(&node);
|
||||
}
|
||||
|
||||
// Run two postgres instances on one pageserver, on different timelines
|
||||
// Run two postgres instances on one pageserver
|
||||
#[test]
|
||||
fn test_pageserver_two_timelines() {
|
||||
let local_env = local_env::test_env("test_pageserver_two_timelines");
|
||||
|
||||
#[ignore]
|
||||
fn test_pageserver_multitenancy() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
// Allocate postgres instance, but don't start
|
||||
let node1 = compute_cplane.new_node();
|
||||
let node2 = compute_cplane.new_node();
|
||||
node1.start(&storage_cplane);
|
||||
node2.start(&storage_cplane);
|
||||
|
||||
// Create new branch at the end of 'main'
|
||||
let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
|
||||
local_env::create_branch(
|
||||
&local_env,
|
||||
"experimental",
|
||||
PointInTime {
|
||||
timelineid: maintli,
|
||||
lsn: startpoint,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
let experimentaltli = storage_cplane.get_branch_timeline("experimental");
|
||||
|
||||
// Launch postgres instances on both branches
|
||||
let node1 = compute_cplane.new_test_node(maintli);
|
||||
let node2 = compute_cplane.new_test_node(experimentaltli);
|
||||
node1.start().unwrap();
|
||||
node2.start().unwrap();
|
||||
// XXX: add some extension func to postgres to check walsender conn
|
||||
// XXX: or better just drop that
|
||||
println!("await pageserver connection...");
|
||||
sleep(Duration::from_secs(3));
|
||||
|
||||
// check node1
|
||||
node1.safe_psql(
|
||||
@@ -103,7 +97,7 @@ fn test_pageserver_two_timelines() {
|
||||
);
|
||||
node1.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
|
||||
);
|
||||
let count: i64 = node1
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
@@ -111,7 +105,7 @@ fn test_pageserver_two_timelines() {
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
assert_eq!(count, 5050);
|
||||
|
||||
// check node2
|
||||
node2.safe_psql(
|
||||
@@ -120,7 +114,7 @@ fn test_pageserver_two_timelines() {
|
||||
);
|
||||
node2.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(100000,200000), 'payload'",
|
||||
"INSERT INTO t SELECT generate_series(100,200), 'payload'",
|
||||
);
|
||||
let count: i64 = node2
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
@@ -128,5 +122,89 @@ fn test_pageserver_two_timelines() {
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 15000150000);
|
||||
assert_eq!(count, 15150);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
// Start pageserver using s3 base image
|
||||
//
|
||||
// Requires working minio with hardcoded setup:
|
||||
// .env("S3_ENDPOINT", "https://127.0.0.1:9000")
|
||||
// .env("S3_REGION", "us-east-1")
|
||||
// .env("S3_ACCESSKEY", "minioadmin")
|
||||
// .env("S3_SECRET", "minioadmin")
|
||||
// .env("S3_BUCKET", "zenith-testbucket")
|
||||
// TODO use env variables in test
|
||||
fn test_pageserver_recovery() {
|
||||
//This test expects that image is already uploaded to s3
|
||||
//To upload it use zenith_push before test (see node.push_to_s3() for details)
|
||||
let storage_cplane = StorageControlPlane::one_page_server(true);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
//Wait while daemon uploads pages from s3
|
||||
sleep(Duration::from_secs(15));
|
||||
|
||||
let node_restored = compute_cplane.new_node_wo_data();
|
||||
|
||||
//TODO 6947041219207877724 is a hardcoded sysid for my cluster. Get it somewhere
|
||||
node_restored.setup_compute_node(6947041219207877724, &storage_cplane);
|
||||
|
||||
node_restored.start(&storage_cplane);
|
||||
|
||||
let rows = node_restored.safe_psql("postgres", "SELECT relname from pg_class;");
|
||||
|
||||
assert_eq!(rows.len(), 395);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore]
|
||||
//Scenario for future test. Not implemented yet
|
||||
fn test_pageserver_node_switch() {
|
||||
//Create pageserver
|
||||
let storage_cplane = StorageControlPlane::one_page_server(false);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
|
||||
//Create reqular node
|
||||
let node = compute_cplane.new_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5050);
|
||||
|
||||
//Push all node files to s3
|
||||
//TODO upload them directly to pageserver
|
||||
node.push_to_s3();
|
||||
//Upload data from s3 to pageserver
|
||||
//storage_cplane.upload_from_s3() //Not implemented yet
|
||||
|
||||
//Shut down the node
|
||||
node.stop();
|
||||
|
||||
//Create new node without files
|
||||
let node_restored = compute_cplane.new_node_wo_data();
|
||||
|
||||
// Setup minimal set of files needed to start node and setup pageserver connection
|
||||
// TODO 6947041219207877724 is a hardcoded sysid. Get it from node
|
||||
node_restored.setup_compute_node(6947041219207877724, &storage_cplane);
|
||||
|
||||
//Start compute node without files
|
||||
node_restored.start(&storage_cplane);
|
||||
|
||||
//Ensure that is has table created on initial node
|
||||
let rows = node_restored.safe_psql("postgres", "SELECT key from t;");
|
||||
assert_eq!(rows.len(), 5050);
|
||||
}
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
// Restart acceptors one by one while compute is under the load.
|
||||
use control_plane::compute::ComputeControlPlane;
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::PointInTime;
|
||||
use control_plane::storage::TestStorageControlPlane;
|
||||
use pageserver::ZTimelineId;
|
||||
#[allow(dead_code)]
|
||||
mod control_plane;
|
||||
use control_plane::ComputeControlPlane;
|
||||
use control_plane::StorageControlPlane;
|
||||
|
||||
use rand::Rng;
|
||||
use std::sync::Arc;
|
||||
@@ -12,20 +11,18 @@ use std::{thread, time};
|
||||
|
||||
#[test]
|
||||
fn test_acceptors_normal_work() {
|
||||
let local_env = local_env::test_env("test_acceptors_normal_work");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
@@ -46,97 +43,24 @@ fn test_acceptors_normal_work() {
|
||||
// check wal files equality
|
||||
}
|
||||
|
||||
// Run page server and multiple safekeepers, and multiple compute nodes running
|
||||
// against different timelines.
|
||||
#[test]
|
||||
fn test_many_timelines() {
|
||||
// Initialize a new repository, and set up WAL safekeepers and page server.
|
||||
const REDUNDANCY: usize = 3;
|
||||
const N_TIMELINES: usize = 5;
|
||||
let local_env = local_env::test_env("test_many_timelines");
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// Create branches
|
||||
let mut timelines: Vec<ZTimelineId> = Vec::new();
|
||||
let maintli = storage_cplane.get_branch_timeline("main"); // main branch
|
||||
timelines.push(maintli);
|
||||
let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
|
||||
for i in 1..N_TIMELINES {
|
||||
// additional branches
|
||||
let branchname = format!("experimental{}", i);
|
||||
local_env::create_branch(
|
||||
&local_env,
|
||||
&branchname,
|
||||
PointInTime {
|
||||
timelineid: maintli,
|
||||
lsn: startpoint,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
let tli = storage_cplane.get_branch_timeline(&branchname);
|
||||
timelines.push(tli);
|
||||
}
|
||||
|
||||
// start postgres on each timeline
|
||||
let mut nodes = Vec::new();
|
||||
for tli in timelines {
|
||||
let node = compute_cplane.new_test_node(tli);
|
||||
nodes.push(node.clone());
|
||||
node.start().unwrap();
|
||||
node.start_proxy(&wal_acceptors);
|
||||
}
|
||||
|
||||
// create schema
|
||||
for node in &nodes {
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
}
|
||||
|
||||
// Populate data
|
||||
for node in &nodes {
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
}
|
||||
|
||||
// Check data
|
||||
for node in &nodes {
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
}
|
||||
}
|
||||
|
||||
// Majority is always alive
|
||||
#[test]
|
||||
fn test_acceptors_restarts() {
|
||||
let local_env = local_env::test_env("test_acceptors_restarts");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
const FAULT_PROBABILITY: f32 = 0.01;
|
||||
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
let mut failed_node: Option<usize> = None;
|
||||
|
||||
// check basic work with table
|
||||
@@ -156,7 +80,7 @@ fn test_acceptors_restarts() {
|
||||
} else {
|
||||
let node: usize = rng.gen_range(0..REDUNDANCY);
|
||||
failed_node = Some(node);
|
||||
storage_cplane.wal_acceptors[node].stop().unwrap();
|
||||
storage_cplane.wal_acceptors[node].stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -169,7 +93,7 @@ fn test_acceptors_restarts() {
|
||||
assert_eq!(count, 500500);
|
||||
}
|
||||
|
||||
fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
|
||||
fn start_acceptor(cplane: &Arc<StorageControlPlane>, no: usize) {
|
||||
let cp = cplane.clone();
|
||||
thread::spawn(move || {
|
||||
thread::sleep(time::Duration::from_secs(1));
|
||||
@@ -181,23 +105,20 @@ fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
|
||||
// them again and check that nothing was losed. Repeat.
|
||||
// N_CRASHES env var
|
||||
#[test]
|
||||
fn test_acceptors_unavailability() {
|
||||
let local_env = local_env::test_env("test_acceptors_unavailability");
|
||||
|
||||
fn test_acceptors_unavalability() {
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 2;
|
||||
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
@@ -208,7 +129,7 @@ fn test_acceptors_unavailability() {
|
||||
psql.execute("INSERT INTO t values (1, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
storage_cplane.wal_acceptors[0].stop().unwrap();
|
||||
storage_cplane.wal_acceptors[0].stop();
|
||||
let cp = Arc::new(storage_cplane);
|
||||
start_acceptor(&cp, 0);
|
||||
let now = SystemTime::now();
|
||||
@@ -218,7 +139,7 @@ fn test_acceptors_unavailability() {
|
||||
psql.execute("INSERT INTO t values (3, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
cp.wal_acceptors[1].stop().unwrap();
|
||||
cp.wal_acceptors[1].stop();
|
||||
start_acceptor(&cp, 1);
|
||||
psql.execute("INSERT INTO t values (4, 'payload')", &[])
|
||||
.unwrap();
|
||||
@@ -236,16 +157,16 @@ fn test_acceptors_unavailability() {
|
||||
assert_eq!(count, 15);
|
||||
}
|
||||
|
||||
fn simulate_failures(cplane: Arc<TestStorageControlPlane>) {
|
||||
fn simulate_failures(cplane: &Arc<StorageControlPlane>) {
|
||||
let mut rng = rand::thread_rng();
|
||||
let n_acceptors = cplane.wal_acceptors.len();
|
||||
let failure_period = time::Duration::from_secs(1);
|
||||
while cplane.is_running() {
|
||||
loop {
|
||||
thread::sleep(failure_period);
|
||||
let mask: u32 = rng.gen_range(0..(1 << n_acceptors));
|
||||
for i in 0..n_acceptors {
|
||||
if (mask & (1 << i)) != 0 {
|
||||
cplane.wal_acceptors[i].stop().unwrap();
|
||||
cplane.wal_acceptors[i].stop();
|
||||
}
|
||||
}
|
||||
thread::sleep(failure_period);
|
||||
@@ -260,34 +181,29 @@ fn simulate_failures(cplane: Arc<TestStorageControlPlane>) {
|
||||
// Race condition test
|
||||
#[test]
|
||||
fn test_race_conditions() {
|
||||
let local_env = local_env::test_env("test_race_conditions");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
|
||||
let storage_cplane = Arc::new(TestStorageControlPlane::fault_tolerant(
|
||||
&local_env, REDUNDANCY,
|
||||
));
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let storage_cplane = StorageControlPlane::fault_tolerant(REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&storage_cplane);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
// start postgre
|
||||
let node = compute_cplane.new_master_node();
|
||||
node.start(&storage_cplane);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let _proxy = node.start_proxy(wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
|
||||
let cp = storage_cplane.clone();
|
||||
let failures_thread = thread::spawn(move || {
|
||||
simulate_failures(cp);
|
||||
let cplane = Arc::new(storage_cplane);
|
||||
let cp = cplane.clone();
|
||||
thread::spawn(move || {
|
||||
simulate_failures(&cp);
|
||||
});
|
||||
|
||||
let mut psql = node.open_psql("postgres");
|
||||
@@ -302,7 +218,5 @@ fn test_race_conditions() {
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 500500);
|
||||
|
||||
storage_cplane.stop();
|
||||
failures_thread.join().unwrap();
|
||||
cplane.stop();
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@ rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3.13"
|
||||
lazy_static = "1.4.0"
|
||||
slog-stdlog = "4.1.0"
|
||||
@@ -25,18 +26,11 @@ clap = "2.33.0"
|
||||
termion = "1.5.6"
|
||||
tui = "0.14.0"
|
||||
daemonize = "0.4.1"
|
||||
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", rev="7f15a24ec7daa0a5d9516da706212745f9042818", features = ["no-verify-ssl"] }
|
||||
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", features = ["no-verify-ssl"] }
|
||||
tokio = { version = "1.3.0", features = ["full"] }
|
||||
tokio-stream = { version = "0.1.4" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres-protocol = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
walkdir = "2"
|
||||
thiserror = "1.0"
|
||||
hex = "0.4.3"
|
||||
tar = "0.4.33"
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
|
||||
@@ -1,202 +0,0 @@
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use tar::Builder;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::ZTimelineId;
|
||||
|
||||
pub fn send_snapshot_tarball(
|
||||
write: &mut dyn Write,
|
||||
timelineid: ZTimelineId,
|
||||
snapshotlsn: u64,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut ar = Builder::new(write);
|
||||
|
||||
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn);
|
||||
let walpath = format!("timelines/{}/wal", timelineid);
|
||||
|
||||
debug!("sending tarball of snapshot in {}", snappath);
|
||||
//ar.append_dir_all("", &snappath)?;
|
||||
|
||||
for entry in WalkDir::new(&snappath) {
|
||||
let entry = entry?;
|
||||
let fullpath = entry.path();
|
||||
let relpath = entry.path().strip_prefix(&snappath).unwrap();
|
||||
|
||||
if relpath.to_str().unwrap() == "" {
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.file_type().is_dir() {
|
||||
trace!(
|
||||
"sending dir {} as {}",
|
||||
fullpath.display(),
|
||||
relpath.display()
|
||||
);
|
||||
ar.append_dir(relpath, fullpath)?;
|
||||
} else if entry.file_type().is_symlink() {
|
||||
error!("ignoring symlink in snapshot dir");
|
||||
} else if entry.file_type().is_file() {
|
||||
// Shared catalogs are exempt
|
||||
if relpath.starts_with("global/") {
|
||||
trace!("sending shared catalog {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else if !is_rel_file_path(relpath.to_str().unwrap()) {
|
||||
trace!("sending {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else {
|
||||
trace!("not sending {}", relpath.display());
|
||||
// FIXME: send all files for now
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
}
|
||||
} else {
|
||||
error!("unknown file type: {}", fullpath.display());
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: also send all the WAL
|
||||
for entry in std::fs::read_dir(&walpath)? {
|
||||
let entry = entry?;
|
||||
let fullpath = &entry.path();
|
||||
let relpath = fullpath.strip_prefix(&walpath).unwrap();
|
||||
|
||||
if !entry.path().is_file() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let archive_fname = relpath.to_str().unwrap().clone();
|
||||
let archive_fname = archive_fname
|
||||
.strip_suffix(".partial")
|
||||
.unwrap_or(&archive_fname);
|
||||
let archive_path = "pg_wal/".to_owned() + archive_fname;
|
||||
ar.append_path_with_name(fullpath, archive_path)?;
|
||||
}
|
||||
|
||||
ar.finish()?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(fname)
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
|
||||
let forkname_match = caps.name("forkname");
|
||||
let forkname = if forkname_match.is_none() {
|
||||
None
|
||||
} else {
|
||||
Some(forkname_match.unwrap().as_str())
|
||||
};
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
};
|
||||
|
||||
return Ok((relnode, forknum, segno));
|
||||
}
|
||||
|
||||
fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
|
||||
/*
|
||||
* Relation data files can be in one of the following directories:
|
||||
*
|
||||
* global/
|
||||
* shared relations
|
||||
*
|
||||
* base/<db oid>/
|
||||
* regular relations, default tablespace
|
||||
*
|
||||
* pg_tblspc/<tblspc oid>/<tblspc version>/
|
||||
* within a non-default tablespace (the name of the directory
|
||||
* depends on version)
|
||||
*
|
||||
* And the relation data files themselves have a filename like:
|
||||
*
|
||||
* <oid>.<segment number>
|
||||
*/
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
|
||||
|
||||
return Ok(());
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split("/");
|
||||
let dbnode_str = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
let _dbnode = u32::from_str_radix(dbnode_str, 10)?;
|
||||
let fname = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
if s.next().is_some() {
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
};
|
||||
|
||||
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
|
||||
|
||||
return Ok(());
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
// TODO
|
||||
return Err(FilePathError::new("tablespaces not supported"));
|
||||
} else {
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
}
|
||||
}
|
||||
|
||||
fn is_rel_file_path(path: &str) -> bool {
|
||||
return parse_rel_file_path(path).is_ok();
|
||||
}
|
||||
43
pageserver/src/bin/cli/main.rs
Normal file
43
pageserver/src/bin/cli/main.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings};
|
||||
|
||||
pub mod pg;
|
||||
pub mod snapshot;
|
||||
pub mod storage;
|
||||
mod subcommand;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let cli_commands = subcommand::ClapCommands {
|
||||
commands: vec![
|
||||
Box::new(pg::PgCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("pg"),
|
||||
}),
|
||||
Box::new(storage::StorageCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("storage"),
|
||||
}),
|
||||
Box::new(snapshot::SnapshotCmd {
|
||||
clap_cmd: clap::SubCommand::with_name("snapshot"),
|
||||
}),
|
||||
],
|
||||
};
|
||||
|
||||
let matches = App::new("zenith")
|
||||
.about("Zenith CLI")
|
||||
.version("1.0")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommands(cli_commands.generate())
|
||||
.get_matches();
|
||||
|
||||
if let Some(subcommand) = matches.subcommand_name() {
|
||||
println!("'git {}' was used", subcommand);
|
||||
}
|
||||
|
||||
match matches.subcommand() {
|
||||
("pg", Some(sub_args)) => cli_commands.commands[0].run(sub_args.clone())?,
|
||||
("storage", Some(sub_args)) => cli_commands.commands[1].run(sub_args.clone())?,
|
||||
("snapshot", Some(sub_args)) => cli_commands.commands[2].run(sub_args.clone())?,
|
||||
("", None) => println!("No subcommand"),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
105
pageserver/src/bin/cli/pg.rs
Normal file
105
pageserver/src/bin/cli/pg.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings, Arg};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct PgCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for PgCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith compute nodes")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list").about("List existing compute nodes"))
|
||||
.subcommand(
|
||||
App::new("create")
|
||||
.about(
|
||||
"Create (init) new data directory using given storage and start postgres",
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("storage")
|
||||
.short("s")
|
||||
.long("storage")
|
||||
.takes_value(true)
|
||||
.help("Name of the storage node to use"),
|
||||
)
|
||||
//TODO should it be just name of uploaded snapshot or some path?
|
||||
.arg(
|
||||
Arg::with_name("snapshot")
|
||||
.long("snapshot")
|
||||
.takes_value(true)
|
||||
.help("Name of the snapshot to use"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("nostart")
|
||||
.long("no-start")
|
||||
.takes_value(false)
|
||||
.help("Don't start postgres on the created node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("destroy")
|
||||
.about("Stop postgres and destroy node's data directory")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("start")
|
||||
.about("Start postgres on the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("replica")
|
||||
.long("replica")
|
||||
.takes_value(false)
|
||||
.help("Start the compute node as replica"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("stop")
|
||||
.about("Stop postgres on the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
.subcommand(
|
||||
App::new("show")
|
||||
.about("Show info about the given node")
|
||||
.arg(
|
||||
Arg::with_name("name")
|
||||
.short("n")
|
||||
.long("name")
|
||||
.takes_value(true)
|
||||
.help("Name of the compute node"),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run PgCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
27
pageserver/src/bin/cli/snapshot.rs
Normal file
27
pageserver/src/bin/cli/snapshot.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings, Arg};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct SnapshotCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for SnapshotCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith snapshots")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list"))
|
||||
.subcommand(App::new("create").arg(Arg::with_name("pgdata").required(true)))
|
||||
.subcommand(App::new("destroy"))
|
||||
.subcommand(App::new("start"))
|
||||
.subcommand(App::new("stop"))
|
||||
.subcommand(App::new("show"))
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run SnapshotCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
25
pageserver/src/bin/cli/storage.rs
Normal file
25
pageserver/src/bin/cli/storage.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, AppSettings};
|
||||
|
||||
use crate::subcommand;
|
||||
|
||||
pub struct StorageCmd<'a> {
|
||||
pub clap_cmd: clap::App<'a, 'a>,
|
||||
}
|
||||
|
||||
impl subcommand::SubCommand for StorageCmd<'_> {
|
||||
fn gen_clap_command(&self) -> clap::App {
|
||||
let c = self.clap_cmd.clone();
|
||||
c.about("Operations with zenith storage nodes")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(App::new("list"))
|
||||
.subcommand(App::new("attach"))
|
||||
.subcommand(App::new("detach"))
|
||||
.subcommand(App::new("show"))
|
||||
}
|
||||
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()> {
|
||||
println!("Run StorageCmd with args {:?}", args);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
29
pageserver/src/bin/cli/subcommand.rs
Normal file
29
pageserver/src/bin/cli/subcommand.rs
Normal file
@@ -0,0 +1,29 @@
|
||||
use anyhow::Result;
|
||||
|
||||
/// All subcommands need to implement this interface.
|
||||
pub trait SubCommand {
|
||||
/// Generates the cli-config that Clap requires for the subcommand.
|
||||
fn gen_clap_command(&self) -> clap::App;
|
||||
|
||||
/// Runs the body of the subcommand.
|
||||
fn run(&self, args: clap::ArgMatches) -> Result<()>;
|
||||
}
|
||||
|
||||
/// A struct which holds a vector of heap-allocated `Box`es of trait objects all of which must
|
||||
/// implement the `SubCommand` trait, but other than that, can be of any type.
|
||||
pub struct ClapCommands {
|
||||
pub commands: Vec<Box<dyn SubCommand>>,
|
||||
}
|
||||
|
||||
impl ClapCommands {
|
||||
/// Generates a vector of `clap::Apps` that can be passed into clap's `.subcommands()` method in
|
||||
/// order to generate the full CLI.
|
||||
pub fn generate(&self) -> Vec<clap::App> {
|
||||
let mut v: Vec<clap::App> = Vec::new();
|
||||
|
||||
for command in self.commands.iter() {
|
||||
v.push(command.gen_clap_command());
|
||||
}
|
||||
v
|
||||
}
|
||||
}
|
||||
@@ -4,63 +4,72 @@
|
||||
|
||||
use log::*;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::thread;
|
||||
use std::{fs::File, fs::OpenOptions, str::FromStr};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{App, Arg};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use slog;
|
||||
use slog::Drain;
|
||||
use slog_scope;
|
||||
use slog_stdlog;
|
||||
|
||||
use pageserver::page_service;
|
||||
use pageserver::restore_s3;
|
||||
use pageserver::tui;
|
||||
//use pageserver::walreceiver;
|
||||
use pageserver::walreceiver;
|
||||
use pageserver::PageServerConf;
|
||||
|
||||
fn zenith_repo_dir() -> String {
|
||||
// Find repository path
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => String::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
fn main() -> Result<(), io::Error> {
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.arg(
|
||||
Arg::with_name("listen")
|
||||
.short("l")
|
||||
.long("listen")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("interactive")
|
||||
.short("i")
|
||||
.long("interactive")
|
||||
.takes_value(false)
|
||||
.help("Interactive mode"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("daemonize")
|
||||
.short("d")
|
||||
.long("daemonize")
|
||||
.takes_value(false)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(Arg::with_name("datadir")
|
||||
.short("D")
|
||||
.long("dir")
|
||||
.takes_value(true)
|
||||
.help("Path to the page server data directory"))
|
||||
.arg(Arg::with_name("wal_producer")
|
||||
.short("w")
|
||||
.long("wal-producer")
|
||||
.takes_value(true)
|
||||
.help("connect to the WAL sender (postgres or wal_acceptor) on connstr (default: 'host=127.0.0.1 port=65432 user=zenith')"))
|
||||
.arg(Arg::with_name("listen")
|
||||
.short("l")
|
||||
.long("listen")
|
||||
.takes_value(true)
|
||||
.help("listen for incoming page requests on ip:port (default: 127.0.0.1:5430)"))
|
||||
.arg(Arg::with_name("interactive")
|
||||
.short("i")
|
||||
.long("interactive")
|
||||
.takes_value(false)
|
||||
.help("Interactive mode"))
|
||||
.arg(Arg::with_name("daemonize")
|
||||
.short("d")
|
||||
.long("daemonize")
|
||||
.takes_value(false)
|
||||
.help("Run in the background"))
|
||||
.arg(Arg::with_name("skip_recovery")
|
||||
.long("skip-recovery")
|
||||
.takes_value(false)
|
||||
.help("Skip S3 recovery procedy and start empty"))
|
||||
.get_matches();
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
data_dir: PathBuf::from("./"),
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
wal_producer_connstr: None,
|
||||
listen_addr: "127.0.0.1:5430".parse().unwrap(),
|
||||
skip_recovery: false,
|
||||
};
|
||||
|
||||
if let Some(dir) = arg_matches.value_of("datadir") {
|
||||
conf.data_dir = PathBuf::from(dir);
|
||||
}
|
||||
|
||||
if arg_matches.is_present("daemonize") {
|
||||
conf.daemonize = true;
|
||||
}
|
||||
@@ -70,21 +79,31 @@ fn main() -> Result<()> {
|
||||
}
|
||||
|
||||
if conf.daemonize && conf.interactive {
|
||||
eprintln!("--daemonize is not allowed with --interactive: choose one");
|
||||
exit(1);
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"--daemonize is not allowed with --interactive: choose one",
|
||||
));
|
||||
}
|
||||
|
||||
if arg_matches.is_present("skip_recovery") {
|
||||
conf.skip_recovery = true;
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("wal_producer") {
|
||||
conf.wal_producer_connstr = Some(String::from_str(addr).unwrap());
|
||||
}
|
||||
|
||||
if let Some(addr) = arg_matches.value_of("listen") {
|
||||
conf.listen_addr = addr.parse()?;
|
||||
conf.listen_addr = addr.parse().unwrap();
|
||||
}
|
||||
|
||||
start_pageserver(&conf)
|
||||
start_pageserver(conf)
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
fn start_pageserver(conf: PageServerConf) -> Result<(), io::Error> {
|
||||
// Initialize logger
|
||||
let _scope_guard = init_logging(&conf)?;
|
||||
let _log_guard = slog_stdlog::init()?;
|
||||
let _scope_guard = init_logging(&conf);
|
||||
let _log_guard = slog_stdlog::init().unwrap();
|
||||
|
||||
// Note: this `info!(...)` macro comes from `log` crate
|
||||
info!("standard logging redirected to slog");
|
||||
@@ -108,25 +127,22 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
let repodir = PathBuf::from(zenith_repo_dir());
|
||||
|
||||
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
let log_filename = repodir.join("pageserver.log");
|
||||
// that we will see any accidental manual fpritf's or backtraces.
|
||||
let stdout = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
.open(conf.data_dir.join("pageserver.log"))
|
||||
.unwrap();
|
||||
let stderr = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
.open(conf.data_dir.join("pageserver.log"))
|
||||
.unwrap();
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file(repodir.clone().join("pageserver.pid"))
|
||||
.working_directory(repodir)
|
||||
.pid_file(conf.data_dir.join("pageserver.pid"))
|
||||
.working_directory(conf.data_dir.clone())
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
@@ -134,37 +150,58 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(e) => error!("Error, {}", e),
|
||||
}
|
||||
} else {
|
||||
// change into the repository directory. In daemon mode, Daemonize
|
||||
// does this for us.
|
||||
let repodir = zenith_repo_dir();
|
||||
std::env::set_current_dir(&repodir)?;
|
||||
info!("Changed current directory to repository in {}", &repodir);
|
||||
}
|
||||
|
||||
let mut threads = Vec::new();
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
info!("starting...");
|
||||
|
||||
// Before opening up for connections, restore the latest base backup from S3.
|
||||
// (We don't persist anything to local disk at the moment, so we need to do
|
||||
// this at every startup)
|
||||
// TODO move it to a separate function
|
||||
if !conf.skip_recovery {
|
||||
restore_s3::restore_main(&conf);
|
||||
}
|
||||
|
||||
// Create directory for wal-redo datadirs
|
||||
match fs::create_dir("wal-redo") {
|
||||
match fs::create_dir(conf.data_dir.join("wal-redo")) {
|
||||
Ok(_) => {}
|
||||
Err(e) => match e.kind() {
|
||||
io::ErrorKind::AlreadyExists => {}
|
||||
_ => {
|
||||
anyhow::bail!("Failed to create wal-redo data directory: {}", e);
|
||||
panic!("Failed to create wal-redo data directory: {}", e);
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// Launch the WAL receiver thread if pageserver was started with --wal-producer
|
||||
// option. It will try to connect to the WAL safekeeper, and stream the WAL. If
|
||||
// the connection is lost, it will reconnect on its own. We just fire and forget
|
||||
// it here.
|
||||
//
|
||||
// All other wal receivers are started on demand by "callmemaybe" command
|
||||
// sent to pageserver.
|
||||
let conf_copy = conf.clone();
|
||||
if let Some(wal_producer) = conf.wal_producer_connstr {
|
||||
let conf = conf_copy.clone();
|
||||
let walreceiver_thread = thread::Builder::new()
|
||||
.name("static WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
walreceiver::thread_main(conf, &wal_producer);
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(walreceiver_thread);
|
||||
}
|
||||
|
||||
// GetPage@LSN requests are served by another thread. (It uses async I/O,
|
||||
// but the code in page_service sets up it own thread pool for that)
|
||||
let conf_copy = conf.clone();
|
||||
let conf = conf_copy.clone();
|
||||
let page_server_thread = thread::Builder::new()
|
||||
.name("Page Service thread".into())
|
||||
.spawn(move || {
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
page_service::thread_main(&conf_copy);
|
||||
page_service::thread_main(conf);
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(page_server_thread);
|
||||
@@ -181,27 +218,23 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
|
||||
fn init_logging(conf: &PageServerConf) -> slog_scope::GlobalLoggerGuard {
|
||||
if conf.interactive {
|
||||
Ok(tui::init_logging())
|
||||
tui::init_logging()
|
||||
} else if conf.daemonize {
|
||||
let log = zenith_repo_dir() + "/pageserver.log";
|
||||
let log_file = File::create(&log).map_err(|err| {
|
||||
// We failed to initialize logging, so we can't log this message with error!
|
||||
eprintln!("Could not create log file {:?}: {}", log, err);
|
||||
err
|
||||
})?;
|
||||
let log = conf.data_dir.join("pageserver.log");
|
||||
let log_file = File::create(log).unwrap_or_else(|_| panic!("Could not create log file"));
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file);
|
||||
let drain = slog_term::CompactFormat::new(decorator).build();
|
||||
let drain = slog::Filter::new(drain, |record: &slog::Record| {
|
||||
if record.level().is_at_least(slog::Level::Debug) {
|
||||
if record.level().is_at_least(slog::Level::Info) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
slog_scope::set_global_logger(logger)
|
||||
} else {
|
||||
let decorator = slog_term::TermDecorator::new().build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build().fuse();
|
||||
@@ -219,6 +252,6 @@ fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard,
|
||||
})
|
||||
.fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
slog_scope::set_global_logger(logger)
|
||||
}
|
||||
}
|
||||
|
||||
218
pageserver/src/controlfile.rs
Normal file
218
pageserver/src/controlfile.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::io::SeekFrom;
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use log::*;
|
||||
|
||||
type XLogRecPtr = u64;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone)]
|
||||
/*
|
||||
* Body of CheckPoint XLOG records. This is declared here because we keep
|
||||
* a copy of the latest one in pg_control for possible disaster recovery.
|
||||
* Changing this struct requires a PG_CONTROL_VERSION bump.
|
||||
*/
|
||||
pub struct CheckPoint {
|
||||
pub redo: XLogRecPtr, /* next RecPtr available when we began to
|
||||
* create CheckPoint (i.e. REDO start point) */
|
||||
pub ThisTimeLineID: u32, /* current TLI */
|
||||
pub PrevTimeLineID: u32, /* previous TLI, if this record begins a new
|
||||
* timeline (equals ThisTimeLineID otherwise) */
|
||||
pub fullPageWrites: bool, /* current full_page_writes */
|
||||
pub nextXid: u64, /* next free transaction ID */
|
||||
pub nextOid: u32, /* next free OID */
|
||||
pub nextMulti: u32, /* next free MultiXactId */
|
||||
pub nextMultiOffset: u32, /* next free MultiXact offset */
|
||||
pub oldestXid: u32, /* cluster-wide minimum datfrozenxid */
|
||||
pub oldestXidDB: u32, /* database with minimum datfrozenxid */
|
||||
pub oldestMulti: u32, /* cluster-wide minimum datminmxid */
|
||||
pub oldestMultiDB: u32, /* database with minimum datminmxid */
|
||||
pub time: u64, /* time stamp of checkpoint */
|
||||
pub oldestCommitTsXid: u32, /* oldest Xid with valid commit
|
||||
* timestamp */
|
||||
pub newestCommitTsXid: u32, /* newest Xid with valid commit
|
||||
* timestamp */
|
||||
|
||||
/*
|
||||
* Oldest XID still running. This is only needed to initialize hot standby
|
||||
* mode from an online checkpoint, so we only bother calculating this for
|
||||
* online checkpoints and only when wal_level is replica. Otherwise it's
|
||||
* set to InvalidTransactionId.
|
||||
*/
|
||||
pub oldestActiveXid: u32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ControlFileDataZenith {
|
||||
pub system_identifier: u64,
|
||||
pg_control_version: u32, /* PG_CONTROL_VERSION */
|
||||
catalog_version_no: u32, /* see catversion.h */
|
||||
|
||||
state: i32, /* see enum above */
|
||||
time: i64, /* time stamp of last pg_control update */
|
||||
pub checkPoint: XLogRecPtr,
|
||||
checkPointCopy: CheckPoint, /* copy of last check point record */
|
||||
unloggedLSN: XLogRecPtr, /* current fake LSN value, for unlogged rels */
|
||||
minRecoveryPoint: XLogRecPtr,
|
||||
minRecoveryPointTLI: u32,
|
||||
backupStartPoint: XLogRecPtr,
|
||||
backupEndPoint: XLogRecPtr,
|
||||
backupEndRequired: bool,
|
||||
}
|
||||
|
||||
impl ControlFileDataZenith {
|
||||
pub fn new() -> ControlFileDataZenith {
|
||||
ControlFileDataZenith {
|
||||
system_identifier: 0,
|
||||
pg_control_version: 0,
|
||||
catalog_version_no: 0,
|
||||
state: 0,
|
||||
time: 0,
|
||||
checkPoint: 0,
|
||||
checkPointCopy: {
|
||||
CheckPoint {
|
||||
redo: 0,
|
||||
ThisTimeLineID: 0,
|
||||
PrevTimeLineID: 0,
|
||||
fullPageWrites: false,
|
||||
nextXid: 0,
|
||||
nextOid: 0,
|
||||
nextMulti: 0,
|
||||
nextMultiOffset: 0,
|
||||
oldestXid: 0,
|
||||
oldestXidDB: 0,
|
||||
oldestMulti: 0,
|
||||
oldestMultiDB: 0,
|
||||
time: 0,
|
||||
oldestCommitTsXid: 0,
|
||||
newestCommitTsXid: 0,
|
||||
oldestActiveXid: 0,
|
||||
}
|
||||
},
|
||||
unloggedLSN: 0,
|
||||
minRecoveryPoint: 0,
|
||||
minRecoveryPointTLI: 0,
|
||||
backupStartPoint: 0,
|
||||
backupEndPoint: 0,
|
||||
backupEndRequired: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_pg_control(mut buf: Bytes) -> ControlFileDataZenith {
|
||||
info!("decode pg_control");
|
||||
|
||||
let controlfile: ControlFileDataZenith = ControlFileDataZenith {
|
||||
system_identifier: buf.get_u64_le(),
|
||||
pg_control_version: buf.get_u32_le(),
|
||||
catalog_version_no: buf.get_u32_le(),
|
||||
state: buf.get_i32_le(),
|
||||
time: {
|
||||
buf.advance(4);
|
||||
buf.get_i64_le()
|
||||
},
|
||||
checkPoint: buf.get_u64_le(),
|
||||
checkPointCopy: {
|
||||
CheckPoint {
|
||||
redo: buf.get_u64_le(),
|
||||
ThisTimeLineID: buf.get_u32_le(),
|
||||
PrevTimeLineID: buf.get_u32_le(),
|
||||
fullPageWrites: buf.get_u8() != 0,
|
||||
nextXid: {
|
||||
buf.advance(7);
|
||||
buf.get_u64_le()
|
||||
},
|
||||
nextOid: buf.get_u32_le(),
|
||||
nextMulti: buf.get_u32_le(),
|
||||
nextMultiOffset: buf.get_u32_le(),
|
||||
oldestXid: buf.get_u32_le(),
|
||||
oldestXidDB: buf.get_u32_le(),
|
||||
oldestMulti: buf.get_u32_le(),
|
||||
oldestMultiDB: buf.get_u32_le(),
|
||||
time: {
|
||||
buf.advance(4);
|
||||
buf.get_u64_le()
|
||||
},
|
||||
oldestCommitTsXid: buf.get_u32_le(),
|
||||
newestCommitTsXid: buf.get_u32_le(),
|
||||
oldestActiveXid: buf.get_u32_le(),
|
||||
}
|
||||
},
|
||||
unloggedLSN: buf.get_u64_le(),
|
||||
minRecoveryPoint: buf.get_u64_le(),
|
||||
minRecoveryPointTLI: buf.get_u32_le(),
|
||||
backupStartPoint: {
|
||||
buf.advance(4);
|
||||
buf.get_u64_le()
|
||||
},
|
||||
backupEndPoint: buf.get_u64_le(),
|
||||
backupEndRequired: buf.get_u8() != 0,
|
||||
};
|
||||
|
||||
return controlfile;
|
||||
}
|
||||
|
||||
pub fn parse_controlfile(b: Bytes) {
|
||||
let controlfile = decode_pg_control(b);
|
||||
|
||||
info!(
|
||||
"controlfile {:X}/{:X}",
|
||||
controlfile.checkPoint >> 32,
|
||||
controlfile.checkPoint
|
||||
);
|
||||
info!("controlfile {:?}", controlfile);
|
||||
}
|
||||
|
||||
const MAX_MAPPINGS: usize = 62;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct RelMapping {
|
||||
mapoid: u32, /* OID of a catalog */
|
||||
mapfilenode: u32, /* its filenode number */
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RelMapFile {
|
||||
magic: i32, /* always RELMAPPER_FILEMAGIC */
|
||||
num_mappings: i32, /* number of valid RelMapping entries */
|
||||
mappings: [u8; MAX_MAPPINGS * 8],
|
||||
crc: u32, /* CRC of all above */
|
||||
pad: i32, /* to make the struct size be 512 exactly */
|
||||
}
|
||||
|
||||
pub fn decode_filemapping(mut buf: Bytes) -> RelMapFile {
|
||||
info!("decode filemap");
|
||||
|
||||
let file: RelMapFile = RelMapFile {
|
||||
magic: buf.get_i32_le(), /* always RELMAPPER_FILEMAGIC */
|
||||
num_mappings: buf.get_i32_le(), /* number of valid RelMapping entries */
|
||||
mappings: {
|
||||
let mut arr = [0 as u8; MAX_MAPPINGS * 8];
|
||||
buf.copy_to_slice(&mut arr);
|
||||
arr
|
||||
},
|
||||
crc: buf.get_u32_le(), /* CRC of all above */
|
||||
pad: buf.get_i32_le(),
|
||||
};
|
||||
|
||||
info!("decode filemap {:?}", file);
|
||||
file
|
||||
}
|
||||
|
||||
pub fn write_buf_to_file(filepath: String, buf: Bytes, blkno: u32) {
|
||||
info!("write_buf_to_file {}", filepath.clone());
|
||||
|
||||
let mut buffer = File::create(filepath.clone()).unwrap();
|
||||
buffer.seek(SeekFrom::Start(8192 * blkno as u64)).unwrap();
|
||||
|
||||
buffer.write_all(&buf).unwrap();
|
||||
|
||||
info!("DONE write_buf_to_file {}", filepath);
|
||||
}
|
||||
@@ -1,12 +1,12 @@
|
||||
use std::fmt;
|
||||
use std::net::SocketAddr;
|
||||
use std::str::FromStr;
|
||||
use std::path::PathBuf;
|
||||
|
||||
pub mod basebackup;
|
||||
pub mod controlfile;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
#[allow(dead_code)]
|
||||
pub mod pg_constants;
|
||||
pub mod restore_local_repo;
|
||||
pub mod restore_s3;
|
||||
pub mod tui;
|
||||
pub mod tui_event;
|
||||
mod tui_logger;
|
||||
@@ -14,47 +14,13 @@ pub mod waldecoder;
|
||||
pub mod walreceiver;
|
||||
pub mod walredo;
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PageServerConf {
|
||||
pub data_dir: PathBuf,
|
||||
pub daemonize: bool,
|
||||
pub interactive: bool,
|
||||
pub wal_producer_connstr: Option<String>,
|
||||
pub listen_addr: SocketAddr,
|
||||
}
|
||||
|
||||
// Zenith Timeline ID is a 32-byte random ID.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct ZTimelineId([u8; 16]);
|
||||
|
||||
impl FromStr for ZTimelineId {
|
||||
type Err = hex::FromHexError;
|
||||
|
||||
fn from_str(s: &str) -> Result<ZTimelineId, Self::Err> {
|
||||
let timelineid = hex::decode(s)?;
|
||||
|
||||
let mut buf: [u8; 16] = [0u8; 16];
|
||||
buf.copy_from_slice(timelineid.as_slice());
|
||||
Ok(ZTimelineId(buf))
|
||||
}
|
||||
}
|
||||
|
||||
impl ZTimelineId {
|
||||
pub fn from(b: [u8; 16]) -> ZTimelineId {
|
||||
ZTimelineId(b)
|
||||
}
|
||||
|
||||
pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZTimelineId {
|
||||
let mut arr = [0u8; 16];
|
||||
buf.copy_to_slice(&mut arr);
|
||||
ZTimelineId::from(arr)
|
||||
}
|
||||
|
||||
pub fn as_arr(&self) -> [u8; 16] {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ZTimelineId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(&hex::encode(self.0))
|
||||
}
|
||||
pub skip_recovery: bool,
|
||||
}
|
||||
|
||||
@@ -6,24 +6,26 @@
|
||||
// per-entry mutex.
|
||||
//
|
||||
|
||||
use crate::restore_local_repo::restore_timeline;
|
||||
use crate::ZTimelineId;
|
||||
use crate::{walredo, PageServerConf};
|
||||
use anyhow::bail;
|
||||
use bytes::Bytes;
|
||||
use core::ops::Bound::Included;
|
||||
use crossbeam_channel::unbounded;
|
||||
use crossbeam_channel::{Receiver, Sender};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use rand::Rng;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::{convert::TryInto, ops::AddAssign};
|
||||
|
||||
use std::error::Error;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::{convert::TryInto, ops::AddAssign};
|
||||
// use tokio::sync::RwLock;
|
||||
use bytes::Bytes;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use rand::Rng;
|
||||
|
||||
use crate::{controlfile, walredo, PageServerConf};
|
||||
|
||||
use crossbeam_channel::unbounded;
|
||||
use crossbeam_channel::{Receiver, Sender};
|
||||
|
||||
// Timeout when waiting or WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
static TIMEOUT: Duration = Duration::from_secs(60);
|
||||
@@ -106,56 +108,34 @@ struct PageCacheShared {
|
||||
first_valid_lsn: u64,
|
||||
last_valid_lsn: u64,
|
||||
last_record_lsn: u64,
|
||||
|
||||
controldata: controlfile::ControlFileDataZenith,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
pub static ref PAGECACHES: Mutex<HashMap<ZTimelineId, Arc<PageCache>>> =
|
||||
Mutex::new(HashMap::new());
|
||||
pub static ref PAGECACHES: Mutex<HashMap<u64, Arc<PageCache>>> = Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
// Get Page Cache for given timeline. It is assumed to already exist.
|
||||
pub fn get_pagecache(_conf: &PageServerConf, timelineid: ZTimelineId) -> Option<Arc<PageCache>> {
|
||||
let pcaches = PAGECACHES.lock().unwrap();
|
||||
|
||||
match pcaches.get(&timelineid) {
|
||||
Some(pcache) => Some(pcache.clone()),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_or_restore_pagecache(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
) -> anyhow::Result<Arc<PageCache>> {
|
||||
pub fn get_pagecache(conf: PageServerConf, sys_id: u64) -> Arc<PageCache> {
|
||||
let mut pcaches = PAGECACHES.lock().unwrap();
|
||||
|
||||
match pcaches.get(&timelineid) {
|
||||
Some(pcache) => Ok(pcache.clone()),
|
||||
None => {
|
||||
let pcache = init_page_cache();
|
||||
if !pcaches.contains_key(&sys_id) {
|
||||
pcaches.insert(sys_id, Arc::new(init_page_cache()));
|
||||
|
||||
restore_timeline(conf, &pcache, timelineid)?;
|
||||
|
||||
let result = Arc::new(pcache);
|
||||
|
||||
pcaches.insert(timelineid, result.clone());
|
||||
|
||||
// Initialize the WAL redo thread
|
||||
//
|
||||
// Now join_handle is not saved any where and we won'try restart tharead
|
||||
// if it is dead. We may later stop that treads after some inactivity period
|
||||
// and restart them on demand.
|
||||
let conf_copy = conf.clone();
|
||||
let _walredo_thread = thread::Builder::new()
|
||||
.name("WAL redo thread".into())
|
||||
.spawn(move || {
|
||||
walredo::wal_redo_main(&conf_copy, timelineid);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
// Initialize the WAL redo thread
|
||||
//
|
||||
// Now join_handle is not saved any where and we won'try restart tharead
|
||||
// if it is dead. We may later stop that treads after some inactivity period
|
||||
// and restart them on demand.
|
||||
let _walredo_thread = thread::Builder::new()
|
||||
.name("WAL redo thread".into())
|
||||
.spawn(move || {
|
||||
walredo::wal_redo_main(conf, sys_id);
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pcaches.get(&sys_id).unwrap().clone()
|
||||
}
|
||||
|
||||
fn init_page_cache() -> PageCache {
|
||||
@@ -169,6 +149,7 @@ fn init_page_cache() -> PageCache {
|
||||
first_valid_lsn: 0,
|
||||
last_valid_lsn: 0,
|
||||
last_record_lsn: 0,
|
||||
controldata: controlfile::ControlFileDataZenith::new(),
|
||||
}),
|
||||
valid_lsn_condvar: Condvar::new(),
|
||||
|
||||
@@ -199,7 +180,7 @@ fn init_page_cache() -> PageCache {
|
||||
// stored directly in the cache entry in that you still need to run the WAL redo
|
||||
// routine to generate the page image.
|
||||
//
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Debug)]
|
||||
pub struct CacheKey {
|
||||
pub tag: BufferTag,
|
||||
pub lsn: u64,
|
||||
@@ -227,7 +208,7 @@ pub struct CacheEntryContent {
|
||||
impl CacheEntry {
|
||||
fn new(key: CacheKey) -> CacheEntry {
|
||||
CacheEntry {
|
||||
key,
|
||||
key: key,
|
||||
content: Mutex::new(CacheEntryContent {
|
||||
page_image: None,
|
||||
wal_record: None,
|
||||
@@ -238,7 +219,7 @@ impl CacheEntry {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq, Hash, Clone, Copy)]
|
||||
#[derive(Eq, PartialEq, Hash, Clone, Copy, Debug)]
|
||||
pub struct RelTag {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
@@ -260,87 +241,176 @@ pub struct WALRecord {
|
||||
pub lsn: u64, // LSN at the *end* of the record
|
||||
pub will_init: bool,
|
||||
pub rec: Bytes,
|
||||
// Remember the offset of main_data in rec,
|
||||
// so that we don't have to parse the record again.
|
||||
// If record has no main_data, this offset equals rec.len().
|
||||
pub main_data_offset: usize,
|
||||
}
|
||||
|
||||
// Public interface functions
|
||||
|
||||
impl PageCache {
|
||||
pub fn get_nonrel_page(&self, tag: BufferTag, _reqlsn: u64) -> Result<Bytes, Box<dyn Error>> {
|
||||
self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// Now we don't have versioning for non-rel pages.
|
||||
// Also at bootstrap we don't know lsn for some files.
|
||||
// So always request the very latest version
|
||||
// let lsn = reqlsn;
|
||||
|
||||
let lsn = u64::MAX;
|
||||
|
||||
let minkey = CacheKey { tag: tag, lsn: 0 };
|
||||
// Look up to the largest lsn
|
||||
let maxkey = CacheKey { tag: tag, lsn: lsn };
|
||||
|
||||
let entry_rc: Arc<CacheEntry>;
|
||||
{
|
||||
let shared = self.shared.lock().unwrap();
|
||||
|
||||
let pagecache = &shared.pagecache;
|
||||
info!("got pagecache {}", pagecache.len());
|
||||
|
||||
let mut entries = pagecache.range((Included(&minkey), Included(&maxkey)));
|
||||
|
||||
let entry_opt = entries.next_back();
|
||||
|
||||
if entry_opt.is_none() {
|
||||
return Err(format!(
|
||||
"not found non-rel page with LSN {} for {}/{}/{}.{} blk {}",
|
||||
lsn, tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum
|
||||
))?;
|
||||
}
|
||||
|
||||
info!(
|
||||
"found non-rel page with LSN {} for {}/{}/{}.{} blk {}",
|
||||
lsn, tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum
|
||||
);
|
||||
|
||||
let (_key, entry) = entry_opt.unwrap();
|
||||
entry_rc = entry.clone();
|
||||
|
||||
// Now that we have a reference to the cache entry, drop the lock on the map.
|
||||
// It's important to do this before waiting on the condition variable below,
|
||||
// and better to do it as soon as possible to maximize concurrency.
|
||||
}
|
||||
|
||||
// Lock the cache entry and dig the page image out of it.
|
||||
let page_img: Bytes;
|
||||
{
|
||||
let entry_content = entry_rc.content.lock().unwrap();
|
||||
|
||||
if let Some(img) = &entry_content.page_image {
|
||||
assert!(!entry_content.apply_pending);
|
||||
page_img = img.clone();
|
||||
} else if entry_content.wal_record.is_some() {
|
||||
return Err("non-rel WAL redo is not implemented yet".into());
|
||||
//
|
||||
// If this page needs to be reconstructed by applying some WAL,
|
||||
// send a request to the WAL redo thread.
|
||||
//
|
||||
// if !entry_content.apply_pending {
|
||||
// assert!(!entry_content.apply_pending);
|
||||
// entry_content.apply_pending = true;
|
||||
|
||||
// let s = &self.walredo_sender;
|
||||
// s.send(entry_rc.clone())?;
|
||||
// }
|
||||
|
||||
// while entry_content.apply_pending {
|
||||
// entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap();
|
||||
//}
|
||||
|
||||
// We should now have a page image. If we don't, it means that WAL redo
|
||||
// failed to reconstruct it. WAL redo should've logged that error already.
|
||||
// page_img = match &entry_content.page_image {
|
||||
// Some(p) => p.clone(),
|
||||
// None => {
|
||||
// error!("could not apply WAL to reconstruct page image for GetPage@LSN request");
|
||||
// return Err("could not apply WAL to reconstruct page image".into());
|
||||
// }
|
||||
// };
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
return Err(format!("no page image or WAL record for requested page"))?;
|
||||
}
|
||||
}
|
||||
|
||||
trace!(
|
||||
"Returning page for {}/{}/{}.{} blk {}",
|
||||
tag.spcnode,
|
||||
tag.dbnode,
|
||||
tag.relnode,
|
||||
tag.forknum,
|
||||
tag.blknum
|
||||
);
|
||||
|
||||
return Ok(page_img);
|
||||
}
|
||||
|
||||
//
|
||||
// GetPage@LSN
|
||||
//
|
||||
// Returns an 8k page image
|
||||
//
|
||||
pub fn get_page_at_lsn(&self, tag: BufferTag, req_lsn: u64) -> anyhow::Result<Bytes> {
|
||||
pub fn get_page_at_lsn(&self, tag: BufferTag, reqlsn: u64) -> Result<Bytes, Box<dyn Error>> {
|
||||
let mut lsn = reqlsn;
|
||||
|
||||
if tag.forknum > 40 {
|
||||
info!(
|
||||
"get_page_at_lsn got request for page with LSN {} for {}/{}/{}.{} blk {}",
|
||||
lsn, tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum
|
||||
);
|
||||
|
||||
return self.get_nonrel_page(tag, lsn);
|
||||
}
|
||||
|
||||
if reqlsn == 0 {
|
||||
let c = self.get_controldata();
|
||||
lsn = c.checkPoint;
|
||||
|
||||
info!("update reqlsn get_page_at_lsn got request for page with LSN {} for {}/{}/{}.{} blk {}", lsn,
|
||||
tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum);
|
||||
}
|
||||
|
||||
self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let mut lsn = req_lsn;
|
||||
//When invalid LSN is requested, it means "don't wait, return latest version of the page"
|
||||
//This is necessary for bootstrap.
|
||||
//TODO should we use last_valid_lsn here instead of maxvalue?
|
||||
if lsn == 0
|
||||
{
|
||||
lsn = 0xffff_ffff_ffff_eeee;
|
||||
}
|
||||
// Look up cache entry. If it's a page image, return that. If it's a WAL record,
|
||||
// ask the WAL redo service to reconstruct the page image from the WAL records.
|
||||
let minkey = CacheKey { tag, lsn: 0 };
|
||||
let maxkey = CacheKey { tag, lsn };
|
||||
|
||||
let minkey = CacheKey { tag: tag, lsn: 0 };
|
||||
let maxkey = CacheKey { tag: tag, lsn: lsn };
|
||||
let entry_rc: Arc<CacheEntry>;
|
||||
{
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
let mut waited = false;
|
||||
|
||||
// There is a a race at postgres instance start
|
||||
// when we request a page before walsender established connection
|
||||
// and was able to stream the page. Just don't wait and return what we have.
|
||||
if req_lsn == 0
|
||||
{
|
||||
// When server just started and created checkpoint lsn,
|
||||
// but we have not yet established connection,
|
||||
// requested lsn will be larger than the one we have
|
||||
while lsn > shared.last_valid_lsn + 500 {
|
||||
// TODO: Wait for the WAL receiver to catch up
|
||||
waited = true;
|
||||
trace!(
|
||||
"walsender hasn't started yet. Don't wait. last_valid_lsn {}, requested {}",
|
||||
shared.last_valid_lsn, lsn);
|
||||
}
|
||||
"not caught up yet: {}, requested {}",
|
||||
shared.last_valid_lsn,
|
||||
lsn
|
||||
);
|
||||
let wait_result = self
|
||||
.valid_lsn_condvar
|
||||
.wait_timeout(shared, TIMEOUT)
|
||||
.unwrap();
|
||||
|
||||
if req_lsn != 0
|
||||
{
|
||||
while lsn > shared.last_valid_lsn {
|
||||
// TODO: Wait for the WAL receiver to catch up
|
||||
waited = true;
|
||||
trace!(
|
||||
"not caught up yet: {}, requested {}",
|
||||
shared.last_valid_lsn,
|
||||
shared = wait_result.0;
|
||||
if wait_result.1.timed_out() {
|
||||
return Err(format!(
|
||||
"Timed out while waiting for WAL record at LSN {} to arrive",
|
||||
lsn
|
||||
);
|
||||
let wait_result = self
|
||||
.valid_lsn_condvar
|
||||
.wait_timeout(shared, TIMEOUT)
|
||||
.unwrap();
|
||||
|
||||
shared = wait_result.0;
|
||||
if wait_result.1.timed_out() {
|
||||
bail!(
|
||||
"Timed out while waiting for WAL record at LSN {:X}/{:X} to arrive",
|
||||
lsn >> 32,
|
||||
lsn & 0xffff_ffff
|
||||
);
|
||||
}
|
||||
))?;
|
||||
}
|
||||
}
|
||||
|
||||
if waited {
|
||||
trace!("caught up now, continuing");
|
||||
}
|
||||
|
||||
if lsn < shared.first_valid_lsn {
|
||||
bail!(
|
||||
"LSN {:X}/{:X} has already been removed",
|
||||
lsn >> 32,
|
||||
lsn & 0xffff_ffff
|
||||
);
|
||||
return Err(format!("LSN {} has already been removed", lsn))?;
|
||||
}
|
||||
|
||||
let pagecache = &shared.pagecache;
|
||||
@@ -350,9 +420,9 @@ impl PageCache {
|
||||
let entry_opt = entries.next_back();
|
||||
|
||||
if entry_opt.is_none() {
|
||||
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
return Ok(Bytes::from_static(&ZERO_PAGE));
|
||||
/* return Err("could not find page image")?; */
|
||||
//static ZERO_PAGE:[u8; 8192] = [0 as u8; 8192];
|
||||
//return Ok(Bytes::from_static(&ZERO_PAGE));
|
||||
return Err("could not find page image")?;
|
||||
}
|
||||
let (_key, entry) = entry_opt.unwrap();
|
||||
entry_rc = entry.clone();
|
||||
@@ -395,12 +465,12 @@ impl PageCache {
|
||||
error!(
|
||||
"could not apply WAL to reconstruct page image for GetPage@LSN request"
|
||||
);
|
||||
bail!("could not apply WAL to reconstruct page image");
|
||||
return Err("could not apply WAL to reconstruct page image".into());
|
||||
}
|
||||
};
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
return Err(format!("no page image or WAL record for requested page"))?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -480,8 +550,10 @@ impl PageCache {
|
||||
// Adds a WAL record to the page cache
|
||||
//
|
||||
pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) {
|
||||
let lsn = rec.lsn;
|
||||
let key = CacheKey { tag, lsn };
|
||||
let key = CacheKey {
|
||||
tag: tag,
|
||||
lsn: rec.lsn,
|
||||
};
|
||||
|
||||
let entry = CacheEntry::new(key.clone());
|
||||
entry.content.lock().unwrap().wal_record = Some(rec);
|
||||
@@ -499,17 +571,13 @@ impl PageCache {
|
||||
*rel_entry = tag.blknum + 1;
|
||||
}
|
||||
|
||||
//trace!("put_wal_record lsn: {}", lsn);
|
||||
trace!("put_wal_record lsn: {}", key.lsn);
|
||||
|
||||
let oldentry = shared.pagecache.insert(key, Arc::new(entry));
|
||||
self.num_entries.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
if !oldentry.is_none() {
|
||||
error!(
|
||||
"overwriting WAL record with LSN {:X}/{:X} in page cache",
|
||||
lsn >> 32,
|
||||
lsn & 0xffffffff
|
||||
);
|
||||
error!("overwriting WAL record in page cache");
|
||||
}
|
||||
|
||||
self.num_wal_records.fetch_add(1, Ordering::Relaxed);
|
||||
@@ -519,7 +587,7 @@ impl PageCache {
|
||||
// Memorize a full image of a page version
|
||||
//
|
||||
pub fn put_page_image(&self, tag: BufferTag, lsn: u64, img: Bytes) {
|
||||
let key = CacheKey { tag, lsn };
|
||||
let key = CacheKey { tag: tag, lsn: lsn };
|
||||
|
||||
let entry = CacheEntry::new(key.clone());
|
||||
entry.content.lock().unwrap().page_image = Some(img);
|
||||
@@ -531,8 +599,10 @@ impl PageCache {
|
||||
self.num_entries.fetch_add(1, Ordering::Relaxed);
|
||||
assert!(oldentry.is_none());
|
||||
|
||||
//debug!("inserted page image for {}/{}/{}_{} blk {} at {}",
|
||||
// tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum, lsn);
|
||||
debug!(
|
||||
"inserted page image for {}/{}/{}_{} blk {} at {}",
|
||||
tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum, lsn
|
||||
);
|
||||
|
||||
self.num_page_images.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
@@ -542,22 +612,12 @@ impl PageCache {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
|
||||
// Can't move backwards.
|
||||
let oldlsn = shared.last_valid_lsn;
|
||||
if lsn >= oldlsn {
|
||||
assert!(lsn >= shared.last_valid_lsn);
|
||||
|
||||
shared.last_valid_lsn = lsn;
|
||||
self.valid_lsn_condvar.notify_all();
|
||||
shared.last_valid_lsn = lsn;
|
||||
self.valid_lsn_condvar.notify_all();
|
||||
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
} else {
|
||||
warn!(
|
||||
"attempted to move last valid LSN backwards (was {:X}/{:X}, new {:X}/{:X})",
|
||||
oldlsn >> 32,
|
||||
oldlsn & 0xffffffff,
|
||||
lsn >> 32,
|
||||
lsn & 0xffffffff
|
||||
);
|
||||
}
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
//
|
||||
@@ -575,7 +635,7 @@ impl PageCache {
|
||||
self.valid_lsn_condvar.notify_all();
|
||||
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
self.last_record_lsn.store(lsn, Ordering::Relaxed);
|
||||
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
//
|
||||
@@ -615,6 +675,16 @@ impl PageCache {
|
||||
return shared.last_record_lsn;
|
||||
}
|
||||
|
||||
pub fn set_controldata(&self, c: controlfile::ControlFileDataZenith) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
shared.controldata = c;
|
||||
}
|
||||
|
||||
pub fn get_controldata(&self) -> controlfile::ControlFileDataZenith {
|
||||
let shared = self.shared.lock().unwrap();
|
||||
return shared.controldata.clone();
|
||||
}
|
||||
|
||||
//
|
||||
// Simple test function for the WAL redo code:
|
||||
//
|
||||
@@ -663,20 +733,19 @@ impl PageCache {
|
||||
}
|
||||
}
|
||||
|
||||
/// Remember a relation's size in blocks.
|
||||
///
|
||||
/// If 'to' is larger than the previously remembered size, the remembered size is increased to 'to'.
|
||||
/// But if it's smaller, there is no change.
|
||||
pub fn relsize_inc(&self, rel: &RelTag, to: u32) {
|
||||
// FIXME: Shouldn't relation size also be tracked with an LSN?
|
||||
// If a replica is lagging behind, it needs to get the size as it was on
|
||||
// the replica's current replay LSN.
|
||||
// FIXME: Shouldn't relation size also be tracked with an LSN?
|
||||
// If a replica is lagging behind, it needs to get the size as it was on
|
||||
// the replica's current replay LSN.
|
||||
pub fn relsize_inc(&self, rel: &RelTag, to: Option<u32>) {
|
||||
let mut shared = self.shared.lock().unwrap();
|
||||
let entry = shared.relsize_cache.entry(*rel).or_insert(0);
|
||||
|
||||
if to >= *entry {
|
||||
*entry = to;
|
||||
if let Some(to) = to {
|
||||
if to >= *entry {
|
||||
*entry = to + 1;
|
||||
}
|
||||
}
|
||||
trace!("relsize_inc {:?} to {}", rel, entry);
|
||||
}
|
||||
|
||||
pub fn relsize_get(&self, rel: &RelTag) -> u32 {
|
||||
|
||||
@@ -7,43 +7,31 @@
|
||||
// *status* -- show actual info about this pageserver,
|
||||
// *pagestream* -- enter mode where smgr and pageserver talk with their
|
||||
// custom protocol.
|
||||
// *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
|
||||
// *callmemaybe $url* -- ask pageserver to start walreceiver on $url
|
||||
//
|
||||
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::io;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter};
|
||||
use tokio::net::{TcpListener, TcpStream};
|
||||
use tokio::runtime;
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::task;
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::page_cache;
|
||||
use crate::restore_local_repo;
|
||||
use crate::walreceiver;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
|
||||
use crate::controlfile;
|
||||
|
||||
type Result<T> = std::result::Result<T, io::Error>;
|
||||
|
||||
#[derive(Debug)]
|
||||
enum FeMessage {
|
||||
StartupMessage(FeStartupMessage),
|
||||
Query(FeQueryMessage), // Simple query
|
||||
Parse(FeParseMessage), // Extended query protocol
|
||||
Describe(FeDescribeMessage),
|
||||
Bind(FeBindMessage),
|
||||
Execute(FeExecuteMessage),
|
||||
Close(FeCloseMessage),
|
||||
Sync,
|
||||
Query(FeQueryMessage),
|
||||
Terminate,
|
||||
|
||||
//
|
||||
@@ -63,14 +51,8 @@ enum BeMessage {
|
||||
AuthenticationOk,
|
||||
ReadyForQuery,
|
||||
RowDescription,
|
||||
ParseComplete,
|
||||
ParameterDescription,
|
||||
NoData,
|
||||
BindComplete,
|
||||
CloseComplete,
|
||||
DataRow,
|
||||
CommandComplete,
|
||||
ControlFile,
|
||||
|
||||
//
|
||||
// All that messages are actually CopyData from libpq point of view.
|
||||
@@ -164,176 +146,6 @@ struct FeQueryMessage {
|
||||
body: Bytes,
|
||||
}
|
||||
|
||||
// We only support the simple case of Parse on unnamed prepared statement and
|
||||
// no params
|
||||
#[derive(Debug)]
|
||||
struct FeParseMessage {
|
||||
query_string: Bytes,
|
||||
}
|
||||
|
||||
fn read_null_terminated(buf: &mut Bytes) -> Result<Bytes> {
|
||||
let mut result = BytesMut::new();
|
||||
|
||||
loop {
|
||||
if !buf.has_remaining() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"no null-terminator in string",
|
||||
));
|
||||
}
|
||||
|
||||
let byte = buf.get_u8();
|
||||
|
||||
if byte == 0 {
|
||||
break;
|
||||
}
|
||||
result.put_u8(byte);
|
||||
}
|
||||
return Ok(result.freeze());
|
||||
}
|
||||
|
||||
impl FeParseMessage {
|
||||
pub fn parse(body: Bytes) -> Result<FeMessage> {
|
||||
let mut buf = body.clone();
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
let query_string = read_null_terminated(&mut buf)?;
|
||||
let nparams = buf.get_i16();
|
||||
|
||||
// FIXME: the rust-postgres driver uses a named prepared statement
|
||||
// for copy_out(). We're not prepared to handle that correctly. For
|
||||
// now, just ignore the statement name, assuming that the client never
|
||||
// uses more than one prepared statement at a time.
|
||||
/*
|
||||
if pstmt_name.len() != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented in Parse",
|
||||
));
|
||||
}
|
||||
*/
|
||||
|
||||
if nparams != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"query params not implemented",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FeMessage::Parse(FeParseMessage { query_string }))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FeDescribeMessage {
|
||||
kind: u8, // 'S' to describe a prepared statement; or 'P' to describe a portal.
|
||||
// we only support unnamed prepared stmt or portal
|
||||
}
|
||||
|
||||
impl FeDescribeMessage {
|
||||
pub fn parse(body: Bytes) -> Result<FeMessage> {
|
||||
let mut buf = body.clone();
|
||||
let kind = buf.get_u8();
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
// FIXME: see FeParseMessage::parse
|
||||
/*
|
||||
if pstmt_name.len() != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented in Describe",
|
||||
));
|
||||
}
|
||||
*/
|
||||
|
||||
if kind != b'S' {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"only prepared statmement Describe is implemented",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FeMessage::Describe(FeDescribeMessage { kind }))
|
||||
}
|
||||
}
|
||||
|
||||
// we only support unnamed prepared stmt or portal
|
||||
#[derive(Debug)]
|
||||
struct FeExecuteMessage {
|
||||
/// max # of rows
|
||||
maxrows: i32,
|
||||
}
|
||||
|
||||
impl FeExecuteMessage {
|
||||
pub fn parse(body: Bytes) -> Result<FeMessage> {
|
||||
let mut buf = body.clone();
|
||||
let portal_name = read_null_terminated(&mut buf)?;
|
||||
let maxrows = buf.get_i32();
|
||||
|
||||
if portal_name.len() != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named portals not implemented",
|
||||
));
|
||||
}
|
||||
|
||||
if maxrows != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"row limit in Execute message not supported",
|
||||
));
|
||||
}
|
||||
|
||||
Ok(FeMessage::Execute(FeExecuteMessage { maxrows }))
|
||||
}
|
||||
}
|
||||
|
||||
// we only support unnamed prepared stmt and portal
|
||||
#[derive(Debug)]
|
||||
struct FeBindMessage {}
|
||||
|
||||
impl FeBindMessage {
|
||||
pub fn parse(body: Bytes) -> Result<FeMessage> {
|
||||
let mut buf = body.clone();
|
||||
let portal_name = read_null_terminated(&mut buf)?;
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
if portal_name.len() != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named portals not implemented",
|
||||
));
|
||||
}
|
||||
|
||||
// FIXME: see FeParseMessage::parse
|
||||
/*
|
||||
if pstmt_name.len() != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented",
|
||||
));
|
||||
}
|
||||
*/
|
||||
|
||||
Ok(FeMessage::Bind(FeBindMessage {}))
|
||||
}
|
||||
}
|
||||
|
||||
// we only support unnamed prepared stmt and portal
|
||||
#[derive(Debug)]
|
||||
struct FeCloseMessage {}
|
||||
|
||||
impl FeCloseMessage {
|
||||
pub fn parse(body: Bytes) -> Result<FeMessage> {
|
||||
let mut buf = body.clone();
|
||||
let _kind = buf.get_u8();
|
||||
let _pstmt_or_portal_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
// FIXME: we do nothing with Close
|
||||
|
||||
Ok(FeMessage::Close(FeCloseMessage {}))
|
||||
}
|
||||
}
|
||||
|
||||
impl FeMessage {
|
||||
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
|
||||
if buf.len() < 5 {
|
||||
@@ -362,16 +174,10 @@ impl FeMessage {
|
||||
let mut body = buf.split_to(total_len);
|
||||
body.advance(5);
|
||||
|
||||
let mut body = body.freeze();
|
||||
|
||||
match tag {
|
||||
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body: body }))),
|
||||
b'P' => Ok(Some(FeParseMessage::parse(body)?)),
|
||||
b'D' => Ok(Some(FeDescribeMessage::parse(body)?)),
|
||||
b'E' => Ok(Some(FeExecuteMessage::parse(body)?)),
|
||||
b'B' => Ok(Some(FeBindMessage::parse(body)?)),
|
||||
b'C' => Ok(Some(FeCloseMessage::parse(body)?)),
|
||||
b'S' => Ok(Some(FeMessage::Sync)),
|
||||
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage {
|
||||
body: body.freeze(),
|
||||
}))),
|
||||
b'X' => Ok(Some(FeMessage::Terminate)),
|
||||
b'd' => {
|
||||
let smgr_tag = body.get_u8();
|
||||
@@ -410,33 +216,26 @@ impl FeMessage {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub fn thread_main(conf: &PageServerConf) {
|
||||
pub fn thread_main(conf: PageServerConf) {
|
||||
// Create a new thread pool
|
||||
//
|
||||
// FIXME: It would be nice to keep this single-threaded for debugging purposes,
|
||||
// but that currently leads to a deadlock: if a GetPage@LSN request arrives
|
||||
// for an LSN that hasn't been received yet, the thread gets stuck waiting for
|
||||
// the WAL to arrive. If the WAL receiver hasn't been launched yet, i.e
|
||||
// we haven't received a "callmemaybe" request yet to tell us where to get the
|
||||
// WAL, we will not have a thread available to process the "callmemaybe"
|
||||
// request when it does arrive. Using a thread pool alleviates the problem so
|
||||
// that it doesn't happen in the tests anymore, but in principle it could still
|
||||
// happen if we receive enough GetPage@LSN requests to consume all of the
|
||||
// available threads.
|
||||
//let runtime = runtime::Builder::new_current_thread().enable_all().build().unwrap();
|
||||
let runtime = runtime::Runtime::new().unwrap();
|
||||
// FIXME: keep it single-threaded for now, make it easier to debug with gdb,
|
||||
// and we're not concerned with performance yet.
|
||||
//let runtime = runtime::Runtime::new().unwrap();
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
info!("Starting page server on {}", conf.listen_addr);
|
||||
|
||||
let runtime_ref = Arc::new(runtime);
|
||||
|
||||
runtime_ref.clone().block_on(async {
|
||||
runtime.block_on(async {
|
||||
let listener = TcpListener::bind(conf.listen_addr).await.unwrap();
|
||||
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept().await.unwrap();
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let mut conn_handler = Connection::new(conf.clone(), socket, &runtime_ref);
|
||||
let mut conn_handler = Connection::new(conf.clone(), socket);
|
||||
|
||||
task::spawn(async move {
|
||||
if let Err(err) = conn_handler.run().await {
|
||||
@@ -453,17 +252,15 @@ struct Connection {
|
||||
buffer: BytesMut,
|
||||
init_done: bool,
|
||||
conf: PageServerConf,
|
||||
runtime: Arc<Runtime>,
|
||||
}
|
||||
|
||||
impl Connection {
|
||||
pub fn new(conf: PageServerConf, socket: TcpStream, runtime: &Arc<Runtime>) -> Connection {
|
||||
pub fn new(conf: PageServerConf, socket: TcpStream) -> Connection {
|
||||
Connection {
|
||||
stream: BufWriter::new(socket),
|
||||
buffer: BytesMut::with_capacity(10 * 1024),
|
||||
init_done: false,
|
||||
conf,
|
||||
runtime: Arc::clone(runtime),
|
||||
conf: conf,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -511,33 +308,6 @@ impl Connection {
|
||||
self.stream.write_u8(b'I').await?;
|
||||
}
|
||||
|
||||
BeMessage::ParseComplete => {
|
||||
self.stream.write_u8(b'1').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::BindComplete => {
|
||||
self.stream.write_u8(b'2').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::CloseComplete => {
|
||||
self.stream.write_u8(b'3').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::NoData => {
|
||||
self.stream.write_u8(b'n').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::ParameterDescription => {
|
||||
self.stream.write_u8(b't').await?;
|
||||
self.stream.write_i32(6).await?;
|
||||
// we don't support params, so always 0
|
||||
self.stream.write_i16(0).await?;
|
||||
}
|
||||
|
||||
BeMessage::RowDescription => {
|
||||
// XXX
|
||||
let mut b = Bytes::from("data\0");
|
||||
@@ -548,7 +318,7 @@ impl Connection {
|
||||
.await?;
|
||||
|
||||
self.stream.write_i16(1).await?;
|
||||
self.stream.write_all(&mut b).await?;
|
||||
self.stream.write_buf(&mut b).await?;
|
||||
self.stream.write_i32(0).await?; /* table oid */
|
||||
self.stream.write_i16(0).await?; /* attnum */
|
||||
self.stream.write_i32(25).await?; /* TEXTOID */
|
||||
@@ -567,19 +337,7 @@ impl Connection {
|
||||
|
||||
self.stream.write_i16(1).await?;
|
||||
self.stream.write_i32(b.len() as i32).await?;
|
||||
self.stream.write_all(&mut b).await?;
|
||||
}
|
||||
|
||||
BeMessage::ControlFile => {
|
||||
// TODO pass checkpoint and xid info in this message
|
||||
let mut b = Bytes::from("hello pg_control");
|
||||
|
||||
self.stream.write_u8(b'D').await?;
|
||||
self.stream.write_i32(4 + 2 + 4 + b.len() as i32).await?;
|
||||
|
||||
self.stream.write_i16(1).await?;
|
||||
self.stream.write_i32(b.len() as i32).await?;
|
||||
self.stream.write_all(&mut b).await?;
|
||||
self.stream.write_buf(&mut b).await?;
|
||||
}
|
||||
|
||||
BeMessage::CommandComplete => {
|
||||
@@ -587,7 +345,7 @@ impl Connection {
|
||||
|
||||
self.stream.write_u8(b'C').await?;
|
||||
self.stream.write_i32(4 + b.len() as i32).await?;
|
||||
self.stream.write_all(&mut b).await?;
|
||||
self.stream.write_buf(&mut b).await?;
|
||||
}
|
||||
|
||||
BeMessage::ZenithStatusResponse(resp) => {
|
||||
@@ -614,7 +372,7 @@ impl Connection {
|
||||
self.stream.write_u8(102).await?; /* tag from pagestore_client.h */
|
||||
self.stream.write_u8(resp.ok as u8).await?;
|
||||
self.stream.write_u32(resp.n_blocks).await?;
|
||||
self.stream.write_all(&mut resp.page.clone()).await?;
|
||||
self.stream.write_buf(&mut resp.page.clone()).await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -627,18 +385,15 @@ impl Connection {
|
||||
}
|
||||
|
||||
async fn run(&mut self) -> Result<()> {
|
||||
let mut unnamed_query_string = Bytes::new();
|
||||
loop {
|
||||
let msg = self.read_message().await?;
|
||||
info!("got message {:?}", msg);
|
||||
match msg {
|
||||
match self.read_message().await? {
|
||||
Some(FeMessage::StartupMessage(m)) => {
|
||||
trace!("got message {:?}", m);
|
||||
|
||||
match m.kind {
|
||||
StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => {
|
||||
let mut b = Bytes::from("N");
|
||||
self.stream.write_all(&mut b).await?;
|
||||
self.stream.write_buf(&mut b).await?;
|
||||
self.stream.flush().await?;
|
||||
}
|
||||
StartupRequestCode::Normal => {
|
||||
@@ -651,28 +406,7 @@ impl Connection {
|
||||
}
|
||||
}
|
||||
Some(FeMessage::Query(m)) => {
|
||||
self.process_query(m.body).await?;
|
||||
}
|
||||
Some(FeMessage::Parse(m)) => {
|
||||
unnamed_query_string = m.query_string;
|
||||
self.write_message(&BeMessage::ParseComplete).await?;
|
||||
}
|
||||
Some(FeMessage::Describe(_)) => {
|
||||
self.write_message_noflush(&BeMessage::ParameterDescription)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::NoData).await?;
|
||||
}
|
||||
Some(FeMessage::Bind(_)) => {
|
||||
self.write_message(&BeMessage::BindComplete).await?;
|
||||
}
|
||||
Some(FeMessage::Close(_)) => {
|
||||
self.write_message(&BeMessage::CloseComplete).await?;
|
||||
}
|
||||
Some(FeMessage::Execute(_)) => {
|
||||
self.process_query(unnamed_query_string.clone()).await?;
|
||||
}
|
||||
Some(FeMessage::Sync) => {
|
||||
self.write_message(&BeMessage::ReadyForQuery).await?;
|
||||
self.process_query(&m).await?;
|
||||
}
|
||||
Some(FeMessage::Terminate) => {
|
||||
break;
|
||||
@@ -681,8 +415,7 @@ impl Connection {
|
||||
info!("connection closed");
|
||||
break;
|
||||
}
|
||||
x => {
|
||||
error!("unexpected message type : {:?}", x);
|
||||
_ => {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "unexpected message"));
|
||||
}
|
||||
}
|
||||
@@ -691,62 +424,87 @@ impl Connection {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn process_query(&mut self, query_string: Bytes) -> Result<()> {
|
||||
debug!("process query {:?}", query_string);
|
||||
async fn process_query(&mut self, q: &FeQueryMessage) -> Result<()> {
|
||||
trace!("got query {:?}", q.body);
|
||||
|
||||
// remove null terminator, if any
|
||||
let mut query_string = query_string.clone();
|
||||
if query_string.last() == Some(&0) {
|
||||
query_string.truncate(query_string.len() - 1);
|
||||
}
|
||||
|
||||
if query_string.starts_with(b"controlfile") {
|
||||
self.handle_controlfile().await
|
||||
} else if query_string.starts_with(b"pagestream ") {
|
||||
let (_l, r) = query_string.split_at("pagestream ".len());
|
||||
let timelineid_str = String::from_utf8(r.to_vec()).unwrap();
|
||||
let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
|
||||
|
||||
self.handle_pagerequests(timelineid).await
|
||||
} else if query_string.starts_with(b"basebackup ") {
|
||||
let (_l, r) = query_string.split_at("basebackup ".len());
|
||||
if q.body.starts_with(b"file") {
|
||||
let (_l, r) = q.body.split_at("file ".len());
|
||||
//TODO parse it correctly
|
||||
let r = r.to_vec();
|
||||
let timelineid_str = String::from(String::from_utf8(r).unwrap().trim_end());
|
||||
info!("got basebackup command: \"{}\"", timelineid_str);
|
||||
let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
|
||||
let str = String::from_utf8(r).unwrap().to_string();
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(timelineid).await?;
|
||||
let mut split = str.split(',');
|
||||
let mut s;
|
||||
|
||||
let filepath = split.next().unwrap();
|
||||
let sysid = {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u64>().unwrap()
|
||||
};
|
||||
|
||||
let buf_tag = page_cache::BufferTag {
|
||||
spcnode: {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u32>().unwrap()
|
||||
},
|
||||
dbnode: {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u32>().unwrap()
|
||||
},
|
||||
relnode: {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u32>().unwrap()
|
||||
},
|
||||
forknum: {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u8>().unwrap()
|
||||
},
|
||||
blknum: {
|
||||
s = split.next().unwrap();
|
||||
s.parse::<u32>().unwrap()
|
||||
},
|
||||
};
|
||||
|
||||
//TODO PARSE LSN
|
||||
//let lsn = { s = split.next().unwrap(); s.parse::<u64>().unwrap()};
|
||||
let lsn: u64 = 0;
|
||||
info!(
|
||||
"process file query sysid {} -- {:?} lsn {}",
|
||||
sysid, buf_tag, lsn
|
||||
);
|
||||
|
||||
self.handle_file(filepath.to_string(), sysid, buf_tag, lsn.into())
|
||||
.await
|
||||
} else if q.body.starts_with(b"pagestream ") {
|
||||
let (_l, r) = q.body.split_at("pagestream ".len());
|
||||
let mut r = r.to_vec();
|
||||
r.pop();
|
||||
let sysid = String::from_utf8(r).unwrap().trim().to_string();
|
||||
let sysid: u64 = sysid.parse().unwrap(); // XXX
|
||||
|
||||
self.handle_pagerequests(sysid).await
|
||||
} else if q.body.starts_with(b"callmemaybe ") {
|
||||
let (_l, r) = q.body.split_at("callmemaybe ".len());
|
||||
let mut r = r.to_vec();
|
||||
r.pop();
|
||||
let connstr = String::from_utf8(r).unwrap().trim().to_string();
|
||||
|
||||
let conf_copy = self.conf.clone();
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
walreceiver::thread_main(conf_copy, &connstr);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// generic ack:
|
||||
self.write_message_noflush(&BeMessage::RowDescription)
|
||||
.await?;
|
||||
self.write_message_noflush(&BeMessage::DataRow).await?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await
|
||||
} else if query_string.starts_with(b"callmemaybe ") {
|
||||
let query_str = String::from_utf8(query_string.to_vec())
|
||||
.unwrap()
|
||||
.to_string();
|
||||
|
||||
// callmemaybe <zenith timelineid as hex string> <connstr>
|
||||
let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) (.*)$").unwrap();
|
||||
let caps = re.captures(&query_str);
|
||||
let caps = caps.unwrap();
|
||||
|
||||
let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str().clone()).unwrap();
|
||||
let connstr: String = String::from(caps.get(2).unwrap().as_str());
|
||||
|
||||
// Check that the timeline exists
|
||||
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
|
||||
if pcache.is_err() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("client requested callmemaybe on timeline {} which does not exist in page server", timelineid)));
|
||||
}
|
||||
|
||||
walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr);
|
||||
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await
|
||||
} else if query_string.starts_with(b"status") {
|
||||
} else if q.body.starts_with(b"status") {
|
||||
self.write_message_noflush(&BeMessage::RowDescription)
|
||||
.await?;
|
||||
self.write_message_noflush(&BeMessage::DataRow).await?;
|
||||
@@ -763,25 +521,35 @@ impl Connection {
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_controlfile(&mut self) -> Result<()> {
|
||||
async fn handle_file(
|
||||
&mut self,
|
||||
filepath: String,
|
||||
sysid: u64,
|
||||
buf_tag: page_cache::BufferTag,
|
||||
lsn: u64,
|
||||
) -> Result<()> {
|
||||
let pcache = page_cache::get_pagecache(self.conf.clone(), sysid);
|
||||
|
||||
match pcache.get_page_at_lsn(buf_tag, lsn) {
|
||||
Ok(p) => {
|
||||
info!("info succeeded get_page_at_lsn: {}", lsn);
|
||||
|
||||
controlfile::write_buf_to_file(filepath, p, buf_tag.blknum);
|
||||
}
|
||||
Err(e) => {
|
||||
info!("page not found and it's ok. get_page_at_lsn: {}", e);
|
||||
}
|
||||
};
|
||||
|
||||
self.write_message_noflush(&BeMessage::RowDescription)
|
||||
.await?;
|
||||
self.write_message_noflush(&BeMessage::ControlFile).await?;
|
||||
self.write_message_noflush(&BeMessage::DataRow).await?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await
|
||||
}
|
||||
|
||||
async fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
// Check that the timeline exists
|
||||
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
|
||||
if pcache.is_err() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("client requested pagestream on timeline {} which does not exist in page server", timelineid)));
|
||||
}
|
||||
let pcache = pcache.unwrap();
|
||||
|
||||
async fn handle_pagerequests(&mut self, sysid: u64) -> Result<()> {
|
||||
/* switch client to COPYBOTH */
|
||||
self.stream.write_u8(b'W').await?;
|
||||
self.stream.write_i32(4 + 1 + 2).await?;
|
||||
@@ -789,11 +557,13 @@ impl Connection {
|
||||
self.stream.write_i16(0).await?; /* numAttributes */
|
||||
self.stream.flush().await?;
|
||||
|
||||
let pcache = page_cache::get_pagecache(self.conf.clone(), sysid);
|
||||
|
||||
loop {
|
||||
let message = self.read_message().await?;
|
||||
|
||||
if let Some(m) = &message {
|
||||
info!("query({:?}): {:?}", timelineid, m);
|
||||
info!("query({}): {:?}", sysid, m);
|
||||
};
|
||||
|
||||
if message.is_none() {
|
||||
@@ -842,9 +612,10 @@ impl Connection {
|
||||
|
||||
let n_blocks = pcache.relsize_get(&tag);
|
||||
|
||||
trace!("ZenithNblocksRequest {:?} = {}", tag, n_blocks);
|
||||
self.write_message(&BeMessage::ZenithNblocksResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks,
|
||||
n_blocks: n_blocks,
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
@@ -858,11 +629,26 @@ impl Connection {
|
||||
};
|
||||
|
||||
let msg = match pcache.get_page_at_lsn(buf_tag, req.lsn) {
|
||||
Ok(p) => BeMessage::ZenithReadResponse(ZenithReadResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
page: p,
|
||||
}),
|
||||
Ok(p) => {
|
||||
let mut b = BytesMut::with_capacity(8192);
|
||||
|
||||
trace!("ZenithReadResponse get_page_at_lsn succeed");
|
||||
if p.len() < 8192 {
|
||||
//add padding
|
||||
trace!("ZenithReadResponse add padding");
|
||||
let padding: [u8; 8192 - 512] = [0; 8192 - 512];
|
||||
b.extend_from_slice(&p);
|
||||
b.extend_from_slice(&padding);
|
||||
} else {
|
||||
b.extend_from_slice(&p);
|
||||
}
|
||||
|
||||
BeMessage::ZenithReadResponse(ZenithReadResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
page: b.freeze(),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
const ZERO_PAGE: [u8; 8192] = [0; 8192];
|
||||
error!("get_page_at_lsn: {}", e);
|
||||
@@ -883,8 +669,9 @@ impl Connection {
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
trace!("ZenithCreateRequest {:?}", tag);
|
||||
|
||||
pcache.relsize_inc(&tag, 0);
|
||||
pcache.relsize_inc(&tag, None);
|
||||
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
@@ -900,7 +687,9 @@ impl Connection {
|
||||
forknum: req.forknum,
|
||||
};
|
||||
|
||||
pcache.relsize_inc(&tag, req.blkno + 1);
|
||||
trace!("ZenithExtendRequest {:?} to {}", tag, req.blkno);
|
||||
|
||||
pcache.relsize_inc(&tag, Some(req.blkno));
|
||||
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
@@ -912,101 +701,4 @@ impl Connection {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn handle_basebackup_request(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
// check that the timeline exists
|
||||
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
|
||||
if pcache.is_err() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("client requested basebackup on timeline {} which does not exist in page server", timelineid)));
|
||||
}
|
||||
|
||||
/* switch client to COPYOUT */
|
||||
let stream = &mut self.stream;
|
||||
stream.write_u8(b'H').await?;
|
||||
stream.write_i32(4 + 1 + 2).await?;
|
||||
stream.write_u8(0).await?; /* copy_is_binary */
|
||||
stream.write_i16(0).await?; /* numAttributes */
|
||||
stream.flush().await?;
|
||||
info!("sent CopyOut");
|
||||
|
||||
/* Send a tarball of the latest snapshot on the timeline */
|
||||
|
||||
// find latest snapshot
|
||||
let snapshotlsn = restore_local_repo::find_latest_snapshot(&self.conf, timelineid).unwrap();
|
||||
|
||||
// Stream it
|
||||
let (s, mut r) = mpsc::channel(5);
|
||||
|
||||
let f_tar = task::spawn_blocking(move || {
|
||||
basebackup::send_snapshot_tarball(&mut CopyDataSink(s), timelineid, snapshotlsn)?;
|
||||
Ok(())
|
||||
});
|
||||
let f_tar2 = async {
|
||||
let joinres = f_tar.await;
|
||||
|
||||
if joinres.is_err() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
joinres.unwrap_err(),
|
||||
));
|
||||
}
|
||||
return joinres.unwrap();
|
||||
};
|
||||
|
||||
let f_pump = async move {
|
||||
loop {
|
||||
let buf = r.recv().await;
|
||||
if buf.is_none() {
|
||||
break;
|
||||
}
|
||||
let mut buf = buf.unwrap();
|
||||
|
||||
// CopyData
|
||||
stream.write_u8(b'd').await?;
|
||||
stream.write_u32((4 + buf.len()) as u32).await?;
|
||||
stream.write_all(&mut buf).await?;
|
||||
trace!("CopyData sent for {} bytes!", buf.len());
|
||||
|
||||
// FIXME: flush isn't really required, but makes it easier
|
||||
// to view in wireshark
|
||||
stream.flush().await?;
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
|
||||
tokio::try_join!(f_tar2, f_pump)?;
|
||||
|
||||
// CopyDone
|
||||
self.stream.write_u8(b'c').await?;
|
||||
self.stream.write_u32(4).await?;
|
||||
self.stream.flush().await?;
|
||||
debug!("CopyDone sent!");
|
||||
|
||||
// FIXME: I'm getting an error from the tokio copyout driver without this.
|
||||
// I think it happens when the CommandComplete, CloseComplete and ReadyForQuery
|
||||
// are sent in the same TCP packet as the CopyDone. I don't understand why.
|
||||
thread::sleep(std::time::Duration::from_secs(1));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct CopyDataSink(mpsc::Sender<Bytes>);
|
||||
|
||||
impl std::io::Write for CopyDataSink {
|
||||
fn write(&mut self, data: &[u8]) -> std::result::Result<usize, std::io::Error> {
|
||||
let buf = Bytes::copy_from_slice(data);
|
||||
|
||||
if let Err(e) = self.0.blocking_send(buf) {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, e));
|
||||
}
|
||||
|
||||
Ok(data.len())
|
||||
}
|
||||
fn flush(&mut self) -> std::result::Result<(), std::io::Error> {
|
||||
// no-op
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,45 +9,3 @@ pub const PG_FILENODEMAP_FORKNUM: u32 = 43;
|
||||
pub const PG_XACT_FORKNUM: u32 = 44;
|
||||
pub const PG_MXACT_OFFSETS_FORKNUM: u32 = 45;
|
||||
pub const PG_MXACT_MEMBERS_FORKNUM: u32 = 46;
|
||||
|
||||
//
|
||||
// constants from clog.h
|
||||
//
|
||||
pub const CLOG_XACTS_PER_BYTE: u32 = 4;
|
||||
pub const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
|
||||
pub const CLOG_BITS_PER_XACT: u8 = 2;
|
||||
pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
|
||||
|
||||
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
|
||||
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
|
||||
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
|
||||
|
||||
pub const CLOG_ZEROPAGE: u8 = 0x00;
|
||||
pub const CLOG_TRUNCATE: u8 = 0x10;
|
||||
|
||||
// From xact.h
|
||||
pub const XLOG_XACT_COMMIT: u8 = 0x00;
|
||||
pub const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
|
||||
/* mask for filtering opcodes out of xl_info */
|
||||
pub const XLOG_XACT_OPMASK: u8 = 0x70;
|
||||
/* does this record have a 'xinfo' field or not */
|
||||
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
|
||||
|
||||
/*
|
||||
* The following flags, stored in xinfo, determine which information is
|
||||
* contained in commit/abort records.
|
||||
*/
|
||||
pub const XACT_XINFO_HAS_DBINFO: u32 = 1;
|
||||
pub const XACT_XINFO_HAS_SUBXACTS: u32 = 2;
|
||||
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 4;
|
||||
|
||||
// From pg_control.h and rmgrlist.h
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const RM_XLOG_ID: u8 = 0;
|
||||
pub const RM_XACT_ID: u8 = 1;
|
||||
pub const RM_CLOG_ID: u8 = 3;
|
||||
// pub const RM_MULTIXACT_ID:u8 = 6;
|
||||
|
||||
// from xlogreader.h
|
||||
pub const XLR_INFO_MASK: u8 = 0x0F;
|
||||
|
||||
@@ -1,489 +0,0 @@
|
||||
//
|
||||
// Restore chunks from local Zenith repository
|
||||
//
|
||||
// This runs once at Page Server startup. It loads all the "snapshots" and all
|
||||
// WAL from all timelines from the local zenith repository into the in-memory page
|
||||
// cache.
|
||||
//
|
||||
// This also initializes the "last valid LSN" in the page cache to the last LSN
|
||||
// seen in the WAL, so that when the WAL receiver is started, it starts
|
||||
// streaming from that LSN.
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::fmt;
|
||||
|
||||
use std::cmp::max;
|
||||
use std::error::Error;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
use std::io::Seek;
|
||||
use std::io::SeekFrom;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::BufferTag;
|
||||
use crate::page_cache::PageCache;
|
||||
use crate::waldecoder::{decode_wal_record, WalStreamDecoder};
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
// FIXME: we'll probably need these elsewhere too, move to some common location
|
||||
const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
//
|
||||
// Load it all into the page cache.
|
||||
//
|
||||
pub fn restore_timeline(
|
||||
conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
timeline: ZTimelineId,
|
||||
) -> Result<()> {
|
||||
let timelinepath = PathBuf::from("timelines").join(timeline.to_string());
|
||||
|
||||
if !timelinepath.exists() {
|
||||
anyhow::bail!("timeline {} does not exist in the page server's repository");
|
||||
}
|
||||
|
||||
// Scan .zenith/timelines/<timeline>/snapshots
|
||||
let snapshotspath = PathBuf::from("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("snapshots");
|
||||
|
||||
let mut last_snapshot_lsn: u64 = 0;
|
||||
|
||||
for direntry in fs::read_dir(&snapshotspath).unwrap() {
|
||||
let direntry = direntry?;
|
||||
let filename = direntry.file_name().to_str().unwrap().to_owned();
|
||||
|
||||
let lsn = u64::from_str_radix(&filename, 16)?;
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
|
||||
restore_snapshot(conf, pcache, timeline, &filename)?;
|
||||
info!("restored snapshot at {}", filename);
|
||||
}
|
||||
|
||||
if last_snapshot_lsn == 0 {
|
||||
error!(
|
||||
"could not find valid snapshot in {}",
|
||||
snapshotspath.display()
|
||||
);
|
||||
// TODO return error?
|
||||
}
|
||||
pcache.init_valid_lsn(last_snapshot_lsn);
|
||||
|
||||
restore_wal(conf, pcache, timeline, last_snapshot_lsn)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result<u64> {
|
||||
let snapshotspath = format!("timelines/{}/snapshots", timeline);
|
||||
|
||||
let mut last_snapshot_lsn = 0;
|
||||
for direntry in fs::read_dir(&snapshotspath).unwrap() {
|
||||
let filename = direntry.unwrap().file_name().to_str().unwrap().to_owned();
|
||||
|
||||
let lsn = u64::from_str_radix(&filename, 16)?;
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
}
|
||||
|
||||
if last_snapshot_lsn == 0 {
|
||||
error!("could not find valid snapshot in {}", &snapshotspath);
|
||||
// TODO return error?
|
||||
}
|
||||
Ok(last_snapshot_lsn)
|
||||
}
|
||||
|
||||
fn restore_snapshot(
|
||||
conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
timeline: ZTimelineId,
|
||||
snapshot: &str,
|
||||
) -> Result<()> {
|
||||
let snapshotpath = PathBuf::from("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("snapshots")
|
||||
.join(snapshot);
|
||||
|
||||
// Scan 'global'
|
||||
for direntry in fs::read_dir(snapshotpath.join("global"))? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("pg_control") => continue,
|
||||
Some("pg_filenode.map") => continue,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => restore_relfile(
|
||||
conf,
|
||||
pcache,
|
||||
timeline,
|
||||
snapshot,
|
||||
GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
}
|
||||
}
|
||||
|
||||
// Scan 'base'. It contains database dirs, the database OID is the filename.
|
||||
// E.g. 'base/12345', where 12345 is the database OID.
|
||||
for direntry in fs::read_dir(snapshotpath.join("base"))? {
|
||||
let direntry = direntry?;
|
||||
|
||||
let dboid = u32::from_str_radix(direntry.file_name().to_str().unwrap(), 10)?;
|
||||
|
||||
for direntry in fs::read_dir(direntry.path())? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
// These special files appear in the snapshot, but are not needed by the page server
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => continue,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => restore_relfile(
|
||||
conf,
|
||||
pcache,
|
||||
timeline,
|
||||
snapshot,
|
||||
DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore_relfile(
|
||||
_conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
_timeline: ZTimelineId,
|
||||
snapshot: &str,
|
||||
spcoid: u32,
|
||||
dboid: u32,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let lsn = u64::from_str_radix(snapshot, 16)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
if p.is_err() {
|
||||
let e = p.unwrap_err();
|
||||
warn!("unrecognized file in snapshot: {:?} ({})", path, e);
|
||||
return Err(e)?;
|
||||
}
|
||||
let (relnode, forknum, segno) = p.unwrap();
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / 8192);
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let tag = page_cache::BufferTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode: relnode,
|
||||
forknum: forknum as u8,
|
||||
blknum: blknum,
|
||||
};
|
||||
pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
|
||||
/*
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
blknum += 1;
|
||||
}
|
||||
|
||||
let tag = page_cache::RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode: relnode,
|
||||
forknum: forknum as u8,
|
||||
};
|
||||
pcache.relsize_inc(&tag, blknum);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Scan WAL on a timeline, starting from gien LSN, and load all the records
|
||||
// into the page cache.
|
||||
fn restore_wal(
|
||||
_conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
timeline: ZTimelineId,
|
||||
startpoint: u64,
|
||||
) -> Result<()> {
|
||||
let walpath = format!("timelines/{}/wal", timeline);
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
|
||||
|
||||
let mut segno = XLByteToSeg(startpoint, 16 * 1024 * 1024);
|
||||
let mut offset = XLogSegmentOffset(startpoint, 16 * 1024 * 1024);
|
||||
let mut last_lsn = 0;
|
||||
loop {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, 16 * 1024 * 1024);
|
||||
let mut path = walpath.clone() + "/" + &filename;
|
||||
|
||||
// It could be as .partial
|
||||
if !PathBuf::from(&path).exists() {
|
||||
path = path + ".partial";
|
||||
}
|
||||
|
||||
// Slurp the WAL file
|
||||
let open_result = File::open(&path);
|
||||
if let Err(e) = open_result {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
break;
|
||||
}
|
||||
return Err(e)?;
|
||||
}
|
||||
let mut file = open_result.unwrap();
|
||||
|
||||
if offset > 0 {
|
||||
file.seek(SeekFrom::Start(offset as u64))?;
|
||||
}
|
||||
|
||||
let mut buf = Vec::new();
|
||||
let nread = file.read_to_end(&mut buf)?;
|
||||
if nread != 16 * 1024 * 1024 - offset as usize {
|
||||
// Maybe allow this for .partial files?
|
||||
error!("read only {} bytes from WAL file", nread);
|
||||
}
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
loop {
|
||||
let rec = waldecoder.poll_decode();
|
||||
if rec.is_err() {
|
||||
// Assume that an error means we've reached the end of
|
||||
// a partial WAL record. So that's ok.
|
||||
break;
|
||||
}
|
||||
if let Some((lsn, recdata)) = rec.unwrap() {
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
// Put the WAL record to the page cache. We make a separate copy of
|
||||
// it for every block it modifies. (The actual WAL record is kept in
|
||||
// a Bytes, which uses a reference counter for the underlying buffer,
|
||||
// so having multiple copies of it doesn't cost that much)
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn: lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset,
|
||||
};
|
||||
|
||||
pcache.put_wal_record(tag, rec);
|
||||
}
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
pcache.advance_last_valid_lsn(lsn);
|
||||
last_lsn = lsn;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
nrecords += 1;
|
||||
}
|
||||
|
||||
info!("restored {} records from WAL file {}", nrecords, filename);
|
||||
|
||||
segno += 1;
|
||||
offset = 0;
|
||||
}
|
||||
info!(
|
||||
"reached end of WAL at {:X}/{:X}",
|
||||
last_lsn >> 32,
|
||||
last_lsn & 0xffffffff
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// FIXME: copied from xlog_utils.rs
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
pub type XLogRecPtr = u64;
|
||||
pub type XLogSegNo = u64;
|
||||
pub type TimeLineID = u32;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
|
||||
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
return xlogptr / wal_segsz_bytes as u64;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
|
||||
return format!(
|
||||
"{:>08X}{:>08X}{:>08X}",
|
||||
tli,
|
||||
logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes),
|
||||
logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)
|
||||
);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
|
||||
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
|
||||
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
|
||||
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
|
||||
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn IsXLogFileName(fname: &str) -> bool {
|
||||
return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn IsPartialXLogFileName(fname: &str) -> bool {
|
||||
if let Some(basefname) = fname.strip_suffix(".partial") {
|
||||
IsXLogFileName(basefname)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl Error for FilePathError {
|
||||
fn description(&self) -> &str {
|
||||
&self.msg
|
||||
}
|
||||
}
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ParsedBaseImageFileName {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u32,
|
||||
pub segno: u32,
|
||||
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
fn parse_relfilename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(fname)
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
|
||||
let forkname_match = caps.name("forkname");
|
||||
let forkname = if forkname_match.is_none() {
|
||||
None
|
||||
} else {
|
||||
Some(forkname_match.unwrap().as_str())
|
||||
};
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
};
|
||||
|
||||
return Ok((relnode, forknum, segno));
|
||||
}
|
||||
@@ -22,7 +22,7 @@ use tokio::runtime;
|
||||
|
||||
use futures::future;
|
||||
|
||||
use crate::{page_cache, PageServerConf};
|
||||
use crate::{controlfile, page_cache, pg_constants, PageServerConf};
|
||||
|
||||
struct Storage {
|
||||
region: Region,
|
||||
@@ -60,8 +60,8 @@ pub fn restore_main(conf: &PageServerConf) {
|
||||
async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
let backend = Storage {
|
||||
region: Region::Custom {
|
||||
region: env::var("S3_REGION").unwrap(),
|
||||
endpoint: env::var("S3_ENDPOINT").unwrap(),
|
||||
region: env::var("S3_REGION").unwrap().into(),
|
||||
endpoint: env::var("S3_ENDPOINT").unwrap().into(),
|
||||
},
|
||||
credentials: Credentials::new(
|
||||
Some(&env::var("S3_ACCESSKEY").unwrap()),
|
||||
@@ -84,8 +84,24 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
.list("relationdata/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
|
||||
// TODO: get that from backup
|
||||
let sys_id: u64 = 42;
|
||||
//Before uploading other files, slurp pg_control to set systemid
|
||||
|
||||
let control_results: Vec<s3::serde_types::ListBucketResult> = bucket
|
||||
.list(
|
||||
"relationdata/global/pg_control".to_string(),
|
||||
Some("".to_string()),
|
||||
)
|
||||
.await?;
|
||||
let object = &(&control_results[0]).contents[0];
|
||||
let (data, _) = bucket.get_object(&object.key).await.unwrap();
|
||||
let bytes = BytesMut::from(data.as_slice()).freeze();
|
||||
let c = controlfile::decode_pg_control(bytes);
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), c.system_identifier);
|
||||
pcache.set_controldata(c.clone());
|
||||
trace!("uploaded controlfile {:?}", pcache.get_controldata());
|
||||
|
||||
let sys_id: u64 = c.system_identifier;
|
||||
let mut oldest_lsn = 0;
|
||||
let mut slurp_futures: Vec<_> = Vec::new();
|
||||
|
||||
@@ -119,23 +135,47 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
panic!("no base backup found");
|
||||
}
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
//Now add nonrelation files
|
||||
let nonrelresults: Vec<s3::serde_types::ListBucketResult> = bucket
|
||||
.list("nonreldata/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
for result in nonrelresults {
|
||||
for object in result.contents {
|
||||
// Download needed non relation files, slurping them into memory
|
||||
|
||||
let key = object.key;
|
||||
let relpath = key.strip_prefix("nonreldata/").unwrap();
|
||||
trace!("list nonrelfiles {}", relpath);
|
||||
|
||||
let parsed = parse_nonrel_file_path(&relpath);
|
||||
|
||||
match parsed {
|
||||
Ok(p) => {
|
||||
let b = bucket.clone();
|
||||
let f = slurp_base_file(conf, sys_id, b, key.to_string(), p);
|
||||
|
||||
slurp_futures.push(f);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("unrecognized file: {} ({})", relpath, e);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
pcache.init_valid_lsn(oldest_lsn);
|
||||
|
||||
info!("{} files to restore...", slurp_futures.len());
|
||||
|
||||
future::join_all(slurp_futures).await;
|
||||
info!("restored!");
|
||||
info!(
|
||||
"restored! {:?} to {:?}",
|
||||
pcache.first_valid_lsn, pcache.last_valid_lsn
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
// FIXME: we'll probably need these elsewhere too, move to some common location
|
||||
const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
@@ -185,6 +225,17 @@ struct ParsedBaseImageFileName {
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
fn parse_lsn_from_filename(fname: &str) -> Result<u64, FilePathError> {
|
||||
let (_, lsn_str) = fname.split_at(fname.len() - 16);
|
||||
|
||||
let (lsnhi, lsnlo) = lsn_str.split_at(8);
|
||||
let lsn_hi = u64::from_str_radix(lsnhi, 16)?;
|
||||
let lsn_lo = u64::from_str_radix(lsnlo, 16)?;
|
||||
let lsn = lsn_hi << 32 | lsn_lo;
|
||||
|
||||
return Ok(lsn);
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
@@ -223,6 +274,46 @@ fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
|
||||
return Ok((relnode, forknum, segno, lsn));
|
||||
}
|
||||
|
||||
fn parse_nonrel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
|
||||
//TODO parse segno from xact filenames too
|
||||
if let Some(fname) = path.strip_prefix("pg_xact/") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_XACT_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
} else if let Some(fname) = path.strip_prefix("pg_multixact/offsets") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_MXACT_OFFSETS_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
} else if let Some(fname) = path.strip_prefix("pg_multixact/members") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_MXACT_MEMBERS_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
} else {
|
||||
return Err(FilePathError::new("invalid non relation data file name"));
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
|
||||
/*
|
||||
* Relation data files can be in one of the following directories:
|
||||
@@ -242,10 +333,36 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
|
||||
* <oid>.<segment number>
|
||||
*/
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
if fname.contains("pg_control") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_CONTROLFILE_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
}
|
||||
|
||||
if fname.contains("pg_filenode") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
}
|
||||
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: GLOBALTABLESPACE_OID,
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode,
|
||||
forknum,
|
||||
@@ -265,10 +382,23 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
};
|
||||
|
||||
if fname.contains("pg_filenode") {
|
||||
let lsn = parse_lsn_from_filename(fname.clone())?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode,
|
||||
relnode: 0,
|
||||
forknum: pg_constants::PG_FILENODEMAP_FORKNUM,
|
||||
segno: 0,
|
||||
lsn,
|
||||
});
|
||||
}
|
||||
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: DEFAULTTABLESPACE_OID,
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum,
|
||||
@@ -302,22 +432,55 @@ async fn slurp_base_file(
|
||||
|
||||
let mut bytes = BytesMut::from(data.as_slice()).freeze();
|
||||
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), sys_id);
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
// pg_filenode.map has non-standard size - 512 bytes
|
||||
if parsed.forknum == pg_constants::PG_FILENODEMAP_FORKNUM {
|
||||
let b = bytes.clone();
|
||||
controlfile::decode_filemapping(b);
|
||||
while bytes.remaining() >= 512 {
|
||||
let tag = page_cache::BufferTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
blknum: 0,
|
||||
};
|
||||
|
||||
while bytes.remaining() >= 8192 {
|
||||
let tag = page_cache::BufferTag {
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(512));
|
||||
}
|
||||
|
||||
let tag = page_cache::RelTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
blknum,
|
||||
};
|
||||
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
|
||||
pcache.relsize_inc(&tag, Some(0));
|
||||
} else {
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
|
||||
let reltag = page_cache::RelTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
};
|
||||
|
||||
blknum += 1;
|
||||
while bytes.remaining() >= 8192 {
|
||||
let tag = page_cache::BufferTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
blknum: blknum,
|
||||
};
|
||||
|
||||
pcache.put_page_image(tag, parsed.lsn, bytes.copy_to_bytes(8192));
|
||||
pcache.relsize_inc(&reltag, Some(blknum));
|
||||
|
||||
blknum += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ use tui::text::{Span, Spans, Text};
|
||||
use tui::widgets::{Block, BorderType, Borders, Paragraph, Widget};
|
||||
use tui::Terminal;
|
||||
|
||||
use slog;
|
||||
use slog::Drain;
|
||||
|
||||
lazy_static! {
|
||||
@@ -91,7 +92,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
return slog_scope::set_global_logger(logger);
|
||||
}
|
||||
|
||||
pub fn ui_main() -> Result<(), Box<dyn Error>> {
|
||||
pub fn ui_main<'b>() -> Result<(), Box<dyn Error>> {
|
||||
// Terminal initialization
|
||||
let stdout = io::stdout().into_raw_mode()?;
|
||||
let stdout = MouseTerminal::from(stdout);
|
||||
@@ -187,7 +188,6 @@ pub fn ui_main() -> Result<(), Box<dyn Error>> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
struct LogWidget<'a> {
|
||||
logger: &'a TuiLogger,
|
||||
title: &'a str,
|
||||
@@ -229,7 +229,7 @@ impl<'a> Widget for LogWidget<'a> {
|
||||
// Render a widget to show some metrics
|
||||
struct MetricsWidget {}
|
||||
|
||||
fn get_metric_u64(title: &str, value: u64) -> Spans {
|
||||
fn get_metric_u64<'a>(title: &'a str, value: u64) -> Spans<'a> {
|
||||
Spans::from(vec![
|
||||
Span::styled(format!("{:<20}", title), Style::default()),
|
||||
Span::raw(": "),
|
||||
@@ -240,7 +240,7 @@ fn get_metric_u64(title: &str, value: u64) -> Spans {
|
||||
])
|
||||
}
|
||||
|
||||
fn get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
|
||||
fn get_metric_str<'a>(title: &'a str, value: &'a str) -> Spans<'a> {
|
||||
Spans::from(vec![
|
||||
Span::styled(format!("{:<20}", title), Style::default()),
|
||||
Span::raw(": "),
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::time::Duration;
|
||||
use termion::event::Key;
|
||||
use termion::input::TermRead;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub enum Event<I> {
|
||||
Input(I),
|
||||
Tick,
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
//
|
||||
use chrono::offset::Local;
|
||||
use chrono::DateTime;
|
||||
use slog;
|
||||
use slog::{Drain, Level, OwnedKVList, Record};
|
||||
use slog_async::AsyncRecord;
|
||||
use std::collections::VecDeque;
|
||||
@@ -80,7 +81,7 @@ impl<'b> TuiLoggerWidget<'b> {
|
||||
style_trace: None,
|
||||
style_info: None,
|
||||
show_module: true,
|
||||
logger,
|
||||
logger: logger,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -167,7 +168,7 @@ impl<'b> Widget for TuiLoggerWidget<'b> {
|
||||
Level::Debug => (self.style_debug, "DEBUG", true),
|
||||
Level::Trace => (self.style_trace, "TRACE", true),
|
||||
};
|
||||
line.push(Span::styled(txt, lvl_style.unwrap_or_default()));
|
||||
line.push(Span::styled(txt, lvl_style.unwrap_or(Style::default())));
|
||||
|
||||
if self.show_module {
|
||||
line.push(Span::raw(" "));
|
||||
|
||||
@@ -1,8 +1,16 @@
|
||||
//#![allow(non_upper_case_globals)]
|
||||
//#![allow(non_camel_case_types)]
|
||||
//#![allow(non_snake_case)]
|
||||
//#![allow(dead_code)]
|
||||
//include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
|
||||
use crate::pg_constants;
|
||||
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
|
||||
use std::cmp::min;
|
||||
use thiserror::Error;
|
||||
|
||||
use log::*;
|
||||
|
||||
const XLOG_BLCKSZ: u32 = 8192;
|
||||
|
||||
@@ -13,7 +21,7 @@ const WAL_SEGMENT_SIZE: u64 = 16 * 1024 * 1024;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XLogPageHeaderData {
|
||||
struct XLogPageHeaderData {
|
||||
xlp_magic: u16, /* magic value for correctness checks */
|
||||
xlp_info: u16, /* flag bits, see below */
|
||||
xlp_tli: u32, /* TimeLineID of first record on page */
|
||||
@@ -27,7 +35,7 @@ const SizeOfXLogShortPHD: usize = 2 + 2 + 4 + 8 + 4 + 4;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XLogLongPageHeaderData {
|
||||
struct XLogLongPageHeaderData {
|
||||
std: XLogPageHeaderData, /* standard header fields */
|
||||
xlp_sysid: u64, /* system identifier from pg_control */
|
||||
xlp_seg_size: u32, /* just as a cross-check */
|
||||
@@ -38,7 +46,6 @@ pub struct XLogLongPageHeaderData {
|
||||
#[allow(non_upper_case_globals)]
|
||||
const SizeOfXLogLongPHD: usize = (2 + 2 + 4 + 8 + 4) + 4 + 8 + 4 + 4;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub struct WalStreamDecoder {
|
||||
lsn: u64,
|
||||
|
||||
@@ -51,13 +58,6 @@ pub struct WalStreamDecoder {
|
||||
recordbuf: BytesMut,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug, Clone)]
|
||||
#[error("{msg} at {lsn}")]
|
||||
pub struct WalDecodeError {
|
||||
msg: String,
|
||||
lsn: u64,
|
||||
}
|
||||
|
||||
//
|
||||
// WalRecordStream is a Stream that returns a stream of WAL records
|
||||
// FIXME: This isn't a proper rust stream
|
||||
@@ -65,7 +65,7 @@ pub struct WalDecodeError {
|
||||
impl WalStreamDecoder {
|
||||
pub fn new(lsn: u64) -> WalStreamDecoder {
|
||||
WalStreamDecoder {
|
||||
lsn,
|
||||
lsn: lsn,
|
||||
|
||||
startlsn: 0,
|
||||
contlen: 0,
|
||||
@@ -80,56 +80,40 @@ impl WalStreamDecoder {
|
||||
self.inputbuf.extend_from_slice(buf);
|
||||
}
|
||||
|
||||
/// Attempt to decode another WAL record from the input that has been fed to the
|
||||
/// decoder so far.
|
||||
///
|
||||
/// Returns one of the following:
|
||||
/// Ok((u64, Bytes)): a tuple containing the LSN of next record, and the record itself
|
||||
/// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function
|
||||
/// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
|
||||
///
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(u64, Bytes)>, WalDecodeError> {
|
||||
// Returns a tuple:
|
||||
// (end LSN, record)
|
||||
pub fn poll_decode(&mut self) -> Option<(u64, Bytes)> {
|
||||
loop {
|
||||
// parse and verify page boundaries as we go
|
||||
if self.lsn % WAL_SEGMENT_SIZE == 0 {
|
||||
// parse long header
|
||||
|
||||
if self.inputbuf.remaining() < SizeOfXLogLongPHD {
|
||||
return Ok(None);
|
||||
return None;
|
||||
}
|
||||
|
||||
let hdr = self.decode_XLogLongPageHeaderData();
|
||||
if hdr.std.xlp_pageaddr != self.lsn {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog segment header".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
// TODO: verify the remaining fields in the header
|
||||
|
||||
self.decode_XLogLongPageHeaderData();
|
||||
self.lsn += SizeOfXLogLongPHD as u64;
|
||||
|
||||
// TODO: verify the fields in the header
|
||||
|
||||
continue;
|
||||
} else if self.lsn % (XLOG_BLCKSZ as u64) == 0 {
|
||||
// parse page header
|
||||
|
||||
if self.inputbuf.remaining() < SizeOfXLogShortPHD {
|
||||
return Ok(None);
|
||||
return None;
|
||||
}
|
||||
|
||||
let hdr = self.decode_XLogPageHeaderData();
|
||||
if hdr.xlp_pageaddr != self.lsn {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog page header".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
// TODO: verify the remaining fields in the header
|
||||
|
||||
self.decode_XLogPageHeaderData();
|
||||
self.lsn += SizeOfXLogShortPHD as u64;
|
||||
|
||||
// TODO: verify the fields in the header
|
||||
|
||||
continue;
|
||||
} else if self.padlen > 0 {
|
||||
if self.inputbuf.remaining() < self.padlen as usize {
|
||||
return Ok(None);
|
||||
return None;
|
||||
}
|
||||
|
||||
// skip padding
|
||||
@@ -140,17 +124,20 @@ impl WalStreamDecoder {
|
||||
// need to have at least the xl_tot_len field
|
||||
|
||||
if self.inputbuf.remaining() < 4 {
|
||||
return Ok(None);
|
||||
return None;
|
||||
}
|
||||
|
||||
// read xl_tot_len FIXME: assumes little-endian
|
||||
self.startlsn = self.lsn;
|
||||
let xl_tot_len = self.inputbuf.get_u32_le();
|
||||
if xl_tot_len < SizeOfXLogRecord {
|
||||
return Err(WalDecodeError {
|
||||
msg: format!("invalid xl_tot_len {}", xl_tot_len),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
error!(
|
||||
"invalid xl_tot_len {} at {:X}/{:X}",
|
||||
xl_tot_len,
|
||||
self.lsn >> 32,
|
||||
self.lsn & 0xffffffff
|
||||
);
|
||||
panic!();
|
||||
}
|
||||
self.lsn += 4;
|
||||
|
||||
@@ -168,7 +155,7 @@ impl WalStreamDecoder {
|
||||
let n = min(self.contlen, pageleft) as usize;
|
||||
|
||||
if self.inputbuf.remaining() < n {
|
||||
return Ok(None);
|
||||
return None;
|
||||
}
|
||||
|
||||
self.recordbuf.put(self.inputbuf.split_to(n));
|
||||
@@ -196,7 +183,7 @@ impl WalStreamDecoder {
|
||||
}
|
||||
|
||||
let result = (self.lsn, recordbuf);
|
||||
return Ok(Some(result));
|
||||
return Some(result);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@@ -249,6 +236,7 @@ const BLCKSZ: u16 = 8192;
|
||||
//
|
||||
// Constants from xlogrecord.h
|
||||
//
|
||||
const XLR_INFO_MASK: u8 = 0x0F;
|
||||
|
||||
const XLR_MAX_BLOCK_ID: u8 = 32;
|
||||
|
||||
@@ -269,7 +257,12 @@ const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
|
||||
const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
|
||||
const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
|
||||
|
||||
#[allow(dead_code)]
|
||||
//
|
||||
// constants from clog.h
|
||||
//
|
||||
const CLOG_XACTS_PER_BYTE: u32 = 4;
|
||||
const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
|
||||
|
||||
pub struct DecodedBkpBlock {
|
||||
/* Is this block ref in use? */
|
||||
//in_use: bool,
|
||||
@@ -278,8 +271,7 @@ pub struct DecodedBkpBlock {
|
||||
pub rnode_spcnode: u32,
|
||||
pub rnode_dbnode: u32,
|
||||
pub rnode_relnode: u32,
|
||||
// Note that we have a few special forknum values for non-rel files.
|
||||
pub forknum: u8,
|
||||
pub forknum: u8, // Note that we have a few special forknum values for non-rel files. Handle them too
|
||||
pub blkno: u32,
|
||||
|
||||
/* copy of the fork_flags field from the XLogRecordBlockHeader */
|
||||
@@ -297,43 +289,44 @@ pub struct DecodedBkpBlock {
|
||||
|
||||
/* Buffer holding the rmgr-specific data associated with this block */
|
||||
has_data: bool,
|
||||
//char *data;
|
||||
data_len: u16,
|
||||
}
|
||||
|
||||
impl DecodedBkpBlock {
|
||||
pub fn new() -> DecodedBkpBlock {
|
||||
DecodedBkpBlock {
|
||||
rnode_spcnode: 0,
|
||||
rnode_dbnode: 0,
|
||||
rnode_relnode: 0,
|
||||
forknum: 0,
|
||||
blkno: 0,
|
||||
|
||||
flags: 0,
|
||||
has_image: false,
|
||||
apply_image: false,
|
||||
will_init: false,
|
||||
hole_offset: 0,
|
||||
hole_length: 0,
|
||||
bimg_len: 0,
|
||||
bimg_info: 0,
|
||||
|
||||
has_data: false,
|
||||
data_len: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(non_upper_case_globals)]
|
||||
const SizeOfXLogRecord: u32 = 24;
|
||||
|
||||
pub struct DecodedWALRecord {
|
||||
pub lsn: u64, // LSN at the *end* of the record
|
||||
pub record: Bytes, // raw XLogRecord
|
||||
|
||||
pub blocks: Vec<DecodedBkpBlock>,
|
||||
pub main_data_offset: usize,
|
||||
}
|
||||
|
||||
// From pg_control.h and rmgrlist.h
|
||||
const XLOG_SWITCH: u8 = 0x40;
|
||||
const RM_XLOG_ID: u8 = 0;
|
||||
|
||||
const RM_XACT_ID: u8 = 1;
|
||||
// const RM_CLOG_ID:u8 = 3;
|
||||
//const RM_MULTIXACT_ID:u8 = 6;
|
||||
|
||||
// from xact.h
|
||||
const XLOG_XACT_COMMIT: u8 = 0x00;
|
||||
// const XLOG_XACT_PREPARE: u8 = 0x10;
|
||||
// const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
|
||||
// const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
|
||||
// const XLOG_XACT_ASSIGNMENT: u8 = 0x50;
|
||||
// const XLOG_XACT_INVALIDATIONS: u8 = 0x60;
|
||||
/* free opcode 0x70 */
|
||||
|
||||
/* mask for filtering opcodes out of xl_info */
|
||||
const XLOG_XACT_OPMASK: u8 = 0x70;
|
||||
|
||||
/* does this record have a 'xinfo' field or not */
|
||||
// const XLOG_XACT_HAS_INFO: u8 = 0x80;
|
||||
|
||||
// Is this record an XLOG_SWITCH record? They need some special processing,
|
||||
// so we need to check for that before the rest of the parsing.
|
||||
//
|
||||
@@ -350,59 +343,34 @@ fn is_xlog_switch_record(rec: &Bytes) -> bool {
|
||||
buf.advance(2); // 2 bytes of padding
|
||||
let _xl_crc = buf.get_u32_le();
|
||||
|
||||
return xl_info == pg_constants::XLOG_SWITCH && xl_rmid == pg_constants::RM_XLOG_ID;
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct RelFileNode {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
return xl_info == XLOG_SWITCH && xl_rmid == RM_XLOG_ID;
|
||||
}
|
||||
|
||||
//
|
||||
// Routines to decode a WAL record and figure out which blocks are modified
|
||||
//
|
||||
// See xlogrecord.h for details
|
||||
// The overall layout of an XLOG record is:
|
||||
// Fixed-size header (XLogRecord struct)
|
||||
// XLogRecordBlockHeader struct
|
||||
// If BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
|
||||
// If BKPIMAGE_HAS_HOLE and BKPIMAGE_IS_COMPRESSED, an
|
||||
// XLogRecordBlockCompressHeader struct follows.
|
||||
// If BKPBLOCK_SAME_REL is not set, a RelFileNode follows
|
||||
// BlockNumber follows
|
||||
// XLogRecordBlockHeader struct
|
||||
// ...
|
||||
// XLogRecordDataHeader[Short|Long] struct
|
||||
// block data
|
||||
// block data
|
||||
// ...
|
||||
// main data
|
||||
pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord {
|
||||
let mut rnode_spcnode: u32 = 0;
|
||||
let mut rnode_dbnode: u32 = 0;
|
||||
let mut rnode_relnode: u32 = 0;
|
||||
let mut got_rnode = false;
|
||||
pub fn decode_wal_record(lsn: u64, rec: Bytes) -> DecodedWALRecord {
|
||||
trace!(
|
||||
"decoding record with LSN {:08X}/{:08X} ({} bytes)",
|
||||
lsn >> 32,
|
||||
lsn & 0xffff_ffff,
|
||||
rec.remaining()
|
||||
);
|
||||
|
||||
let mut buf = rec.clone();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
|
||||
// FIXME: assume little-endian here
|
||||
let xl_tot_len = buf.get_u32_le();
|
||||
let xl_xid = buf.get_u32_le();
|
||||
let xl_prev = buf.get_u64_le();
|
||||
let _xl_prev = buf.get_u64_le();
|
||||
let xl_info = buf.get_u8();
|
||||
let xl_rmid = buf.get_u8();
|
||||
buf.advance(2); // 2 bytes of padding
|
||||
let _xl_crc = buf.get_u32_le();
|
||||
|
||||
trace!(
|
||||
"decode_wal_record xl_rmid = {} xl_info = {}",
|
||||
xl_rmid,
|
||||
xl_info
|
||||
);
|
||||
info!("decode_wal_record xl_rmid = {}", xl_rmid);
|
||||
|
||||
let rminfo: u8 = xl_info & !XLR_INFO_MASK;
|
||||
|
||||
let remaining = xl_tot_len - SizeOfXLogRecord;
|
||||
|
||||
@@ -410,28 +378,78 @@ pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord {
|
||||
//TODO error
|
||||
}
|
||||
|
||||
let mut rnode_spcnode: u32 = 0;
|
||||
let mut rnode_dbnode: u32 = 0;
|
||||
let mut rnode_relnode: u32 = 0;
|
||||
let mut got_rnode = false;
|
||||
|
||||
if xl_rmid == RM_XACT_ID
|
||||
&& ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT
|
||||
|| (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED)
|
||||
{
|
||||
info!("decode_wal_record RM_XACT_ID - XLOG_XACT_COMMIT");
|
||||
|
||||
let mut blocks: Vec<DecodedBkpBlock> = Vec::new();
|
||||
|
||||
let blkno = xl_xid / CLOG_XACTS_PER_PAGE;
|
||||
|
||||
let mut blk = DecodedBkpBlock {
|
||||
rnode_spcnode: 0,
|
||||
rnode_dbnode: 0,
|
||||
rnode_relnode: 0,
|
||||
forknum: pg_constants::PG_XACT_FORKNUM as u8,
|
||||
blkno: blkno,
|
||||
|
||||
flags: 0,
|
||||
has_image: false,
|
||||
apply_image: false,
|
||||
will_init: false,
|
||||
hole_offset: 0,
|
||||
hole_length: 0,
|
||||
bimg_len: 0,
|
||||
bimg_info: 0,
|
||||
|
||||
has_data: true,
|
||||
data_len: 0,
|
||||
};
|
||||
|
||||
let fork_flags = buf.get_u8();
|
||||
blk.has_data = (fork_flags & BKPBLOCK_HAS_DATA) != 0;
|
||||
blk.data_len = buf.get_u16_le();
|
||||
|
||||
info!(
|
||||
"decode_wal_record RM_XACT_ID blk has data with data_len {}",
|
||||
blk.data_len
|
||||
);
|
||||
|
||||
blocks.push(blk);
|
||||
return DecodedWALRecord {
|
||||
lsn: lsn,
|
||||
record: rec,
|
||||
blocks: blocks,
|
||||
};
|
||||
}
|
||||
|
||||
// Decode the headers
|
||||
|
||||
let mut max_block_id = 0;
|
||||
let mut blocks_total_len: u32 = 0;
|
||||
let mut main_data_len = 0;
|
||||
let mut datatotal: u32 = 0;
|
||||
let mut blocks: Vec<DecodedBkpBlock> = Vec::new();
|
||||
|
||||
// 2. Decode the headers.
|
||||
// XLogRecordBlockHeaders if any,
|
||||
// XLogRecordDataHeader[Short|Long]
|
||||
while buf.remaining() > datatotal as usize {
|
||||
let block_id = buf.get_u8();
|
||||
|
||||
match block_id {
|
||||
XLR_BLOCK_ID_DATA_SHORT => {
|
||||
/* XLogRecordDataHeaderShort */
|
||||
main_data_len = buf.get_u8() as u32;
|
||||
let main_data_len = buf.get_u8() as u32;
|
||||
|
||||
datatotal += main_data_len;
|
||||
}
|
||||
|
||||
XLR_BLOCK_ID_DATA_LONG => {
|
||||
/* XLogRecordDataHeaderLong */
|
||||
main_data_len = buf.get_u32_le();
|
||||
/* XLogRecordDataHeaderShort */
|
||||
let main_data_len = buf.get_u32();
|
||||
|
||||
datatotal += main_data_len;
|
||||
}
|
||||
|
||||
@@ -447,7 +465,25 @@ pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord {
|
||||
|
||||
0..=XLR_MAX_BLOCK_ID => {
|
||||
/* XLogRecordBlockHeader */
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let mut blk = DecodedBkpBlock {
|
||||
rnode_spcnode: 0,
|
||||
rnode_dbnode: 0,
|
||||
rnode_relnode: 0,
|
||||
forknum: 0,
|
||||
blkno: 0,
|
||||
|
||||
flags: 0,
|
||||
has_image: false,
|
||||
apply_image: false,
|
||||
will_init: false,
|
||||
hole_offset: 0,
|
||||
hole_length: 0,
|
||||
bimg_len: 0,
|
||||
bimg_info: 0,
|
||||
|
||||
has_data: false,
|
||||
data_len: 0,
|
||||
};
|
||||
let fork_flags: u8;
|
||||
|
||||
if block_id <= max_block_id {
|
||||
@@ -467,12 +503,28 @@ pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord {
|
||||
blk.has_image = (fork_flags & BKPBLOCK_HAS_IMAGE) != 0;
|
||||
blk.has_data = (fork_flags & BKPBLOCK_HAS_DATA) != 0;
|
||||
blk.will_init = (fork_flags & BKPBLOCK_WILL_INIT) != 0;
|
||||
|
||||
blk.data_len = buf.get_u16_le();
|
||||
|
||||
/* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */
|
||||
|
||||
/* cross-check that the HAS_DATA flag is set iff data_length > 0 */
|
||||
// TODO
|
||||
/*
|
||||
if (blk->has_data && blk->data_len == 0)
|
||||
{
|
||||
report_invalid_record(state,
|
||||
"BKPBLOCK_HAS_DATA set, but no data included at %X/%X",
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
goto err;
|
||||
}
|
||||
if (!blk->has_data && blk->data_len != 0)
|
||||
{
|
||||
report_invalid_record(state,
|
||||
"BKPBLOCK_HAS_DATA not set, but data length is %u at %X/%X",
|
||||
(unsigned int) blk->data_len,
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
goto err;
|
||||
}
|
||||
*/
|
||||
datatotal += blk.data_len as u32;
|
||||
blocks_total_len += blk.data_len as u32;
|
||||
|
||||
if blk.has_image {
|
||||
blk.bimg_len = buf.get_u16_le();
|
||||
@@ -491,7 +543,6 @@ pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord {
|
||||
blk.hole_length = BLCKSZ - blk.bimg_len;
|
||||
}
|
||||
datatotal += blk.bimg_len as u32;
|
||||
blocks_total_len += blk.bimg_len as u32;
|
||||
|
||||
/*
|
||||
* cross-check that hole_offset > 0, hole_length > 0 and
|
||||
@@ -567,14 +618,20 @@ pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord {
|
||||
rnode_spcnode = buf.get_u32_le();
|
||||
rnode_dbnode = buf.get_u32_le();
|
||||
rnode_relnode = buf.get_u32_le();
|
||||
//rnode = &blk->rnode;
|
||||
got_rnode = true;
|
||||
} else if !got_rnode {
|
||||
// TODO
|
||||
/*
|
||||
report_invalid_record(state,
|
||||
"BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
goto err; */
|
||||
} else {
|
||||
if !got_rnode {
|
||||
// TODO
|
||||
/*
|
||||
report_invalid_record(state,
|
||||
"BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
goto err;
|
||||
*/
|
||||
}
|
||||
|
||||
//blk->rnode = *rnode;
|
||||
}
|
||||
|
||||
blk.rnode_spcnode = rnode_spcnode;
|
||||
@@ -582,13 +639,8 @@ pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord {
|
||||
blk.rnode_relnode = rnode_relnode;
|
||||
|
||||
blk.blkno = buf.get_u32_le();
|
||||
trace!(
|
||||
"this record affects {}/{}/{} blk {}",
|
||||
rnode_spcnode,
|
||||
rnode_dbnode,
|
||||
rnode_relnode,
|
||||
blk.blkno
|
||||
);
|
||||
|
||||
//println!("this record affects {}/{}/{} blk {}",rnode_spcnode, rnode_dbnode, rnode_relnode, blk.blkno);
|
||||
|
||||
blocks.push(blk);
|
||||
}
|
||||
@@ -599,58 +651,21 @@ pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord {
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Decode blocks.
|
||||
// We don't need them, so just skip blocks_total_len bytes
|
||||
buf.advance(blocks_total_len as usize);
|
||||
/*
|
||||
* Ok, we've parsed the fragment headers, and verified that the total
|
||||
* length of the payload in the fragments is equal to the amount of data
|
||||
* left. Copy the data of each fragment to a separate buffer.
|
||||
*
|
||||
* We could just set up pointers into readRecordBuf, but we want to align
|
||||
* the data for the convenience of the callers. Backup images are not
|
||||
* copied, however; they don't need alignment.
|
||||
*/
|
||||
|
||||
let main_data_offset = (xl_tot_len - main_data_len) as usize;
|
||||
// Since we don't care about the data payloads here, we're done.
|
||||
|
||||
// 4. Decode main_data
|
||||
if main_data_len > 0 {
|
||||
assert_eq!(buf.remaining(), main_data_len as usize);
|
||||
}
|
||||
|
||||
//5. Handle special CLOG and XACT records
|
||||
if xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = buf.get_i32_le() as u32;
|
||||
trace!("RM_CLOG_ID updates block {}", blk.blkno);
|
||||
blocks.push(blk);
|
||||
} else if xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT xl_prev {:X}/{:X} xid {} updates block {}",
|
||||
(xl_prev >> 32),
|
||||
xl_prev & 0xffffffff,
|
||||
xl_xid,
|
||||
blk.blkno
|
||||
);
|
||||
blocks.push(blk);
|
||||
//TODO parse commit record to extract subtrans entries
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
trace!(
|
||||
"XLOG_XACT_ABORT xl_prev {:X}/{:X} xid {} updates block {}",
|
||||
(xl_prev >> 32),
|
||||
xl_prev & 0xffffffff,
|
||||
xl_xid,
|
||||
blk.blkno
|
||||
);
|
||||
blocks.push(blk);
|
||||
//TODO parse abort record to extract subtrans entries
|
||||
}
|
||||
}
|
||||
|
||||
DecodedWALRecord {
|
||||
return DecodedWALRecord {
|
||||
lsn: lsn,
|
||||
record: rec,
|
||||
blocks,
|
||||
main_data_offset: main_data_offset,
|
||||
}
|
||||
blocks: blocks,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1,96 +1,29 @@
|
||||
//!
|
||||
//! WAL receiver
|
||||
//!
|
||||
//! The WAL receiver connects to the WAL safekeeper service, and streams WAL.
|
||||
//! For each WAL record, it decodes the record to figure out which data blocks
|
||||
//! the record affects, and adds the records to the page cache.
|
||||
//!
|
||||
//
|
||||
// WAL receiver
|
||||
//
|
||||
// The WAL receiver connects to the WAL safekeeper service, and streams WAL.
|
||||
// For each WAL record, it decodes the record to figure out which data blocks
|
||||
// the record affects, and adds the records to the page cache.
|
||||
//
|
||||
use log::*;
|
||||
|
||||
use tokio::runtime;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::BufferTag;
|
||||
use crate::waldecoder::{decode_wal_record, WalStreamDecoder};
|
||||
use crate::waldecoder::WalStreamDecoder;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Error;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
use tokio::runtime;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use tokio_postgres::replication::{PgTimestamp, ReplicationStream};
|
||||
use tokio_postgres::{NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
//
|
||||
// We keep one WAL Receiver active per timeline.
|
||||
//
|
||||
struct WalReceiverEntry {
|
||||
wal_producer_connstr: String,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref WAL_RECEIVERS: Mutex<HashMap<ZTimelineId, WalReceiverEntry>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
) {
|
||||
let mut receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
|
||||
match receivers.get_mut(&timelineid) {
|
||||
Some(receiver) => {
|
||||
receiver.wal_producer_connstr = wal_producer_connstr.into();
|
||||
}
|
||||
None => {
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Also launch a new thread to handle this connection
|
||||
let conf_copy = conf.clone();
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
thread_main(&conf_copy, timelineid);
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Look up current WAL producer connection string in the hash table
|
||||
fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
|
||||
let receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
|
||||
receivers
|
||||
.get(&timelineid)
|
||||
.unwrap()
|
||||
.wal_producer_connstr
|
||||
.clone()
|
||||
}
|
||||
use tokio_postgres::{connect_replication, Error, NoTls, ReplicationMode};
|
||||
|
||||
//
|
||||
// This is the entry point for the WAL receiver thread.
|
||||
//
|
||||
fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
info!(
|
||||
"WAL receiver thread started for timeline : '{}'",
|
||||
timelineid
|
||||
);
|
||||
pub fn thread_main(conf: PageServerConf, wal_producer_connstr: &String) {
|
||||
info!("WAL receiver thread started: '{}'", wal_producer_connstr);
|
||||
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
@@ -99,32 +32,31 @@ fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
|
||||
runtime.block_on(async {
|
||||
loop {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
let _res = walreceiver_main(conf.clone(), wal_producer_connstr).await;
|
||||
|
||||
let res = walreceiver_main(conf, timelineid, &wal_producer_connstr).await;
|
||||
|
||||
if let Err(e) = res {
|
||||
info!(
|
||||
"WAL streaming connection failed ({}), retrying in 1 second",
|
||||
e
|
||||
);
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
// TODO: print/log the error
|
||||
info!(
|
||||
"WAL streaming connection failed, retrying in 1 second...: {:?}",
|
||||
_res
|
||||
);
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async fn walreceiver_main(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
conf: PageServerConf,
|
||||
wal_producer_connstr: &String,
|
||||
) -> Result<(), Error> {
|
||||
// Connect to the database in replication mode.
|
||||
info!("connecting to {:?}", wal_producer_connstr);
|
||||
let connect_cfg = format!("{} replication=true", wal_producer_connstr);
|
||||
let (rclient, connection) = tokio_postgres::connect(&connect_cfg, NoTls).await?;
|
||||
info!("connected!");
|
||||
debug!("connecting to {}...", wal_producer_connstr);
|
||||
let (mut rclient, connection) = connect_replication(
|
||||
wal_producer_connstr.as_str(),
|
||||
NoTls,
|
||||
ReplicationMode::Physical,
|
||||
)
|
||||
.await?;
|
||||
debug!("connected!");
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
@@ -134,29 +66,28 @@ async fn walreceiver_main(
|
||||
}
|
||||
});
|
||||
|
||||
let identify = identify_system(&rclient).await?;
|
||||
info!("{:?}", identify);
|
||||
let end_of_wal = u64::from(identify.xlogpos);
|
||||
let identify_system = rclient.identify_system().await?;
|
||||
let end_of_wal = u64::from(identify_system.xlogpos());
|
||||
let mut caught_up = false;
|
||||
|
||||
let pcache = page_cache::get_pagecache(&conf, timelineid).unwrap();
|
||||
let sysid: u64 = identify_system.systemid().parse().unwrap();
|
||||
let pcache = page_cache::get_pagecache(conf, sysid);
|
||||
|
||||
//
|
||||
// Start streaming the WAL, from where we left off previously.
|
||||
//
|
||||
let mut startpoint = pcache.get_last_valid_lsn();
|
||||
let last_valid_lsn = pcache.get_last_valid_lsn();
|
||||
if startpoint == 0 {
|
||||
// If we start here with identify.xlogpos we will have race condition with
|
||||
// If we start here with identify_system.xlogpos() we will have race condition with
|
||||
// postgres start: insert into postgres may request page that was modified with lsn
|
||||
// smaller than identify.xlogpos.
|
||||
// smaller than identify_system.xlogpos().
|
||||
//
|
||||
// Current procedure for starting postgres will anyway be changed to something
|
||||
// different like having 'initdb' method on a pageserver (or importing some shared
|
||||
// empty database snapshot), so for now I just put start of first segment which
|
||||
// seems to be a valid record.
|
||||
pcache.init_valid_lsn(0x_1_000_000_u64);
|
||||
startpoint = 0x_1_000_000_u64;
|
||||
startpoint = u64::from(0x_1_000_000_u64);
|
||||
} else {
|
||||
// There might be some padding after the last full record, skip it.
|
||||
//
|
||||
@@ -168,23 +99,16 @@ async fn walreceiver_main(
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"last_valid_lsn {:X}/{:X} starting replication from {:X}/{:X} for timeline {}, server is at {:X}/{:X}...",
|
||||
(last_valid_lsn >> 32),
|
||||
(last_valid_lsn & 0xffffffff),
|
||||
"starting replication from {:X}/{:X}, server is at {:X}/{:X}...",
|
||||
(startpoint >> 32),
|
||||
(startpoint & 0xffffffff),
|
||||
timelineid,
|
||||
(end_of_wal >> 32),
|
||||
(end_of_wal & 0xffffffff)
|
||||
);
|
||||
|
||||
let startpoint = PgLsn::from(startpoint);
|
||||
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
|
||||
let copy_stream = rclient.copy_both_simple::<bytes::Bytes>(&query).await?;
|
||||
|
||||
let physical_stream = ReplicationStream::new(copy_stream);
|
||||
tokio::pin!(physical_stream);
|
||||
|
||||
let startpoint = tokio_postgres::types::Lsn::from(startpoint);
|
||||
let mut physical_stream = rclient
|
||||
.start_physical_replication(None, startpoint, None)
|
||||
.await?;
|
||||
let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
|
||||
|
||||
while let Some(replication_message) = physical_stream.next().await {
|
||||
@@ -196,13 +120,6 @@ async fn walreceiver_main(
|
||||
let startlsn = xlog_data.wal_start();
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
|
||||
write_wal_file(
|
||||
startlsn,
|
||||
timelineid,
|
||||
16 * 1024 * 1024, // FIXME
|
||||
data,
|
||||
)?;
|
||||
|
||||
trace!(
|
||||
"received XLogData between {:X}/{:X} and {:X}/{:X}",
|
||||
(startlsn >> 32),
|
||||
@@ -214,8 +131,10 @@ async fn walreceiver_main(
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
loop {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode() {
|
||||
let decoded =
|
||||
crate::waldecoder::decode_wal_record(startlsn, recdata.clone());
|
||||
|
||||
// Put the WAL record to the page cache. We make a separate copy of
|
||||
// it for every block it modifies. (The actual WAL record is kept in
|
||||
// a Bytes, which uses a reference counter for the underlying buffer,
|
||||
@@ -230,17 +149,17 @@ async fn walreceiver_main(
|
||||
};
|
||||
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn,
|
||||
lsn: lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset,
|
||||
};
|
||||
|
||||
pcache.put_wal_record(tag, rec);
|
||||
}
|
||||
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
pcache.advance_last_record_lsn(lsn);
|
||||
pcache.advance_last_valid_lsn(lsn);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
@@ -264,230 +183,12 @@ async fn walreceiver_main(
|
||||
}
|
||||
}
|
||||
|
||||
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
|
||||
let wal_end = keepalive.wal_end();
|
||||
let timestamp = keepalive.timestamp();
|
||||
let reply_requested: bool = keepalive.reply() != 0;
|
||||
|
||||
trace!(
|
||||
"received PrimaryKeepAlive(wal_end: {}, timestamp: {} reply: {})",
|
||||
wal_end,
|
||||
timestamp,
|
||||
reply_requested,
|
||||
);
|
||||
if reply_requested {
|
||||
// TODO: More thought should go into what values are sent here.
|
||||
let last_lsn = PgLsn::from(pcache.get_last_valid_lsn());
|
||||
let write_lsn = last_lsn;
|
||||
let flush_lsn = last_lsn;
|
||||
let apply_lsn = PgLsn::INVALID;
|
||||
let ts = PgTimestamp::now()?;
|
||||
const NO_REPLY: u8 = 0u8;
|
||||
|
||||
physical_stream
|
||||
.as_mut()
|
||||
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)
|
||||
.await?;
|
||||
}
|
||||
ReplicationMessage::PrimaryKeepAlive(_keepalive) => {
|
||||
trace!("received PrimaryKeepAlive");
|
||||
// FIXME: Reply, or the connection will time out
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
///
|
||||
/// See the [postgres docs] for more details.
|
||||
///
|
||||
/// [postgres docs]: https://www.postgresql.org/docs/current/protocol-replication.html
|
||||
#[derive(Debug)]
|
||||
pub struct IdentifySystem {
|
||||
systemid: u64,
|
||||
timeline: u32,
|
||||
xlogpos: PgLsn,
|
||||
dbname: Option<String>,
|
||||
}
|
||||
|
||||
/// There was a problem parsing the response to
|
||||
/// a postgres IDENTIFY_SYSTEM command.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
#[error("IDENTIFY_SYSTEM parse error")]
|
||||
pub struct IdentifyError;
|
||||
|
||||
/// Run the postgres `IDENTIFY_SYSTEM` command
|
||||
pub async fn identify_system(client: &tokio_postgres::Client) -> Result<IdentifySystem, Error> {
|
||||
let query_str = "IDENTIFY_SYSTEM";
|
||||
let response = client.simple_query(query_str).await?;
|
||||
|
||||
// get(N) from row, then parse it as some destination type.
|
||||
fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
|
||||
where
|
||||
T: FromStr,
|
||||
{
|
||||
let val = row.get(idx).ok_or(IdentifyError)?;
|
||||
val.parse::<T>().or(Err(IdentifyError))
|
||||
}
|
||||
|
||||
// extract the row contents into an IdentifySystem struct.
|
||||
// written as a closure so I can use ? for Option here.
|
||||
if let Some(SimpleQueryMessage::Row(first_row)) = response.get(0) {
|
||||
Ok(IdentifySystem {
|
||||
systemid: get_parse(first_row, 0)?,
|
||||
timeline: get_parse(first_row, 1)?,
|
||||
xlogpos: get_parse(first_row, 2)?,
|
||||
dbname: get_parse(first_row, 3).ok(),
|
||||
})
|
||||
} else {
|
||||
Err(IdentifyError)?
|
||||
}
|
||||
}
|
||||
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
pub const XLOG_BLCKSZ: usize = 8192;
|
||||
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
|
||||
pub const XLOG_PAGE_MAGIC: u16 = 0xD109;
|
||||
pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
|
||||
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = XLP_REM_LEN_OFFS + 4 + 4;
|
||||
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = XLOG_SIZE_OF_XLOG_SHORT_PHD + 8 + 4 + 4;
|
||||
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
|
||||
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = XLOG_RECORD_CRC_OFFS + 4;
|
||||
pub type XLogRecPtr = u64;
|
||||
pub type TimeLineID = u32;
|
||||
pub type TimestampTz = u64;
|
||||
pub type XLogSegNo = u64;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
|
||||
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
return xlogptr / wal_segsz_bytes as u64;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegNoOffsetToRecPtr(
|
||||
segno: XLogSegNo,
|
||||
offset: u32,
|
||||
wal_segsz_bytes: usize,
|
||||
) -> XLogRecPtr {
|
||||
return segno * (wal_segsz_bytes as u64) + (offset as u64);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
|
||||
return format!(
|
||||
"{:>08X}{:>08X}{:>08X}",
|
||||
tli,
|
||||
logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes),
|
||||
logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)
|
||||
);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
|
||||
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
|
||||
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
|
||||
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
|
||||
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
|
||||
}
|
||||
|
||||
fn write_wal_file(
|
||||
startpos: XLogRecPtr,
|
||||
timeline: ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
buf: &[u8],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut bytes_left: usize = buf.len();
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &'static [u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
|
||||
|
||||
while bytes_left != 0 {
|
||||
let bytes_to_write;
|
||||
|
||||
/*
|
||||
* If crossing a WAL boundary, only write up until we reach wal
|
||||
* segment size.
|
||||
*/
|
||||
if xlogoff + bytes_left > wal_seg_size {
|
||||
bytes_to_write = wal_seg_size - xlogoff;
|
||||
} else {
|
||||
bytes_to_write = bytes_left;
|
||||
}
|
||||
|
||||
/* Open file */
|
||||
let segno = XLByteToSeg(start_pos, wal_seg_size);
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
segno,
|
||||
wal_seg_size,
|
||||
);
|
||||
let wal_file_path = wal_dir.join(wal_file_name.clone());
|
||||
let wal_file_partial_path = wal_dir.join(wal_file_name.clone() + ".partial");
|
||||
|
||||
{
|
||||
let mut wal_file: File;
|
||||
/* Try to open already completed segment */
|
||||
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
|
||||
wal_file = file;
|
||||
partial = false;
|
||||
} else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) {
|
||||
/* Try to open existed partial file */
|
||||
wal_file = file;
|
||||
partial = true;
|
||||
} else {
|
||||
/* Create and fill new partial file */
|
||||
partial = true;
|
||||
match OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.open(&wal_file_partial_path)
|
||||
{
|
||||
Ok(mut file) => {
|
||||
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
|
||||
file.write_all(&ZERO_BLOCK)?;
|
||||
}
|
||||
wal_file = file;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
wal_file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?;
|
||||
|
||||
// FIXME: Flush the file
|
||||
//wal_file.sync_all()?;
|
||||
}
|
||||
/* Write was successful, advance our position */
|
||||
bytes_written += bytes_to_write;
|
||||
bytes_left -= bytes_to_write;
|
||||
start_pos += bytes_to_write as u64;
|
||||
xlogoff += bytes_to_write;
|
||||
|
||||
/* Did we reach the end of a WAL segment? */
|
||||
if XLogSegmentOffset(start_pos, wal_seg_size) == 0 {
|
||||
xlogoff = 0;
|
||||
if partial {
|
||||
fs::rename(&wal_file_partial_path, &wal_file_path)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -19,31 +19,30 @@ use std::assert;
|
||||
use std::cell::RefCell;
|
||||
use std::fs;
|
||||
use std::io::Error;
|
||||
use std::process::Stdio;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use std::{path::PathBuf, process::Stdio};
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::process::{Child, ChildStdin, ChildStdout, Command};
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::time::timeout;
|
||||
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::CacheEntry;
|
||||
use crate::page_cache::WALRecord;
|
||||
use crate::ZTimelineId;
|
||||
use crate::{page_cache::BufferTag, pg_constants, PageServerConf};
|
||||
use crate::{page_cache::BufferTag, PageServerConf};
|
||||
|
||||
static TIMEOUT: Duration = Duration::from_secs(20);
|
||||
|
||||
//
|
||||
// Main entry point for the WAL applicator thread.
|
||||
//
|
||||
pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
info!("WAL redo thread started {}", timelineid);
|
||||
pub fn wal_redo_main(conf: PageServerConf, sys_id: u64) {
|
||||
info!("WAL redo thread started {}", sys_id);
|
||||
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
@@ -53,15 +52,15 @@ pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, timelineid).unwrap();
|
||||
let pcache = page_cache::get_pagecache(conf.clone(), sys_id);
|
||||
|
||||
// Loop forever, handling requests as they come.
|
||||
let walredo_channel_receiver = &pcache.walredo_receiver;
|
||||
loop {
|
||||
let mut process: WalRedoProcess;
|
||||
let datadir = format!("wal-redo/{}", timelineid);
|
||||
let datadir = conf.data_dir.join(format!("wal-redo/{}", sys_id));
|
||||
|
||||
info!("launching WAL redo postgres process {}", timelineid);
|
||||
info!("launching WAL redo postgres process {}", sys_id);
|
||||
{
|
||||
let _guard = runtime.enter();
|
||||
process = WalRedoProcess::launch(&datadir, &runtime).unwrap();
|
||||
@@ -89,59 +88,6 @@ pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
}
|
||||
}
|
||||
|
||||
fn transaction_id_set_status_bit(
|
||||
xl_info: u8,
|
||||
xl_rmid: u8,
|
||||
xl_xid: u32,
|
||||
record: WALRecord,
|
||||
page: &mut BytesMut,
|
||||
) {
|
||||
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let mut status = 0;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
} else {
|
||||
trace!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {:X}/{:X} main_data_offset {}, rec.len {}",
|
||||
status,
|
||||
record.lsn >> 32,
|
||||
record.lsn & 0xffffffff,
|
||||
record.main_data_offset, record.rec.len());
|
||||
return;
|
||||
}
|
||||
|
||||
trace!("handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort) lsn {:X}/{:X} main_data_offset {}, rec.len {}",
|
||||
status,
|
||||
record.lsn >> 32,
|
||||
record.lsn & 0xffffffff,
|
||||
record.main_data_offset, record.rec.len());
|
||||
|
||||
let byteno: usize = ((xl_rmid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
|
||||
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
|
||||
|
||||
let byteptr = &mut page[byteno..byteno + 1];
|
||||
let bshift: u8 = ((xl_xid % pg_constants::CLOG_XACTS_PER_BYTE)
|
||||
* pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
|
||||
let mut curval = byteptr[0];
|
||||
curval = (curval >> bshift) & pg_constants::CLOG_XACT_BITMASK;
|
||||
|
||||
let mut byteval = [0];
|
||||
byteval[0] = curval;
|
||||
byteval[0] &= !(((1 << pg_constants::CLOG_BITS_PER_XACT as u8) - 1) << bshift);
|
||||
byteval[0] |= status << bshift;
|
||||
|
||||
byteptr.copy_from_slice(&byteval);
|
||||
trace!(
|
||||
"xl_xid {} byteno {} curval {} byteval {}",
|
||||
xl_xid,
|
||||
byteno,
|
||||
curval,
|
||||
byteval[0]
|
||||
);
|
||||
}
|
||||
|
||||
fn handle_apply_request(
|
||||
pcache: &page_cache::PageCache,
|
||||
process: &WalRedoProcess,
|
||||
@@ -158,46 +104,7 @@ fn handle_apply_request(
|
||||
let nrecords = records.len();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
if tag.forknum == pg_constants::PG_XACT_FORKNUM as u8 {
|
||||
//TODO use base image if any
|
||||
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let zero_page_bytes: &[u8] = &ZERO_PAGE;
|
||||
let mut page = BytesMut::from(zero_page_bytes);
|
||||
|
||||
for record in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
// FIXME: refactor to avoid code duplication.
|
||||
let _xl_tot_len = buf.get_u32_le();
|
||||
let xl_xid = buf.get_u32_le();
|
||||
let _xl_prev = buf.get_u64_le();
|
||||
let xl_info = buf.get_u8();
|
||||
let xl_rmid = buf.get_u8();
|
||||
buf.advance(2); // 2 bytes of padding
|
||||
let _xl_crc = buf.get_u32_le();
|
||||
|
||||
if xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let info = xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
page.clone_from_slice(zero_page_bytes);
|
||||
trace!("handle_apply_request for RM_CLOG_ID-CLOG_ZEROPAGE lsn {:X}/{:X} main_data_offset {}, rec.len {}",
|
||||
record.lsn >> 32,
|
||||
record.lsn & 0xffffffff,
|
||||
record.main_data_offset, record.rec.len());
|
||||
}
|
||||
} else if xl_rmid == pg_constants::RM_XACT_ID {
|
||||
transaction_id_set_status_bit(xl_info, xl_rmid, xl_xid, record, &mut page);
|
||||
}
|
||||
}
|
||||
|
||||
apply_result = Ok::<Bytes, Error>(page.freeze());
|
||||
} else {
|
||||
apply_result = process.apply_wal_records(runtime, tag, base_img, records);
|
||||
}
|
||||
|
||||
let apply_result = process.apply_wal_records(runtime, tag, base_img, records);
|
||||
let duration = start.elapsed();
|
||||
|
||||
let result;
|
||||
@@ -240,13 +147,13 @@ impl WalRedoProcess {
|
||||
// Tests who run pageserver binary are setting proper PG_BIN_DIR
|
||||
// and PG_LIB_DIR so that WalRedo would start right postgres. We may later
|
||||
// switch to setting same things in pageserver config file.
|
||||
fn launch(datadir: &str, runtime: &Runtime) -> Result<WalRedoProcess, Error> {
|
||||
fn launch(datadir: &PathBuf, runtime: &Runtime) -> Result<WalRedoProcess, Error> {
|
||||
// Create empty data directory for wal-redo postgres deleting old one.
|
||||
fs::remove_dir_all(datadir).ok();
|
||||
fs::remove_dir_all(datadir.to_str().unwrap()).ok();
|
||||
let initdb = runtime
|
||||
.block_on(
|
||||
Command::new("initdb")
|
||||
.args(&["-D", datadir])
|
||||
.args(&["-D", datadir.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.output(),
|
||||
)
|
||||
@@ -266,11 +173,14 @@ impl WalRedoProcess {
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env("PGDATA", datadir)
|
||||
.env("PGDATA", datadir.to_str().unwrap())
|
||||
.spawn()
|
||||
.expect("postgres --wal-redo command failed to start");
|
||||
|
||||
info!("launched WAL redo postgres process on {}", datadir);
|
||||
info!(
|
||||
"launched WAL redo postgres process on {}",
|
||||
datadir.to_str().unwrap()
|
||||
);
|
||||
|
||||
let stdin = child.stdin.take().expect("failed to open child's stdin");
|
||||
let stderr = child.stderr.take().expect("failed to open child's stderr");
|
||||
@@ -298,7 +208,7 @@ impl WalRedoProcess {
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
Ok(WalRedoProcess {
|
||||
child,
|
||||
child: child,
|
||||
stdin: RefCell::new(stdin),
|
||||
stdout: RefCell::new(stdout),
|
||||
})
|
||||
|
||||
@@ -10,10 +10,6 @@
|
||||
#
|
||||
# 2) installs postgres to REPO_ROOT/tmp_install/
|
||||
#
|
||||
|
||||
# Halt immediately if any command fails
|
||||
set -e
|
||||
|
||||
REPO_ROOT=$(dirname "$0")
|
||||
REPO_ROOT="`( cd \"$REPO_ROOT\" && pwd )`"
|
||||
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
[package]
|
||||
name = "postgres_ffi"
|
||||
version = "0.1.0"
|
||||
authors = ["Heikki Linnakangas <heikki@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
hex = "0.4.3"
|
||||
|
||||
[build-dependencies]
|
||||
bindgen = "0.53.1"
|
||||
@@ -1,42 +0,0 @@
|
||||
extern crate bindgen;
|
||||
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn main() {
|
||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||
println!("cargo:rerun-if-changed=pg_control_ffi.h");
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
// to bindgen, and lets you build up options for
|
||||
// the resulting bindings.
|
||||
let bindings = bindgen::Builder::default()
|
||||
// The input header we would like to generate
|
||||
// bindings for.
|
||||
.header("pg_control_ffi.h")
|
||||
// Tell cargo to invalidate the built crate whenever any of the
|
||||
// included header files changed.
|
||||
.parse_callbacks(Box::new(bindgen::CargoCallbacks))
|
||||
.whitelist_type("ControlFileData")
|
||||
.whitelist_var("PG_CONTROL_FILE_SIZE")
|
||||
.whitelist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
|
||||
.whitelist_type("DBState")
|
||||
// Path the server include dir. It is in tmp_install/include/server, if you did
|
||||
// "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
|
||||
// and used DESTDIR to move it into tmp_install, then it's in
|
||||
// tmp_install/include/postgres/server (that's how the pgbuild.sh script does it).
|
||||
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
|
||||
// but this will do for now.
|
||||
.clang_arg("-I../tmp_install/include/server")
|
||||
.clang_arg("-I../tmp_install/include/postgresql/server")
|
||||
// Finish the builder and generate the bindings.
|
||||
.generate()
|
||||
// Unwrap the Result and panic on failure.
|
||||
.expect("Unable to generate bindings");
|
||||
|
||||
// Write the bindings to the $OUT_DIR/bindings.rs file.
|
||||
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
|
||||
bindings
|
||||
.write_to_file(out_path.join("bindings.rs"))
|
||||
.expect("Couldn't write bindings!");
|
||||
}
|
||||
@@ -1,4 +0,0 @@
|
||||
#include "c.h"
|
||||
#include "catalog/pg_control.h"
|
||||
|
||||
const uint32 PG_CONTROLFILEDATA_OFFSETOF_CRC = offsetof(ControlFileData, crc);
|
||||
@@ -1,67 +0,0 @@
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
|
||||
// sizeof(ControlFileData)
|
||||
const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
|
||||
const OFFSETOF_CRC: usize = PG_CONTROLFILEDATA_OFFSETOF_CRC as usize;
|
||||
|
||||
impl ControlFileData {
|
||||
// Initialize an all-zeros ControlFileData struct
|
||||
pub fn new() -> ControlFileData {
|
||||
let controlfile: ControlFileData;
|
||||
|
||||
let b = [0u8; SIZEOF_CONTROLDATA];
|
||||
controlfile =
|
||||
unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
|
||||
|
||||
return controlfile;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_pg_control(buf: Bytes) -> Result<ControlFileData, anyhow::Error> {
|
||||
let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA];
|
||||
buf.clone().copy_to_slice(&mut b);
|
||||
|
||||
let controlfile: ControlFileData;
|
||||
|
||||
// TODO: verify CRC
|
||||
let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC];
|
||||
data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
let expectedcrc = crc32c::crc32c(&data_without_crc);
|
||||
|
||||
controlfile = unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
|
||||
|
||||
if expectedcrc != controlfile.crc {
|
||||
anyhow::bail!(
|
||||
"invalid CRC in control file: expected {:08X}, was {:08X}",
|
||||
expectedcrc,
|
||||
controlfile.crc
|
||||
);
|
||||
}
|
||||
|
||||
Ok(controlfile)
|
||||
}
|
||||
|
||||
pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes {
|
||||
let b: [u8; SIZEOF_CONTROLDATA];
|
||||
|
||||
b = unsafe { std::mem::transmute::<ControlFileData, [u8; SIZEOF_CONTROLDATA]>(controlfile) };
|
||||
|
||||
// Recompute the CRC
|
||||
let mut data_without_crc: [u8; OFFSETOF_CRC] = [0u8; OFFSETOF_CRC];
|
||||
data_without_crc.copy_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
let newcrc = crc32c::crc32c(&data_without_crc);
|
||||
|
||||
let mut buf = BytesMut::with_capacity(PG_CONTROL_FILE_SIZE as usize);
|
||||
|
||||
buf.extend_from_slice(&b[0..OFFSETOF_CRC]);
|
||||
buf.extend_from_slice(&newcrc.to_ne_bytes());
|
||||
// Fill the rest of the control file with zeros.
|
||||
buf.resize(PG_CONTROL_FILE_SIZE as usize, 0);
|
||||
|
||||
return buf.into();
|
||||
}
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: 67da6b1df6...a71b5c24eb
@@ -7,10 +7,14 @@ edition = "2018"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
crossbeam-channel = "0.5.0"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3.13"
|
||||
lazy_static = "1.4.0"
|
||||
slog-stdlog = "4.1.0"
|
||||
slog-async = "2.6.0"
|
||||
@@ -19,14 +23,16 @@ slog-term = "2.8.0"
|
||||
slog = "2.7.0"
|
||||
log = "0.4.14"
|
||||
clap = "2.33.0"
|
||||
termion = "1.5.6"
|
||||
tui = "0.14.0"
|
||||
daemonize = "0.4.1"
|
||||
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", features = ["no-verify-ssl"] }
|
||||
tokio = { version = "1.3.0", features = ["full"] }
|
||||
tokio-stream = { version = "0.1.4" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
tokio-postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres-protocol = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
postgres = { git = "https://github.com/kelvich/rust-postgres", branch = "replication_rebase" }
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
|
||||
# FIXME: 'pageserver' is needed for ZTimelineId. Refactor
|
||||
pageserver = { path = "../pageserver" }
|
||||
|
||||
@@ -9,15 +9,17 @@ use std::path::PathBuf;
|
||||
use std::thread;
|
||||
use std::{fs::File, fs::OpenOptions};
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
|
||||
use slog;
|
||||
use slog::Drain;
|
||||
use slog_scope;
|
||||
use slog_stdlog;
|
||||
|
||||
use walkeeper::wal_service;
|
||||
use walkeeper::WalAcceptorConf;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
fn main() -> Result<(), io::Error> {
|
||||
let arg_matches = App::new("Zenith wal_acceptor")
|
||||
.about("Store WAL stream to local file system and push it to WAL receivers")
|
||||
.arg(
|
||||
@@ -27,13 +29,6 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Path to the WAL acceptor data directory"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("systemid")
|
||||
.long("systemid")
|
||||
.takes_value(true)
|
||||
.required(true)
|
||||
.help("PostgreSQL system id, from pg_control"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen")
|
||||
.short("l")
|
||||
@@ -64,23 +59,16 @@ fn main() -> Result<()> {
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let systemid_str = arg_matches.value_of("systemid").unwrap();
|
||||
let systemid: u64 = systemid_str.parse()?;
|
||||
|
||||
let mut conf = WalAcceptorConf {
|
||||
data_dir: PathBuf::from("./"),
|
||||
systemid: systemid,
|
||||
daemonize: false,
|
||||
no_sync: false,
|
||||
pageserver_addr: None,
|
||||
listen_addr: "127.0.0.1:5454".parse()?,
|
||||
listen_addr: "127.0.0.1:5454".parse().unwrap(),
|
||||
};
|
||||
|
||||
if let Some(dir) = arg_matches.value_of("datadir") {
|
||||
conf.data_dir = PathBuf::from(dir);
|
||||
|
||||
// change into the data directory.
|
||||
std::env::set_current_dir(&conf.data_dir)?;
|
||||
}
|
||||
|
||||
if arg_matches.is_present("no-sync") {
|
||||
@@ -102,9 +90,9 @@ fn main() -> Result<()> {
|
||||
start_wal_acceptor(conf)
|
||||
}
|
||||
|
||||
fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<(), io::Error> {
|
||||
// Initialize logger
|
||||
let _scope_guard = init_logging(&conf)?;
|
||||
let _scope_guard = init_logging(&conf);
|
||||
let _log_guard = slog_stdlog::init().unwrap();
|
||||
// Note: this `info!(...)` macro comes from `log` crate
|
||||
info!("standard logging redirected to slog");
|
||||
@@ -113,20 +101,20 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
info!("daemonizing...");
|
||||
|
||||
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
// that we will see any accidental manual fpritf's or backtraces.
|
||||
let stdout = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open("wal_acceptor.log")
|
||||
.open(conf.data_dir.join("wal_acceptor.log"))
|
||||
.unwrap();
|
||||
let stderr = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open("wal_acceptor.log")
|
||||
.open(conf.data_dir.join("wal_acceptor.log"))
|
||||
.unwrap();
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file("wal_acceptor.pid")
|
||||
.pid_file(conf.data_dir.join("wal_acceptor.pid"))
|
||||
.working_directory(Path::new("."))
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
@@ -153,24 +141,20 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_logging(conf: &WalAcceptorConf) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
|
||||
fn init_logging(conf: &WalAcceptorConf) -> slog_scope::GlobalLoggerGuard {
|
||||
if conf.daemonize {
|
||||
let log = conf.data_dir.join("wal_acceptor.log");
|
||||
let log_file = File::create(&log).map_err(|err| {
|
||||
// We failed to initialize logging, so we can't log this message with error!
|
||||
eprintln!("Could not create log file {:?}: {}", log, err);
|
||||
err
|
||||
})?;
|
||||
let log_file = File::create(log).unwrap_or_else(|_| panic!("Could not create log file"));
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file);
|
||||
let drain = slog_term::CompactFormat::new(decorator).build();
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
slog_scope::set_global_logger(logger)
|
||||
} else {
|
||||
let decorator = slog_term::TermDecorator::new().build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build().fuse();
|
||||
let drain = slog_async::Async::new(drain).chan_size(1000).build().fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
return slog_scope::set_global_logger(logger);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,12 +6,10 @@ mod pq_protocol;
|
||||
pub mod wal_service;
|
||||
pub mod xlog_utils;
|
||||
|
||||
use crate::pq_protocol::SystemId;
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WalAcceptorConf {
|
||||
pub data_dir: PathBuf,
|
||||
pub systemid: SystemId,
|
||||
pub daemonize: bool,
|
||||
pub no_sync: bool,
|
||||
pub listen_addr: SocketAddr,
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use pageserver::ZTimelineId;
|
||||
use std::io;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type SystemId = u64;
|
||||
@@ -39,7 +37,7 @@ pub enum BeMessage<'a> {
|
||||
pub struct FeStartupMessage {
|
||||
pub version: u32,
|
||||
pub kind: StartupRequestCode,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub system_id: SystemId,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -85,33 +83,26 @@ impl FeStartupMessage {
|
||||
let params_str = str::from_utf8(¶ms_bytes).unwrap();
|
||||
let params = params_str.split('\0');
|
||||
let mut options = false;
|
||||
let mut timelineid: Option<ZTimelineId> = None;
|
||||
let mut system_id: u64 = 0;
|
||||
for p in params {
|
||||
if p == "options" {
|
||||
options = true;
|
||||
} else if options {
|
||||
for opt in p.split(' ') {
|
||||
if opt.starts_with("ztimelineid=") {
|
||||
// FIXME: rethrow parsing error, don't unwrap
|
||||
timelineid = Some(ZTimelineId::from_str(&opt[12..]).unwrap());
|
||||
if opt.starts_with("system.id=") {
|
||||
system_id = opt[10..].parse::<u64>().unwrap();
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if timelineid.is_none() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"timelineid is required",
|
||||
));
|
||||
}
|
||||
|
||||
buf.advance(len as usize);
|
||||
Ok(Some(FeMessage::StartupMessage(FeStartupMessage {
|
||||
version,
|
||||
kind,
|
||||
timelineid: timelineid.unwrap(),
|
||||
system_id,
|
||||
})))
|
||||
}
|
||||
}
|
||||
@@ -155,20 +146,20 @@ impl<'a> BeMessage<'a> {
|
||||
|
||||
BeMessage::RowDescription(rows) => {
|
||||
buf.put_u8(b'T');
|
||||
|
||||
let mut body = BytesMut::new();
|
||||
body.put_i16(rows.len() as i16); // # of fields
|
||||
let total_len: u32 = rows
|
||||
.iter()
|
||||
.fold(0, |acc, row| acc + row.name.len() as u32 + 3 * (4 + 2));
|
||||
buf.put_u32(4 + 2 + total_len);
|
||||
for row in rows.iter() {
|
||||
body.put_slice(row.name);
|
||||
body.put_i32(0); /* table oid */
|
||||
body.put_i16(0); /* attnum */
|
||||
body.put_u32(row.typoid);
|
||||
body.put_i16(row.typlen);
|
||||
body.put_i32(-1); /* typmod */
|
||||
body.put_i16(0); /* format code */
|
||||
buf.put_i16(row.name.len() as i16);
|
||||
buf.put_slice(row.name);
|
||||
buf.put_i32(0); /* table oid */
|
||||
buf.put_i16(0); /* attnum */
|
||||
buf.put_u32(row.typoid);
|
||||
buf.put_i16(row.typlen);
|
||||
buf.put_i32(-1); /* typmod */
|
||||
buf.put_i16(0); /* format code */
|
||||
}
|
||||
buf.put_i32((4 + body.len()) as i32); // # of bytes, including len field itself
|
||||
buf.put(body);
|
||||
}
|
||||
|
||||
BeMessage::DataRow(vals) => {
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
// receive WAL from wal_proposer and send it to WAL receivers
|
||||
//
|
||||
|
||||
extern crate fs2;
|
||||
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use fs2::FileExt;
|
||||
@@ -31,7 +33,6 @@ use tokio_postgres::{connect, Error, NoTls};
|
||||
use crate::pq_protocol::*;
|
||||
use crate::xlog_utils::*;
|
||||
use crate::WalAcceptorConf;
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
type FullTransactionId = u64;
|
||||
|
||||
@@ -63,8 +64,7 @@ struct ServerInfo {
|
||||
protocol_version: u32, /* proxy-safekeeper protocol version */
|
||||
pg_version: u32, /* Postgres server version */
|
||||
node_id: NodeId,
|
||||
system_id: SystemId,
|
||||
timeline_id: ZTimelineId, /* Zenith timelineid */
|
||||
system_id: SystemId, /* Postgres system identifier */
|
||||
wal_end: XLogRecPtr,
|
||||
timeline: TimeLineID,
|
||||
wal_seg_size: u32,
|
||||
@@ -146,8 +146,8 @@ struct SharedState {
|
||||
* Database instance (tenant)
|
||||
*/
|
||||
#[derive(Debug)]
|
||||
pub struct Timeline {
|
||||
timelineid: ZTimelineId,
|
||||
pub struct System {
|
||||
id: SystemId,
|
||||
mutex: Mutex<SharedState>,
|
||||
cond: Notify, /* conditional variable used to notify wal senders */
|
||||
}
|
||||
@@ -157,7 +157,7 @@ pub struct Timeline {
|
||||
*/
|
||||
#[derive(Debug)]
|
||||
struct Connection {
|
||||
timeline: Option<Arc<Timeline>>,
|
||||
system: Option<Arc<System>>,
|
||||
stream: TcpStream, /* Postgres connection */
|
||||
inbuf: BytesMut, /* input buffer */
|
||||
outbuf: BytesMut, /* output buffer */
|
||||
@@ -211,7 +211,6 @@ impl Serializer for ServerInfo {
|
||||
buf.put_u32_le(self.pg_version);
|
||||
self.node_id.pack(buf);
|
||||
buf.put_u64_le(self.system_id);
|
||||
buf.put_slice(&self.timeline_id.as_arr());
|
||||
buf.put_u64_le(self.wal_end);
|
||||
buf.put_u32_le(self.timeline);
|
||||
buf.put_u32_le(self.wal_seg_size);
|
||||
@@ -222,7 +221,6 @@ impl Serializer for ServerInfo {
|
||||
pg_version: buf.get_u32_le(),
|
||||
node_id: NodeId::unpack(buf),
|
||||
system_id: buf.get_u64_le(),
|
||||
timeline_id: ZTimelineId::get_from_buf(buf),
|
||||
wal_end: buf.get_u64_le(),
|
||||
timeline: buf.get_u32_le(),
|
||||
wal_seg_size: buf.get_u32_le(),
|
||||
@@ -280,7 +278,6 @@ impl SafeKeeperInfo {
|
||||
pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
|
||||
node_id: NodeId { term: 0, uuid: 0 },
|
||||
system_id: 0, /* Postgres system identifier */
|
||||
timeline_id: ZTimelineId::from([0u8; 16]),
|
||||
wal_end: 0,
|
||||
timeline: 0,
|
||||
wal_seg_size: 0,
|
||||
@@ -352,8 +349,7 @@ impl Serializer for SafeKeeperResponse {
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
pub static ref TIMELINES: Mutex<HashMap<ZTimelineId, Arc<Timeline>>> =
|
||||
Mutex::new(HashMap::new());
|
||||
pub static ref SYSTEMS: Mutex<HashMap<SystemId, Arc<System>>> = Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
pub fn thread_main(conf: WalAcceptorConf) {
|
||||
@@ -370,7 +366,7 @@ pub fn thread_main(conf: WalAcceptorConf) {
|
||||
info!("Starting wal acceptor on {}", conf.listen_addr);
|
||||
|
||||
runtime.block_on(async {
|
||||
main_loop(&conf).await.unwrap();
|
||||
let _unused = main_loop(&conf).await;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -393,8 +389,8 @@ async fn main_loop(conf: &WalAcceptorConf) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
pub fn new(timelineid: ZTimelineId) -> Timeline {
|
||||
impl System {
|
||||
pub fn new(id: SystemId) -> System {
|
||||
let shared_state = SharedState {
|
||||
commit_lsn: 0,
|
||||
info: SafeKeeperInfo::new(),
|
||||
@@ -405,8 +401,8 @@ impl Timeline {
|
||||
catalog_xmin: u64::MAX,
|
||||
},
|
||||
};
|
||||
Timeline {
|
||||
timelineid,
|
||||
System {
|
||||
id: id,
|
||||
mutex: Mutex::new(shared_state),
|
||||
cond: Notify::new(),
|
||||
}
|
||||
@@ -447,23 +443,12 @@ impl Timeline {
|
||||
return shared_state.hs_feedback;
|
||||
}
|
||||
|
||||
// Load and lock control file (prevent running more than one instance of safekeeper)
|
||||
fn load_control_file(&self, conf: &WalAcceptorConf) -> Result<()> {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
|
||||
if shared_state.control_file.is_some() {
|
||||
info!(
|
||||
"control file for timeline {} is already open",
|
||||
self.timelineid
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Load and lock control file (prevent running more than one instance of safekeeper
|
||||
fn load_control_file(&self, conf: &WalAcceptorConf) {
|
||||
let control_file_path = conf
|
||||
.data_dir
|
||||
.join(self.timelineid.to_string())
|
||||
.join(self.id.to_string())
|
||||
.join(CONTROL_FILE_NAME);
|
||||
info!("loading control file {}", control_file_path.display());
|
||||
match OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
@@ -475,13 +460,13 @@ impl Timeline {
|
||||
match file.try_lock_exclusive() {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
io_error!(
|
||||
panic!(
|
||||
"Control file {:?} is locked by some other process: {}",
|
||||
&control_file_path,
|
||||
e
|
||||
&control_file_path, e
|
||||
);
|
||||
}
|
||||
}
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.control_file = Some(file);
|
||||
|
||||
const SIZE: usize = mem::size_of::<SafeKeeperInfo>();
|
||||
@@ -498,13 +483,12 @@ impl Timeline {
|
||||
let my_info = SafeKeeperInfo::unpack(&mut input);
|
||||
|
||||
if my_info.magic != SK_MAGIC {
|
||||
io_error!("Invalid control file magic: {}", my_info.magic);
|
||||
panic!("Invalid control file magic: {}", my_info.magic);
|
||||
}
|
||||
if my_info.format_version != SK_FORMAT_VERSION {
|
||||
io_error!(
|
||||
panic!(
|
||||
"Incompatible format version: {} vs. {}",
|
||||
my_info.format_version,
|
||||
SK_FORMAT_VERSION
|
||||
my_info.format_version, SK_FORMAT_VERSION
|
||||
);
|
||||
}
|
||||
shared_state.info = my_info;
|
||||
@@ -517,7 +501,6 @@ impl Timeline {
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_control_file(&self, sync: bool) -> Result<()> {
|
||||
@@ -538,7 +521,7 @@ impl Timeline {
|
||||
impl Connection {
|
||||
pub fn new(socket: TcpStream, conf: &WalAcceptorConf) -> Connection {
|
||||
Connection {
|
||||
timeline: None,
|
||||
system: None,
|
||||
stream: socket,
|
||||
inbuf: BytesMut::with_capacity(10 * 1024),
|
||||
outbuf: BytesMut::with_capacity(10 * 1024),
|
||||
@@ -547,8 +530,8 @@ impl Connection {
|
||||
}
|
||||
}
|
||||
|
||||
fn timeline(&self) -> Arc<Timeline> {
|
||||
self.timeline.as_ref().unwrap().clone()
|
||||
fn system(&self) -> Arc<System> {
|
||||
self.system.as_ref().unwrap().clone()
|
||||
}
|
||||
|
||||
async fn run(&mut self) -> Result<()> {
|
||||
@@ -580,15 +563,10 @@ impl Connection {
|
||||
"no_user",
|
||||
);
|
||||
let callme = format!(
|
||||
"callmemaybe {} host={} port={} options='-c ztimelineid={}'",
|
||||
self.timeline().timelineid,
|
||||
"callmemaybe host={} port={} replication=1 options='-c system.id={}'",
|
||||
self.conf.listen_addr.ip(),
|
||||
self.conf.listen_addr.port(),
|
||||
self.timeline().timelineid
|
||||
);
|
||||
info!(
|
||||
"requesting page server to connect to us: start {} {}",
|
||||
ps_connstr, callme
|
||||
self.system().get_info().server.system_id,
|
||||
);
|
||||
let (client, connection) = connect(&ps_connstr, NoTls).await?;
|
||||
|
||||
@@ -604,14 +582,22 @@ impl Connection {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_timeline(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
let mut timelines = TIMELINES.lock().unwrap();
|
||||
if !timelines.contains_key(&timelineid) {
|
||||
info!("creating timeline dir {}", timelineid);
|
||||
fs::create_dir_all(timelineid.to_string())?;
|
||||
timelines.insert(timelineid, Arc::new(Timeline::new(timelineid)));
|
||||
fn set_system(&mut self, id: SystemId) -> Result<()> {
|
||||
let mut systems = SYSTEMS.lock().unwrap();
|
||||
if id == 0 {
|
||||
// non-multitenant configuration: just a single instance
|
||||
if let Some(system) = systems.values().next() {
|
||||
self.system = Some(system.clone());
|
||||
return Ok(());
|
||||
}
|
||||
io_error!("No active instances");
|
||||
}
|
||||
self.timeline = Some(timelines.get(&timelineid).unwrap().clone());
|
||||
if !systems.contains_key(&id) {
|
||||
let system_dir = self.conf.data_dir.join(id.to_string());
|
||||
fs::create_dir_all(system_dir)?;
|
||||
systems.insert(id, Arc::new(System::new(id)));
|
||||
}
|
||||
self.system = Some(systems.get(&id).unwrap().clone());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -620,16 +606,14 @@ impl Connection {
|
||||
// Receive information about server
|
||||
let server_info = self.read_req::<ServerInfo>().await?;
|
||||
info!(
|
||||
"Start handshake with wal_proposer {} sysid {} timeline {}",
|
||||
"Start handshake with wal_proposer {} sysid {}",
|
||||
self.stream.peer_addr()?,
|
||||
server_info.system_id,
|
||||
server_info.timeline_id,
|
||||
server_info.system_id
|
||||
);
|
||||
// FIXME: also check that the system identifier matches
|
||||
self.set_timeline(server_info.timeline_id)?;
|
||||
self.timeline().load_control_file(&self.conf)?;
|
||||
self.set_system(server_info.system_id)?;
|
||||
self.system().load_control_file(&self.conf);
|
||||
|
||||
let mut my_info = self.timeline().get_info();
|
||||
let mut my_info = self.system().get_info();
|
||||
|
||||
/* Check protocol compatibility */
|
||||
if server_info.protocol_version != SK_PROTOCOL_VERSION {
|
||||
@@ -678,9 +662,9 @@ impl Connection {
|
||||
);
|
||||
}
|
||||
my_info.server.node_id = prop.node_id;
|
||||
self.timeline().set_info(&my_info);
|
||||
self.system().set_info(&my_info);
|
||||
/* Need to persist our vote first */
|
||||
self.timeline().save_control_file(true)?;
|
||||
self.system().save_control_file(true)?;
|
||||
|
||||
let mut flushed_restart_lsn: XLogRecPtr = 0;
|
||||
let wal_seg_size = server_info.wal_seg_size as usize;
|
||||
@@ -694,13 +678,12 @@ impl Connection {
|
||||
// Add far as replication in postgres is initiated by receiver, we should use callme mechanism
|
||||
if let Err(e) = self.request_callback().await {
|
||||
// Do not treate it as fatal error and continue work
|
||||
// FIXME: we should retry after a while...
|
||||
error!("Failed to send callme request to pageserver: {}", e);
|
||||
}
|
||||
|
||||
info!(
|
||||
"Start streaming from timeline {} address {:?}",
|
||||
server_info.timeline_id,
|
||||
"Start streaming from server {} address {:?}",
|
||||
server_info.system_id,
|
||||
self.stream.peer_addr()?
|
||||
);
|
||||
|
||||
@@ -722,15 +705,6 @@ impl Connection {
|
||||
let rec_size = (end_pos - start_pos) as usize;
|
||||
assert!(rec_size <= MAX_SEND_SIZE);
|
||||
|
||||
debug!(
|
||||
"received for {} bytes between {:X}/{:X} and {:X}/{:X}",
|
||||
rec_size,
|
||||
start_pos >> 32,
|
||||
start_pos & 0xffffffff,
|
||||
end_pos >> 32,
|
||||
end_pos & 0xffffffff
|
||||
);
|
||||
|
||||
/* Receive message body */
|
||||
self.inbuf.resize(rec_size, 0u8);
|
||||
self.stream.read_exact(&mut self.inbuf[0..rec_size]).await?;
|
||||
@@ -761,7 +735,7 @@ impl Connection {
|
||||
* when restart_lsn delta exceeds WAL segment size.
|
||||
*/
|
||||
sync_control_file |= flushed_restart_lsn + (wal_seg_size as u64) < my_info.restart_lsn;
|
||||
self.timeline().save_control_file(sync_control_file)?;
|
||||
self.system().save_control_file(sync_control_file)?;
|
||||
|
||||
if sync_control_file {
|
||||
flushed_restart_lsn = my_info.restart_lsn;
|
||||
@@ -772,7 +746,7 @@ impl Connection {
|
||||
let resp = SafeKeeperResponse {
|
||||
epoch: my_info.epoch,
|
||||
flush_lsn: end_pos,
|
||||
hs_feedback: self.timeline().get_hs_feedback(),
|
||||
hs_feedback: self.system().get_hs_feedback(),
|
||||
};
|
||||
self.start_sending();
|
||||
resp.pack(&mut self.outbuf);
|
||||
@@ -782,7 +756,7 @@ impl Connection {
|
||||
* Ping wal sender that new data is available.
|
||||
* FlushLSN (end_pos) can be smaller than commitLSN in case we are at catching-up safekeeper.
|
||||
*/
|
||||
self.timeline()
|
||||
self.system()
|
||||
.notify_wal_senders(min(req.commit_lsn, end_pos));
|
||||
}
|
||||
Ok(())
|
||||
@@ -833,7 +807,7 @@ impl Connection {
|
||||
}
|
||||
|
||||
//
|
||||
// Send WAL to replica or WAL receiver using standard libpq replication protocol
|
||||
// Send WAL to replica or WAL sender using standard libpq replication protocol
|
||||
//
|
||||
async fn send_wal(&mut self) -> Result<()> {
|
||||
info!("WAL sender to {:?} is started", self.stream.peer_addr()?);
|
||||
@@ -854,7 +828,7 @@ impl Connection {
|
||||
BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery);
|
||||
self.send().await?;
|
||||
self.init_done = true;
|
||||
self.set_timeline(m.timelineid)?;
|
||||
self.set_system(m.system_id)?;
|
||||
}
|
||||
StartupRequestCode::Cancel => return Ok(()),
|
||||
}
|
||||
@@ -887,7 +861,7 @@ impl Connection {
|
||||
let (start_pos, timeline) = self.find_end_of_wal(false);
|
||||
let lsn = format!("{:X}/{:>08X}", (start_pos >> 32) as u32, start_pos as u32);
|
||||
let tli = timeline.to_string();
|
||||
let sysid = self.timeline().get_info().server.system_id.to_string();
|
||||
let sysid = self.system().get_info().server.system_id.to_string();
|
||||
let lsn_bytes = lsn.as_bytes();
|
||||
let tli_bytes = tli.as_bytes();
|
||||
let sysid_bytes = sysid.as_bytes();
|
||||
@@ -919,11 +893,11 @@ impl Connection {
|
||||
);
|
||||
BeMessage::write(
|
||||
&mut self.outbuf,
|
||||
&BeMessage::DataRow(&[Some(sysid_bytes), Some(tli_bytes), Some(lsn_bytes), None]),
|
||||
&BeMessage::DataRow(&[Some(lsn_bytes), Some(tli_bytes), Some(sysid_bytes), None]),
|
||||
);
|
||||
BeMessage::write(
|
||||
&mut self.outbuf,
|
||||
&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM\0"),
|
||||
&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM"),
|
||||
);
|
||||
BeMessage::write(&mut self.outbuf, &BeMessage::ReadyForQuery);
|
||||
self.send().await?;
|
||||
@@ -943,7 +917,7 @@ impl Connection {
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let wal_seg_size = self.timeline().get_info().server.wal_seg_size as usize;
|
||||
let wal_seg_size = self.system().get_info().server.wal_seg_size as usize;
|
||||
if wal_seg_size == 0 {
|
||||
io_error!("Can not start replication before connecting to wal_proposer");
|
||||
}
|
||||
@@ -961,6 +935,15 @@ impl Connection {
|
||||
BeMessage::write(&mut self.outbuf, &BeMessage::Copy);
|
||||
self.send().await?;
|
||||
|
||||
/*
|
||||
* Always start streaming at the beginning of a segment
|
||||
*
|
||||
* FIXME: It is common practice to start streaming at the beginning of
|
||||
* the segment, but it should be up to the client to decide that. We
|
||||
* shouldn't enforce that here.
|
||||
*/
|
||||
start_pos -= XLogSegmentOffset(start_pos, wal_seg_size) as u64;
|
||||
|
||||
let mut end_pos: XLogRecPtr;
|
||||
let mut commit_lsn: XLogRecPtr;
|
||||
let mut wal_file: Option<File> = None;
|
||||
@@ -977,18 +960,19 @@ impl Connection {
|
||||
end_pos = stop_pos;
|
||||
} else {
|
||||
/* normal mode */
|
||||
let timeline = self.timeline();
|
||||
loop {
|
||||
// Rust doesn't allow to grab async result from mutex scope
|
||||
let system = self.system();
|
||||
let notified = system.cond.notified();
|
||||
{
|
||||
let shared_state = timeline.mutex.lock().unwrap();
|
||||
let shared_state = system.mutex.lock().unwrap();
|
||||
commit_lsn = shared_state.commit_lsn;
|
||||
if start_pos < commit_lsn {
|
||||
end_pos = commit_lsn;
|
||||
break;
|
||||
}
|
||||
}
|
||||
timeline.cond.notified().await;
|
||||
notified.await;
|
||||
}
|
||||
}
|
||||
if end_pos == END_REPLICATION_MARKER {
|
||||
@@ -999,13 +983,13 @@ impl Connection {
|
||||
Ok(0) => break,
|
||||
Ok(_) => match self.parse_message()? {
|
||||
Some(FeMessage::CopyData(m)) => self
|
||||
.timeline()
|
||||
.system()
|
||||
.add_hs_feedback(HotStandbyFeedback::parse(&m.body)),
|
||||
_ => {}
|
||||
},
|
||||
Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {}
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1020,7 +1004,7 @@ impl Connection {
|
||||
let wal_file_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
.join(self.timeline().timelineid.to_string())
|
||||
.join(self.system().id.to_string())
|
||||
.join(wal_file_name.clone() + ".partial");
|
||||
if let Ok(opened_file) = File::open(&wal_file_path) {
|
||||
file = opened_file;
|
||||
@@ -1028,30 +1012,21 @@ impl Connection {
|
||||
let wal_file_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
.join(self.timeline().timelineid.to_string())
|
||||
.join(self.system().id.to_string())
|
||||
.join(wal_file_name);
|
||||
match File::open(&wal_file_path) {
|
||||
Ok(opened_file) => file = opened_file,
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
|
||||
|
||||
// How much to read and send in message? We cannot cross the WAL file
|
||||
// boundary, and we don't want send more than MAX_SEND_SIZE.
|
||||
let send_size = (end_pos - start_pos) as usize;
|
||||
let send_size = min(send_size, wal_seg_size - xlogoff);
|
||||
let send_size = min(send_size, MAX_SEND_SIZE);
|
||||
|
||||
let send_size = min((end_pos - start_pos) as usize, MAX_SEND_SIZE);
|
||||
let msg_size = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + send_size;
|
||||
let data_start = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE;
|
||||
let data_end = data_start + send_size;
|
||||
|
||||
file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
file.read_exact(&mut self.outbuf[data_start..data_end])?;
|
||||
self.outbuf[0] = b'd';
|
||||
BigEndian::write_u32(
|
||||
@@ -1066,12 +1041,6 @@ impl Connection {
|
||||
self.stream.write_all(&self.outbuf[0..msg_size]).await?;
|
||||
start_pos += send_size as u64;
|
||||
|
||||
debug!(
|
||||
"Sent WAL to page server up to {:X}/{:>08X}",
|
||||
(end_pos >> 32) as u32,
|
||||
end_pos as u32
|
||||
);
|
||||
|
||||
if XLogSegmentOffset(start_pos, wal_seg_size) != 0 {
|
||||
wal_file = Some(file);
|
||||
}
|
||||
@@ -1126,12 +1095,12 @@ impl Connection {
|
||||
let wal_file_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
.join(self.timeline().timelineid.to_string())
|
||||
.join(self.system().id.to_string())
|
||||
.join(wal_file_name.clone());
|
||||
let wal_file_partial_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
.join(self.timeline().timelineid.to_string())
|
||||
.join(self.system().id.to_string())
|
||||
.join(wal_file_name.clone() + ".partial");
|
||||
|
||||
{
|
||||
@@ -1161,7 +1130,7 @@ impl Connection {
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1194,7 +1163,7 @@ impl Connection {
|
||||
fn find_end_of_wal(&self, precise: bool) -> (XLogRecPtr, TimeLineID) {
|
||||
find_end_of_wal(
|
||||
&self.conf.data_dir,
|
||||
self.timeline().get_info().server.wal_seg_size as usize,
|
||||
self.system().get_info().server.wal_seg_size as usize,
|
||||
precise,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use log::*;
|
||||
use std::cmp::min;
|
||||
use std::fs::{self, File};
|
||||
use std::io::prelude::*;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::PathBuf;
|
||||
use std::time::SystemTime;
|
||||
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
@@ -89,7 +89,7 @@ pub fn get_current_timestamp() -> TimestampTz {
|
||||
}
|
||||
|
||||
fn find_end_of_wal_segment(
|
||||
data_dir: &Path,
|
||||
data_dir: &PathBuf,
|
||||
segno: XLogSegNo,
|
||||
tli: TimeLineID,
|
||||
wal_seg_size: usize,
|
||||
@@ -185,7 +185,7 @@ fn find_end_of_wal_segment(
|
||||
}
|
||||
|
||||
pub fn find_end_of_wal(
|
||||
data_dir: &Path,
|
||||
data_dir: &PathBuf,
|
||||
wal_seg_size: usize,
|
||||
precise: bool,
|
||||
) -> (XLogRecPtr, TimeLineID) {
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
[package]
|
||||
name = "zenith"
|
||||
version = "0.1.0"
|
||||
authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
clap = "2.33.0"
|
||||
anyhow = "1.0"
|
||||
|
||||
# FIXME: 'pageserver' is needed for ZTimelineId. Refactor
|
||||
pageserver = { path = "../pageserver" }
|
||||
control_plane = { path = "../control_plane" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
@@ -1,336 +0,0 @@
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Result;
|
||||
use anyhow::{anyhow, bail};
|
||||
use clap::{App, Arg, ArgMatches, SubCommand};
|
||||
|
||||
use control_plane::local_env::LocalEnv;
|
||||
use control_plane::storage::PageServerNode;
|
||||
use control_plane::{compute::ComputeControlPlane, local_env, storage};
|
||||
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
fn zenith_repo_dir() -> PathBuf {
|
||||
// Find repository path
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
// Main entry point for the 'zenith' CLI utility
|
||||
//
|
||||
// This utility can used to work with a local zenith repository.
|
||||
// In order to run queries in it, you need to launch the page server,
|
||||
// and a compute node against the page server
|
||||
fn main() -> Result<()> {
|
||||
let name_arg = Arg::with_name("NAME")
|
||||
.short("n")
|
||||
.index(1)
|
||||
.help("name of this postgres instance")
|
||||
.required(true);
|
||||
let matches = App::new("zenith")
|
||||
.about("Zenith CLI")
|
||||
.subcommand(
|
||||
SubCommand::with_name("init")
|
||||
.about("Initialize a new Zenith repository in current directory"),
|
||||
)
|
||||
.subcommand(
|
||||
SubCommand::with_name("branch")
|
||||
.about("Create a new branch")
|
||||
.arg(Arg::with_name("branchname").required(false).index(1))
|
||||
.arg(Arg::with_name("start-point").required(false).index(2)),
|
||||
)
|
||||
.subcommand(
|
||||
SubCommand::with_name("pageserver")
|
||||
.about("Manage pageserver instance")
|
||||
.subcommand(SubCommand::with_name("status"))
|
||||
.subcommand(SubCommand::with_name("start"))
|
||||
.subcommand(SubCommand::with_name("stop")),
|
||||
)
|
||||
.subcommand(
|
||||
SubCommand::with_name("pg")
|
||||
.about("Manage postgres instances")
|
||||
.subcommand(
|
||||
SubCommand::with_name("create")
|
||||
// .arg(name_arg.clone()
|
||||
// .required(false)
|
||||
// .help("name of this postgres instance (will be pgN if omitted)"))
|
||||
.arg(Arg::with_name("timeline").required(false).index(1)),
|
||||
)
|
||||
.subcommand(SubCommand::with_name("list"))
|
||||
.subcommand(SubCommand::with_name("start").arg(name_arg.clone()))
|
||||
.subcommand(SubCommand::with_name("stop").arg(name_arg.clone()))
|
||||
.subcommand(SubCommand::with_name("destroy").arg(name_arg.clone())),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
// handle init separately and exit
|
||||
if let ("init", Some(sub_args)) = matches.subcommand() {
|
||||
run_init_cmd(sub_args.clone())?;
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// all other commands would need config
|
||||
|
||||
let repopath = PathBuf::from(zenith_repo_dir());
|
||||
if !repopath.exists() {
|
||||
bail!(
|
||||
"Zenith repository does not exists in {}.\n\
|
||||
Set ZENITH_REPO_DIR or initialize a new repository with 'zenith init'",
|
||||
repopath.display()
|
||||
);
|
||||
}
|
||||
// TODO: check that it looks like a zenith repository
|
||||
let env = match local_env::load_config(&repopath) {
|
||||
Ok(conf) => conf,
|
||||
Err(e) => {
|
||||
eprintln!("Error loading config from {}: {}", repopath.display(), e);
|
||||
exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
match matches.subcommand() {
|
||||
("init", Some(_)) => {
|
||||
panic!() /* Should not happen. Init was handled before */
|
||||
}
|
||||
|
||||
("branch", Some(sub_args)) => run_branch_cmd(&env, sub_args.clone())?,
|
||||
("pageserver", Some(sub_args)) => run_pageserver_cmd(&env, sub_args.clone())?,
|
||||
|
||||
("start", Some(_sub_m)) => {
|
||||
let pageserver = storage::PageServerNode::from_env(&env);
|
||||
|
||||
if let Err(e) = pageserver.start() {
|
||||
eprintln!("pageserver start: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
("stop", Some(_sub_m)) => {
|
||||
let pageserver = storage::PageServerNode::from_env(&env);
|
||||
if let Err(e) = pageserver.stop() {
|
||||
eprintln!("pageserver stop: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
("status", Some(_sub_m)) => {}
|
||||
|
||||
("pg", Some(pg_match)) => {
|
||||
if let Err(e) = handle_pg(pg_match, &env) {
|
||||
eprintln!("pg operation failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn run_pageserver_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> {
|
||||
match args.subcommand() {
|
||||
("status", Some(_sub_m)) => {
|
||||
todo!();
|
||||
}
|
||||
("start", Some(_sub_m)) => {
|
||||
let psnode = PageServerNode::from_env(local_env);
|
||||
psnode.start()?;
|
||||
println!("Page server started");
|
||||
}
|
||||
("stop", Some(_sub_m)) => {
|
||||
todo!();
|
||||
}
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Peek into the repository, to grab the timeline ID of given branch
|
||||
pub fn get_branch_timeline(repopath: &Path, branchname: &str) -> ZTimelineId {
|
||||
let branchpath = repopath.join("refs/branches/".to_owned() + branchname);
|
||||
|
||||
ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap()
|
||||
}
|
||||
|
||||
fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||
|
||||
match pg_match.subcommand() {
|
||||
("create", Some(sub_m)) => {
|
||||
// FIXME: cheat and resolve the timeline by peeking into the
|
||||
// repository. In reality, when you're launching a compute node
|
||||
// against a possibly-remote page server, we wouldn't know what
|
||||
// branches exist in the remote repository. Or would we require
|
||||
// that you "zenith fetch" them into a local repoitory first?
|
||||
let timeline_arg = sub_m.value_of("timeline").unwrap_or("main");
|
||||
let timeline = get_branch_timeline(&env.repo_path, timeline_arg);
|
||||
|
||||
println!("Initializing Postgres on timeline {}...", timeline);
|
||||
|
||||
cplane.new_node(timeline)?;
|
||||
}
|
||||
("list", Some(_sub_m)) => {
|
||||
println!("NODE\tADDRESS\tSTATUS");
|
||||
for (node_name, node) in cplane.nodes.iter() {
|
||||
println!("{}\t{}\t{}", node_name, node.address, node.status());
|
||||
}
|
||||
}
|
||||
("start", Some(sub_m)) => {
|
||||
let name = sub_m.value_of("NAME").unwrap();
|
||||
let node = cplane
|
||||
.nodes
|
||||
.get(name)
|
||||
.ok_or(anyhow!("postgres {} is not found", name))?;
|
||||
node.start()?;
|
||||
}
|
||||
("stop", Some(sub_m)) => {
|
||||
let name = sub_m.value_of("NAME").unwrap();
|
||||
let node = cplane
|
||||
.nodes
|
||||
.get(name)
|
||||
.ok_or(anyhow!("postgres {} is not found", name))?;
|
||||
node.stop()?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// "zenith init" - Initialize a new Zenith repository in current dir
|
||||
fn run_init_cmd(_args: ArgMatches) -> Result<()> {
|
||||
local_env::init()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// handle "zenith branch" subcommand
|
||||
fn run_branch_cmd(local_env: &LocalEnv, args: ArgMatches) -> Result<()> {
|
||||
let repopath = local_env.repo_path.to_str().unwrap();
|
||||
|
||||
if let Some(branchname) = args.value_of("branchname") {
|
||||
if PathBuf::from(format!("{}/refs/branches/{}", repopath, branchname)).exists() {
|
||||
anyhow::bail!("branch {} already exists", branchname);
|
||||
}
|
||||
|
||||
if let Some(startpoint_str) = args.value_of("start-point") {
|
||||
let mut startpoint = parse_point_in_time(startpoint_str)?;
|
||||
|
||||
if startpoint.lsn == 0 {
|
||||
// Find end of WAL on the old timeline
|
||||
let end_of_wal = local_env::find_end_of_wal(local_env, startpoint.timelineid)?;
|
||||
|
||||
println!(
|
||||
"branching at end of WAL: {:X}/{:X}",
|
||||
end_of_wal >> 32,
|
||||
end_of_wal & 0xffffffff
|
||||
);
|
||||
|
||||
startpoint.lsn = end_of_wal;
|
||||
}
|
||||
|
||||
return local_env::create_branch(local_env, branchname, startpoint);
|
||||
} else {
|
||||
panic!("Missing start-point");
|
||||
}
|
||||
} else {
|
||||
// No arguments, list branches
|
||||
list_branches()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn list_branches() -> Result<()> {
|
||||
// list branches
|
||||
let paths = fs::read_dir(zenith_repo_dir().join("refs").join("branches"))?;
|
||||
|
||||
for path in paths {
|
||||
println!(" {}", path?.file_name().to_str().unwrap());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// Parse user-given string that represents a point-in-time.
|
||||
//
|
||||
// We support multiple variants:
|
||||
//
|
||||
// Raw timeline id in hex, meaning the end of that timeline:
|
||||
// bc62e7d612d0e6fe8f99a6dd2f281f9d
|
||||
//
|
||||
// A specific LSN on a timeline:
|
||||
// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8
|
||||
//
|
||||
// Same, with a human-friendly branch name:
|
||||
// main
|
||||
// main@2/15D3DD8
|
||||
//
|
||||
// Human-friendly tag name:
|
||||
// mytag
|
||||
//
|
||||
//
|
||||
fn parse_point_in_time(s: &str) -> Result<local_env::PointInTime> {
|
||||
let mut strings = s.split("@");
|
||||
let name = strings.next().unwrap();
|
||||
|
||||
let lsn: Option<u64>;
|
||||
if let Some(lsnstr) = strings.next() {
|
||||
let mut s = lsnstr.split("/");
|
||||
let lsn_hi: u64 = s
|
||||
.next()
|
||||
.ok_or(anyhow!("invalid LSN in point-in-time specification"))?
|
||||
.parse()?;
|
||||
let lsn_lo: u64 = s
|
||||
.next()
|
||||
.ok_or(anyhow!("invalid LSN in point-in-time specification"))?
|
||||
.parse()?;
|
||||
lsn = Some(lsn_hi << 32 | lsn_lo);
|
||||
} else {
|
||||
lsn = None
|
||||
}
|
||||
|
||||
// Check if it's a tag
|
||||
if lsn.is_none() {
|
||||
let tagpath = zenith_repo_dir().join("refs").join("tags").join(name);
|
||||
if tagpath.exists() {
|
||||
let pointstr = fs::read_to_string(tagpath)?;
|
||||
|
||||
return parse_point_in_time(&pointstr);
|
||||
}
|
||||
}
|
||||
// Check if it's a branch
|
||||
// Check if it's branch @ LSN
|
||||
let branchpath = zenith_repo_dir().join("refs").join("branches").join(name);
|
||||
if branchpath.exists() {
|
||||
let pointstr = fs::read_to_string(branchpath)?;
|
||||
|
||||
let mut result = parse_point_in_time(&pointstr)?;
|
||||
if lsn.is_some() {
|
||||
result.lsn = lsn.unwrap();
|
||||
} else {
|
||||
result.lsn = 0;
|
||||
}
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
// Check if it's a timelineid
|
||||
// Check if it's timelineid @ LSN
|
||||
let tlipath = zenith_repo_dir().join("timelines").join(name);
|
||||
if tlipath.exists() {
|
||||
let result = local_env::PointInTime {
|
||||
timelineid: ZTimelineId::from_str(name)?,
|
||||
lsn: lsn.unwrap_or(0),
|
||||
};
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
panic!("could not parse point-in-time {}", s);
|
||||
}
|
||||
@@ -1,7 +0,0 @@
|
||||
[package]
|
||||
name = "zenith_utils"
|
||||
version = "0.1.0"
|
||||
authors = ["Eric Seppanen <eric@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
@@ -1,2 +0,0 @@
|
||||
//! zenith_utils is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
Reference in New Issue
Block a user