mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-05 22:40:37 +00:00
Compare commits
1 Commits
embedded_w
...
issue_56
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
babd2339cc |
45
.github/workflows/notifications.yml
vendored
45
.github/workflows/notifications.yml
vendored
@@ -1,45 +0,0 @@
|
||||
name: Send Notifications
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
send-notifications:
|
||||
timeout-minutes: 30
|
||||
name: send commit notifications
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Form variables for notification message
|
||||
id: git_info_grab
|
||||
run: |
|
||||
git_stat=$(git show --stat=50)
|
||||
git_stat="${git_stat//'%'/'%25'}"
|
||||
git_stat="${git_stat//$'\n'/'%0A'}"
|
||||
git_stat="${git_stat//$'\r'/'%0D'}"
|
||||
git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces
|
||||
echo "::set-output name=git_stat::$git_stat"
|
||||
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
|
||||
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
|
||||
|
||||
- name: Send notification
|
||||
uses: appleboy/telegram-action@master
|
||||
with:
|
||||
to: ${{ secrets.TELEGRAM_TO }}
|
||||
token: ${{ secrets.TELEGRAM_TOKEN }}
|
||||
format: markdown
|
||||
args: |
|
||||
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
|
||||
|
||||
```
|
||||
${{ steps.git_info_grab.outputs.git_stat }}
|
||||
```
|
||||
|
||||
61
.github/workflows/testing.yml
vendored
61
.github/workflows/testing.yml
vendored
@@ -1,36 +1,45 @@
|
||||
name: Build and Test
|
||||
name: regression check
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
on: [push]
|
||||
|
||||
jobs:
|
||||
regression-check:
|
||||
strategy:
|
||||
matrix:
|
||||
# If we want to duplicate this job for different
|
||||
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
|
||||
rust_toolchain: [stable]
|
||||
os: [ubuntu-latest]
|
||||
timeout-minutes: 30
|
||||
timeout-minutes: 10
|
||||
name: run regression test suite
|
||||
runs-on: ${{ matrix.os }}
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
|
||||
- name: install rust toolchain ${{ matrix.rust_toolchain }}
|
||||
uses: actions-rs/toolchain@v1
|
||||
- name: Form variables for notification message
|
||||
id: git_info_grab
|
||||
run: |
|
||||
git_stat=$(git show --stat=50)
|
||||
git_stat="${git_stat//'%'/'%25'}"
|
||||
git_stat="${git_stat//$'\n'/'%0A'}"
|
||||
git_stat="${git_stat//$'\r'/'%0D'}"
|
||||
git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces
|
||||
echo "::set-output name=git_stat::$git_stat"
|
||||
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
|
||||
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
|
||||
|
||||
- name: Send notification
|
||||
uses: appleboy/telegram-action@master
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: ${{ matrix.rust_toolchain }}
|
||||
override: true
|
||||
to: ${{ secrets.TELEGRAM_TO }}
|
||||
token: ${{ secrets.TELEGRAM_TOKEN }}
|
||||
format: markdown
|
||||
args: |
|
||||
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
|
||||
|
||||
```
|
||||
${{ steps.git_info_grab.outputs.git_stat }}
|
||||
```
|
||||
|
||||
- name: Install postgres dependencies
|
||||
run: |
|
||||
@@ -52,7 +61,11 @@ jobs:
|
||||
- name: Build postgres
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
make postgres
|
||||
./pgbuild.sh
|
||||
|
||||
- name: Install rust
|
||||
run: |
|
||||
sudo apt install -y cargo
|
||||
|
||||
- name: Cache cargo deps
|
||||
id: cache_cargo
|
||||
@@ -64,10 +77,10 @@ jobs:
|
||||
target
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
|
||||
|
||||
- name: Run cargo build
|
||||
- name: Build
|
||||
run: |
|
||||
cargo build --workspace --bins --examples --tests
|
||||
cargo build
|
||||
|
||||
- name: Run cargo test
|
||||
- name: Run test
|
||||
run: |
|
||||
cargo test -- --nocapture --test-threads=1
|
||||
cargo test --test test_pageserver -- --nocapture --test-threads=1
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -3,4 +3,3 @@
|
||||
/tmp_install
|
||||
/tmp_check_cli
|
||||
.vscode
|
||||
.zenith
|
||||
|
||||
195
Cargo.lock
generated
195
Cargo.lock
generated
@@ -91,19 +91,19 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "async-io"
|
||||
version = "1.4.0"
|
||||
version = "1.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fcb9af4888a70ad78ecb5efcb0ba95d66a3cf54a88b62ae81559954c7588c7a2"
|
||||
checksum = "9315f8f07556761c3e48fec2e6b276004acf426e6dc068b2c2251854d65ee0fd"
|
||||
dependencies = [
|
||||
"concurrent-queue",
|
||||
"fastrand",
|
||||
"futures-lite",
|
||||
"libc",
|
||||
"log",
|
||||
"nb-connect",
|
||||
"once_cell",
|
||||
"parking",
|
||||
"polling",
|
||||
"socket2",
|
||||
"vec-arena",
|
||||
"waker-fn",
|
||||
"winapi",
|
||||
@@ -111,9 +111,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "async-lock"
|
||||
version = "2.4.0"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6a8ea61bf9947a1007c5cada31e647dbc77b103c679858150003ba697ea798b"
|
||||
checksum = "1996609732bde4a9988bc42125f55f2af5f3c36370e27c778d5191a4a1b63bfb"
|
||||
dependencies = [
|
||||
"event-listener",
|
||||
]
|
||||
@@ -162,9 +162,9 @@ checksum = "e91831deabf0d6d7ec49552e489aed63b7456a7a3c46cff62adad428110b0af0"
|
||||
|
||||
[[package]]
|
||||
name = "async-trait"
|
||||
version = "0.1.50"
|
||||
version = "0.1.49"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b98e84bbb4cbcdd97da190ba0c58a1bb0de2c1fdf67d159e192ed766aeca722"
|
||||
checksum = "589652ce7ccb335d1e7ecb3be145425702b290dbcb7029bbeaae263fc1d87b48"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -243,12 +243,13 @@ checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
|
||||
|
||||
[[package]]
|
||||
name = "bindgen"
|
||||
version = "0.57.0"
|
||||
version = "0.53.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fd4865004a46a0aafb2a0a5eb19d3c9fc46ee5f063a6cfc605c69ac9ecf5263d"
|
||||
checksum = "c72a978d268b1d70b0e963217e60fdabd9523a941457a6c42a7315d15c7e89e5"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cexpr",
|
||||
"cfg-if 0.1.10",
|
||||
"clang-sys",
|
||||
"clap",
|
||||
"env_logger",
|
||||
@@ -345,9 +346,6 @@ name = "cc"
|
||||
version = "1.0.67"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd"
|
||||
dependencies = [
|
||||
"jobserver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cexpr"
|
||||
@@ -385,9 +383,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "clang-sys"
|
||||
version = "1.2.0"
|
||||
version = "0.29.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "853eda514c284c2287f4bf20ae614f8781f40a81d32ecda6e91449304dfe077c"
|
||||
checksum = "fe6837df1d5cba2397b835c8530f51723267e16abbf83892e9e5af4f0e5dd10a"
|
||||
dependencies = [
|
||||
"glob",
|
||||
"libc",
|
||||
@@ -598,9 +596,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.8.3"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "17392a012ea30ef05a610aa97dfb49496e71c9f676b27879922ea5bdf60d9d3f"
|
||||
checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"humantime",
|
||||
@@ -924,9 +922,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "httparse"
|
||||
version = "1.4.0"
|
||||
version = "1.3.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a1ce40d6fc9764887c2fdc7305c3dcc429ba11ff981c1509416afd5697e4437"
|
||||
checksum = "bc35c995b9d93ec174cf9a27d425c7892722101e14993cd227fdb51d70cf9589"
|
||||
|
||||
[[package]]
|
||||
name = "httpdate"
|
||||
@@ -936,9 +934,12 @@ checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47"
|
||||
|
||||
[[package]]
|
||||
name = "humantime"
|
||||
version = "2.1.0"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
|
||||
checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f"
|
||||
dependencies = [
|
||||
"quick-error",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
@@ -979,9 +980,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.2.3"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8"
|
||||
checksum = "89829a5d69c23d348314a7ac337fe39173b61149a9864deabd260983aed48c21"
|
||||
dependencies = [
|
||||
"matches",
|
||||
"unicode-bidi",
|
||||
@@ -1032,15 +1033,6 @@ version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "972f5ae5d1cb9c6ae417789196c803205313edde988685da5e3aae0827b9e7fd"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.50"
|
||||
@@ -1079,24 +1071,12 @@ checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41"
|
||||
|
||||
[[package]]
|
||||
name = "libloading"
|
||||
version = "0.7.0"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f84d96438c15fcd6c3f244c8fce01d1e2b9c6b5623e9c711dc9286d8fc92d6a"
|
||||
checksum = "f2b111a074963af1d37a139918ac6d49ad1d0d5e47f72fd55388619691a7d753"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "librocksdb-sys"
|
||||
version = "6.17.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5da125e1c0f22c7cae785982115523a0738728498547f415c9054cb17c7e89f9"
|
||||
dependencies = [
|
||||
"bindgen",
|
||||
"cc",
|
||||
"glob",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1204,6 +1184,16 @@ dependencies = [
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nb-connect"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a19900e7eee95eb2b3c2e26d12a874cc80aaf750e31be6fcbe743ead369fa45d"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"socket2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "5.1.2"
|
||||
@@ -1223,41 +1213,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b8536030f9fea7127f841b45bb6243b27255787fb4eb83958aa1ef9d2fdc0c36"
|
||||
dependencies = [
|
||||
"num-bigint",
|
||||
"num-complex",
|
||||
"num-integer",
|
||||
"num-iter",
|
||||
"num-rational",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-complex"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6b19411a9719e753aff12e5187b74d60d3dc449ec3f4dc21e3989c3f554bc95"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
version = "0.1.44"
|
||||
@@ -1268,29 +1223,6 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-iter"
|
||||
version = "0.1.42"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-rational"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c000134b5dbf44adc5cb772486d335293351644b801551abe8f75c84cfa4aef"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"num-bigint",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.14"
|
||||
@@ -1381,19 +1313,18 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
"crossbeam-channel",
|
||||
"daemonize",
|
||||
"futures",
|
||||
"hex",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"parse_duration",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
"postgres_ffi",
|
||||
"rand 0.8.3",
|
||||
"regex",
|
||||
"rocksdb",
|
||||
"rust-s3",
|
||||
"slog",
|
||||
"slog-async",
|
||||
@@ -1408,7 +1339,6 @@ dependencies = [
|
||||
"tokio-stream",
|
||||
"tui",
|
||||
"walkdir",
|
||||
"zenith_utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1442,17 +1372,6 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parse_duration"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7037e5e93e0172a5a96874380bf73bc6ecef022e26fa25f2be26864d6b3ba95d"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"num",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "peeking_take_while"
|
||||
version = "0.1.2"
|
||||
@@ -1485,18 +1404,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "pin-project"
|
||||
version = "1.0.7"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7509cc106041c40a4518d2af7a61530e1eed0e6285296a3d8c5472806ccc4a4"
|
||||
checksum = "bc174859768806e91ae575187ada95c91a29e96a98dc5d2cd9a1fed039501ba6"
|
||||
dependencies = [
|
||||
"pin-project-internal",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pin-project-internal"
|
||||
version = "1.0.7"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48c950132583b500556b1efd71d45b319029f2b71518d979fcc208e16b42426f"
|
||||
checksum = "a490329918e856ed1b083f244e3bfe2d8c4f336407e4ea9e1a9f479ff09049e5"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -1585,7 +1504,6 @@ dependencies = [
|
||||
"chrono",
|
||||
"crc32c",
|
||||
"hex",
|
||||
"log",
|
||||
"rand 0.8.3",
|
||||
]
|
||||
|
||||
@@ -1616,6 +1534,12 @@ dependencies = [
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-error"
|
||||
version = "1.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.9"
|
||||
@@ -1812,16 +1736,6 @@ dependencies = [
|
||||
"winreg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rocksdb"
|
||||
version = "0.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c749134fda8bfc90d0de643d59bfc841dcb3ac8a1062e12b6754bd60235c48b3"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"librocksdb-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rust-argon2"
|
||||
version = "0.8.3"
|
||||
@@ -2059,9 +1973,9 @@ checksum = "cbce6d4507c7e4a3962091436e56e95290cb71fa302d0d270e32130b75fbff27"
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.3"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527"
|
||||
checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8"
|
||||
|
||||
[[package]]
|
||||
name = "slog"
|
||||
@@ -2499,9 +2413,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
version = "0.2.12"
|
||||
version = "0.2.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cbdbff6266a24120518560b5dc983096efb98462e51d0d68169895b237be3e5d"
|
||||
checksum = "b00bca6106a5e23f3eee943593759b7fcddb00554332e856d990c893966879fb"
|
||||
|
||||
[[package]]
|
||||
name = "vec-arena"
|
||||
@@ -2552,12 +2466,9 @@ dependencies = [
|
||||
"lazy_static",
|
||||
"log",
|
||||
"pageserver",
|
||||
"parse_duration",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres_ffi",
|
||||
"regex",
|
||||
"rust-s3",
|
||||
"slog",
|
||||
"slog-async",
|
||||
"slog-scope",
|
||||
@@ -2566,7 +2477,6 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-stream",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2762,6 +2672,3 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "zenith_utils"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
53
Makefile
53
Makefile
@@ -1,53 +0,0 @@
|
||||
#
|
||||
# Top level Makefile to build Zenith and PostgreSQL
|
||||
#
|
||||
all: zenith postgres
|
||||
|
||||
# We don't want to run 'cargo build' in parallel with the postgres build,
|
||||
# because interleaving cargo build output with postgres build output looks
|
||||
# confusing. Also, 'cargo build' is parallel on its own, so it would be too
|
||||
# much parallelism. (Recursive invocation of postgres target still gets any
|
||||
# '-j' flag from the command line, so 'make -j' is still useful.)
|
||||
.NOTPARALLEL:
|
||||
|
||||
### Zenith Rust bits
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
zenith: postgres-headers
|
||||
cargo build
|
||||
|
||||
### PostgreSQL parts
|
||||
tmp_install/build/config.status:
|
||||
+@echo "Configuring postgres build"
|
||||
mkdir -p tmp_install/build
|
||||
(cd tmp_install/build && \
|
||||
../../vendor/postgres/configure CFLAGS='-O0' --enable-debug --enable-cassert \
|
||||
--enable-depend --with-libxml --prefix=$(abspath tmp_install) > configure.log)
|
||||
|
||||
# nicer alias for running 'configure'
|
||||
postgres-configure: tmp_install/build/config.status
|
||||
|
||||
# Install the PostgreSQL header files into tmp_install/include
|
||||
postgres-headers: postgres-configure
|
||||
+@echo "Installing PostgreSQL headers"
|
||||
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
|
||||
|
||||
# Compile and install PostgreSQL
|
||||
postgres: postgres-configure
|
||||
+@echo "Compiling PostgreSQL"
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
|
||||
|
||||
postgres-clean:
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
clean:
|
||||
cd tmp_install/build && ${MAKE} clean
|
||||
cargo clean
|
||||
|
||||
# This removes everything
|
||||
distclean:
|
||||
rm -rf tmp_install
|
||||
cargo clean
|
||||
|
||||
.PHONY: postgres-configure postgres postgres-headers zenith
|
||||
@@ -8,7 +8,8 @@ Zenith substitutes PostgreSQL storage layer and redistributes data across a clus
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
cd zenith
|
||||
make
|
||||
./pgbuild.sh # builds postgres and installs it to ./tmp_install
|
||||
cargo build
|
||||
```
|
||||
|
||||
2. Start pageserver and postggres on top of it (should be called from repo root):
|
||||
@@ -53,7 +54,7 @@ postgres=# select * from t;
|
||||
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
make # builds also postgres and installs it to ./tmp_install
|
||||
./pgbuild.sh # builds postgres and installs it to ./tmp_install
|
||||
cargo test -- --test-threads=1
|
||||
```
|
||||
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::io::{Read, Write};
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, ExitStatus};
|
||||
use std::process::Command;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{collections::BTreeMap, path::PathBuf};
|
||||
@@ -12,12 +11,13 @@ use std::{collections::BTreeMap, path::PathBuf};
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use tar;
|
||||
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::storage::{PageServerNode, WalProposerNode};
|
||||
use pageserver::{zenith_repo_dir, ZTimelineId};
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
@@ -190,11 +190,11 @@ impl PostgresNode {
|
||||
);
|
||||
let port: u16 = CONF_PORT_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.ok_or(anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.ok_or(anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or(anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
@@ -277,9 +277,7 @@ impl PostgresNode {
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
fsync = off\n\
|
||||
max_connections = 100\n\
|
||||
wal_sender_timeout = 0\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n",
|
||||
@@ -292,7 +290,7 @@ impl PostgresNode {
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeepr or
|
||||
// page server yet. But this will do for now.
|
||||
self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n");
|
||||
self.append_conf("postgresql.conf", &format!("wal_keep_size='10TB'\n"));
|
||||
|
||||
// Connect it to the page server.
|
||||
|
||||
@@ -355,7 +353,6 @@ impl PostgresNode {
|
||||
)
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.with_context(|| "pg_ctl failed")?;
|
||||
if !pg_ctl.success() {
|
||||
@@ -399,14 +396,6 @@ impl PostgresNode {
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
fn dump_log_file(&self) {
|
||||
if let Ok(mut file) = File::open(self.env.repo_path.join("pageserver.log")) {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- pageserver.log:\n{}", buffer);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
@@ -418,11 +407,7 @@ impl PostgresNode {
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Running {}", sql);
|
||||
let result = client.query(sql, &[]);
|
||||
if result.is_err() {
|
||||
self.dump_log_file();
|
||||
}
|
||||
result.unwrap()
|
||||
client.query(sql, &[]).unwrap()
|
||||
}
|
||||
|
||||
pub fn open_psql(&self, db: &str) -> Client {
|
||||
@@ -458,92 +443,8 @@ impl PostgresNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pg_regress(&self) -> ExitStatus {
|
||||
self.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
let data_dir = zenith_repo_dir();
|
||||
let regress_run_path = data_dir.join("regress");
|
||||
fs::create_dir_all(®ress_run_path).unwrap();
|
||||
fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGPORT", self.address.port().to_string())
|
||||
.env("PGUSER", self.whoami())
|
||||
.env("PGHOST", self.address.ip().to_string())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
if !regress_check.success() {
|
||||
if let Ok(mut file) = File::open("regression.diffs") {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- regression.diffs:\n{}", buffer);
|
||||
}
|
||||
self.dump_log_file();
|
||||
if let Ok(mut file) = File::open(
|
||||
self.env
|
||||
.repo_path
|
||||
.join("pgdatadirs")
|
||||
.join("pg1")
|
||||
.join("log"),
|
||||
) {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- pgdatadirs/pg1/log:\n{}", buffer);
|
||||
}
|
||||
}
|
||||
regress_check
|
||||
}
|
||||
|
||||
pub fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus {
|
||||
let port = self.address.port().to_string();
|
||||
let clients = clients.to_string();
|
||||
let seconds = seconds.to_string();
|
||||
let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&["-i", "-p", port.as_str(), "postgres"])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench -i");
|
||||
let pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&[
|
||||
"-p",
|
||||
port.as_str(),
|
||||
"-T",
|
||||
seconds.as_str(),
|
||||
"-P",
|
||||
"1",
|
||||
"-c",
|
||||
clients.as_str(),
|
||||
"-M",
|
||||
"prepared",
|
||||
"postgres",
|
||||
])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench run");
|
||||
pg_bench_run
|
||||
}
|
||||
// TODO
|
||||
pub fn pg_bench() {}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
|
||||
@@ -15,9 +15,8 @@ use std::process::{Command, Stdio};
|
||||
use anyhow::Result;
|
||||
use serde_derive::{Deserialize, Serialize};
|
||||
|
||||
use pageserver::zenith_repo_dir;
|
||||
use pageserver::ZTimelineId;
|
||||
use postgres_ffi::xlog_utils;
|
||||
use walkeeper::xlog_utils;
|
||||
|
||||
//
|
||||
// This data structure represents deserialized zenith config, which should be
|
||||
@@ -53,6 +52,14 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
fn zenith_repo_dir() -> PathBuf {
|
||||
// Find repository path
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Initialize a new Zenith repository
|
||||
//
|
||||
@@ -71,7 +78,7 @@ pub fn init() -> Result<()> {
|
||||
let cargo_path = env::current_dir()?;
|
||||
if !cargo_path.join("pageserver/Cargo.toml").exists() {
|
||||
anyhow::bail!(
|
||||
"Current directory does not look like a zenith repo. \
|
||||
"Current dirrectory does not look like a zenith repo. \
|
||||
Please, run 'init' from zenith repo root."
|
||||
);
|
||||
}
|
||||
@@ -84,7 +91,7 @@ pub fn init() -> Result<()> {
|
||||
if !pg_path.exists() {
|
||||
anyhow::bail!(
|
||||
"Can't find postres binary at {}. \
|
||||
Perhaps 'make postgres' is needed to build it first.",
|
||||
Perhaps './pgbuild.sh' is needed to build it first.",
|
||||
pg_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
@@ -101,7 +108,7 @@ pub fn init() -> Result<()> {
|
||||
|
||||
// ok, we are good to go
|
||||
let mut conf = LocalEnv {
|
||||
repo_path,
|
||||
repo_path: repo_path.clone(),
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir,
|
||||
systemid: 0,
|
||||
@@ -129,38 +136,30 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> {
|
||||
|
||||
// Run initdb
|
||||
//
|
||||
// We create the cluster temporarily in a "tmp" directory inside the repository,
|
||||
// and move it to the right location from there.
|
||||
let tmppath = repopath.join("tmp");
|
||||
|
||||
// FIXME: we create it temporarily in "tmp" directory, and move it into
|
||||
// the repository. Use "tempdir()" or something? Or just create it directly
|
||||
// in the repo?
|
||||
let initdb_path = local_env.pg_bin_dir().join("initdb");
|
||||
let initdb = Command::new(initdb_path)
|
||||
.args(&["-D", tmppath.to_str().unwrap()])
|
||||
let _initdb = Command::new(initdb_path)
|
||||
.args(&["-D", "tmp"])
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap())
|
||||
.env(
|
||||
"DYLD_LIBRARY_PATH",
|
||||
local_env.pg_lib_dir().to_str().unwrap(),
|
||||
)
|
||||
.stdout(Stdio::null())
|
||||
.status()
|
||||
.with_context(|| "failed to execute initdb")?;
|
||||
if !initdb.success() {
|
||||
anyhow::bail!("initdb failed");
|
||||
}
|
||||
println!("initdb succeeded");
|
||||
|
||||
// Read control file to extract the LSN and system id
|
||||
let controlfile_path = tmppath.join("global").join("pg_control");
|
||||
let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
|
||||
let controlfile =
|
||||
postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?;
|
||||
let systemid = controlfile.system_identifier;
|
||||
let lsn = controlfile.checkPoint;
|
||||
let lsnstr = format!("{:016X}", lsn);
|
||||
|
||||
// Move the initial WAL file
|
||||
fs::rename(
|
||||
tmppath.join("pg_wal").join("000000010000000000000001"),
|
||||
"tmp/pg_wal/000000010000000000000001",
|
||||
timelinedir
|
||||
.join("wal")
|
||||
.join("000000010000000000000001.partial"),
|
||||
@@ -168,13 +167,14 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> {
|
||||
println!("moved initial WAL file");
|
||||
|
||||
// Remove pg_wal
|
||||
fs::remove_dir_all(tmppath.join("pg_wal"))?;
|
||||
fs::remove_dir_all("tmp/pg_wal")?;
|
||||
println!("removed tmp/pg_wal");
|
||||
|
||||
force_crash_recovery(&tmppath)?;
|
||||
force_crash_recovery(&PathBuf::from("tmp"))?;
|
||||
println!("updated pg_control");
|
||||
|
||||
let target = timelinedir.join("snapshots").join(&lsnstr);
|
||||
fs::rename(tmppath, &target)?;
|
||||
fs::rename("tmp", &target)?;
|
||||
println!("moved 'tmp' to {}", target.display());
|
||||
|
||||
// Create 'main' branch to refer to the initial timeline
|
||||
@@ -254,7 +254,7 @@ pub fn test_env(testname: &str) -> LocalEnv {
|
||||
systemid: 0,
|
||||
};
|
||||
init_repo(&mut local_env).expect("could not initialize zenith repository");
|
||||
local_env
|
||||
return local_env;
|
||||
}
|
||||
|
||||
// Find the directory where the binaries were put (i.e. target/debug/)
|
||||
@@ -266,7 +266,7 @@ pub fn cargo_bin_dir() -> PathBuf {
|
||||
pathbuf.pop();
|
||||
}
|
||||
|
||||
pathbuf
|
||||
return pathbuf;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@@ -358,7 +358,7 @@ pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<u6
|
||||
|
||||
let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true);
|
||||
|
||||
Ok(lsn)
|
||||
return Ok(lsn);
|
||||
}
|
||||
|
||||
// Find the latest snapshot for a timeline
|
||||
|
||||
@@ -13,6 +13,7 @@ use std::time::Duration;
|
||||
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::compute::PostgresNode;
|
||||
use crate::local_env::LocalEnv;
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
@@ -55,7 +56,24 @@ impl TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: pserver,
|
||||
test_done: AtomicBool::new(false),
|
||||
repopath,
|
||||
repopath: repopath,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn one_page_server_no_start(local_env: &LocalEnv) -> TestStorageControlPlane {
|
||||
let repopath = local_env.repo_path.clone();
|
||||
|
||||
let pserver = Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
});
|
||||
|
||||
TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: pserver,
|
||||
test_done: AtomicBool::new(false),
|
||||
repopath: repopath,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,7 +89,7 @@ impl TestStorageControlPlane {
|
||||
listen_address: None,
|
||||
}),
|
||||
test_done: AtomicBool::new(false),
|
||||
repopath,
|
||||
repopath: repopath,
|
||||
};
|
||||
cplane.pageserver.start().unwrap();
|
||||
|
||||
@@ -93,9 +111,6 @@ impl TestStorageControlPlane {
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
for wa in self.wal_acceptors.iter() {
|
||||
let _ = wa.stop();
|
||||
}
|
||||
self.test_done.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
@@ -167,8 +182,7 @@ impl PageServerNode {
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.env("ZENITH_REPO_DIR", self.repo_path())
|
||||
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
anyhow::bail!(
|
||||
@@ -219,7 +233,7 @@ impl PageServerNode {
|
||||
if !status.success() {
|
||||
anyhow::bail!("Failed to stop pageserver with pid {}", pid);
|
||||
} else {
|
||||
Ok(())
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -351,6 +365,42 @@ impl Drop for WalProposerNode {
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub fn regress_check(pg: &PostgresNode) {
|
||||
pg.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
|
||||
let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress");
|
||||
fs::create_dir_all(regress_run_path.clone()).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let _regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", pg.env.pg_bin_dir().to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", pg.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGHOST", pg.address.ip().to_string())
|
||||
.env("PGPORT", pg.address.port().to_string())
|
||||
.env("PGUSER", pg.whoami())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
}
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// This should contain an unsigned integer, but we return it as a String
|
||||
|
||||
@@ -50,6 +50,7 @@ fn test_redo_cases() {
|
||||
|
||||
// Runs pg_regress on a compute node
|
||||
#[test]
|
||||
#[ignore]
|
||||
fn test_regress() {
|
||||
let local_env = local_env::test_env("test_regress");
|
||||
|
||||
@@ -62,26 +63,7 @@ fn test_regress() {
|
||||
let node = compute_cplane.new_test_node(maintli);
|
||||
node.start().unwrap();
|
||||
|
||||
let status = node.pg_regress();
|
||||
assert!(status.success());
|
||||
}
|
||||
|
||||
// Runs pg_bench on a compute node
|
||||
#[test]
|
||||
fn pgbench() {
|
||||
let local_env = local_env::test_env("pgbench");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_node(maintli);
|
||||
node.start().unwrap();
|
||||
|
||||
let status = node.pg_bench(10, 100);
|
||||
assert!(status.success());
|
||||
control_plane::storage::regress_check(&node);
|
||||
}
|
||||
|
||||
// Run two postgres instances on one pageserver, on different timelines
|
||||
|
||||
@@ -10,46 +10,6 @@ use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use std::{thread, time};
|
||||
|
||||
const DOWNTIME: u64 = 2;
|
||||
|
||||
#[test]
|
||||
//#[ignore]
|
||||
fn test_embedded_wal_proposer() {
|
||||
let local_env = local_env::test_env("test_embedded_wal_proposer");
|
||||
|
||||
const REDUNDANCY: usize = 3;
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!("wal_acceptors='{}'\n", wal_acceptors),
|
||||
);
|
||||
node.start().unwrap();
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
// check wal files equality
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_acceptors_normal_work() {
|
||||
let local_env = local_env::test_env("test_acceptors_normal_work");
|
||||
@@ -212,7 +172,7 @@ fn test_acceptors_restarts() {
|
||||
fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
|
||||
let cp = cplane.clone();
|
||||
thread::spawn(move || {
|
||||
thread::sleep(time::Duration::from_secs(DOWNTIME));
|
||||
thread::sleep(time::Duration::from_secs(1));
|
||||
cp.wal_acceptors[no].start();
|
||||
});
|
||||
}
|
||||
@@ -248,16 +208,13 @@ fn test_acceptors_unavailability() {
|
||||
psql.execute("INSERT INTO t values (1, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
// Shut down all wal acceptors
|
||||
storage_cplane.wal_acceptors[0].stop().unwrap();
|
||||
let cp = Arc::new(storage_cplane);
|
||||
start_acceptor(&cp, 0);
|
||||
let now = SystemTime::now();
|
||||
psql.execute("INSERT INTO t values (2, 'payload')", &[])
|
||||
.unwrap();
|
||||
// Here we check that the query above was hanging
|
||||
// while wal_acceptor was unavailiable
|
||||
assert!(now.elapsed().unwrap().as_secs() >= DOWNTIME);
|
||||
assert!(now.elapsed().unwrap().as_secs() > 1);
|
||||
psql.execute("INSERT INTO t values (3, 'payload')", &[])
|
||||
.unwrap();
|
||||
|
||||
@@ -265,9 +222,7 @@ fn test_acceptors_unavailability() {
|
||||
start_acceptor(&cp, 1);
|
||||
psql.execute("INSERT INTO t values (4, 'payload')", &[])
|
||||
.unwrap();
|
||||
// Here we check that the query above was hanging
|
||||
// while wal_acceptor was unavailiable
|
||||
assert!(now.elapsed().unwrap().as_secs() >= 2 * DOWNTIME);
|
||||
assert!(now.elapsed().unwrap().as_secs() > 2);
|
||||
|
||||
psql.execute("INSERT INTO t values (5, 'payload')", &[])
|
||||
.unwrap();
|
||||
@@ -278,8 +233,6 @@ fn test_acceptors_unavailability() {
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
// Ensure that all inserts succeeded.
|
||||
// Including ones that were waiting for wal acceptor restart.
|
||||
assert_eq!(count, 15);
|
||||
}
|
||||
|
||||
|
||||
2373
pageserver/Cargo.lock
generated
Normal file
2373
pageserver/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,7 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
crossbeam-channel = "0.5.0"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
@@ -31,14 +32,11 @@ tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
rocksdb = "0.16.0"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
walkdir = "2"
|
||||
thiserror = "1.0"
|
||||
hex = "0.4.3"
|
||||
tar = "0.4.33"
|
||||
parse_duration = "*"
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
|
||||
41
pageserver/build.rs
Normal file
41
pageserver/build.rs
Normal file
@@ -0,0 +1,41 @@
|
||||
//
|
||||
// Triggers postgres build if there is no postgres binary present at
|
||||
// 'REPO_ROOT/tmp_install/bin/postgres'.
|
||||
//
|
||||
// I can see a lot of disadvantages with such automatization and main
|
||||
// advantage here is ability to build everything and run integration tests
|
||||
// in a bare repo by running 'cargo test'.
|
||||
//
|
||||
// We can interceipt whether it is debug or release build and run
|
||||
// corresponding pg build. But it seems like an overkill for now.
|
||||
//
|
||||
// Problem #1 -- language server in my editor likes calling 'cargo build'
|
||||
// by himself. So if I delete tmp_install directory it would magically reappear
|
||||
// after some time. During this compilation 'cargo build' may whine about
|
||||
// "waiting for file lock on build directory".
|
||||
//
|
||||
// Problem #2 -- cargo build would run this only if something is changed in
|
||||
// the crate.
|
||||
//
|
||||
// And generally speaking postgres is not a build dependency for the pageserver,
|
||||
// just for integration tests. So let's not mix that. I'll leave this file in
|
||||
// place for some time just in case if anybody would start doing the same.
|
||||
//
|
||||
|
||||
// use std::path::Path;
|
||||
// use std::process::{Command};
|
||||
|
||||
fn main() {
|
||||
// // build some postgres if it is not done none yet
|
||||
// if !Path::new("../tmp_install/bin/postgres").exists() {
|
||||
// let make_res = Command::new("make")
|
||||
// .arg("postgres")
|
||||
// .env_clear()
|
||||
// .status()
|
||||
// .expect("failed to execute 'make postgres'");
|
||||
|
||||
// if !make_res.success() {
|
||||
// panic!("postgres build failed");
|
||||
// }
|
||||
// }
|
||||
}
|
||||
@@ -1,11 +1,12 @@
|
||||
use crate::ZTimelineId;
|
||||
use log::*;
|
||||
use postgres_ffi::FilePathError;
|
||||
use regex::Regex;
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use tar::Builder;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::ZTimelineId;
|
||||
|
||||
pub fn send_snapshot_tarball(
|
||||
write: &mut dyn Write,
|
||||
timelineid: ZTimelineId,
|
||||
@@ -65,7 +66,7 @@ pub fn send_snapshot_tarball(
|
||||
continue;
|
||||
}
|
||||
|
||||
let archive_fname = relpath.to_str().unwrap();
|
||||
let archive_fname = relpath.to_str().unwrap().clone();
|
||||
let archive_fname = archive_fname
|
||||
.strip_suffix(".partial")
|
||||
.unwrap_or(&archive_fname);
|
||||
@@ -84,7 +85,45 @@ pub fn send_snapshot_tarball(
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
#[derive(Debug)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
|
||||
let caps = re
|
||||
@@ -94,8 +133,13 @@ fn parse_filename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forknum = postgres_ffi::forkname_to_forknum(forkname)?;
|
||||
let forkname_match = caps.name("forkname");
|
||||
let forkname = if forkname_match.is_none() {
|
||||
None
|
||||
} else {
|
||||
Some(forkname_match.unwrap().as_str())
|
||||
};
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
@@ -104,7 +148,7 @@ fn parse_filename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
};
|
||||
|
||||
Ok((relnode, forknum, segno))
|
||||
return Ok((relnode, forknum, segno));
|
||||
}
|
||||
|
||||
fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
|
||||
@@ -128,9 +172,9 @@ fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
|
||||
|
||||
Ok(())
|
||||
return Ok(());
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split('/');
|
||||
let mut s = dbpath.split("/");
|
||||
let dbnode_str = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
@@ -144,15 +188,15 @@ fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
|
||||
|
||||
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
|
||||
|
||||
Ok(())
|
||||
return Ok(());
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
// TODO
|
||||
Err(FilePathError::new("tablespaces not supported"))
|
||||
return Err(FilePathError::new("tablespaces not supported"));
|
||||
} else {
|
||||
Err(FilePathError::new("invalid relation data file name"))
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
}
|
||||
}
|
||||
|
||||
fn is_rel_file_path(path: &str) -> bool {
|
||||
parse_rel_file_path(path).is_ok()
|
||||
return parse_rel_file_path(path).is_ok();
|
||||
}
|
||||
|
||||
@@ -3,12 +3,12 @@
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use parse_duration::parse;
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io;
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{App, Arg};
|
||||
@@ -16,12 +16,18 @@ use daemonize::Daemonize;
|
||||
|
||||
use slog::Drain;
|
||||
|
||||
use pageserver::{page_service, tui, zenith_repo_dir, PageServerConf};
|
||||
use pageserver::page_service;
|
||||
use pageserver::tui;
|
||||
//use pageserver::walreceiver;
|
||||
use pageserver::PageServerConf;
|
||||
|
||||
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
const DEFAULT_GC_PERIOD_SEC: u64 = 10;
|
||||
//const DEFAULT_GC_HORIZON: u64 = 1024 * 1024 * 1024;
|
||||
//const DEFAULT_GC_PERIOD_SEC: u64 = 600;
|
||||
fn zenith_repo_dir() -> String {
|
||||
// Find repository path
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => String::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
@@ -47,25 +53,11 @@ fn main() -> Result<()> {
|
||||
.takes_value(false)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("gc_horizon")
|
||||
.long("gc_horizon")
|
||||
.takes_value(true)
|
||||
.help("Distance from current LSN to perform all wal records cleanup"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("gc_period")
|
||||
.long("gc_period")
|
||||
.takes_value(true)
|
||||
.help("Interval between garbage collector iterations"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(DEFAULT_GC_PERIOD_SEC),
|
||||
listen_addr: "127.0.0.1:5430".parse().unwrap(),
|
||||
};
|
||||
|
||||
@@ -86,14 +78,6 @@ fn main() -> Result<()> {
|
||||
conf.listen_addr = addr.parse()?;
|
||||
}
|
||||
|
||||
if let Some(horizon) = arg_matches.value_of("gc_horizon") {
|
||||
conf.gc_horizon = horizon.parse()?;
|
||||
}
|
||||
|
||||
if let Some(period) = arg_matches.value_of("gc_period") {
|
||||
conf.gc_period = parse(period)?;
|
||||
}
|
||||
|
||||
start_pageserver(&conf)
|
||||
}
|
||||
|
||||
@@ -124,7 +108,7 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
let repodir = zenith_repo_dir();
|
||||
let repodir = PathBuf::from(zenith_repo_dir());
|
||||
|
||||
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
@@ -141,7 +125,7 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file(repodir.join("pageserver.pid"))
|
||||
.pid_file(repodir.clone().join("pageserver.pid"))
|
||||
.working_directory(repodir)
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
@@ -155,7 +139,7 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
// does this for us.
|
||||
let repodir = zenith_repo_dir();
|
||||
std::env::set_current_dir(&repodir)?;
|
||||
info!("Changed current directory to repository in {:?}", &repodir);
|
||||
info!("Changed current directory to repository in {}", &repodir);
|
||||
}
|
||||
|
||||
let mut threads = Vec::new();
|
||||
@@ -185,9 +169,9 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
.unwrap();
|
||||
threads.push(page_server_thread);
|
||||
|
||||
if let Some(tui_thread) = tui_thread {
|
||||
if tui_thread.is_some() {
|
||||
// The TUI thread exits when the user asks to Quit.
|
||||
tui_thread.join().unwrap();
|
||||
tui_thread.unwrap().join().unwrap();
|
||||
} else {
|
||||
// In non-interactive mode, wait forever.
|
||||
for t in threads {
|
||||
@@ -201,23 +185,19 @@ fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard,
|
||||
if conf.interactive {
|
||||
Ok(tui::init_logging())
|
||||
} else if conf.daemonize {
|
||||
let log = zenith_repo_dir().join("pageserver.log");
|
||||
let log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log)
|
||||
.map_err(|err| {
|
||||
// We failed to initialize logging, so we can't log this message with error!
|
||||
eprintln!("Could not create log file {:?}: {}", log, err);
|
||||
err
|
||||
})?;
|
||||
let log = zenith_repo_dir() + "/pageserver.log";
|
||||
let log_file = File::create(&log).map_err(|err| {
|
||||
// We failed to initialize logging, so we can't log this message with error!
|
||||
eprintln!("Could not create log file {:?}: {}", log, err);
|
||||
err
|
||||
})?;
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file);
|
||||
let drain = slog_term::CompactFormat::new(decorator).build();
|
||||
let drain = slog::Filter::new(drain, |record: &slog::Record| {
|
||||
if record.level().is_at_least(slog::Level::Info) {
|
||||
if record.level().is_at_least(slog::Level::Debug) {
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
});
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
@@ -235,7 +215,7 @@ fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard,
|
||||
{
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
use std::fmt;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod basebackup;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pg_constants;
|
||||
pub mod restore_local_repo;
|
||||
pub mod tui;
|
||||
pub mod tui_event;
|
||||
@@ -20,34 +19,9 @@ pub struct PageServerConf {
|
||||
pub daemonize: bool,
|
||||
pub interactive: bool,
|
||||
pub listen_addr: SocketAddr,
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
}
|
||||
|
||||
/// Zenith Timeline ID is a 128-bit random ID.
|
||||
///
|
||||
/// Zenith timeline IDs are different from PostgreSQL timeline
|
||||
/// IDs. They serve a similar purpose though: they differentiate
|
||||
/// between different "histories" of the same cluster. However,
|
||||
/// PostgreSQL timeline IDs are a bit cumbersome, because they are only
|
||||
/// 32-bits wide, and they must be in ascending order in any given
|
||||
/// timeline history. Those limitations mean that we cannot generate a
|
||||
/// new PostgreSQL timeline ID by just generating a random number. And
|
||||
/// that in turn is problematic for the "pull/push" workflow, where you
|
||||
/// have a local copy of a zenith repository, and you periodically sync
|
||||
/// the local changes with a remote server. When you work "detached"
|
||||
/// from the remote server, you cannot create a PostgreSQL timeline ID
|
||||
/// that's guaranteed to be different from all existing timelines in
|
||||
/// the remote server. For example, if two people are having a clone of
|
||||
/// the repository on their laptops, and they both create a new branch
|
||||
/// with different name. What timeline ID would they assign to their
|
||||
/// branches? If they pick the same one, and later try to push the
|
||||
/// branches to the same remote server, they will get mixed up.
|
||||
///
|
||||
/// To avoid those issues, Zenith has its own concept of timelines that
|
||||
/// is separate from PostgreSQL timelines, and doesn't have those
|
||||
/// limitations. A zenith timeline is identified by a 128-bit ID, which
|
||||
/// is usually printed out as a hex string.
|
||||
// Zenith Timeline ID is a 32-byte random ID.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct ZTimelineId([u8; 16]);
|
||||
|
||||
@@ -84,11 +58,3 @@ impl fmt::Display for ZTimelineId {
|
||||
f.write_str(&hex::encode(self.0))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn zenith_repo_dir() -> PathBuf {
|
||||
// Find repository path
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -10,17 +10,20 @@
|
||||
// *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
|
||||
//
|
||||
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt, BE};
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::io;
|
||||
use std::io::{BufReader, BufWriter, Read, Write};
|
||||
use std::net::{TcpListener, TcpStream};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter};
|
||||
use tokio::net::{TcpListener, TcpStream};
|
||||
use tokio::runtime;
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::task;
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::page_cache;
|
||||
@@ -47,8 +50,12 @@ enum FeMessage {
|
||||
// All that messages are actually CopyData from libpq point of view.
|
||||
//
|
||||
ZenithExistsRequest(ZenithRequest),
|
||||
ZenithTruncRequest(ZenithRequest),
|
||||
ZenithUnlinkRequest(ZenithRequest),
|
||||
ZenithNblocksRequest(ZenithRequest),
|
||||
ZenithReadRequest(ZenithRequest),
|
||||
ZenithCreateRequest(ZenithRequest),
|
||||
ZenithExtendRequest(ZenithRequest),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -80,7 +87,7 @@ struct ZenithRequest {
|
||||
relnode: u32,
|
||||
forknum: u8,
|
||||
blkno: u32,
|
||||
lsn: Lsn,
|
||||
lsn: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -111,41 +118,26 @@ enum StartupRequestCode {
|
||||
}
|
||||
|
||||
impl FeStartupMessage {
|
||||
pub fn read(stream: &mut dyn std::io::Read) -> Result<Option<FeMessage>> {
|
||||
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
|
||||
const MAX_STARTUP_PACKET_LENGTH: u32 = 10000;
|
||||
const CANCEL_REQUEST_CODE: u32 = (1234 << 16) | 5678;
|
||||
const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679;
|
||||
const NEGOTIATE_GSS_CODE: u32 = (1234 << 16) | 5680;
|
||||
|
||||
// Read length. If the connection is closed before reading anything (or before
|
||||
// reading 4 bytes, to be precise), return None to indicate that the connection
|
||||
// was closed. This matches the PostgreSQL server's behavior, which avoids noise
|
||||
// in the log if the client opens connection but closes it immediately.
|
||||
let len = match stream.read_u32::<BE>() {
|
||||
Ok(len) => len,
|
||||
Err(err) => {
|
||||
if err.kind() == std::io::ErrorKind::UnexpectedEof {
|
||||
return Ok(None);
|
||||
} else {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
};
|
||||
if buf.len() < 4 {
|
||||
return Ok(None);
|
||||
}
|
||||
let len = BigEndian::read_u32(&buf[0..4]);
|
||||
|
||||
if len < 4 || len as u32 > MAX_STARTUP_PACKET_LENGTH {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"invalid message length",
|
||||
));
|
||||
}
|
||||
let bodylen = len - 4;
|
||||
|
||||
// Read the rest of the startup packet
|
||||
let mut body_buf: Vec<u8> = vec![0; bodylen as usize];
|
||||
stream.read_exact(&mut body_buf)?;
|
||||
let mut body = Bytes::from(body_buf);
|
||||
let version = BigEndian::read_u32(&buf[4..8]);
|
||||
|
||||
// Parse the first field, which indicates what kind of a packet it is
|
||||
let version = body.get_u32();
|
||||
let kind = match version {
|
||||
CANCEL_REQUEST_CODE => StartupRequestCode::Cancel,
|
||||
NEGOTIATE_SSL_CODE => StartupRequestCode::NegotiateSsl,
|
||||
@@ -153,8 +145,7 @@ impl FeStartupMessage {
|
||||
_ => StartupRequestCode::Normal,
|
||||
};
|
||||
|
||||
// Ignore the rest of the packet
|
||||
|
||||
buf.advance(len as usize);
|
||||
Ok(Some(FeMessage::StartupMessage(FeStartupMessage {
|
||||
version,
|
||||
kind,
|
||||
@@ -198,11 +189,12 @@ fn read_null_terminated(buf: &mut Bytes) -> Result<Bytes> {
|
||||
}
|
||||
result.put_u8(byte);
|
||||
}
|
||||
Ok(result.freeze())
|
||||
return Ok(result.freeze());
|
||||
}
|
||||
|
||||
impl FeParseMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
pub fn parse(body: Bytes) -> Result<FeMessage> {
|
||||
let mut buf = body.clone();
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
let query_string = read_null_terminated(&mut buf)?;
|
||||
let nparams = buf.get_i16();
|
||||
@@ -212,7 +204,7 @@ impl FeParseMessage {
|
||||
// now, just ignore the statement name, assuming that the client never
|
||||
// uses more than one prepared statement at a time.
|
||||
/*
|
||||
if !pstmt_name.is_empty() {
|
||||
if pstmt_name.len() != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented in Parse",
|
||||
@@ -238,13 +230,14 @@ struct FeDescribeMessage {
|
||||
}
|
||||
|
||||
impl FeDescribeMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
pub fn parse(body: Bytes) -> Result<FeMessage> {
|
||||
let mut buf = body.clone();
|
||||
let kind = buf.get_u8();
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
// FIXME: see FeParseMessage::parse
|
||||
/*
|
||||
if !pstmt_name.is_empty() {
|
||||
if pstmt_name.len() != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented in Describe",
|
||||
@@ -271,11 +264,12 @@ struct FeExecuteMessage {
|
||||
}
|
||||
|
||||
impl FeExecuteMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
pub fn parse(body: Bytes) -> Result<FeMessage> {
|
||||
let mut buf = body.clone();
|
||||
let portal_name = read_null_terminated(&mut buf)?;
|
||||
let maxrows = buf.get_i32();
|
||||
|
||||
if !portal_name.is_empty() {
|
||||
if portal_name.len() != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named portals not implemented",
|
||||
@@ -298,11 +292,12 @@ impl FeExecuteMessage {
|
||||
struct FeBindMessage {}
|
||||
|
||||
impl FeBindMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
pub fn parse(body: Bytes) -> Result<FeMessage> {
|
||||
let mut buf = body.clone();
|
||||
let portal_name = read_null_terminated(&mut buf)?;
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
if !portal_name.is_empty() {
|
||||
if portal_name.len() != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named portals not implemented",
|
||||
@@ -311,7 +306,7 @@ impl FeBindMessage {
|
||||
|
||||
// FIXME: see FeParseMessage::parse
|
||||
/*
|
||||
if !pstmt_name.is_empty() {
|
||||
if pstmt_name.len() != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented",
|
||||
@@ -328,7 +323,8 @@ impl FeBindMessage {
|
||||
struct FeCloseMessage {}
|
||||
|
||||
impl FeCloseMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
pub fn parse(body: Bytes) -> Result<FeMessage> {
|
||||
let mut buf = body.clone();
|
||||
let _kind = buf.get_u8();
|
||||
let _pstmt_or_portal_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
@@ -339,40 +335,37 @@ impl FeCloseMessage {
|
||||
}
|
||||
|
||||
impl FeMessage {
|
||||
pub fn read(stream: &mut dyn Read) -> Result<Option<FeMessage>> {
|
||||
// Each libpq message begins with a message type byte, followed by message length
|
||||
// If the client closes the connection, return None. But if the client closes the
|
||||
// connection in the middle of a message, we will return an error.
|
||||
let tag = match stream.read_u8() {
|
||||
Ok(b) => b,
|
||||
Err(err) => {
|
||||
if err.kind() == std::io::ErrorKind::UnexpectedEof {
|
||||
return Ok(None);
|
||||
} else {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
};
|
||||
let len = stream.read_u32::<BE>()?;
|
||||
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
|
||||
if buf.len() < 5 {
|
||||
let to_read = 5 - buf.len();
|
||||
buf.reserve(to_read);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tag = buf[0];
|
||||
let len = BigEndian::read_u32(&buf[1..5]);
|
||||
|
||||
// The message length includes itself, so it better be at least 4
|
||||
if len < 4 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"invalid message length: parsing u32",
|
||||
));
|
||||
}
|
||||
let bodylen = len - 4;
|
||||
|
||||
// Read message body
|
||||
let mut body_buf: Vec<u8> = vec![0; bodylen as usize];
|
||||
stream.read_exact(&mut body_buf)?;
|
||||
let total_len = len as usize + 1;
|
||||
if buf.len() < total_len {
|
||||
let to_read = total_len - buf.len();
|
||||
buf.reserve(to_read);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut body = Bytes::from(body_buf);
|
||||
let mut body = buf.split_to(total_len);
|
||||
body.advance(5);
|
||||
|
||||
let mut body = body.freeze();
|
||||
|
||||
// Parse it
|
||||
match tag {
|
||||
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body }))),
|
||||
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body: body }))),
|
||||
b'P' => Ok(Some(FeParseMessage::parse(body)?)),
|
||||
b'D' => Ok(Some(FeDescribeMessage::parse(body)?)),
|
||||
b'E' => Ok(Some(FeExecuteMessage::parse(body)?)),
|
||||
@@ -388,24 +381,28 @@ impl FeMessage {
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
blkno: body.get_u32(),
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
lsn: body.get_u64(),
|
||||
};
|
||||
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
match smgr_tag {
|
||||
0 => Ok(Some(FeMessage::ZenithExistsRequest(zreq))),
|
||||
1 => Ok(Some(FeMessage::ZenithNblocksRequest(zreq))),
|
||||
2 => Ok(Some(FeMessage::ZenithReadRequest(zreq))),
|
||||
1 => Ok(Some(FeMessage::ZenithTruncRequest(zreq))),
|
||||
2 => Ok(Some(FeMessage::ZenithUnlinkRequest(zreq))),
|
||||
3 => Ok(Some(FeMessage::ZenithNblocksRequest(zreq))),
|
||||
4 => Ok(Some(FeMessage::ZenithReadRequest(zreq))),
|
||||
5 => Ok(Some(FeMessage::ZenithCreateRequest(zreq))),
|
||||
6 => Ok(Some(FeMessage::ZenithExtendRequest(zreq))),
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("unknown smgr message tag: {},'{:?}'", smgr_tag, body),
|
||||
format!("unknown smgr message tag: {},'{:?}'", smgr_tag, buf),
|
||||
)),
|
||||
}
|
||||
}
|
||||
tag => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("unknown message tag: {},'{:?}'", tag, body),
|
||||
format!("unknown message tag: {},'{:?}'", tag, buf),
|
||||
)),
|
||||
}
|
||||
}
|
||||
@@ -413,233 +410,269 @@ impl FeMessage {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
///
|
||||
/// Main loop of the page service.
|
||||
///
|
||||
/// Listens for connections, and launches a new handler thread for each.
|
||||
///
|
||||
pub fn thread_main(conf: &PageServerConf) {
|
||||
// Create a new thread pool
|
||||
//
|
||||
// FIXME: It would be nice to keep this single-threaded for debugging purposes,
|
||||
// but that currently leads to a deadlock: if a GetPage@LSN request arrives
|
||||
// for an LSN that hasn't been received yet, the thread gets stuck waiting for
|
||||
// the WAL to arrive. If the WAL receiver hasn't been launched yet, i.e
|
||||
// we haven't received a "callmemaybe" request yet to tell us where to get the
|
||||
// WAL, we will not have a thread available to process the "callmemaybe"
|
||||
// request when it does arrive. Using a thread pool alleviates the problem so
|
||||
// that it doesn't happen in the tests anymore, but in principle it could still
|
||||
// happen if we receive enough GetPage@LSN requests to consume all of the
|
||||
// available threads.
|
||||
//let runtime = runtime::Builder::new_current_thread().enable_all().build().unwrap();
|
||||
let runtime = runtime::Runtime::new().unwrap();
|
||||
|
||||
info!("Starting page server on {}", conf.listen_addr);
|
||||
|
||||
let listener = TcpListener::bind(conf.listen_addr).unwrap();
|
||||
let runtime_ref = Arc::new(runtime);
|
||||
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept().unwrap();
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
let mut conn_handler = Connection::new(conf.clone(), socket);
|
||||
runtime_ref.clone().block_on(async {
|
||||
let listener = TcpListener::bind(conf.listen_addr).await.unwrap();
|
||||
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = conn_handler.run() {
|
||||
error!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept().await.unwrap();
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let mut conn_handler = Connection::new(conf.clone(), socket, &runtime_ref);
|
||||
|
||||
task::spawn(async move {
|
||||
if let Err(err) = conn_handler.run().await {
|
||||
error!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Connection {
|
||||
stream_in: BufReader<TcpStream>,
|
||||
stream: BufWriter<TcpStream>,
|
||||
buffer: BytesMut,
|
||||
init_done: bool,
|
||||
conf: PageServerConf,
|
||||
runtime: Arc<Runtime>,
|
||||
}
|
||||
|
||||
impl Connection {
|
||||
pub fn new(conf: PageServerConf, socket: TcpStream) -> Connection {
|
||||
pub fn new(conf: PageServerConf, socket: TcpStream, runtime: &Arc<Runtime>) -> Connection {
|
||||
Connection {
|
||||
stream_in: BufReader::new(socket.try_clone().unwrap()),
|
||||
stream: BufWriter::new(socket),
|
||||
buffer: BytesMut::with_capacity(10 * 1024),
|
||||
init_done: false,
|
||||
conf,
|
||||
runtime: Arc::clone(runtime),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Read full message or return None if connection is closed
|
||||
//
|
||||
fn read_message(&mut self) -> Result<Option<FeMessage>> {
|
||||
if !self.init_done {
|
||||
FeStartupMessage::read(&mut self.stream_in)
|
||||
} else {
|
||||
FeMessage::read(&mut self.stream_in)
|
||||
async fn read_message(&mut self) -> Result<Option<FeMessage>> {
|
||||
loop {
|
||||
if let Some(message) = self.parse_message()? {
|
||||
return Ok(Some(message));
|
||||
}
|
||||
|
||||
if self.stream.read_buf(&mut self.buffer).await? == 0 {
|
||||
if self.buffer.is_empty() {
|
||||
return Ok(None);
|
||||
} else {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
"connection reset by peer",
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn write_message_noflush(&mut self, message: &BeMessage) -> io::Result<()> {
|
||||
fn parse_message(&mut self) -> Result<Option<FeMessage>> {
|
||||
if !self.init_done {
|
||||
FeStartupMessage::parse(&mut self.buffer)
|
||||
} else {
|
||||
FeMessage::parse(&mut self.buffer)
|
||||
}
|
||||
}
|
||||
|
||||
async fn write_message_noflush(&mut self, message: &BeMessage) -> io::Result<()> {
|
||||
match message {
|
||||
BeMessage::AuthenticationOk => {
|
||||
self.stream.write_u8(b'R')?;
|
||||
self.stream.write_i32::<BE>(4 + 4)?;
|
||||
self.stream.write_i32::<BE>(0)?;
|
||||
self.stream.write_u8(b'R').await?;
|
||||
self.stream.write_i32(4 + 4).await?;
|
||||
self.stream.write_i32(0).await?;
|
||||
}
|
||||
|
||||
BeMessage::ReadyForQuery => {
|
||||
self.stream.write_u8(b'Z')?;
|
||||
self.stream.write_i32::<BE>(4 + 1)?;
|
||||
self.stream.write_u8(b'I')?;
|
||||
self.stream.write_u8(b'Z').await?;
|
||||
self.stream.write_i32(4 + 1).await?;
|
||||
self.stream.write_u8(b'I').await?;
|
||||
}
|
||||
|
||||
BeMessage::ParseComplete => {
|
||||
self.stream.write_u8(b'1')?;
|
||||
self.stream.write_i32::<BE>(4)?;
|
||||
self.stream.write_u8(b'1').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::BindComplete => {
|
||||
self.stream.write_u8(b'2')?;
|
||||
self.stream.write_i32::<BE>(4)?;
|
||||
self.stream.write_u8(b'2').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::CloseComplete => {
|
||||
self.stream.write_u8(b'3')?;
|
||||
self.stream.write_i32::<BE>(4)?;
|
||||
self.stream.write_u8(b'3').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::NoData => {
|
||||
self.stream.write_u8(b'n')?;
|
||||
self.stream.write_i32::<BE>(4)?;
|
||||
self.stream.write_u8(b'n').await?;
|
||||
self.stream.write_i32(4).await?;
|
||||
}
|
||||
|
||||
BeMessage::ParameterDescription => {
|
||||
self.stream.write_u8(b't')?;
|
||||
self.stream.write_i32::<BE>(6)?;
|
||||
self.stream.write_u8(b't').await?;
|
||||
self.stream.write_i32(6).await?;
|
||||
// we don't support params, so always 0
|
||||
self.stream.write_i16::<BE>(0)?;
|
||||
self.stream.write_i16(0).await?;
|
||||
}
|
||||
|
||||
BeMessage::RowDescription => {
|
||||
// XXX
|
||||
let b = Bytes::from("data\0");
|
||||
let mut b = Bytes::from("data\0");
|
||||
|
||||
self.stream.write_u8(b'T')?;
|
||||
self.stream.write_u8(b'T').await?;
|
||||
self.stream
|
||||
.write_i32::<BE>(4 + 2 + b.len() as i32 + 3 * (4 + 2))?;
|
||||
.write_i32(4 + 2 + b.len() as i32 + 3 * (4 + 2))
|
||||
.await?;
|
||||
|
||||
self.stream.write_i16::<BE>(1)?;
|
||||
self.stream.write_all(&b)?;
|
||||
self.stream.write_i32::<BE>(0)?; /* table oid */
|
||||
self.stream.write_i16::<BE>(0)?; /* attnum */
|
||||
self.stream.write_i32::<BE>(25)?; /* TEXTOID */
|
||||
self.stream.write_i16::<BE>(-1)?; /* typlen */
|
||||
self.stream.write_i32::<BE>(0)?; /* typmod */
|
||||
self.stream.write_i16::<BE>(0)?; /* format code */
|
||||
self.stream.write_i16(1).await?;
|
||||
self.stream.write_all(&mut b).await?;
|
||||
self.stream.write_i32(0).await?; /* table oid */
|
||||
self.stream.write_i16(0).await?; /* attnum */
|
||||
self.stream.write_i32(25).await?; /* TEXTOID */
|
||||
self.stream.write_i16(-1).await?; /* typlen */
|
||||
self.stream.write_i32(0).await?; /* typmod */
|
||||
self.stream.write_i16(0).await?; /* format code */
|
||||
}
|
||||
|
||||
// XXX: accept some text data
|
||||
BeMessage::DataRow => {
|
||||
// XXX
|
||||
let b = Bytes::from("hello world");
|
||||
let mut b = Bytes::from("hello world");
|
||||
|
||||
self.stream.write_u8(b'D')?;
|
||||
self.stream.write_i32::<BE>(4 + 2 + 4 + b.len() as i32)?;
|
||||
self.stream.write_u8(b'D').await?;
|
||||
self.stream.write_i32(4 + 2 + 4 + b.len() as i32).await?;
|
||||
|
||||
self.stream.write_i16::<BE>(1)?;
|
||||
self.stream.write_i32::<BE>(b.len() as i32)?;
|
||||
self.stream.write_all(&b)?;
|
||||
self.stream.write_i16(1).await?;
|
||||
self.stream.write_i32(b.len() as i32).await?;
|
||||
self.stream.write_all(&mut b).await?;
|
||||
}
|
||||
|
||||
BeMessage::ControlFile => {
|
||||
// TODO pass checkpoint and xid info in this message
|
||||
let b = Bytes::from("hello pg_control");
|
||||
let mut b = Bytes::from("hello pg_control");
|
||||
|
||||
self.stream.write_u8(b'D')?;
|
||||
self.stream.write_i32::<BE>(4 + 2 + 4 + b.len() as i32)?;
|
||||
self.stream.write_u8(b'D').await?;
|
||||
self.stream.write_i32(4 + 2 + 4 + b.len() as i32).await?;
|
||||
|
||||
self.stream.write_i16::<BE>(1)?;
|
||||
self.stream.write_i32::<BE>(b.len() as i32)?;
|
||||
self.stream.write_all(&b)?;
|
||||
self.stream.write_i16(1).await?;
|
||||
self.stream.write_i32(b.len() as i32).await?;
|
||||
self.stream.write_all(&mut b).await?;
|
||||
}
|
||||
|
||||
BeMessage::CommandComplete => {
|
||||
let b = Bytes::from("SELECT 1\0");
|
||||
let mut b = Bytes::from("SELECT 1\0");
|
||||
|
||||
self.stream.write_u8(b'C')?;
|
||||
self.stream.write_i32::<BE>(4 + b.len() as i32)?;
|
||||
self.stream.write_all(&b)?;
|
||||
self.stream.write_u8(b'C').await?;
|
||||
self.stream.write_i32(4 + b.len() as i32).await?;
|
||||
self.stream.write_all(&mut b).await?;
|
||||
}
|
||||
|
||||
BeMessage::ZenithStatusResponse(resp) => {
|
||||
self.stream.write_u8(b'd')?;
|
||||
self.stream.write_u32::<BE>(4 + 1 + 1 + 4)?;
|
||||
self.stream.write_u8(100)?; /* tag from pagestore_client.h */
|
||||
self.stream.write_u8(resp.ok as u8)?;
|
||||
self.stream.write_u32::<BE>(resp.n_blocks)?;
|
||||
self.stream.write_u8(b'd').await?;
|
||||
self.stream.write_u32(4 + 1 + 1 + 4).await?;
|
||||
self.stream.write_u8(100).await?; /* tag from pagestore_client.h */
|
||||
self.stream.write_u8(resp.ok as u8).await?;
|
||||
self.stream.write_u32(resp.n_blocks).await?;
|
||||
}
|
||||
|
||||
BeMessage::ZenithNblocksResponse(resp) => {
|
||||
self.stream.write_u8(b'd')?;
|
||||
self.stream.write_u32::<BE>(4 + 1 + 1 + 4)?;
|
||||
self.stream.write_u8(101)?; /* tag from pagestore_client.h */
|
||||
self.stream.write_u8(resp.ok as u8)?;
|
||||
self.stream.write_u32::<BE>(resp.n_blocks)?;
|
||||
self.stream.write_u8(b'd').await?;
|
||||
self.stream.write_u32(4 + 1 + 1 + 4).await?;
|
||||
self.stream.write_u8(101).await?; /* tag from pagestore_client.h */
|
||||
self.stream.write_u8(resp.ok as u8).await?;
|
||||
self.stream.write_u32(resp.n_blocks).await?;
|
||||
}
|
||||
|
||||
BeMessage::ZenithReadResponse(resp) => {
|
||||
self.stream.write_u8(b'd')?;
|
||||
self.stream.write_u8(b'd').await?;
|
||||
self.stream
|
||||
.write_u32::<BE>(4 + 1 + 1 + 4 + resp.page.len() as u32)?;
|
||||
self.stream.write_u8(102)?; /* tag from pagestore_client.h */
|
||||
self.stream.write_u8(resp.ok as u8)?;
|
||||
self.stream.write_u32::<BE>(resp.n_blocks)?;
|
||||
self.stream.write_all(&resp.page.clone())?;
|
||||
.write_u32(4 + 1 + 1 + 4 + resp.page.len() as u32)
|
||||
.await?;
|
||||
self.stream.write_u8(102).await?; /* tag from pagestore_client.h */
|
||||
self.stream.write_u8(resp.ok as u8).await?;
|
||||
self.stream.write_u32(resp.n_blocks).await?;
|
||||
self.stream.write_all(&mut resp.page.clone()).await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_message(&mut self, message: &BeMessage) -> io::Result<()> {
|
||||
self.write_message_noflush(message)?;
|
||||
self.stream.flush()
|
||||
async fn write_message(&mut self, message: &BeMessage) -> io::Result<()> {
|
||||
self.write_message_noflush(message).await?;
|
||||
self.stream.flush().await
|
||||
}
|
||||
|
||||
fn run(&mut self) -> Result<()> {
|
||||
async fn run(&mut self) -> Result<()> {
|
||||
let mut unnamed_query_string = Bytes::new();
|
||||
loop {
|
||||
let msg = self.read_message()?;
|
||||
trace!("got message {:?}", msg);
|
||||
let msg = self.read_message().await?;
|
||||
info!("got message {:?}", msg);
|
||||
match msg {
|
||||
Some(FeMessage::StartupMessage(m)) => {
|
||||
trace!("got message {:?}", m);
|
||||
|
||||
match m.kind {
|
||||
StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => {
|
||||
let b = Bytes::from("N");
|
||||
self.stream.write_all(&b)?;
|
||||
self.stream.flush()?;
|
||||
let mut b = Bytes::from("N");
|
||||
self.stream.write_all(&mut b).await?;
|
||||
self.stream.flush().await?;
|
||||
}
|
||||
StartupRequestCode::Normal => {
|
||||
self.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
self.write_message_noflush(&BeMessage::AuthenticationOk)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await?;
|
||||
self.init_done = true;
|
||||
}
|
||||
StartupRequestCode::Cancel => return Ok(()),
|
||||
}
|
||||
}
|
||||
Some(FeMessage::Query(m)) => {
|
||||
self.process_query(m.body)?;
|
||||
self.process_query(m.body).await?;
|
||||
}
|
||||
Some(FeMessage::Parse(m)) => {
|
||||
unnamed_query_string = m.query_string;
|
||||
self.write_message(&BeMessage::ParseComplete)?;
|
||||
self.write_message(&BeMessage::ParseComplete).await?;
|
||||
}
|
||||
Some(FeMessage::Describe(_)) => {
|
||||
self.write_message_noflush(&BeMessage::ParameterDescription)?;
|
||||
self.write_message(&BeMessage::NoData)?;
|
||||
self.write_message_noflush(&BeMessage::ParameterDescription)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::NoData).await?;
|
||||
}
|
||||
Some(FeMessage::Bind(_)) => {
|
||||
self.write_message(&BeMessage::BindComplete)?;
|
||||
self.write_message(&BeMessage::BindComplete).await?;
|
||||
}
|
||||
Some(FeMessage::Close(_)) => {
|
||||
self.write_message(&BeMessage::CloseComplete)?;
|
||||
self.write_message(&BeMessage::CloseComplete).await?;
|
||||
}
|
||||
Some(FeMessage::Execute(_)) => {
|
||||
self.process_query(unnamed_query_string.clone())?;
|
||||
self.process_query(unnamed_query_string.clone()).await?;
|
||||
}
|
||||
Some(FeMessage::Sync) => {
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await?;
|
||||
}
|
||||
Some(FeMessage::Terminate) => {
|
||||
break;
|
||||
@@ -658,7 +691,7 @@ impl Connection {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process_query(&mut self, query_string: Bytes) -> Result<()> {
|
||||
async fn process_query(&mut self, query_string: Bytes) -> Result<()> {
|
||||
debug!("process query {:?}", query_string);
|
||||
|
||||
// remove null terminator, if any
|
||||
@@ -668,13 +701,13 @@ impl Connection {
|
||||
}
|
||||
|
||||
if query_string.starts_with(b"controlfile") {
|
||||
self.handle_controlfile()
|
||||
self.handle_controlfile().await
|
||||
} else if query_string.starts_with(b"pagestream ") {
|
||||
let (_l, r) = query_string.split_at("pagestream ".len());
|
||||
let timelineid_str = String::from_utf8(r.to_vec()).unwrap();
|
||||
let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
|
||||
|
||||
self.handle_pagerequests(timelineid)
|
||||
self.handle_pagerequests(timelineid).await
|
||||
} else if query_string.starts_with(b"basebackup ") {
|
||||
let (_l, r) = query_string.split_at("basebackup ".len());
|
||||
let r = r.to_vec();
|
||||
@@ -683,18 +716,21 @@ impl Connection {
|
||||
let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(timelineid)?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)
|
||||
self.handle_basebackup_request(timelineid).await?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await
|
||||
} else if query_string.starts_with(b"callmemaybe ") {
|
||||
let query_str = String::from_utf8(query_string.to_vec()).unwrap();
|
||||
let query_str = String::from_utf8(query_string.to_vec())
|
||||
.unwrap()
|
||||
.to_string();
|
||||
|
||||
// callmemaybe <zenith timelineid as hex string> <connstr>
|
||||
let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) (.*)$").unwrap();
|
||||
let caps = re.captures(&query_str);
|
||||
let caps = caps.unwrap();
|
||||
|
||||
let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
||||
let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str().clone()).unwrap();
|
||||
let connstr: String = String::from(caps.get(2).unwrap().as_str());
|
||||
|
||||
// Check that the timeline exists
|
||||
@@ -707,29 +743,36 @@ impl Connection {
|
||||
|
||||
walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr);
|
||||
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await
|
||||
} else if query_string.starts_with(b"status") {
|
||||
self.write_message_noflush(&BeMessage::RowDescription)?;
|
||||
self.write_message_noflush(&BeMessage::DataRow)?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)
|
||||
self.write_message_noflush(&BeMessage::RowDescription)
|
||||
.await?;
|
||||
self.write_message_noflush(&BeMessage::DataRow).await?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await
|
||||
} else {
|
||||
self.write_message_noflush(&BeMessage::RowDescription)?;
|
||||
self.write_message_noflush(&BeMessage::DataRow)?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)
|
||||
self.write_message_noflush(&BeMessage::RowDescription)
|
||||
.await?;
|
||||
self.write_message_noflush(&BeMessage::DataRow).await?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_controlfile(&mut self) -> Result<()> {
|
||||
self.write_message_noflush(&BeMessage::RowDescription)?;
|
||||
self.write_message_noflush(&BeMessage::ControlFile)?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)
|
||||
async fn handle_controlfile(&mut self) -> Result<()> {
|
||||
self.write_message_noflush(&BeMessage::RowDescription)
|
||||
.await?;
|
||||
self.write_message_noflush(&BeMessage::ControlFile).await?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)
|
||||
.await?;
|
||||
self.write_message(&BeMessage::ReadyForQuery).await
|
||||
}
|
||||
|
||||
fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
async fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
// Check that the timeline exists
|
||||
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
|
||||
if pcache.is_err() {
|
||||
@@ -740,17 +783,17 @@ impl Connection {
|
||||
let pcache = pcache.unwrap();
|
||||
|
||||
/* switch client to COPYBOTH */
|
||||
self.stream.write_u8(b'W')?;
|
||||
self.stream.write_i32::<BE>(4 + 1 + 2)?;
|
||||
self.stream.write_u8(0)?; /* copy_is_binary */
|
||||
self.stream.write_i16::<BE>(0)?; /* numAttributes */
|
||||
self.stream.flush()?;
|
||||
self.stream.write_u8(b'W').await?;
|
||||
self.stream.write_i32(4 + 1 + 2).await?;
|
||||
self.stream.write_u8(0).await?; /* copy_is_binary */
|
||||
self.stream.write_i16(0).await?; /* numAttributes */
|
||||
self.stream.flush().await?;
|
||||
|
||||
loop {
|
||||
let message = self.read_message()?;
|
||||
let message = self.read_message().await?;
|
||||
|
||||
if let Some(m) = &message {
|
||||
trace!("query({:?}): {:?}", timelineid, m);
|
||||
info!("query({:?}): {:?}", timelineid, m);
|
||||
};
|
||||
|
||||
if message.is_none() {
|
||||
@@ -767,12 +810,27 @@ impl Connection {
|
||||
forknum: req.forknum,
|
||||
};
|
||||
|
||||
let exist = pcache.relsize_exist(&tag, req.lsn).unwrap_or(false);
|
||||
let exist = pcache.relsize_exist(&tag);
|
||||
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: exist,
|
||||
n_blocks: 0,
|
||||
}))?
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
Some(FeMessage::ZenithTruncRequest(_)) => {
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
Some(FeMessage::ZenithUnlinkRequest(_)) => {
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
Some(FeMessage::ZenithNblocksRequest(req)) => {
|
||||
let tag = page_cache::RelTag {
|
||||
@@ -782,21 +840,20 @@ impl Connection {
|
||||
forknum: req.forknum,
|
||||
};
|
||||
|
||||
let n_blocks = pcache.relsize_get(&tag, req.lsn).unwrap_or(0);
|
||||
let n_blocks = pcache.relsize_get(&tag);
|
||||
|
||||
self.write_message(&BeMessage::ZenithNblocksResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks,
|
||||
}))?
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
Some(FeMessage::ZenithReadRequest(req)) => {
|
||||
let buf_tag = page_cache::BufferTag {
|
||||
rel: page_cache::RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
},
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
blknum: req.blkno,
|
||||
};
|
||||
|
||||
@@ -817,14 +874,46 @@ impl Connection {
|
||||
}
|
||||
};
|
||||
|
||||
self.write_message(&msg)?
|
||||
self.write_message(&msg).await?
|
||||
}
|
||||
Some(FeMessage::ZenithCreateRequest(req)) => {
|
||||
let tag = page_cache::RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
|
||||
pcache.relsize_inc(&tag, 0);
|
||||
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
Some(FeMessage::ZenithExtendRequest(req)) => {
|
||||
let tag = page_cache::RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
|
||||
pcache.relsize_inc(&tag, req.blkno + 1);
|
||||
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
}))
|
||||
.await?
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_basebackup_request(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
async fn handle_basebackup_request(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
// check that the timeline exists
|
||||
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
|
||||
if pcache.is_err() {
|
||||
@@ -835,11 +924,11 @@ impl Connection {
|
||||
|
||||
/* switch client to COPYOUT */
|
||||
let stream = &mut self.stream;
|
||||
stream.write_u8(b'H')?;
|
||||
stream.write_i32::<BE>(4 + 1 + 2)?;
|
||||
stream.write_u8(0)?; /* copy_is_binary */
|
||||
stream.write_i16::<BE>(0)?; /* numAttributes */
|
||||
stream.flush()?;
|
||||
stream.write_u8(b'H').await?;
|
||||
stream.write_i32(4 + 1 + 2).await?;
|
||||
stream.write_u8(0).await?; /* copy_is_binary */
|
||||
stream.write_i16(0).await?; /* numAttributes */
|
||||
stream.flush().await?;
|
||||
info!("sent CopyOut");
|
||||
|
||||
/* Send a tarball of the latest snapshot on the timeline */
|
||||
@@ -847,45 +936,72 @@ impl Connection {
|
||||
// find latest snapshot
|
||||
let snapshotlsn = restore_local_repo::find_latest_snapshot(&self.conf, timelineid).unwrap();
|
||||
|
||||
basebackup::send_snapshot_tarball(&mut CopyDataSink { stream }, timelineid, snapshotlsn)?;
|
||||
// Stream it
|
||||
let (s, mut r) = mpsc::channel(5);
|
||||
|
||||
let f_tar = task::spawn_blocking(move || {
|
||||
basebackup::send_snapshot_tarball(&mut CopyDataSink(s), timelineid, snapshotlsn)?;
|
||||
Ok(())
|
||||
});
|
||||
let f_tar2 = async {
|
||||
let joinres = f_tar.await;
|
||||
|
||||
if joinres.is_err() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
joinres.unwrap_err(),
|
||||
));
|
||||
}
|
||||
return joinres.unwrap();
|
||||
};
|
||||
|
||||
let f_pump = async move {
|
||||
loop {
|
||||
let buf = r.recv().await;
|
||||
if buf.is_none() {
|
||||
break;
|
||||
}
|
||||
let mut buf = buf.unwrap();
|
||||
|
||||
// CopyData
|
||||
stream.write_u8(b'd').await?;
|
||||
stream.write_u32((4 + buf.len()) as u32).await?;
|
||||
stream.write_all(&mut buf).await?;
|
||||
trace!("CopyData sent for {} bytes!", buf.len());
|
||||
|
||||
// FIXME: flush isn't really required, but makes it easier
|
||||
// to view in wireshark
|
||||
stream.flush().await?;
|
||||
}
|
||||
Ok(())
|
||||
};
|
||||
|
||||
tokio::try_join!(f_tar2, f_pump)?;
|
||||
|
||||
// CopyDone
|
||||
self.stream.write_u8(b'c')?;
|
||||
self.stream.write_u32::<BE>(4)?;
|
||||
self.stream.flush()?;
|
||||
self.stream.write_u8(b'c').await?;
|
||||
self.stream.write_u32(4).await?;
|
||||
self.stream.flush().await?;
|
||||
debug!("CopyDone sent!");
|
||||
|
||||
// FIXME: I'm getting an error from the tokio copyout driver without this.
|
||||
// I think it happens when the CommandComplete, CloseComplete and ReadyForQuery
|
||||
// are sent in the same TCP packet as the CopyDone. I don't understand why.
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
thread::sleep(std::time::Duration::from_secs(1));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A std::io::Write implementation that wraps all data written to it in CopyData
|
||||
/// messages.
|
||||
///
|
||||
struct CopyDataSink<'a> {
|
||||
stream: &'a mut BufWriter<TcpStream>,
|
||||
}
|
||||
struct CopyDataSink(mpsc::Sender<Bytes>);
|
||||
|
||||
impl<'a> std::io::Write for CopyDataSink<'a> {
|
||||
impl std::io::Write for CopyDataSink {
|
||||
fn write(&mut self, data: &[u8]) -> std::result::Result<usize, std::io::Error> {
|
||||
// CopyData
|
||||
// FIXME: if the input is large, we should split it into multiple messages.
|
||||
// Not sure what the threshold should be, but the ultimate hard limit is that
|
||||
// the length cannot exceed u32.
|
||||
self.stream.write_u8(b'd')?;
|
||||
self.stream.write_u32::<BE>((4 + data.len()) as u32)?;
|
||||
self.stream.write_all(&data)?;
|
||||
trace!("CopyData sent for {} bytes!", data.len());
|
||||
let buf = Bytes::copy_from_slice(data);
|
||||
|
||||
// FIXME: flush isn't really required, but makes it easier
|
||||
// to view in wireshark
|
||||
self.stream.flush()?;
|
||||
if let Err(e) = self.0.blocking_send(buf) {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, e));
|
||||
}
|
||||
|
||||
Ok(data.len())
|
||||
}
|
||||
|
||||
53
pageserver/src/pg_constants.rs
Normal file
53
pageserver/src/pg_constants.rs
Normal file
@@ -0,0 +1,53 @@
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
pub const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
//Special values for non-rel files' tags
|
||||
//TODO maybe use enum?
|
||||
pub const PG_CONTROLFILE_FORKNUM: u32 = 42;
|
||||
pub const PG_FILENODEMAP_FORKNUM: u32 = 43;
|
||||
pub const PG_XACT_FORKNUM: u32 = 44;
|
||||
pub const PG_MXACT_OFFSETS_FORKNUM: u32 = 45;
|
||||
pub const PG_MXACT_MEMBERS_FORKNUM: u32 = 46;
|
||||
|
||||
//
|
||||
// constants from clog.h
|
||||
//
|
||||
pub const CLOG_XACTS_PER_BYTE: u32 = 4;
|
||||
pub const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
|
||||
pub const CLOG_BITS_PER_XACT: u8 = 2;
|
||||
pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
|
||||
|
||||
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
|
||||
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
|
||||
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
|
||||
|
||||
pub const CLOG_ZEROPAGE: u8 = 0x00;
|
||||
pub const CLOG_TRUNCATE: u8 = 0x10;
|
||||
|
||||
// From xact.h
|
||||
pub const XLOG_XACT_COMMIT: u8 = 0x00;
|
||||
pub const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
|
||||
/* mask for filtering opcodes out of xl_info */
|
||||
pub const XLOG_XACT_OPMASK: u8 = 0x70;
|
||||
/* does this record have a 'xinfo' field or not */
|
||||
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
|
||||
|
||||
/*
|
||||
* The following flags, stored in xinfo, determine which information is
|
||||
* contained in commit/abort records.
|
||||
*/
|
||||
pub const XACT_XINFO_HAS_DBINFO: u32 = 1;
|
||||
pub const XACT_XINFO_HAS_SUBXACTS: u32 = 2;
|
||||
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 4;
|
||||
|
||||
// From pg_control.h and rmgrlist.h
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const RM_XLOG_ID: u8 = 0;
|
||||
pub const RM_XACT_ID: u8 = 1;
|
||||
pub const RM_CLOG_ID: u8 = 3;
|
||||
// pub const RM_MULTIXACT_ID:u8 = 6;
|
||||
|
||||
// from xlogreader.h
|
||||
pub const XLR_INFO_MASK: u8 = 0x0F;
|
||||
@@ -12,8 +12,10 @@
|
||||
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::fmt;
|
||||
|
||||
use std::cmp::max;
|
||||
use std::error::Error;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
@@ -27,14 +29,9 @@ use bytes::Bytes;
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::BufferTag;
|
||||
use crate::page_cache::PageCache;
|
||||
use crate::page_cache::RelTag;
|
||||
use crate::waldecoder::{decode_wal_record, WalStreamDecoder};
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::FilePathError;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
@@ -61,21 +58,20 @@ pub fn restore_timeline(
|
||||
.join(timeline.to_string())
|
||||
.join("snapshots");
|
||||
|
||||
let mut last_snapshot_lsn: Lsn = Lsn(0);
|
||||
let mut last_snapshot_lsn: u64 = 0;
|
||||
|
||||
for direntry in fs::read_dir(&snapshotspath).unwrap() {
|
||||
let direntry = direntry?;
|
||||
let filename = direntry.file_name();
|
||||
let lsn = Lsn::from_filename(&filename)?;
|
||||
let filename = direntry.file_name().to_str().unwrap().to_owned();
|
||||
|
||||
let lsn = u64::from_str_radix(&filename, 16)?;
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
|
||||
// FIXME: pass filename as Path instead of str?
|
||||
let filename_str = filename.into_string().unwrap();
|
||||
restore_snapshot(conf, pcache, timeline, &filename_str)?;
|
||||
info!("restored snapshot at {:?}", filename_str);
|
||||
restore_snapshot(conf, pcache, timeline, &filename)?;
|
||||
info!("restored snapshot at {}", filename);
|
||||
}
|
||||
|
||||
if last_snapshot_lsn == Lsn(0) {
|
||||
if last_snapshot_lsn == 0 {
|
||||
error!(
|
||||
"could not find valid snapshot in {}",
|
||||
snapshotspath.display()
|
||||
@@ -170,17 +166,7 @@ fn restore_snapshot(
|
||||
}
|
||||
}
|
||||
}
|
||||
for entry in fs::read_dir(snapshotpath.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
restore_nonrelfile(
|
||||
conf,
|
||||
pcache,
|
||||
timeline,
|
||||
snapshot,
|
||||
pg_constants::PG_XACT_FORKNUM,
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
Ok(())
|
||||
@@ -195,33 +181,33 @@ fn restore_relfile(
|
||||
dboid: u32,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let lsn = Lsn::from_hex(snapshot)?;
|
||||
let lsn = u64::from_str_radix(snapshot, 16)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
if let Err(e) = p {
|
||||
if p.is_err() {
|
||||
let e = p.unwrap_err();
|
||||
warn!("unrecognized file in snapshot: {:?} ({})", path, e);
|
||||
return Err(e.into());
|
||||
return Err(e)?;
|
||||
}
|
||||
let (relnode, forknum, segno) = p.unwrap();
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
|
||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / 8192);
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum,
|
||||
},
|
||||
blknum,
|
||||
let tag = page_cache::BufferTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode: relnode,
|
||||
forknum: forknum as u8,
|
||||
blknum: blknum,
|
||||
};
|
||||
pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
|
||||
/*
|
||||
@@ -247,62 +233,13 @@ fn restore_relfile(
|
||||
blknum += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore_nonrelfile(
|
||||
_conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
_timeline: ZTimelineId,
|
||||
snapshot: &str,
|
||||
forknum: u8,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let lsn = Lsn::from_hex(snapshot)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
|
||||
let mut blknum: u32 = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
|
||||
/*
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
blknum += 1;
|
||||
}
|
||||
let tag = page_cache::RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode: relnode,
|
||||
forknum: forknum as u8,
|
||||
};
|
||||
pcache.relsize_inc(&tag, blknum);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -313,16 +250,15 @@ fn restore_wal(
|
||||
_conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
timeline: ZTimelineId,
|
||||
startpoint: Lsn,
|
||||
startpoint: u64,
|
||||
) -> Result<()> {
|
||||
let walpath = format!("timelines/{}/wal", timeline);
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
|
||||
|
||||
const SEG_SIZE: u64 = 16 * 1024 * 1024;
|
||||
let mut segno = startpoint.segment_number(SEG_SIZE);
|
||||
let mut offset = startpoint.segment_offset(SEG_SIZE);
|
||||
let mut last_lsn = Lsn(0);
|
||||
let mut segno = XLByteToSeg(startpoint, 16 * 1024 * 1024);
|
||||
let mut offset = XLogSegmentOffset(startpoint, 16 * 1024 * 1024);
|
||||
let mut last_lsn = 0;
|
||||
loop {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, 16 * 1024 * 1024);
|
||||
@@ -330,17 +266,18 @@ fn restore_wal(
|
||||
|
||||
// It could be as .partial
|
||||
if !PathBuf::from(&path).exists() {
|
||||
path += ".partial";
|
||||
path = path + ".partial";
|
||||
}
|
||||
|
||||
// Slurp the WAL file
|
||||
let open_result = File::open(&path);
|
||||
if let Err(e) = &open_result {
|
||||
if let Err(e) = open_result {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
break;
|
||||
}
|
||||
return Err(e)?;
|
||||
}
|
||||
let mut file = open_result?;
|
||||
let mut file = open_result.unwrap();
|
||||
|
||||
if offset > 0 {
|
||||
file.seek(SeekFrom::Start(offset as u64))?;
|
||||
@@ -370,20 +307,18 @@ fn restore_wal(
|
||||
// so having multiple copies of it doesn't cost that much)
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn,
|
||||
lsn: lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
truncate: false,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
main_data_offset: decoded.main_data_offset,
|
||||
};
|
||||
|
||||
pcache.put_wal_record(tag, rec);
|
||||
@@ -403,17 +338,117 @@ fn restore_wal(
|
||||
segno += 1;
|
||||
offset = 0;
|
||||
}
|
||||
info!("reached end of WAL at {}", last_lsn);
|
||||
info!(
|
||||
"reached end of WAL at {:X}/{:X}",
|
||||
last_lsn >> 32,
|
||||
last_lsn & 0xffffffff
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// FIXME: copied from xlog_utils.rs
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
pub type XLogRecPtr = u64;
|
||||
pub type XLogSegNo = u64;
|
||||
pub type TimeLineID = u32;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
|
||||
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
return xlogptr / wal_segsz_bytes as u64;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
|
||||
return format!(
|
||||
"{:>08X}{:>08X}{:>08X}",
|
||||
tli,
|
||||
logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes),
|
||||
logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)
|
||||
);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
|
||||
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
|
||||
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
|
||||
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
|
||||
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn IsXLogFileName(fname: &str) -> bool {
|
||||
return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn IsPartialXLogFileName(fname: &str) -> bool {
|
||||
if let Some(basefname) = fname.strip_suffix(".partial") {
|
||||
IsXLogFileName(basefname)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl Error for FilePathError {
|
||||
fn description(&self) -> &str {
|
||||
&self.msg
|
||||
}
|
||||
}
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ParsedBaseImageFileName {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u8,
|
||||
pub forknum: u32,
|
||||
pub segno: u32,
|
||||
|
||||
pub lsn: u64,
|
||||
@@ -425,7 +460,7 @@ struct ParsedBaseImageFileName {
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
fn parse_relfilename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
|
||||
let caps = re
|
||||
@@ -435,8 +470,13 @@ fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forknum = postgres_ffi::forkname_to_forknum(forkname)?;
|
||||
let forkname_match = caps.name("forkname");
|
||||
let forkname = if forkname_match.is_none() {
|
||||
None
|
||||
} else {
|
||||
Some(forkname_match.unwrap().as_str())
|
||||
};
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
@@ -445,5 +485,5 @@ fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
};
|
||||
|
||||
Ok((relnode, forknum, segno))
|
||||
return Ok((relnode, forknum, segno));
|
||||
}
|
||||
|
||||
@@ -38,9 +38,12 @@ pub fn restore_main(conf: &PageServerConf) {
|
||||
let result = restore_chunk(conf).await;
|
||||
|
||||
match result {
|
||||
Ok(_) => {}
|
||||
Ok(_) => {
|
||||
return;
|
||||
}
|
||||
Err(err) => {
|
||||
error!("S3 error: {}", err);
|
||||
return;
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -133,12 +136,50 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ParsedBaseImageFileName {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u8,
|
||||
pub forknum: u32,
|
||||
pub segno: u32,
|
||||
|
||||
pub lsn: u64,
|
||||
@@ -150,7 +191,7 @@ struct ParsedBaseImageFileName {
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();
|
||||
|
||||
let caps = re
|
||||
@@ -158,23 +199,28 @@ fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode: u32 = relnode_str.parse()?;
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forkname_match = caps.name("forkname");
|
||||
let forkname = if forkname_match.is_none() {
|
||||
None
|
||||
} else {
|
||||
Some(forkname_match.unwrap().as_str())
|
||||
};
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
segno_match.unwrap().as_str().parse::<u32>()?
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
};
|
||||
|
||||
let lsn_hi: u64 = caps.name("lsnhi").unwrap().as_str().parse()?;
|
||||
let lsn_lo: u64 = caps.name("lsnlo").unwrap().as_str().parse()?;
|
||||
let lsn_hi = u64::from_str_radix(caps.name("lsnhi").unwrap().as_str(), 16)?;
|
||||
let lsn_lo = u64::from_str_radix(caps.name("lsnlo").unwrap().as_str(), 16)?;
|
||||
let lsn = lsn_hi << 32 | lsn_lo;
|
||||
|
||||
Ok((relnode, forknum, segno, lsn))
|
||||
return Ok((relnode, forknum, segno, lsn));
|
||||
}
|
||||
|
||||
fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
|
||||
@@ -198,20 +244,20 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
Ok(ParsedBaseImageFileName {
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
})
|
||||
});
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split("/");
|
||||
let dbnode_str = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
let dbnode: u32 = dbnode_str.parse()?;
|
||||
let dbnode = u32::from_str_radix(dbnode_str, 10)?;
|
||||
let fname = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
@@ -221,19 +267,19 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
|
||||
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
Ok(ParsedBaseImageFileName {
|
||||
return Ok(ParsedBaseImageFileName {
|
||||
spcnode: DEFAULTTABLESPACE_OID,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum,
|
||||
segno,
|
||||
lsn,
|
||||
})
|
||||
});
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
// TODO
|
||||
Err(FilePathError::new("tablespaces not supported"))
|
||||
return Err(FilePathError::new("tablespaces not supported"));
|
||||
} else {
|
||||
Err(FilePathError::new("invalid relation data file name"))
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -256,18 +302,17 @@ async fn slurp_base_file(
|
||||
|
||||
let mut bytes = BytesMut::from(data.as_slice()).freeze();
|
||||
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
|
||||
while bytes.remaining() >= 8192 {
|
||||
let tag = page_cache::BufferTag {
|
||||
rel: page_cache::RelTag {
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum,
|
||||
},
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
blknum,
|
||||
};
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -41,7 +41,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -52,7 +52,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -65,7 +65,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
|
||||
@@ -84,11 +84,11 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
|
||||
return true;
|
||||
}
|
||||
|
||||
false
|
||||
return false;
|
||||
})
|
||||
.fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
slog_scope::set_global_logger(logger)
|
||||
return slog_scope::set_global_logger(logger);
|
||||
}
|
||||
|
||||
pub fn ui_main() -> Result<(), Box<dyn Error>> {
|
||||
@@ -240,9 +240,7 @@ fn get_metric_u64(title: &str, value: u64) -> Spans {
|
||||
])
|
||||
}
|
||||
|
||||
// This is not used since LSNs were removed from page cache stats.
|
||||
// Maybe it will be used in the future?
|
||||
fn _get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
|
||||
fn get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
|
||||
Spans::from(vec![
|
||||
Span::styled(format!("{:<20}", title), Style::default()),
|
||||
Span::raw(": "),
|
||||
@@ -250,6 +248,13 @@ fn _get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
|
||||
])
|
||||
}
|
||||
|
||||
// FIXME: We really should define a datatype for LSNs, with Display trait and
|
||||
// helper functions. There's one in tokio-postgres, but I don't think we want
|
||||
// to rely on that.
|
||||
fn format_lsn(lsn: u64) -> String {
|
||||
return format!("{:X}/{:X}", lsn >> 32, lsn & 0xffff_ffff);
|
||||
}
|
||||
|
||||
impl tui::widgets::Widget for MetricsWidget {
|
||||
fn render(self, area: Rect, buf: &mut Buffer) {
|
||||
let block = Block::default()
|
||||
@@ -263,19 +268,14 @@ impl tui::widgets::Widget for MetricsWidget {
|
||||
let mut lines: Vec<Spans> = Vec::new();
|
||||
|
||||
let page_cache_stats = crate::page_cache::get_stats();
|
||||
|
||||
// This is not used since LSNs were removed from page cache stats.
|
||||
// Maybe it will be used in the future?
|
||||
/*
|
||||
let lsnrange = format!(
|
||||
"{} - {}",
|
||||
page_cache_stats.first_valid_lsn, page_cache_stats.last_valid_lsn
|
||||
format_lsn(page_cache_stats.first_valid_lsn),
|
||||
format_lsn(page_cache_stats.last_valid_lsn)
|
||||
);
|
||||
let last_valid_recordlsn_str = page_cache_stats.last_record_lsn.to_string();
|
||||
let last_valid_recordlsn_str = format_lsn(page_cache_stats.last_record_lsn);
|
||||
lines.push(get_metric_str("Valid LSN range", &lsnrange));
|
||||
lines.push(get_metric_str("Last record LSN", &last_valid_recordlsn_str));
|
||||
*/
|
||||
|
||||
lines.push(get_metric_u64(
|
||||
"# of cache entries",
|
||||
page_cache_stats.num_entries,
|
||||
|
||||
@@ -76,8 +76,8 @@ impl Events {
|
||||
};
|
||||
Events {
|
||||
rx,
|
||||
input_handle,
|
||||
ignore_exit_key,
|
||||
input_handle,
|
||||
tick_handle,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ impl Drain for TuiLogger {
|
||||
events.pop_back();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
use crate::pg_constants;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::XLogRecord;
|
||||
use std::cmp::min;
|
||||
use std::str;
|
||||
use thiserror::Error;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
const XLOG_BLCKSZ: u32 = 8192;
|
||||
|
||||
// FIXME: this is configurable in PostgreSQL, 16 MB is the default
|
||||
const WAL_SEGMENT_SIZE: u64 = 16 * 1024 * 1024;
|
||||
@@ -41,9 +40,9 @@ const SizeOfXLogLongPHD: usize = (2 + 2 + 4 + 8 + 4) + 4 + 8 + 4 + 4;
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub struct WalStreamDecoder {
|
||||
lsn: Lsn,
|
||||
lsn: u64,
|
||||
|
||||
startlsn: Lsn, // LSN where this record starts
|
||||
startlsn: u64, // LSN where this record starts
|
||||
contlen: u32,
|
||||
padlen: u32,
|
||||
|
||||
@@ -56,7 +55,7 @@ pub struct WalStreamDecoder {
|
||||
#[error("{msg} at {lsn}")]
|
||||
pub struct WalDecodeError {
|
||||
msg: String,
|
||||
lsn: Lsn,
|
||||
lsn: u64,
|
||||
}
|
||||
|
||||
//
|
||||
@@ -64,11 +63,11 @@ pub struct WalDecodeError {
|
||||
// FIXME: This isn't a proper rust stream
|
||||
//
|
||||
impl WalStreamDecoder {
|
||||
pub fn new(lsn: Lsn) -> WalStreamDecoder {
|
||||
pub fn new(lsn: u64) -> WalStreamDecoder {
|
||||
WalStreamDecoder {
|
||||
lsn,
|
||||
|
||||
startlsn: Lsn(0),
|
||||
startlsn: 0,
|
||||
contlen: 0,
|
||||
padlen: 0,
|
||||
|
||||
@@ -89,10 +88,10 @@ impl WalStreamDecoder {
|
||||
/// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function
|
||||
/// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
|
||||
///
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(u64, Bytes)>, WalDecodeError> {
|
||||
loop {
|
||||
// parse and verify page boundaries as we go
|
||||
if self.lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 {
|
||||
if self.lsn % WAL_SEGMENT_SIZE == 0 {
|
||||
// parse long header
|
||||
|
||||
if self.inputbuf.remaining() < SizeOfXLogLongPHD {
|
||||
@@ -100,7 +99,7 @@ impl WalStreamDecoder {
|
||||
}
|
||||
|
||||
let hdr = self.decode_XLogLongPageHeaderData();
|
||||
if hdr.std.xlp_pageaddr != self.lsn.0 {
|
||||
if hdr.std.xlp_pageaddr != self.lsn {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog segment header".into(),
|
||||
lsn: self.lsn,
|
||||
@@ -110,13 +109,15 @@ impl WalStreamDecoder {
|
||||
|
||||
self.lsn += SizeOfXLogLongPHD as u64;
|
||||
continue;
|
||||
} else if self.lsn.block_offset() == 0 {
|
||||
} else if self.lsn % (XLOG_BLCKSZ as u64) == 0 {
|
||||
// parse page header
|
||||
|
||||
if self.inputbuf.remaining() < SizeOfXLogShortPHD {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let hdr = self.decode_XLogPageHeaderData();
|
||||
if hdr.xlp_pageaddr != self.lsn.0 {
|
||||
if hdr.xlp_pageaddr != self.lsn {
|
||||
return Err(WalDecodeError {
|
||||
msg: "invalid xlog page header".into(),
|
||||
lsn: self.lsn,
|
||||
@@ -161,7 +162,7 @@ impl WalStreamDecoder {
|
||||
continue;
|
||||
} else {
|
||||
// we're continuing a record, possibly from previous page.
|
||||
let pageleft = self.lsn.remaining_in_block() as u32;
|
||||
let pageleft: u32 = XLOG_BLCKSZ - (self.lsn % (XLOG_BLCKSZ as u64)) as u32;
|
||||
|
||||
// read the rest of the record, or as much as fits on this page.
|
||||
let n = min(self.contlen, pageleft) as usize;
|
||||
@@ -178,17 +179,20 @@ impl WalStreamDecoder {
|
||||
let recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new());
|
||||
|
||||
let recordbuf = recordbuf.freeze();
|
||||
let mut buf = recordbuf.clone();
|
||||
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.padlen = self.lsn.calc_padding(WAL_SEGMENT_SIZE) as u32;
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||
if is_xlog_switch_record(&recordbuf) {
|
||||
trace!(
|
||||
"saw xlog switch record at {:X}/{:X}",
|
||||
(self.lsn >> 32),
|
||||
self.lsn & 0xffffffff
|
||||
);
|
||||
self.padlen = (WAL_SEGMENT_SIZE - (self.lsn % WAL_SEGMENT_SIZE)) as u32;
|
||||
}
|
||||
|
||||
if self.lsn % 8 != 0 {
|
||||
self.padlen = 8 - (self.lsn % 8) as u32;
|
||||
}
|
||||
|
||||
let result = (self.lsn, recordbuf);
|
||||
@@ -223,7 +227,7 @@ impl WalStreamDecoder {
|
||||
// FIXME: check that hdr.xlp_rem_len matches self.contlen
|
||||
//println!("next xlog page (xlp_rem_len: {})", hdr.xlp_rem_len);
|
||||
|
||||
hdr
|
||||
return hdr;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
@@ -235,10 +239,36 @@ impl WalStreamDecoder {
|
||||
xlp_xlog_blcksz: self.inputbuf.get_u32_le(),
|
||||
};
|
||||
|
||||
hdr
|
||||
return hdr;
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME:
|
||||
const BLCKSZ: u16 = 8192;
|
||||
|
||||
//
|
||||
// Constants from xlogrecord.h
|
||||
//
|
||||
|
||||
const XLR_MAX_BLOCK_ID: u8 = 32;
|
||||
|
||||
const XLR_BLOCK_ID_DATA_SHORT: u8 = 255;
|
||||
const XLR_BLOCK_ID_DATA_LONG: u8 = 254;
|
||||
const XLR_BLOCK_ID_ORIGIN: u8 = 253;
|
||||
const XLR_BLOCK_ID_TOPLEVEL_XID: u8 = 252;
|
||||
|
||||
const BKPBLOCK_FORK_MASK: u8 = 0x0F;
|
||||
const _BKPBLOCK_FLAG_MASK: u8 = 0xF0;
|
||||
const BKPBLOCK_HAS_IMAGE: u8 = 0x10; /* block data is an XLogRecordBlockImage */
|
||||
const BKPBLOCK_HAS_DATA: u8 = 0x20;
|
||||
const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */
|
||||
const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */
|
||||
|
||||
/* Information stored in bimg_info */
|
||||
const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
|
||||
const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
|
||||
const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub struct DecodedBkpBlock {
|
||||
/* Is this block ref in use? */
|
||||
@@ -298,147 +328,36 @@ impl DecodedBkpBlock {
|
||||
const SizeOfXLogRecord: u32 = 24;
|
||||
|
||||
pub struct DecodedWALRecord {
|
||||
pub xl_info: u8,
|
||||
pub xl_rmid: u8,
|
||||
pub record: Bytes, // raw XLogRecord
|
||||
|
||||
pub blocks: Vec<DecodedBkpBlock>,
|
||||
pub main_data_offset: usize,
|
||||
}
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type TransactionId = u32;
|
||||
pub type BlockNumber = u32;
|
||||
pub type OffsetNumber = u16;
|
||||
// Is this record an XLOG_SWITCH record? They need some special processing,
|
||||
// so we need to check for that before the rest of the parsing.
|
||||
//
|
||||
// FIXME: refactor this and decode_wal_record() below to avoid the duplication.
|
||||
fn is_xlog_switch_record(rec: &Bytes) -> bool {
|
||||
let mut buf = rec.clone();
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
// FIXME: assume little-endian here
|
||||
let _xl_tot_len = buf.get_u32_le();
|
||||
let _xl_xid = buf.get_u32_le();
|
||||
let _xl_prev = buf.get_u64_le();
|
||||
let xl_info = buf.get_u8();
|
||||
let xl_rmid = buf.get_u8();
|
||||
buf.advance(2); // 2 bytes of padding
|
||||
let _xl_crc = buf.get_u32_le();
|
||||
|
||||
return xl_info == pg_constants::XLOG_SWITCH && xl_rmid == pg_constants::RM_XLOG_ID;
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct RelFileNode {
|
||||
pub spcnode: Oid, /* tablespace */
|
||||
pub dbnode: Oid, /* database */
|
||||
pub relnode: Oid, /* relation */
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlSmgrTruncate {
|
||||
pub blkno: BlockNumber,
|
||||
pub rnode: RelFileNode,
|
||||
pub flags: u32,
|
||||
}
|
||||
|
||||
impl XlSmgrTruncate {
|
||||
pub fn decode(decoded: &DecodedWALRecord) -> XlSmgrTruncate {
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance((SizeOfXLogRecord + 2) as usize);
|
||||
XlSmgrTruncate {
|
||||
blkno: buf.get_u32_le(),
|
||||
rnode: RelFileNode {
|
||||
spcnode: buf.get_u32_le(), /* tablespace */
|
||||
dbnode: buf.get_u32_le(), /* database */
|
||||
relnode: buf.get_u32_le(), /* relation */
|
||||
},
|
||||
flags: buf.get_u32_le(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlCreateDatabase {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_id: Oid,
|
||||
pub src_db_id: Oid,
|
||||
pub src_tablespace_id: Oid,
|
||||
}
|
||||
|
||||
impl XlCreateDatabase {
|
||||
pub fn decode(decoded: &DecodedWALRecord) -> XlCreateDatabase {
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance((SizeOfXLogRecord + 2) as usize);
|
||||
XlCreateDatabase {
|
||||
db_id: buf.get_u32_le(),
|
||||
tablespace_id: buf.get_u32_le(),
|
||||
src_db_id: buf.get_u32_le(),
|
||||
src_tablespace_id: buf.get_u32_le(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlHeapInsert {
|
||||
pub offnum: OffsetNumber,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
impl XlHeapInsert {
|
||||
pub fn decode(buf: &mut Bytes) -> XlHeapInsert {
|
||||
XlHeapInsert {
|
||||
offnum: buf.get_u16_le(),
|
||||
flags: buf.get_u8(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlHeapMultiInsert {
|
||||
pub flags: u8,
|
||||
pub ntuples: u16,
|
||||
}
|
||||
|
||||
impl XlHeapMultiInsert {
|
||||
pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert {
|
||||
XlHeapMultiInsert {
|
||||
flags: buf.get_u8(),
|
||||
ntuples: buf.get_u16_le(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlHeapDelete {
|
||||
pub xmax: TransactionId,
|
||||
pub offnum: OffsetNumber,
|
||||
pub infobits_set: u8,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
impl XlHeapDelete {
|
||||
pub fn decode(buf: &mut Bytes) -> XlHeapDelete {
|
||||
XlHeapDelete {
|
||||
xmax: buf.get_u32_le(),
|
||||
offnum: buf.get_u16_le(),
|
||||
infobits_set: buf.get_u8(),
|
||||
flags: buf.get_u8(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlHeapUpdate {
|
||||
pub old_xmax: TransactionId,
|
||||
pub old_offnum: OffsetNumber,
|
||||
pub old_infobits_set: u8,
|
||||
pub flags: u8,
|
||||
pub new_xmax: TransactionId,
|
||||
pub new_offnum: OffsetNumber,
|
||||
}
|
||||
|
||||
impl XlHeapUpdate {
|
||||
pub fn decode(buf: &mut Bytes) -> XlHeapUpdate {
|
||||
XlHeapUpdate {
|
||||
old_xmax: buf.get_u32_le(),
|
||||
old_offnum: buf.get_u16_le(),
|
||||
old_infobits_set: buf.get_u8(),
|
||||
flags: buf.get_u8(),
|
||||
new_xmax: buf.get_u32_le(),
|
||||
new_offnum: buf.get_u16_le(),
|
||||
}
|
||||
}
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
}
|
||||
|
||||
//
|
||||
@@ -448,10 +367,10 @@ impl XlHeapUpdate {
|
||||
// The overall layout of an XLOG record is:
|
||||
// Fixed-size header (XLogRecord struct)
|
||||
// XLogRecordBlockHeader struct
|
||||
// If pg_constants::BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
|
||||
// If pg_constants::BKPIMAGE_HAS_HOLE and pg_constants::BKPIMAGE_IS_COMPRESSED, an
|
||||
// If BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
|
||||
// If BKPIMAGE_HAS_HOLE and BKPIMAGE_IS_COMPRESSED, an
|
||||
// XLogRecordBlockCompressHeader struct follows.
|
||||
// If pg_constants::BKPBLOCK_SAME_REL is not set, a RelFileNode follows
|
||||
// If BKPBLOCK_SAME_REL is not set, a RelFileNode follows
|
||||
// BlockNumber follows
|
||||
// XLogRecordBlockHeader struct
|
||||
// ...
|
||||
@@ -460,26 +379,32 @@ impl XlHeapUpdate {
|
||||
// block data
|
||||
// ...
|
||||
// main data
|
||||
pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord {
|
||||
let mut rnode_spcnode: u32 = 0;
|
||||
let mut rnode_dbnode: u32 = 0;
|
||||
let mut rnode_relnode: u32 = 0;
|
||||
let mut got_rnode = false;
|
||||
|
||||
let mut buf = record.clone();
|
||||
let mut buf = rec.clone();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
|
||||
// FIXME: assume little-endian here
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
let xl_tot_len = buf.get_u32_le();
|
||||
let xl_xid = buf.get_u32_le();
|
||||
let xl_prev = buf.get_u64_le();
|
||||
let xl_info = buf.get_u8();
|
||||
let xl_rmid = buf.get_u8();
|
||||
buf.advance(2); // 2 bytes of padding
|
||||
let _xl_crc = buf.get_u32_le();
|
||||
|
||||
trace!(
|
||||
"decode_wal_record xl_rmid = {} xl_info = {}",
|
||||
xlogrec.xl_rmid,
|
||||
xlogrec.xl_info
|
||||
xl_rmid,
|
||||
xl_info
|
||||
);
|
||||
|
||||
let remaining = xlogrec.xl_tot_len - SizeOfXLogRecord;
|
||||
let remaining = xl_tot_len - SizeOfXLogRecord;
|
||||
|
||||
if buf.remaining() != remaining as usize {
|
||||
//TODO error
|
||||
@@ -498,29 +423,29 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
let block_id = buf.get_u8();
|
||||
|
||||
match block_id {
|
||||
pg_constants::XLR_BLOCK_ID_DATA_SHORT => {
|
||||
XLR_BLOCK_ID_DATA_SHORT => {
|
||||
/* XLogRecordDataHeaderShort */
|
||||
main_data_len = buf.get_u8() as u32;
|
||||
datatotal += main_data_len;
|
||||
}
|
||||
|
||||
pg_constants::XLR_BLOCK_ID_DATA_LONG => {
|
||||
XLR_BLOCK_ID_DATA_LONG => {
|
||||
/* XLogRecordDataHeaderLong */
|
||||
main_data_len = buf.get_u32_le();
|
||||
datatotal += main_data_len;
|
||||
}
|
||||
|
||||
pg_constants::XLR_BLOCK_ID_ORIGIN => {
|
||||
XLR_BLOCK_ID_ORIGIN => {
|
||||
// RepOriginId is uint16
|
||||
buf.advance(2);
|
||||
}
|
||||
|
||||
pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
|
||||
XLR_BLOCK_ID_TOPLEVEL_XID => {
|
||||
// TransactionId is uint32
|
||||
buf.advance(4);
|
||||
}
|
||||
|
||||
0..=pg_constants::XLR_MAX_BLOCK_ID => {
|
||||
0..=XLR_MAX_BLOCK_ID => {
|
||||
/* XLogRecordBlockHeader */
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let fork_flags: u8;
|
||||
@@ -537,11 +462,11 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
max_block_id = block_id;
|
||||
|
||||
fork_flags = buf.get_u8();
|
||||
blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
|
||||
blk.forknum = fork_flags & BKPBLOCK_FORK_MASK;
|
||||
blk.flags = fork_flags;
|
||||
blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0;
|
||||
blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0;
|
||||
blk.will_init = (fork_flags & pg_constants::BKPBLOCK_WILL_INIT) != 0;
|
||||
blk.has_image = (fork_flags & BKPBLOCK_HAS_IMAGE) != 0;
|
||||
blk.has_data = (fork_flags & BKPBLOCK_HAS_DATA) != 0;
|
||||
blk.will_init = (fork_flags & BKPBLOCK_WILL_INIT) != 0;
|
||||
blk.data_len = buf.get_u16_le();
|
||||
|
||||
/* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */
|
||||
@@ -554,16 +479,16 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
blk.hole_offset = buf.get_u16_le();
|
||||
blk.bimg_info = buf.get_u8();
|
||||
|
||||
blk.apply_image = (blk.bimg_info & pg_constants::BKPIMAGE_APPLY) != 0;
|
||||
blk.apply_image = (blk.bimg_info & BKPIMAGE_APPLY) != 0;
|
||||
|
||||
if blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED != 0 {
|
||||
if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 {
|
||||
if blk.bimg_info & BKPIMAGE_IS_COMPRESSED != 0 {
|
||||
if blk.bimg_info & BKPIMAGE_HAS_HOLE != 0 {
|
||||
blk.hole_length = buf.get_u16_le();
|
||||
} else {
|
||||
blk.hole_length = 0;
|
||||
}
|
||||
} else {
|
||||
blk.hole_length = pg_constants::BLCKSZ - blk.bimg_len;
|
||||
blk.hole_length = BLCKSZ - blk.bimg_len;
|
||||
}
|
||||
datatotal += blk.bimg_len as u32;
|
||||
blocks_total_len += blk.bimg_len as u32;
|
||||
@@ -572,15 +497,13 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
* cross-check that hole_offset > 0, hole_length > 0 and
|
||||
* bimg_len < BLCKSZ if the HAS_HOLE flag is set.
|
||||
*/
|
||||
if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0
|
||||
&& (blk.hole_offset == 0
|
||||
|| blk.hole_length == 0
|
||||
|| blk.bimg_len == pg_constants::BLCKSZ)
|
||||
if blk.bimg_info & BKPIMAGE_HAS_HOLE != 0
|
||||
&& (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ)
|
||||
{
|
||||
// TODO
|
||||
/*
|
||||
report_invalid_record(state,
|
||||
"pg_constants::BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
|
||||
"BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
|
||||
(unsigned int) blk->hole_offset,
|
||||
(unsigned int) blk->hole_length,
|
||||
(unsigned int) blk->bimg_len,
|
||||
@@ -593,13 +516,13 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
* cross-check that hole_offset == 0 and hole_length == 0 if
|
||||
* the HAS_HOLE flag is not set.
|
||||
*/
|
||||
if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
|
||||
if blk.bimg_info & BKPIMAGE_HAS_HOLE == 0
|
||||
&& (blk.hole_offset != 0 || blk.hole_length != 0)
|
||||
{
|
||||
// TODO
|
||||
/*
|
||||
report_invalid_record(state,
|
||||
"pg_constants::BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
|
||||
"BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
|
||||
(unsigned int) blk->hole_offset,
|
||||
(unsigned int) blk->hole_length,
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
@@ -611,13 +534,11 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
* cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED
|
||||
* flag is set.
|
||||
*/
|
||||
if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0)
|
||||
&& blk.bimg_len == pg_constants::BLCKSZ
|
||||
{
|
||||
if (blk.bimg_info & BKPIMAGE_IS_COMPRESSED == 0) && blk.bimg_len == BLCKSZ {
|
||||
// TODO
|
||||
/*
|
||||
report_invalid_record(state,
|
||||
"pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X",
|
||||
"BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X",
|
||||
(unsigned int) blk->bimg_len,
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
goto err;
|
||||
@@ -628,21 +549,21 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
* cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor
|
||||
* IS_COMPRESSED flag is set.
|
||||
*/
|
||||
if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
|
||||
&& blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0
|
||||
&& blk.bimg_len != pg_constants::BLCKSZ
|
||||
if blk.bimg_info & BKPIMAGE_HAS_HOLE == 0
|
||||
&& blk.bimg_info & BKPIMAGE_IS_COMPRESSED == 0
|
||||
&& blk.bimg_len != BLCKSZ
|
||||
{
|
||||
// TODO
|
||||
/*
|
||||
report_invalid_record(state,
|
||||
"neither pg_constants::BKPIMAGE_HAS_HOLE nor pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X",
|
||||
"neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X",
|
||||
(unsigned int) blk->data_len,
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
goto err;
|
||||
*/
|
||||
}
|
||||
}
|
||||
if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 {
|
||||
if fork_flags & BKPBLOCK_SAME_REL == 0 {
|
||||
rnode_spcnode = buf.get_u32_le();
|
||||
rnode_dbnode = buf.get_u32_le();
|
||||
rnode_relnode = buf.get_u32_le();
|
||||
@@ -651,7 +572,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
// TODO
|
||||
/*
|
||||
report_invalid_record(state,
|
||||
"pg_constants::BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
|
||||
"BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
|
||||
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
|
||||
goto err; */
|
||||
}
|
||||
@@ -682,7 +603,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
// We don't need them, so just skip blocks_total_len bytes
|
||||
buf.advance(blocks_total_len as usize);
|
||||
|
||||
let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;
|
||||
let main_data_offset = (xl_tot_len - main_data_len) as usize;
|
||||
|
||||
// 4. Decode main_data
|
||||
if main_data_len > 0 {
|
||||
@@ -690,253 +611,46 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
}
|
||||
|
||||
//5. Handle special CLOG and XACT records
|
||||
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
if xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = buf.get_i32_le() as u32;
|
||||
blk.will_init = true;
|
||||
trace!("RM_CLOG_ID updates block {}", blk.blkno);
|
||||
blocks.push(blk);
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
} else if xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.blkno = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
|
||||
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
|
||||
xlogrec.xl_prev & 0xffffffff,
|
||||
xlogrec.xl_xid,
|
||||
blk.blkno,
|
||||
main_data_len
|
||||
"XLOG_XACT_COMMIT xl_prev {:X}/{:X} xid {} updates block {}",
|
||||
(xl_prev >> 32),
|
||||
xl_prev & 0xffffffff,
|
||||
xl_xid,
|
||||
blk.blkno
|
||||
);
|
||||
blocks.push(blk);
|
||||
|
||||
//parse commit record to extract subtrans entries
|
||||
// xl_xact_commit starts with time of commit
|
||||
let _xact_time = buf.get_i64_le();
|
||||
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
let mut prev_blkno = u32::MAX;
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if prev_blkno != blkno {
|
||||
prev_blkno = blkno;
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blocks.push(blk);
|
||||
}
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO handle this too?
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
|
||||
let nmsgs = buf.get_i32_le();
|
||||
for _i in 0..nmsgs {
|
||||
let sizeof_shared_invalidation_message = 0;
|
||||
buf.advance(sizeof_shared_invalidation_message);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
|
||||
let _xid = buf.get_u32_le();
|
||||
trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
|
||||
//TODO handle this to be able to restore pg_twophase on node start
|
||||
}
|
||||
//TODO parse commit record to extract subtrans entries
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.blkno = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
trace!(
|
||||
"XLOG_XACT_ABORT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
|
||||
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
|
||||
xlogrec.xl_prev & 0xffffffff,
|
||||
xlogrec.xl_xid,
|
||||
blk.blkno,
|
||||
main_data_len
|
||||
"XLOG_XACT_ABORT xl_prev {:X}/{:X} xid {} updates block {}",
|
||||
(xl_prev >> 32),
|
||||
xl_prev & 0xffffffff,
|
||||
xl_xid,
|
||||
blk.blkno
|
||||
);
|
||||
blocks.push(blk);
|
||||
//parse abort record to extract subtrans entries
|
||||
// xl_xact_abort starts with time of commit
|
||||
let _xact_time = buf.get_i64_le();
|
||||
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
let mut prev_blkno = u32::MAX;
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if prev_blkno != blkno {
|
||||
prev_blkno = blkno;
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blocks.push(blk);
|
||||
}
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO save these too
|
||||
trace!(
|
||||
"XLOG_XACT_ABORT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode
|
||||
);
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
|
||||
let _xid = buf.get_u32_le();
|
||||
trace!("XLOG_XACT_ABORT-XACT_XINFO_HAS_TWOPHASE");
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_DBASE_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_DBASE_CREATE {
|
||||
//buf points to main_data
|
||||
let db_id = buf.get_u32_le();
|
||||
let tablespace_id = buf.get_u32_le();
|
||||
let src_db_id = buf.get_u32_le();
|
||||
let src_tablespace_id = buf.get_u32_le();
|
||||
trace!(
|
||||
"XLOG_DBASE_CREATE tablespace_id/db_id {}/{} src_db_id {}/{}",
|
||||
tablespace_id,
|
||||
db_id,
|
||||
src_tablespace_id,
|
||||
src_db_id
|
||||
);
|
||||
// in postgres it is implemented as copydir
|
||||
// we need to copy all pages in page_cache
|
||||
} else {
|
||||
trace!("XLOG_DBASE_DROP is not handled yet");
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_TBLSPC_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_TBLSPC_CREATE {
|
||||
//buf points to main_data
|
||||
let ts_id = buf.get_u32_le();
|
||||
let ts_path = str::from_utf8(&buf).unwrap();
|
||||
trace!("XLOG_TBLSPC_CREATE ts_id {} ts_path {}", ts_id, ts_path);
|
||||
} else {
|
||||
trace!("XLOG_TBLSPC_DROP is not handled yet");
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = XlHeapInsert::decode(&mut buf);
|
||||
if (xlrec.flags
|
||||
& (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
|
||||
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
|
||||
!= 0
|
||||
{
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = XlHeapDelete::decode(&mut buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_UPDATE
|
||||
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
|
||||
{
|
||||
let xlrec = XlHeapUpdate::decode(&mut buf);
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0
|
||||
&& blocks.len() > 1
|
||||
{
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blocks[1].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
|
||||
blk.rnode_spcnode = blocks[1].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[1].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[1].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = XlHeapMultiInsert::decode(&mut buf);
|
||||
if (xlrec.flags
|
||||
& (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
|
||||
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
|
||||
!= 0
|
||||
{
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
}
|
||||
//TODO parse abort record to extract subtrans entries
|
||||
}
|
||||
}
|
||||
|
||||
DecodedWALRecord {
|
||||
xl_info: xlogrec.xl_info,
|
||||
xl_rmid: xlogrec.xl_rmid,
|
||||
record,
|
||||
record: rec,
|
||||
blocks,
|
||||
main_data_offset,
|
||||
main_data_offset: main_data_offset,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,15 +7,13 @@
|
||||
//!
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::{BufferTag, RelTag};
|
||||
use crate::waldecoder::*;
|
||||
use crate::page_cache::BufferTag;
|
||||
use crate::waldecoder::{decode_wal_record, WalStreamDecoder};
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Error;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use std::collections::HashMap;
|
||||
@@ -26,13 +24,11 @@ use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
use std::time::Duration;
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::runtime;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use tokio_postgres::replication::{PgTimestamp, ReplicationStream};
|
||||
use tokio_postgres::{NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use tokio_stream::StreamExt;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
//
|
||||
// We keep one WAL Receiver active per timeline.
|
||||
@@ -96,38 +92,30 @@ fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
timelineid
|
||||
);
|
||||
|
||||
// We need a tokio runtime to call the rust-postgres copy_both function.
|
||||
// Most functions in the rust-postgres driver have a blocking wrapper,
|
||||
// but copy_both does not (TODO: the copy_both support is still work-in-progress
|
||||
// as of this writing. Check later if that has changed, or implement the
|
||||
// wrapper ourselves in rust-postgres)
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
//
|
||||
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
|
||||
// and start streaming WAL from it. If the connection is lost, keep retrying.
|
||||
//
|
||||
loop {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
runtime.block_on(async {
|
||||
loop {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
|
||||
let res = walreceiver_main(&runtime, conf, timelineid, &wal_producer_connstr);
|
||||
let res = walreceiver_main(conf, timelineid, &wal_producer_connstr).await;
|
||||
|
||||
if let Err(e) = res {
|
||||
info!(
|
||||
"WAL streaming connection failed ({}), retrying in 1 second",
|
||||
e
|
||||
);
|
||||
sleep(Duration::from_secs(1));
|
||||
if let Err(e) = res {
|
||||
info!(
|
||||
"WAL streaming connection failed ({}), retrying in 1 second",
|
||||
e
|
||||
);
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fn walreceiver_main(
|
||||
runtime: &Runtime,
|
||||
async fn walreceiver_main(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
@@ -135,21 +123,20 @@ fn walreceiver_main(
|
||||
// Connect to the database in replication mode.
|
||||
info!("connecting to {:?}", wal_producer_connstr);
|
||||
let connect_cfg = format!("{} replication=true", wal_producer_connstr);
|
||||
|
||||
let (rclient, connection) = runtime.block_on(tokio_postgres::connect(&connect_cfg, NoTls))?;
|
||||
let (rclient, connection) = tokio_postgres::connect(&connect_cfg, NoTls).await?;
|
||||
info!("connected!");
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
runtime.spawn(async move {
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
error!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
let identify = identify_system(runtime, &rclient)?;
|
||||
let identify = identify_system(&rclient).await?;
|
||||
info!("{:?}", identify);
|
||||
let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
|
||||
let end_of_wal = u64::from(identify.xlogpos);
|
||||
let mut caught_up = false;
|
||||
|
||||
let pcache = page_cache::get_pagecache(&conf, timelineid).unwrap();
|
||||
@@ -159,7 +146,7 @@ fn walreceiver_main(
|
||||
//
|
||||
let mut startpoint = pcache.get_last_valid_lsn();
|
||||
let last_valid_lsn = pcache.get_last_valid_lsn();
|
||||
if startpoint == Lsn(0) {
|
||||
if startpoint == 0 {
|
||||
// If we start here with identify.xlogpos we will have race condition with
|
||||
// postgres start: insert into postgres may request page that was modified with lsn
|
||||
// smaller than identify.xlogpos.
|
||||
@@ -168,37 +155,45 @@ fn walreceiver_main(
|
||||
// different like having 'initdb' method on a pageserver (or importing some shared
|
||||
// empty database snapshot), so for now I just put start of first segment which
|
||||
// seems to be a valid record.
|
||||
pcache.init_valid_lsn(Lsn(0x0100_0000));
|
||||
startpoint = Lsn(0x0100_0000);
|
||||
pcache.init_valid_lsn(0x_1_000_000_u64);
|
||||
startpoint = 0x_1_000_000_u64;
|
||||
} else {
|
||||
// There might be some padding after the last full record, skip it.
|
||||
//
|
||||
// FIXME: It probably would be better to always start streaming from the beginning
|
||||
// of the page, or the segment, so that we could check the page/segment headers
|
||||
// too. Just for the sake of paranoia.
|
||||
startpoint += startpoint.calc_padding(8u32);
|
||||
if startpoint % 8 != 0 {
|
||||
startpoint += 8 - (startpoint % 8);
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"last_valid_lsn {} starting replication from {} for timeline {}, server is at {}...",
|
||||
last_valid_lsn, startpoint, timelineid, end_of_wal
|
||||
"last_valid_lsn {:X}/{:X} starting replication from {:X}/{:X} for timeline {}, server is at {:X}/{:X}...",
|
||||
(last_valid_lsn >> 32),
|
||||
(last_valid_lsn & 0xffffffff),
|
||||
(startpoint >> 32),
|
||||
(startpoint & 0xffffffff),
|
||||
timelineid,
|
||||
(end_of_wal >> 32),
|
||||
(end_of_wal & 0xffffffff)
|
||||
);
|
||||
|
||||
let startpoint = PgLsn::from(startpoint);
|
||||
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
|
||||
|
||||
let copy_stream = runtime.block_on(rclient.copy_both_simple::<bytes::Bytes>(&query))?;
|
||||
let copy_stream = rclient.copy_both_simple::<bytes::Bytes>(&query).await?;
|
||||
|
||||
let physical_stream = ReplicationStream::new(copy_stream);
|
||||
tokio::pin!(physical_stream);
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
|
||||
|
||||
while let Some(replication_message) = runtime.block_on(physical_stream.next()) {
|
||||
while let Some(replication_message) = physical_stream.next().await {
|
||||
match replication_message? {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
// Pass the WAL data to the decoder, and see if we can decode
|
||||
// more records as a result.
|
||||
let data = xlog_data.data();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let startlsn = xlog_data.wal_start();
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
|
||||
write_wal_file(
|
||||
@@ -208,7 +203,13 @@ fn walreceiver_main(
|
||||
data,
|
||||
)?;
|
||||
|
||||
trace!("received XLogData between {} and {}", startlsn, endlsn);
|
||||
trace!(
|
||||
"received XLogData between {:X}/{:X} and {:X}/{:X}",
|
||||
(startlsn >> 32),
|
||||
(startlsn & 0xffffffff),
|
||||
(endlsn >> 32),
|
||||
(endlsn & 0xffffffff)
|
||||
);
|
||||
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
@@ -221,63 +222,22 @@ fn walreceiver_main(
|
||||
// so having multiple copies of it doesn't cost that much)
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
truncate: false,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
main_data_offset: decoded.main_data_offset,
|
||||
};
|
||||
|
||||
pcache.put_wal_record(tag, rec);
|
||||
}
|
||||
// include truncate wal record in all pages
|
||||
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_SMGR_TRUNCATE
|
||||
{
|
||||
let truncate = XlSmgrTruncate::decode(&decoded);
|
||||
if (truncate.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: truncate.rnode.spcnode,
|
||||
dbnode: truncate.rnode.dbnode,
|
||||
relnode: truncate.rnode.relnode,
|
||||
forknum: pg_constants::MAIN_FORKNUM,
|
||||
},
|
||||
blknum: truncate.blkno,
|
||||
};
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
truncate: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
pcache.put_rel_wal_record(tag, rec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_DBASE_CREATE
|
||||
{
|
||||
let createdb = XlCreateDatabase::decode(&decoded);
|
||||
pcache.create_database(
|
||||
lsn,
|
||||
createdb.db_id,
|
||||
createdb.tablespace_id,
|
||||
createdb.src_db_id,
|
||||
createdb.src_tablespace_id,
|
||||
)?;
|
||||
}
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
pcache.advance_last_record_lsn(lsn);
|
||||
@@ -295,7 +255,11 @@ fn walreceiver_main(
|
||||
pcache.advance_last_valid_lsn(endlsn);
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!("caught up at LSN {}", endlsn);
|
||||
info!(
|
||||
"caught up at LSN {:X}/{:X}",
|
||||
(endlsn >> 32),
|
||||
(endlsn & 0xffffffff)
|
||||
);
|
||||
caught_up = true;
|
||||
}
|
||||
}
|
||||
@@ -313,24 +277,23 @@ fn walreceiver_main(
|
||||
);
|
||||
if reply_requested {
|
||||
// TODO: More thought should go into what values are sent here.
|
||||
let last_lsn = PgLsn::from(u64::from(pcache.get_last_valid_lsn()));
|
||||
let last_lsn = PgLsn::from(pcache.get_last_valid_lsn());
|
||||
let write_lsn = last_lsn;
|
||||
let flush_lsn = last_lsn;
|
||||
let apply_lsn = PgLsn::INVALID;
|
||||
let ts = PgTimestamp::now()?;
|
||||
const NO_REPLY: u8 = 0u8;
|
||||
|
||||
runtime.block_on(
|
||||
physical_stream
|
||||
.as_mut()
|
||||
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY),
|
||||
)?;
|
||||
physical_stream
|
||||
.as_mut()
|
||||
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
@@ -353,12 +316,9 @@ pub struct IdentifySystem {
|
||||
pub struct IdentifyError;
|
||||
|
||||
/// Run the postgres `IDENTIFY_SYSTEM` command
|
||||
pub fn identify_system(
|
||||
runtime: &Runtime,
|
||||
client: &tokio_postgres::Client,
|
||||
) -> Result<IdentifySystem, Error> {
|
||||
pub async fn identify_system(client: &tokio_postgres::Client) -> Result<IdentifySystem, Error> {
|
||||
let query_str = "IDENTIFY_SYSTEM";
|
||||
let response = runtime.block_on(client.simple_query(query_str))?;
|
||||
let response = client.simple_query(query_str).await?;
|
||||
|
||||
// get(N) from row, then parse it as some destination type.
|
||||
fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
|
||||
@@ -383,8 +343,64 @@ pub fn identify_system(
|
||||
}
|
||||
}
|
||||
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
pub const XLOG_BLCKSZ: usize = 8192;
|
||||
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
|
||||
pub const XLOG_PAGE_MAGIC: u16 = 0xD109;
|
||||
pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
|
||||
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = XLP_REM_LEN_OFFS + 4 + 4;
|
||||
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = XLOG_SIZE_OF_XLOG_SHORT_PHD + 8 + 4 + 4;
|
||||
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
|
||||
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = XLOG_RECORD_CRC_OFFS + 4;
|
||||
pub type XLogRecPtr = u64;
|
||||
pub type TimeLineID = u32;
|
||||
pub type TimestampTz = u64;
|
||||
pub type XLogSegNo = u64;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
|
||||
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
return xlogptr / wal_segsz_bytes as u64;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegNoOffsetToRecPtr(
|
||||
segno: XLogSegNo,
|
||||
offset: u32,
|
||||
wal_segsz_bytes: usize,
|
||||
) -> XLogRecPtr {
|
||||
return segno * (wal_segsz_bytes as u64) + (offset as u64);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
|
||||
return format!(
|
||||
"{:>08X}{:>08X}{:>08X}",
|
||||
tli,
|
||||
logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes),
|
||||
logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)
|
||||
);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
|
||||
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
|
||||
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
|
||||
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
|
||||
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
|
||||
}
|
||||
|
||||
fn write_wal_file(
|
||||
startpos: Lsn,
|
||||
startpos: XLogRecPtr,
|
||||
timeline: ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
buf: &[u8],
|
||||
@@ -393,12 +409,12 @@ fn write_wal_file(
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
const ZERO_BLOCK: &'static [u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = start_pos.segment_offset(wal_seg_size as u64) as usize;
|
||||
let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
|
||||
|
||||
while bytes_left != 0 {
|
||||
let bytes_to_write;
|
||||
@@ -414,7 +430,7 @@ fn write_wal_file(
|
||||
}
|
||||
|
||||
/* Open file */
|
||||
let segno = start_pos.segment_number(wal_seg_size as u64);
|
||||
let segno = XLByteToSeg(start_pos, wal_seg_size);
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
segno,
|
||||
@@ -466,7 +482,7 @@ fn write_wal_file(
|
||||
xlogoff += bytes_to_write;
|
||||
|
||||
/* Did we reach the end of a WAL segment? */
|
||||
if start_pos.segment_offset(wal_seg_size as u64) == 0 {
|
||||
if XLogSegmentOffset(start_pos, wal_seg_size) == 0 {
|
||||
xlogoff = 0;
|
||||
if partial {
|
||||
fs::rename(&wal_file_partial_path, &wal_file_path)?;
|
||||
|
||||
@@ -14,341 +14,221 @@
|
||||
// TODO: Even though the postgres code runs in a separate process,
|
||||
// it's not a secure sandbox.
|
||||
//
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use std::assert;
|
||||
use std::cell::RefCell;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::Error;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Stdio;
|
||||
use std::sync::mpsc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::process::{ChildStdin, ChildStdout, Command};
|
||||
use tokio::process::{Child, ChildStdin, ChildStdout, Command};
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio::time::timeout;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::page_cache::BufferTag;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::CacheEntry;
|
||||
use crate::page_cache::WALRecord;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::XLogRecord;
|
||||
use crate::{page_cache::BufferTag, pg_constants, PageServerConf};
|
||||
|
||||
static TIMEOUT: Duration = Duration::from_secs(20);
|
||||
|
||||
///
|
||||
/// A WAL redo manager consists of two parts: WalRedoManager, and
|
||||
/// WalRedoManagerInternal. WalRedoManager is the public struct
|
||||
/// that can be used to send redo requests to the manager.
|
||||
/// WalRedoManagerInternal is used by the manager thread itself.
|
||||
///
|
||||
pub struct WalRedoManager {
|
||||
request_tx: Mutex<mpsc::Sender<WalRedoRequest>>,
|
||||
}
|
||||
//
|
||||
// Main entry point for the WAL applicator thread.
|
||||
//
|
||||
pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
info!("WAL redo thread started {}", timelineid);
|
||||
|
||||
struct WalRedoManagerInternal {
|
||||
_conf: PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
// runtime for the async part.
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
request_rx: mpsc::Receiver<WalRedoRequest>,
|
||||
}
|
||||
let pcache = page_cache::get_pagecache(conf, timelineid).unwrap();
|
||||
|
||||
#[derive(Debug)]
|
||||
struct WalRedoRequest {
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
// Loop forever, handling requests as they come.
|
||||
let walredo_channel_receiver = &pcache.walredo_receiver;
|
||||
loop {
|
||||
let mut process: WalRedoProcess;
|
||||
let datadir = format!("wal-redo/{}", timelineid);
|
||||
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
|
||||
response_channel: mpsc::Sender<Result<Bytes, WalRedoError>>,
|
||||
}
|
||||
|
||||
/// An error happened in WAL redo
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum WalRedoError {
|
||||
#[error(transparent)]
|
||||
IoError(#[from] std::io::Error),
|
||||
}
|
||||
|
||||
///
|
||||
/// Public interface of WAL redo manager
|
||||
///
|
||||
impl WalRedoManager {
|
||||
///
|
||||
/// Create a new WalRedoManager.
|
||||
///
|
||||
/// This only initializes the struct. You need to call WalRedoManager::launch to
|
||||
/// start the thread that processes the requests.
|
||||
pub fn new(conf: &PageServerConf, timelineid: ZTimelineId) -> WalRedoManager {
|
||||
let (tx, rx) = mpsc::channel();
|
||||
|
||||
//
|
||||
// Launch the WAL redo thread
|
||||
//
|
||||
// Get mutable references to the values that we need to pass to the
|
||||
// thread.
|
||||
let request_rx = rx;
|
||||
let conf_copy = conf.clone();
|
||||
|
||||
// Currently, the join handle is not saved anywhere and we
|
||||
// won't try restart the thread if it dies.
|
||||
let _walredo_thread = std::thread::Builder::new()
|
||||
.name("WAL redo thread".into())
|
||||
.spawn(move || {
|
||||
let mut internal = WalRedoManagerInternal {
|
||||
_conf: conf_copy,
|
||||
timelineid,
|
||||
request_rx,
|
||||
};
|
||||
internal.wal_redo_main();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
WalRedoManager {
|
||||
request_tx: Mutex::new(tx),
|
||||
info!("launching WAL redo postgres process {}", timelineid);
|
||||
{
|
||||
let _guard = runtime.enter();
|
||||
process = WalRedoProcess::launch(&datadir, &runtime).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Request the WAL redo manager to apply WAL records, to reconstruct the page image
|
||||
/// of the given page version.
|
||||
///
|
||||
pub fn request_redo(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
// Create a channel where to receive the response
|
||||
let (tx, rx) = mpsc::channel::<Result<Bytes, WalRedoError>>();
|
||||
// Pretty arbitrarily, reuse the same Postgres process for 100 requests.
|
||||
// After that, kill it and start a new one. This is mostly to avoid
|
||||
// using up all shared buffers in Postgres's shared buffer cache; we don't
|
||||
// want to write any pages to disk in the WAL redo process.
|
||||
for _i in 1..100 {
|
||||
let request = walredo_channel_receiver.recv().unwrap();
|
||||
|
||||
let request = WalRedoRequest {
|
||||
tag,
|
||||
lsn,
|
||||
base_img,
|
||||
records,
|
||||
response_channel: tx,
|
||||
};
|
||||
|
||||
self.request_tx
|
||||
.lock()
|
||||
.unwrap()
|
||||
.send(request)
|
||||
.expect("could not send WAL redo request");
|
||||
|
||||
rx.recv()
|
||||
.expect("could not receive response to WAL redo request")
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// WAL redo thread
|
||||
///
|
||||
impl WalRedoManagerInternal {
|
||||
//
|
||||
// Main entry point for the WAL applicator thread.
|
||||
//
|
||||
fn wal_redo_main(&mut self) {
|
||||
info!("WAL redo thread started {}", self.timelineid);
|
||||
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
// runtime for the async part.
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let process: WalRedoProcess;
|
||||
let datadir = format!("wal-redo/{}", self.timelineid);
|
||||
|
||||
info!("launching WAL redo postgres process {}", self.timelineid);
|
||||
|
||||
process = runtime.block_on(WalRedoProcess::launch(&datadir)).unwrap();
|
||||
info!("WAL redo postgres started");
|
||||
|
||||
// Loop forever, handling requests as they come.
|
||||
loop {
|
||||
let request = self.request_rx.recv().unwrap();
|
||||
|
||||
let result = runtime.block_on(self.handle_apply_request(&process, &request));
|
||||
let result_ok = result.is_ok();
|
||||
|
||||
// Send the result to the requester
|
||||
let _ = request.response_channel.send(result);
|
||||
|
||||
if !result_ok {
|
||||
error!("wal-redo-postgres filed to apply request {:?}", request);
|
||||
let result = handle_apply_request(&pcache, &process, &runtime, request);
|
||||
if result.is_err() {
|
||||
// On error, kill the process.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
info!("killing WAL redo postgres process");
|
||||
let _ = runtime.block_on(process.stdin.get_mut().shutdown());
|
||||
let mut child = process.child;
|
||||
drop(process.stdin);
|
||||
let _ = runtime.block_on(child.wait());
|
||||
}
|
||||
}
|
||||
|
||||
fn transaction_id_set_status_bit(
|
||||
xl_info: u8,
|
||||
xl_rmid: u8,
|
||||
xl_xid: u32,
|
||||
record: WALRecord,
|
||||
page: &mut BytesMut,
|
||||
) {
|
||||
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let mut status = 0;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
} else {
|
||||
trace!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {:X}/{:X} main_data_offset {}, rec.len {}",
|
||||
status,
|
||||
record.lsn >> 32,
|
||||
record.lsn & 0xffffffff,
|
||||
record.main_data_offset, record.rec.len());
|
||||
return;
|
||||
}
|
||||
|
||||
fn transaction_id_set_status_bit(&self, xid: u32, status: u8, page: &mut BytesMut) {
|
||||
trace!(
|
||||
"handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
|
||||
status
|
||||
);
|
||||
trace!("handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort) lsn {:X}/{:X} main_data_offset {}, rec.len {}",
|
||||
status,
|
||||
record.lsn >> 32,
|
||||
record.lsn & 0xffffffff,
|
||||
record.main_data_offset, record.rec.len());
|
||||
|
||||
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
|
||||
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
|
||||
let byteno: usize = ((xl_rmid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
|
||||
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
|
||||
|
||||
let bshift: u8 = ((xid % pg_constants::CLOG_XACTS_PER_BYTE)
|
||||
* pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
let byteptr = &mut page[byteno..byteno + 1];
|
||||
let bshift: u8 = ((xl_xid % pg_constants::CLOG_XACTS_PER_BYTE)
|
||||
* pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
|
||||
page[byteno] =
|
||||
(page[byteno] & !(pg_constants::CLOG_XACT_BITMASK << bshift)) | (status << bshift);
|
||||
}
|
||||
let mut curval = byteptr[0];
|
||||
curval = (curval >> bshift) & pg_constants::CLOG_XACT_BITMASK;
|
||||
|
||||
///
|
||||
/// Process one request for WAL redo.
|
||||
///
|
||||
async fn handle_apply_request(
|
||||
&self,
|
||||
process: &WalRedoProcess,
|
||||
request: &WalRedoRequest,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let tag = request.tag;
|
||||
let lsn = request.lsn;
|
||||
let base_img = request.base_img.clone();
|
||||
let records = &request.records;
|
||||
let mut byteval = [0];
|
||||
byteval[0] = curval;
|
||||
byteval[0] &= !(((1 << pg_constants::CLOG_BITS_PER_XACT as u8) - 1) << bshift);
|
||||
byteval[0] |= status << bshift;
|
||||
|
||||
let nrecords = records.len();
|
||||
byteptr.copy_from_slice(&byteval);
|
||||
trace!(
|
||||
"xl_xid {} byteno {} curval {} byteval {}",
|
||||
xl_xid,
|
||||
byteno,
|
||||
curval,
|
||||
byteval[0]
|
||||
);
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
fn handle_apply_request(
|
||||
pcache: &page_cache::PageCache,
|
||||
process: &WalRedoProcess,
|
||||
runtime: &Runtime,
|
||||
entry_rc: Arc<CacheEntry>,
|
||||
) -> Result<(), Error> {
|
||||
let tag = entry_rc.key.tag;
|
||||
let lsn = entry_rc.key.lsn;
|
||||
let (base_img, records) = pcache.collect_records_for_apply(entry_rc.as_ref());
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
if tag.rel.forknum == pg_constants::PG_XACT_FORKNUM {
|
||||
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let mut page = BytesMut::new();
|
||||
if let Some(fpi) = base_img {
|
||||
page.extend_from_slice(&fpi[..]);
|
||||
} else {
|
||||
page.extend_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
for record in records {
|
||||
let mut buf = record.rec.clone();
|
||||
let mut entry = entry_rc.content.lock().unwrap();
|
||||
entry.apply_pending = false;
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
// FIXME: refactor to avoid code duplication.
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
let nrecords = records.len();
|
||||
|
||||
//move to main data
|
||||
// TODO probably, we should store some records in our special format
|
||||
// to avoid this weird parsing on replay
|
||||
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
|
||||
if buf.remaining() > skip {
|
||||
buf.advance(skip);
|
||||
}
|
||||
|
||||
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
page.clone_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let mut status = 0;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
|
||||
self.transaction_id_set_status_bit(xlogrec.xl_xid, status, &mut page);
|
||||
//handle subtrans
|
||||
let _xact_time = buf.get_i64_le();
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
}
|
||||
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag.blknum == blkno {
|
||||
status = pg_constants::TRANSACTION_STATUS_SUB_COMMITTED;
|
||||
self.transaction_id_set_status_bit(subxact, status, &mut page);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
self.transaction_id_set_status_bit(xlogrec.xl_xid, status, &mut page);
|
||||
//handle subtrans
|
||||
let _xact_time = buf.get_i64_le();
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
}
|
||||
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag.blknum == blkno {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
self.transaction_id_set_status_bit(subxact, status, &mut page);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
trace!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {} main_data_offset {}, rec.len {}",
|
||||
status,
|
||||
record.lsn,
|
||||
record.main_data_offset, record.rec.len());
|
||||
}
|
||||
let start = Instant::now();
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
if tag.forknum == pg_constants::PG_XACT_FORKNUM as u8 {
|
||||
//TODO use base image if any
|
||||
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let zero_page_bytes: &[u8] = &ZERO_PAGE;
|
||||
let mut page = BytesMut::from(zero_page_bytes);
|
||||
|
||||
for record in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
// FIXME: refactor to avoid code duplication.
|
||||
let _xl_tot_len = buf.get_u32_le();
|
||||
let xl_xid = buf.get_u32_le();
|
||||
let _xl_prev = buf.get_u64_le();
|
||||
let xl_info = buf.get_u8();
|
||||
let xl_rmid = buf.get_u8();
|
||||
buf.advance(2); // 2 bytes of padding
|
||||
let _xl_crc = buf.get_u32_le();
|
||||
|
||||
if xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let info = xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
page.clone_from_slice(zero_page_bytes);
|
||||
trace!("handle_apply_request for RM_CLOG_ID-CLOG_ZEROPAGE lsn {:X}/{:X} main_data_offset {}, rec.len {}",
|
||||
record.lsn >> 32,
|
||||
record.lsn & 0xffffffff,
|
||||
record.main_data_offset, record.rec.len());
|
||||
}
|
||||
} else if xl_rmid == pg_constants::RM_XACT_ID {
|
||||
transaction_id_set_status_bit(xl_info, xl_rmid, xl_xid, record, &mut page);
|
||||
}
|
||||
|
||||
apply_result = Ok::<Bytes, Error>(page.freeze());
|
||||
} else {
|
||||
apply_result = process.apply_wal_records(tag, base_img, records).await;
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
let result: Result<Bytes, WalRedoError>;
|
||||
|
||||
trace!(
|
||||
"applied {} WAL records in {} ms to reconstruct page image at LSN {}",
|
||||
nrecords,
|
||||
duration.as_millis(),
|
||||
lsn
|
||||
);
|
||||
|
||||
if let Err(e) = apply_result {
|
||||
error!("could not apply WAL records: {}", e);
|
||||
result = Err(WalRedoError::IoError(e));
|
||||
} else {
|
||||
let img = apply_result.unwrap();
|
||||
|
||||
result = Ok(img);
|
||||
}
|
||||
|
||||
// The caller is responsible for sending the response
|
||||
result
|
||||
apply_result = Ok::<Bytes, Error>(page.freeze());
|
||||
} else {
|
||||
apply_result = process.apply_wal_records(runtime, tag, base_img, records);
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
let result;
|
||||
|
||||
debug!(
|
||||
"applied {} WAL records in {} ms to reconstruct page image at LSN {:X}/{:X}",
|
||||
nrecords,
|
||||
duration.as_millis(),
|
||||
lsn >> 32,
|
||||
lsn & 0xffff_ffff
|
||||
);
|
||||
|
||||
if let Err(e) = apply_result {
|
||||
error!("could not apply WAL records: {}", e);
|
||||
result = Err(e);
|
||||
} else {
|
||||
entry.page_image = Some(apply_result.unwrap());
|
||||
pcache
|
||||
.num_page_images
|
||||
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
result = Ok(());
|
||||
}
|
||||
|
||||
// Wake up the requester, whether the operation succeeded or not.
|
||||
entry_rc.walredo_condvar.notify_all();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct WalRedoProcess {
|
||||
child: Child,
|
||||
stdin: RefCell<ChildStdin>,
|
||||
stdout: RefCell<ChildStdout>,
|
||||
}
|
||||
@@ -360,14 +240,16 @@ impl WalRedoProcess {
|
||||
// Tests who run pageserver binary are setting proper PG_BIN_DIR
|
||||
// and PG_LIB_DIR so that WalRedo would start right postgres. We may later
|
||||
// switch to setting same things in pageserver config file.
|
||||
async fn launch(datadir: &str) -> Result<WalRedoProcess, Error> {
|
||||
fn launch(datadir: &str, runtime: &Runtime) -> Result<WalRedoProcess, Error> {
|
||||
// Create empty data directory for wal-redo postgres deleting old one.
|
||||
fs::remove_dir_all(datadir).ok();
|
||||
let initdb = Command::new("initdb")
|
||||
.args(&["-D", datadir])
|
||||
.arg("-N")
|
||||
.output()
|
||||
.await
|
||||
let initdb = runtime
|
||||
.block_on(
|
||||
Command::new("initdb")
|
||||
.args(&["-D", datadir])
|
||||
.arg("-N")
|
||||
.output(),
|
||||
)
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.status.success() {
|
||||
@@ -376,14 +258,8 @@ impl WalRedoProcess {
|
||||
std::str::from_utf8(&initdb.stdout).unwrap(),
|
||||
std::str::from_utf8(&initdb.stderr).unwrap()
|
||||
);
|
||||
} else {
|
||||
// Limit shared cache for wal-redo-postres
|
||||
let mut config = OpenOptions::new()
|
||||
.append(true)
|
||||
.open(PathBuf::from(&datadir).join("postgresql.conf"))?;
|
||||
config.write_all(b"shared_buffers=128kB\n")?;
|
||||
config.write_all(b"fsync=off\n")?;
|
||||
}
|
||||
|
||||
// Start postgres itself
|
||||
let mut child = Command::new("postgres")
|
||||
.arg("--wal-redo")
|
||||
@@ -414,7 +290,7 @@ impl WalRedoProcess {
|
||||
if res.unwrap() == 0 {
|
||||
break;
|
||||
}
|
||||
error!("wal-redo-postgres: {}", line.trim());
|
||||
debug!("wal-redo-postgres: {}", line.trim());
|
||||
line.clear();
|
||||
}
|
||||
Ok::<(), Error>(())
|
||||
@@ -422,6 +298,7 @@ impl WalRedoProcess {
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
Ok(WalRedoProcess {
|
||||
child,
|
||||
stdin: RefCell::new(stdin),
|
||||
stdout: RefCell::new(stdout),
|
||||
})
|
||||
@@ -431,91 +308,97 @@ impl WalRedoProcess {
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
async fn apply_wal_records(
|
||||
fn apply_wal_records(
|
||||
&self,
|
||||
runtime: &Runtime,
|
||||
tag: BufferTag,
|
||||
base_img: Option<Bytes>,
|
||||
records: &Vec<WALRecord>,
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, Error> {
|
||||
let mut stdin = self.stdin.borrow_mut();
|
||||
let mut stdout = self.stdout.borrow_mut();
|
||||
|
||||
// We do three things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin, read the result from child's stdout, and forward any logging
|
||||
// information that the child writes to its stderr to the page server's log.
|
||||
//
|
||||
// 'f_stdin' handles writing the base image and WAL records to the child process.
|
||||
// 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
|
||||
// tokio runtime in the 'launch' function already, forwards the logging.
|
||||
let f_stdin = async {
|
||||
// Send base image, if any. (If the record initializes the page, previous page
|
||||
// version is not needed.)
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
)
|
||||
.await??;
|
||||
if base_img.is_some() {
|
||||
return runtime.block_on(async {
|
||||
//
|
||||
// This async block sends all the commands to the process.
|
||||
//
|
||||
// For reasons I don't understand, this needs to be a "move" block;
|
||||
// otherwise the stdin pipe doesn't get closed, despite the shutdown()
|
||||
// call.
|
||||
//
|
||||
let f_stdin = async {
|
||||
// Send base image, if any. (If the record initializes the page, previous page
|
||||
// version is not needed.)
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
)
|
||||
.await??;
|
||||
}
|
||||
if base_img.is_some() {
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
|
||||
)
|
||||
.await??;
|
||||
}
|
||||
|
||||
// Send WAL records.
|
||||
for rec in records.iter() {
|
||||
let r = rec.clone();
|
||||
// Send WAL records.
|
||||
for rec in records.iter() {
|
||||
let r = rec.clone();
|
||||
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(r.lsn, r.rec))
|
||||
.await?;
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(r.lsn, r.rec))
|
||||
.await?;
|
||||
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
// r.lsn >> 32, r.lsn & 0xffff_ffff);
|
||||
}
|
||||
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
|
||||
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
// r.lsn >> 32, r.lsn & 0xffff_ffff);
|
||||
}
|
||||
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
|
||||
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
|
||||
|
||||
// Send GetPage command to get the result back
|
||||
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
|
||||
timeout(TIMEOUT, stdin.flush()).await??;
|
||||
//debug!("sent GetPage for {}", tag.blknum);
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
// Send GetPage command to get the result back
|
||||
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
|
||||
timeout(TIMEOUT, stdin.flush()).await??;
|
||||
//debug!("sent GetPage for {}", tag.blknum);
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
|
||||
// Read back new page image
|
||||
let f_stdout = async {
|
||||
let mut buf = [0u8; 8192];
|
||||
// Read back new page image
|
||||
let f_stdout = async {
|
||||
let mut buf = [0u8; 8192];
|
||||
|
||||
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
|
||||
//debug!("got response for {}", tag.blknum);
|
||||
Ok::<[u8; 8192], Error>(buf)
|
||||
};
|
||||
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
|
||||
//debug!("got response for {}", tag.blknum);
|
||||
Ok::<[u8; 8192], Error>(buf)
|
||||
};
|
||||
|
||||
let res = futures::try_join!(f_stdout, f_stdin)?;
|
||||
// Kill the process. This closes its stdin, which should signal the process
|
||||
// to terminate. TODO: SIGKILL if needed
|
||||
//child.wait();
|
||||
|
||||
let buf = res.0;
|
||||
let res = futures::try_join!(f_stdout, f_stdin)?;
|
||||
|
||||
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
|
||||
let buf = res.0;
|
||||
|
||||
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Functions for constructing messages to send to the postgres WAL redo
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 5 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u8('B' as u8);
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
return buf.freeze();
|
||||
}
|
||||
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
@@ -524,39 +407,47 @@ fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
let len = 4 + 5 * 4 + base_img.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u8('P' as u8);
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
buf.put(base_img);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
return buf.freeze();
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
fn build_apply_record_msg(endlsn: u64, rec: Bytes) -> Bytes {
|
||||
let len = 4 + 8 + rec.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u8('A' as u8);
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.put_u64(endlsn);
|
||||
buf.put(rec);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
return buf.freeze();
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 5 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u8('G' as u8);
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
buf.put_u32(tag.spcnode);
|
||||
buf.put_u32(tag.dbnode);
|
||||
buf.put_u32(tag.relnode);
|
||||
buf.put_u32(tag.forknum as u32);
|
||||
buf.put_u32(tag.blknum);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
return buf.freeze();
|
||||
}
|
||||
|
||||
33
pgbuild.sh
Executable file
33
pgbuild.sh
Executable file
@@ -0,0 +1,33 @@
|
||||
#!/bin/sh
|
||||
#
|
||||
# Purpose of this script is to build and install postgres in a local directory
|
||||
# so that zenith intergation tests would find pg binaries and support files.
|
||||
#
|
||||
# ./pgbuild.sh would do following:
|
||||
#
|
||||
# 1) run out-of-source build of postgres in REPO_ROOT/tmp_install/build directory (I'm reusing
|
||||
# tmp_install path here since it is already present in .gitignore)
|
||||
#
|
||||
# 2) installs postgres to REPO_ROOT/tmp_install/
|
||||
#
|
||||
|
||||
# Halt immediately if any command fails
|
||||
set -e
|
||||
|
||||
REPO_ROOT=$(dirname "$0")
|
||||
REPO_ROOT="`( cd \"$REPO_ROOT\" && pwd )`"
|
||||
|
||||
# configure
|
||||
echo "Configuring postgres build"
|
||||
mkdir -p $REPO_ROOT/tmp_install/build
|
||||
cd $REPO_ROOT/tmp_install/build
|
||||
../../vendor/postgres/configure CFLAGS='-O0' --enable-debug --enable-cassert \
|
||||
--enable-depend --with-libxml --prefix=/ > configure.log
|
||||
|
||||
# compile
|
||||
echo "Compiling postgres"
|
||||
make -j8 -s
|
||||
export DESTDIR=$REPO_ROOT/tmp_install
|
||||
|
||||
echo "Installing postgres to $DESTDIR"
|
||||
make install -s
|
||||
@@ -14,7 +14,6 @@ byteorder = "1.4.3"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
hex = "0.4.3"
|
||||
log = "0.4.14"
|
||||
|
||||
[build-dependencies]
|
||||
bindgen = "0.57"
|
||||
bindgen = "0.53.1"
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
This module contains utility functions for interacting with PostgreSQL
|
||||
file formats.
|
||||
|
||||
@@ -24,7 +24,7 @@ fn main() {
|
||||
// Path the server include dir. It is in tmp_install/include/server, if you did
|
||||
// "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
|
||||
// and used DESTDIR to move it into tmp_install, then it's in
|
||||
// tmp_install/include/postgres/server
|
||||
// tmp_install/include/postgres/server (that's how the pgbuild.sh script does it).
|
||||
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
|
||||
// but this will do for now.
|
||||
.clang_arg("-I../tmp_install/include/server")
|
||||
|
||||
@@ -3,12 +3,7 @@
|
||||
#![allow(non_snake_case)]
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
|
||||
pub mod pg_constants;
|
||||
pub mod xlog_utils;
|
||||
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
// sizeof(ControlFileData)
|
||||
const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
|
||||
@@ -23,13 +18,13 @@ impl ControlFileData {
|
||||
controlfile =
|
||||
unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
|
||||
|
||||
controlfile
|
||||
return controlfile;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_pg_control(mut buf: Bytes) -> Result<ControlFileData, anyhow::Error> {
|
||||
pub fn decode_pg_control(buf: Bytes) -> Result<ControlFileData, anyhow::Error> {
|
||||
let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA];
|
||||
buf.copy_to_slice(&mut b);
|
||||
buf.clone().copy_to_slice(&mut b);
|
||||
|
||||
let controlfile: ControlFileData;
|
||||
|
||||
@@ -51,50 +46,6 @@ pub fn decode_pg_control(mut buf: Bytes) -> Result<ControlFileData, anyhow::Erro
|
||||
Ok(controlfile)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl Error for FilePathError {
|
||||
fn description(&self) -> &str {
|
||||
&self.msg
|
||||
}
|
||||
}
|
||||
|
||||
impl FilePathError {
|
||||
pub fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
pub fn forkname_to_forknum(forkname: Option<&str>) -> Result<u8, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(pg_constants::MAIN_FORKNUM),
|
||||
Some("fsm") => Ok(pg_constants::FSM_FORKNUM),
|
||||
Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM),
|
||||
Some("init") => Ok(pg_constants::INIT_FORKNUM),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes {
|
||||
let b: [u8; SIZEOF_CONTROLDATA];
|
||||
|
||||
@@ -112,5 +63,5 @@ pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes {
|
||||
// Fill the rest of the control file with zeros.
|
||||
buf.resize(PG_CONTROL_FILE_SIZE as usize, 0);
|
||||
|
||||
buf.into()
|
||||
return buf.into();
|
||||
}
|
||||
|
||||
@@ -1,130 +0,0 @@
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
pub const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
//TODO maybe use enum?
|
||||
pub const MAIN_FORKNUM: u8 = 0;
|
||||
pub const FSM_FORKNUM: u8 = 1;
|
||||
pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
//Special values for non-rel files' tags
|
||||
pub const PG_CONTROLFILE_FORKNUM: u8 = 42;
|
||||
pub const PG_FILENODEMAP_FORKNUM: u8 = 43;
|
||||
pub const PG_XACT_FORKNUM: u8 = 44;
|
||||
pub const PG_MXACT_OFFSETS_FORKNUM: u8 = 45;
|
||||
pub const PG_MXACT_MEMBERS_FORKNUM: u8 = 46;
|
||||
|
||||
//
|
||||
// constants from clog.h
|
||||
//
|
||||
pub const CLOG_XACTS_PER_BYTE: u32 = 4;
|
||||
pub const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
|
||||
pub const CLOG_BITS_PER_XACT: u8 = 2;
|
||||
pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
|
||||
|
||||
//
|
||||
// Constants from visbilitymap.h
|
||||
//
|
||||
pub const SIZE_OF_PAGE_HEADER: u16 = 24;
|
||||
pub const BITS_PER_HEAPBLOCK: u16 = 2;
|
||||
pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;
|
||||
|
||||
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
|
||||
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
|
||||
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
|
||||
|
||||
pub const CLOG_ZEROPAGE: u8 = 0x00;
|
||||
pub const CLOG_TRUNCATE: u8 = 0x10;
|
||||
|
||||
// From xact.h
|
||||
pub const XLOG_XACT_COMMIT: u8 = 0x00;
|
||||
pub const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
|
||||
// From srlu.h
|
||||
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
|
||||
|
||||
/* mask for filtering opcodes out of xl_info */
|
||||
pub const XLOG_XACT_OPMASK: u8 = 0x70;
|
||||
/* does this record have a 'xinfo' field or not */
|
||||
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
|
||||
|
||||
/*
|
||||
* The following flags, stored in xinfo, determine which information is
|
||||
* contained in commit/abort records.
|
||||
*/
|
||||
pub const XACT_XINFO_HAS_DBINFO: u32 = 1u32 << 0;
|
||||
pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
|
||||
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
|
||||
pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
|
||||
pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
|
||||
// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
|
||||
// pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
|
||||
// pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;
|
||||
|
||||
// From pg_control.h and rmgrlist.h
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
|
||||
|
||||
// From heapam_xlog.h
|
||||
pub const XLOG_HEAP_INSERT: u8 = 0x00;
|
||||
pub const XLOG_HEAP_DELETE: u8 = 0x10;
|
||||
pub const XLOG_HEAP_UPDATE: u8 = 0x20;
|
||||
pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
|
||||
pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
|
||||
pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
|
||||
pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
|
||||
pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
|
||||
pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
|
||||
pub const RM_XLOG_ID: u8 = 0;
|
||||
pub const RM_XACT_ID: u8 = 1;
|
||||
pub const RM_SMGR_ID: u8 = 2;
|
||||
pub const RM_CLOG_ID: u8 = 3;
|
||||
pub const RM_DBASE_ID: u8 = 4;
|
||||
pub const RM_TBLSPC_ID: u8 = 5;
|
||||
pub const RM_MULTIXACT_ID: u8 = 6;
|
||||
pub const RM_RELMAP_ID: u8 = 7;
|
||||
pub const RM_STANDBY_ID: u8 = 8;
|
||||
pub const RM_HEAP2_ID: u8 = 9;
|
||||
pub const RM_HEAP_ID: u8 = 10;
|
||||
|
||||
// from xlogreader.h
|
||||
pub const XLR_INFO_MASK: u8 = 0x0F;
|
||||
pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
|
||||
|
||||
// from dbcommands_xlog.h
|
||||
pub const XLOG_DBASE_CREATE: u8 = 0x00;
|
||||
pub const XLOG_DBASE_DROP: u8 = 0x10;
|
||||
|
||||
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
|
||||
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
|
||||
|
||||
pub const SIZEOF_XLOGRECORD: u32 = 24;
|
||||
|
||||
// FIXME:
|
||||
pub const BLCKSZ: u16 = 8192;
|
||||
|
||||
//
|
||||
// from xlogrecord.h
|
||||
//
|
||||
pub const XLR_MAX_BLOCK_ID: u8 = 32;
|
||||
|
||||
pub const XLR_BLOCK_ID_DATA_SHORT: u8 = 255;
|
||||
pub const XLR_BLOCK_ID_DATA_LONG: u8 = 254;
|
||||
pub const XLR_BLOCK_ID_ORIGIN: u8 = 253;
|
||||
pub const XLR_BLOCK_ID_TOPLEVEL_XID: u8 = 252;
|
||||
|
||||
pub const BKPBLOCK_FORK_MASK: u8 = 0x0F;
|
||||
pub const _BKPBLOCK_FLAG_MASK: u8 = 0xF0;
|
||||
pub const BKPBLOCK_HAS_IMAGE: u8 = 0x10; /* block data is an XLogRecordBlockImage */
|
||||
pub const BKPBLOCK_HAS_DATA: u8 = 0x20;
|
||||
pub const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */
|
||||
pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */
|
||||
|
||||
/* Information stored in bimg_info */
|
||||
pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
|
||||
pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
|
||||
pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: 87080ddc02...67da6b1df6
2373
walkeeper/Cargo.lock
generated
Normal file
2373
walkeeper/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
@@ -20,7 +20,6 @@ slog = "2.7.0"
|
||||
log = "0.4.14"
|
||||
clap = "2.33.0"
|
||||
daemonize = "0.4.1"
|
||||
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", rev="7f15a24ec7daa0a5d9516da706212745f9042818", features = ["no-verify-ssl"] }
|
||||
tokio = { version = "1.3.0", features = ["full"] }
|
||||
tokio-stream = { version = "0.1.4" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
@@ -28,9 +27,6 @@ postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
parse_duration = "*"
|
||||
walkdir = "2"
|
||||
|
||||
# FIXME: 'pageserver' is needed for ZTimelineId. Refactor
|
||||
pageserver = { path = "../pageserver" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
|
||||
@@ -3,11 +3,10 @@
|
||||
//
|
||||
use daemonize::Daemonize;
|
||||
use log::*;
|
||||
use parse_duration::parse;
|
||||
use std::io;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::{fs::File, fs::OpenOptions};
|
||||
|
||||
use anyhow::Result;
|
||||
@@ -15,7 +14,6 @@ use clap::{App, Arg};
|
||||
|
||||
use slog::Drain;
|
||||
|
||||
use walkeeper::s3_offload;
|
||||
use walkeeper::wal_service;
|
||||
use walkeeper::WalAcceptorConf;
|
||||
|
||||
@@ -47,13 +45,8 @@ fn main() -> Result<()> {
|
||||
Arg::with_name("pageserver")
|
||||
.short("p")
|
||||
.long("pageserver")
|
||||
.takes_value(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("ttl")
|
||||
.long("ttl")
|
||||
.takes_value(true)
|
||||
.help("interval for keeping WAL as walkeeper node, after which them will be uploaded to S3 and removed locally"),
|
||||
.help("address ip:port of pageserver with which wal_acceptor should establish connection"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("daemonize")
|
||||
@@ -76,12 +69,11 @@ fn main() -> Result<()> {
|
||||
|
||||
let mut conf = WalAcceptorConf {
|
||||
data_dir: PathBuf::from("./"),
|
||||
systemid,
|
||||
systemid: systemid,
|
||||
daemonize: false,
|
||||
no_sync: false,
|
||||
pageserver_addr: None,
|
||||
listen_addr: "127.0.0.1:5454".parse()?,
|
||||
ttl: None,
|
||||
};
|
||||
|
||||
if let Some(dir) = arg_matches.value_of("datadir") {
|
||||
@@ -107,10 +99,6 @@ fn main() -> Result<()> {
|
||||
conf.pageserver_addr = Some(addr.parse().unwrap());
|
||||
}
|
||||
|
||||
if let Some(ttl) = arg_matches.value_of("ttl") {
|
||||
conf.ttl = Some::<Duration>(parse(ttl)?);
|
||||
}
|
||||
|
||||
start_wal_acceptor(conf)
|
||||
}
|
||||
|
||||
@@ -150,19 +138,6 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
}
|
||||
|
||||
let mut threads = Vec::new();
|
||||
|
||||
if conf.ttl.is_some() {
|
||||
let s3_conf = conf.clone();
|
||||
let s3_offload_thread = thread::Builder::new()
|
||||
.name("S3 offload thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
s3_offload::thread_main(s3_conf);
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(s3_offload_thread);
|
||||
}
|
||||
|
||||
let wal_acceptor_thread = thread::Builder::new()
|
||||
.name("WAL acceptor thread".into())
|
||||
.spawn(|| {
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
//
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
mod pq_protocol;
|
||||
pub mod s3_offload;
|
||||
pub mod wal_service;
|
||||
pub mod xlog_utils;
|
||||
|
||||
use crate::pq_protocol::SystemId;
|
||||
|
||||
@@ -17,5 +16,4 @@ pub struct WalAcceptorConf {
|
||||
pub no_sync: bool,
|
||||
pub listen_addr: SocketAddr,
|
||||
pub pageserver_addr: Option<SocketAddr>,
|
||||
pub ttl: Option<Duration>,
|
||||
}
|
||||
|
||||
@@ -40,7 +40,6 @@ pub struct FeStartupMessage {
|
||||
pub version: u32,
|
||||
pub kind: StartupRequestCode,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub appname: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -87,17 +86,15 @@ impl FeStartupMessage {
|
||||
let params = params_str.split('\0');
|
||||
let mut options = false;
|
||||
let mut timelineid: Option<ZTimelineId> = None;
|
||||
let mut appname: Option<String> = None;
|
||||
for p in params {
|
||||
if p == "options" {
|
||||
options = true;
|
||||
} else if options {
|
||||
for opt in p.split(' ') {
|
||||
if let Some(ztimelineid_str) = opt.strip_prefix("ztimelineid=") {
|
||||
if opt.starts_with("ztimelineid=") {
|
||||
// FIXME: rethrow parsing error, don't unwrap
|
||||
timelineid = Some(ZTimelineId::from_str(ztimelineid_str).unwrap());
|
||||
} else if let Some(val) = opt.strip_prefix("application_name=") {
|
||||
appname = Some(val.to_string());
|
||||
timelineid = Some(ZTimelineId::from_str(&opt[12..]).unwrap());
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -114,7 +111,6 @@ impl FeStartupMessage {
|
||||
Ok(Some(FeMessage::StartupMessage(FeStartupMessage {
|
||||
version,
|
||||
kind,
|
||||
appname,
|
||||
timelineid: timelineid.unwrap(),
|
||||
})))
|
||||
}
|
||||
|
||||
@@ -1,106 +0,0 @@
|
||||
//
|
||||
// Offload old WAL segments to S3 and remove them locally
|
||||
//
|
||||
|
||||
use anyhow::Result;
|
||||
use log::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use s3::bucket::Bucket;
|
||||
use s3::creds::Credentials;
|
||||
use s3::region::Region;
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::fs::{self, File};
|
||||
use std::io::prelude::*;
|
||||
use std::iter::FromIterator;
|
||||
use std::path::PathBuf;
|
||||
use std::time::SystemTime;
|
||||
use tokio::runtime;
|
||||
use tokio::time::sleep;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::WalAcceptorConf;
|
||||
|
||||
pub fn thread_main(conf: WalAcceptorConf) {
|
||||
// Create a new thread pool
|
||||
//
|
||||
// FIXME: keep it single-threaded for now, make it easier to debug with gdb,
|
||||
// and we're not concerned with performance yet.
|
||||
//let runtime = runtime::Runtime::new().unwrap();
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
info!("Starting S3 offload task");
|
||||
|
||||
runtime.block_on(async {
|
||||
main_loop(&conf).await.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
async fn offload_files(
|
||||
bucket: &Bucket,
|
||||
listing: &HashSet<String>,
|
||||
dir_path: &PathBuf,
|
||||
conf: &WalAcceptorConf,
|
||||
) -> Result<u64> {
|
||||
let horizon = SystemTime::now() - conf.ttl.unwrap();
|
||||
let mut n: u64 = 0;
|
||||
for entry in WalkDir::new(dir_path) {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_file()
|
||||
&& IsXLogFileName(entry.file_name().to_str().unwrap())
|
||||
&& entry.metadata().unwrap().created().unwrap() <= horizon
|
||||
{
|
||||
let relpath = path.strip_prefix(&conf.data_dir).unwrap();
|
||||
let s3path = String::from("walarchive/") + relpath.to_str().unwrap();
|
||||
if !listing.contains(&s3path) {
|
||||
let mut file = File::open(&path)?;
|
||||
let mut content = Vec::new();
|
||||
file.read_to_end(&mut content)?;
|
||||
bucket.put_object(s3path, &content).await?;
|
||||
|
||||
fs::remove_file(&path)?;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
async fn main_loop(conf: &WalAcceptorConf) -> Result<()> {
|
||||
let region = Region::Custom {
|
||||
region: env::var("S3_REGION").unwrap(),
|
||||
endpoint: env::var("S3_ENDPOINT").unwrap(),
|
||||
};
|
||||
let credentials = Credentials::new(
|
||||
Some(&env::var("S3_ACCESSKEY").unwrap()),
|
||||
Some(&env::var("S3_SECRET").unwrap()),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Create Bucket in REGION for BUCKET
|
||||
let bucket = Bucket::new_with_path_style("zenith-testbucket", region, credentials)?;
|
||||
|
||||
loop {
|
||||
// List out contents of directory
|
||||
let results = bucket
|
||||
.list("walarchive/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
let listing = HashSet::from_iter(
|
||||
results
|
||||
.iter()
|
||||
.flat_map(|b| b.contents.iter().map(|o| o.key.clone())),
|
||||
);
|
||||
|
||||
let n = offload_files(&bucket, &listing, &conf.data_dir, conf).await?;
|
||||
info!("Offload {} files to S3", n);
|
||||
sleep(conf.ttl.unwrap()).await;
|
||||
}
|
||||
}
|
||||
@@ -29,9 +29,9 @@ use tokio::task;
|
||||
use tokio_postgres::{connect, Error, NoTls};
|
||||
|
||||
use crate::pq_protocol::*;
|
||||
use crate::xlog_utils::*;
|
||||
use crate::WalAcceptorConf;
|
||||
use pageserver::ZTimelineId;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
|
||||
type FullTransactionId = u64;
|
||||
|
||||
@@ -158,12 +158,11 @@ pub struct Timeline {
|
||||
#[derive(Debug)]
|
||||
struct Connection {
|
||||
timeline: Option<Arc<Timeline>>,
|
||||
stream: TcpStream, /* Postgres connection */
|
||||
inbuf: BytesMut, /* input buffer */
|
||||
outbuf: BytesMut, /* output buffer */
|
||||
init_done: bool, /* startup packet proceeded */
|
||||
appname: Option<String>, /* assigned application name */
|
||||
conf: WalAcceptorConf, /* wal acceptor configuration */
|
||||
stream: TcpStream, /* Postgres connection */
|
||||
inbuf: BytesMut, /* input buffer */
|
||||
outbuf: BytesMut, /* output buffer */
|
||||
init_done: bool, /* startup packet proceeded */
|
||||
conf: WalAcceptorConf, /* wal acceptor configuration */
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -194,14 +193,14 @@ fn parse_hex_str(s: &str) -> Result<u64> {
|
||||
|
||||
impl Serializer for NodeId {
|
||||
fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u64_le(self.term);
|
||||
buf.put_u128(self.uuid); // use big endian to provide compatibility with memcmp
|
||||
buf.put_u128_le(self.uuid);
|
||||
buf.put_u64(self.term); // use big endian to provide compatibility with memcmp
|
||||
}
|
||||
|
||||
fn unpack(buf: &mut BytesMut) -> NodeId {
|
||||
NodeId {
|
||||
term: buf.get_u64_le(),
|
||||
uuid: buf.get_u128(), // use big endian to provide compatibility with memcmp
|
||||
uuid: buf.get_u128_le(),
|
||||
term: buf.get_u64(), // use big endian to provide compatibility with memcmp
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -445,7 +444,7 @@ impl Timeline {
|
||||
|
||||
fn get_hs_feedback(&self) -> HotStandbyFeedback {
|
||||
let shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.hs_feedback
|
||||
return shared_state.hs_feedback;
|
||||
}
|
||||
|
||||
// Load and lock control file (prevent running more than one instance of safekeeper)
|
||||
@@ -528,7 +527,7 @@ impl Timeline {
|
||||
|
||||
let file = shared_state.control_file.as_mut().unwrap();
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.write_all(&buf[..])?;
|
||||
file.write_all(&mut buf[..])?;
|
||||
if sync {
|
||||
file.sync_all()?;
|
||||
}
|
||||
@@ -544,7 +543,6 @@ impl Connection {
|
||||
inbuf: BytesMut::with_capacity(10 * 1024),
|
||||
outbuf: BytesMut::with_capacity(10 * 1024),
|
||||
init_done: false,
|
||||
appname: None,
|
||||
conf: conf.clone(),
|
||||
}
|
||||
}
|
||||
@@ -556,7 +554,7 @@ impl Connection {
|
||||
async fn run(&mut self) -> Result<()> {
|
||||
self.inbuf.resize(4, 0u8);
|
||||
self.stream.read_exact(&mut self.inbuf[0..4]).await?;
|
||||
let startup_pkg_len = BigEndian::read_u32(&self.inbuf[0..4]);
|
||||
let startup_pkg_len = BigEndian::read_u32(&mut self.inbuf[0..4]);
|
||||
if startup_pkg_len == 0 {
|
||||
self.receive_wal().await?; // internal protocol between wal_proposer and wal_acceptor
|
||||
} else {
|
||||
@@ -857,7 +855,6 @@ impl Connection {
|
||||
self.send().await?;
|
||||
self.init_done = true;
|
||||
self.set_timeline(m.timelineid)?;
|
||||
self.appname = m.appname;
|
||||
}
|
||||
StartupRequestCode::Cancel => return Ok(()),
|
||||
}
|
||||
@@ -941,7 +938,7 @@ impl Connection {
|
||||
let mut caps = re.captures_iter(str::from_utf8(&cmd[..]).unwrap());
|
||||
let cap = caps.next().unwrap();
|
||||
let mut start_pos: XLogRecPtr = (parse_hex_str(&cap[1])? << 32) | parse_hex_str(&cap[2])?;
|
||||
let mut stop_pos: XLogRecPtr = if let Some(cap) = caps.next() {
|
||||
let stop_pos: XLogRecPtr = if let Some(cap) = caps.next() {
|
||||
(parse_hex_str(&cap[1])? << 32) | parse_hex_str(&cap[2])?
|
||||
} else {
|
||||
0
|
||||
@@ -954,9 +951,6 @@ impl Connection {
|
||||
if start_pos == 0 {
|
||||
start_pos = wal_end;
|
||||
}
|
||||
if stop_pos == 0 && self.appname == Some("wal_proposer_recovery".to_string()) {
|
||||
stop_pos = wal_end;
|
||||
}
|
||||
info!(
|
||||
"Start replication from {:X}/{:>08X} till {:X}/{:>08X}",
|
||||
(start_pos >> 32) as u32,
|
||||
@@ -1003,12 +997,12 @@ impl Connection {
|
||||
// Try to fetch replica's feedback
|
||||
match self.stream.try_read_buf(&mut self.inbuf) {
|
||||
Ok(0) => break,
|
||||
Ok(_) => {
|
||||
if let Some(FeMessage::CopyData(m)) = self.parse_message()? {
|
||||
self.timeline()
|
||||
.add_hs_feedback(HotStandbyFeedback::parse(&m.body))
|
||||
}
|
||||
}
|
||||
Ok(_) => match self.parse_message()? {
|
||||
Some(FeMessage::CopyData(m)) => self
|
||||
.timeline()
|
||||
.add_hs_feedback(HotStandbyFeedback::parse(&m.body)),
|
||||
_ => {}
|
||||
},
|
||||
Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {}
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
@@ -1108,7 +1102,7 @@ impl Connection {
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
const ZERO_BLOCK: &'static [u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
|
||||
|
||||
@@ -1,15 +1,4 @@
|
||||
//
|
||||
// This file contains common utilities for dealing with PostgreSQL WAL files and
|
||||
// LSNs.
|
||||
//
|
||||
// Many of these functions have been copied from PostgreSQL, and rewritten in
|
||||
// Rust. That's why they don't follow the usual Rust naming conventions, they
|
||||
// have been named the same as the corresponding PostgreSQL functions instead.
|
||||
//
|
||||
|
||||
use crate::pg_constants;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, Bytes};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
@@ -34,17 +23,17 @@ pub type XLogSegNo = u64;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
|
||||
(xlogptr as u32) & (wal_segsz_bytes as u32 - 1)
|
||||
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
(0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
|
||||
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
xlogptr / wal_segsz_bytes as u64
|
||||
return xlogptr / wal_segsz_bytes as u64;
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
@@ -53,7 +42,7 @@ pub fn XLogSegNoOffsetToRecPtr(
|
||||
offset: u32,
|
||||
wal_segsz_bytes: usize,
|
||||
) -> XLogRecPtr {
|
||||
segno * (wal_segsz_bytes as u64) + (offset as u64)
|
||||
return segno * (wal_segsz_bytes as u64) + (offset as u64);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
@@ -71,7 +60,7 @@ pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLin
|
||||
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
|
||||
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
|
||||
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
|
||||
(log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
|
||||
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
@@ -81,7 +70,7 @@ pub fn IsXLogFileName(fname: &str) -> bool {
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn IsPartialXLogFileName(fname: &str) -> bool {
|
||||
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
|
||||
return fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]);
|
||||
}
|
||||
|
||||
pub fn get_current_timestamp() -> TimestampTz {
|
||||
@@ -192,12 +181,9 @@ fn find_end_of_wal_segment(
|
||||
}
|
||||
}
|
||||
}
|
||||
last_valid_rec_pos as u32
|
||||
return last_valid_rec_pos as u32;
|
||||
}
|
||||
|
||||
///
|
||||
/// Scan a directory that contains PostgreSQL WAL files, for the end of WAL.
|
||||
///
|
||||
pub fn find_end_of_wal(
|
||||
data_dir: &Path,
|
||||
wal_seg_size: usize,
|
||||
@@ -251,7 +237,7 @@ pub fn find_end_of_wal(
|
||||
let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
|
||||
return (high_ptr, high_tli);
|
||||
}
|
||||
(0, 0)
|
||||
return (0, 0);
|
||||
}
|
||||
|
||||
pub fn main() {
|
||||
@@ -266,39 +252,3 @@ pub fn main() {
|
||||
tli
|
||||
);
|
||||
}
|
||||
|
||||
//
|
||||
// Xlog record parsing routines
|
||||
// TODO move here other related code from waldecoder.rs
|
||||
//
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XLogRecord {
|
||||
pub xl_tot_len: u32,
|
||||
pub xl_xid: u32,
|
||||
pub xl_prev: u64,
|
||||
pub xl_info: u8,
|
||||
pub xl_rmid: u8,
|
||||
pub xl_crc: u32,
|
||||
}
|
||||
|
||||
impl XLogRecord {
|
||||
pub fn from_bytes(buf: &mut Bytes) -> XLogRecord {
|
||||
XLogRecord {
|
||||
xl_tot_len: buf.get_u32_le(),
|
||||
xl_xid: buf.get_u32_le(),
|
||||
xl_prev: buf.get_u64_le(),
|
||||
xl_info: buf.get_u8(),
|
||||
xl_rmid: buf.get_u8(),
|
||||
xl_crc: {
|
||||
buf.advance(2);
|
||||
buf.get_u32_le()
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Is this record an XLOG_SWITCH record? They need some special processing,
|
||||
pub fn is_xlog_switch_record(&self) -> bool {
|
||||
self.xl_info == pg_constants::XLOG_SWITCH && self.xl_rmid == pg_constants::RM_XLOG_ID
|
||||
}
|
||||
}
|
||||
@@ -34,7 +34,10 @@ fn main() -> Result<()> {
|
||||
.required(true);
|
||||
let matches = App::new("zenith")
|
||||
.about("Zenith CLI")
|
||||
.subcommand(SubCommand::with_name("init").about("Initialize a new Zenith repository"))
|
||||
.subcommand(
|
||||
SubCommand::with_name("init")
|
||||
.about("Initialize a new Zenith repository in current directory"),
|
||||
)
|
||||
.subcommand(
|
||||
SubCommand::with_name("branch")
|
||||
.about("Create a new branch")
|
||||
@@ -73,10 +76,10 @@ fn main() -> Result<()> {
|
||||
|
||||
// all other commands would need config
|
||||
|
||||
let repopath = zenith_repo_dir();
|
||||
let repopath = PathBuf::from(zenith_repo_dir());
|
||||
if !repopath.exists() {
|
||||
bail!(
|
||||
"Zenith repository does not exist in {}.\n\
|
||||
"Zenith repository does not exists in {}.\n\
|
||||
Set ZENITH_REPO_DIR or initialize a new repository with 'zenith init'",
|
||||
repopath.display()
|
||||
);
|
||||
@@ -183,7 +186,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let node = cplane
|
||||
.nodes
|
||||
.get(name)
|
||||
.ok_or_else(|| anyhow!("postgres {} is not found", name))?;
|
||||
.ok_or(anyhow!("postgres {} is not found", name))?;
|
||||
node.start()?;
|
||||
}
|
||||
("stop", Some(sub_m)) => {
|
||||
@@ -191,7 +194,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
let node = cplane
|
||||
.nodes
|
||||
.get(name)
|
||||
.ok_or_else(|| anyhow!("postgres {} is not found", name))?;
|
||||
.ok_or(anyhow!("postgres {} is not found", name))?;
|
||||
node.stop()?;
|
||||
}
|
||||
|
||||
@@ -274,19 +277,19 @@ fn list_branches() -> Result<()> {
|
||||
//
|
||||
//
|
||||
fn parse_point_in_time(s: &str) -> Result<local_env::PointInTime> {
|
||||
let mut strings = s.split('@');
|
||||
let mut strings = s.split("@");
|
||||
let name = strings.next().unwrap();
|
||||
|
||||
let lsn: Option<u64>;
|
||||
if let Some(lsnstr) = strings.next() {
|
||||
let mut s = lsnstr.split('/');
|
||||
let mut s = lsnstr.split("/");
|
||||
let lsn_hi: u64 = s
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("invalid LSN in point-in-time specification"))?
|
||||
.ok_or(anyhow!("invalid LSN in point-in-time specification"))?
|
||||
.parse()?;
|
||||
let lsn_lo: u64 = s
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("invalid LSN in point-in-time specification"))?
|
||||
.ok_or(anyhow!("invalid LSN in point-in-time specification"))?
|
||||
.parse()?;
|
||||
lsn = Some(lsn_hi << 32 | lsn_lo);
|
||||
} else {
|
||||
@@ -309,8 +312,11 @@ fn parse_point_in_time(s: &str) -> Result<local_env::PointInTime> {
|
||||
let pointstr = fs::read_to_string(branchpath)?;
|
||||
|
||||
let mut result = parse_point_in_time(&pointstr)?;
|
||||
|
||||
result.lsn = lsn.unwrap_or(0);
|
||||
if lsn.is_some() {
|
||||
result.lsn = lsn.unwrap();
|
||||
} else {
|
||||
result.lsn = 0;
|
||||
}
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
|
||||
@@ -5,4 +5,3 @@ authors = ["Eric Seppanen <eric@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
thiserror = "1"
|
||||
|
||||
@@ -1,10 +1,2 @@
|
||||
//! zenith_utils is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
|
||||
/// `Lsn` type implements common tasks on Log Sequence Numbers
|
||||
pub mod lsn;
|
||||
/// SeqWait allows waiting for a future sequence number to arrive
|
||||
pub mod seqwait;
|
||||
|
||||
// Async version of SeqWait. Currently unused.
|
||||
// pub mod seqwait_async;
|
||||
|
||||
@@ -1,251 +0,0 @@
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use std::fmt;
|
||||
use std::ops::{Add, AddAssign};
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
/// Transaction log block size in bytes
|
||||
pub const XLOG_BLCKSZ: u32 = 8192;
|
||||
|
||||
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
|
||||
#[derive(Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd)]
|
||||
pub struct Lsn(pub u64);
|
||||
|
||||
/// We tried to parse an LSN from a string, but failed
|
||||
#[derive(Debug, PartialEq, thiserror::Error)]
|
||||
#[error("LsnParseError")]
|
||||
pub struct LsnParseError;
|
||||
|
||||
impl Lsn {
|
||||
/// Maximum possible value for an LSN
|
||||
pub const MAX: Lsn = Lsn(u64::MAX);
|
||||
|
||||
/// Subtract a number, returning None on overflow.
|
||||
pub fn checked_sub<T: Into<u64>>(self, other: T) -> Option<Lsn> {
|
||||
let other: u64 = other.into();
|
||||
self.0.checked_sub(other).map(Lsn)
|
||||
}
|
||||
|
||||
/// Parse an LSN from a filename in the form `0000000000000000`
|
||||
pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
|
||||
where
|
||||
F: AsRef<Path>,
|
||||
{
|
||||
let filename: &Path = filename.as_ref();
|
||||
let filename = filename.to_str().ok_or(LsnParseError)?;
|
||||
Lsn::from_hex(filename)
|
||||
}
|
||||
|
||||
/// Parse an LSN from a string in the form `0000000000000000`
|
||||
pub fn from_hex<S>(s: S) -> Result<Self, LsnParseError>
|
||||
where
|
||||
S: AsRef<str>,
|
||||
{
|
||||
let s: &str = s.as_ref();
|
||||
let n = u64::from_str_radix(s, 16).or(Err(LsnParseError))?;
|
||||
Ok(Lsn(n))
|
||||
}
|
||||
|
||||
/// Compute the offset into a segment
|
||||
pub fn segment_offset(self, seg_sz: u64) -> u64 {
|
||||
self.0 % seg_sz
|
||||
}
|
||||
|
||||
/// Compute the segment number
|
||||
pub fn segment_number(self, seg_sz: u64) -> u64 {
|
||||
self.0 / seg_sz
|
||||
}
|
||||
|
||||
/// Compute the offset into a block
|
||||
pub fn block_offset(self) -> u64 {
|
||||
const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
|
||||
self.0 % BLCKSZ
|
||||
}
|
||||
|
||||
/// Compute the bytes remaining in this block
|
||||
///
|
||||
/// If the LSN is already at the block boundary, it will return `XLOG_BLCKSZ`.
|
||||
pub fn remaining_in_block(self) -> u64 {
|
||||
const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
|
||||
BLCKSZ - (self.0 % BLCKSZ)
|
||||
}
|
||||
|
||||
/// Compute the bytes remaining to fill a chunk of some size
|
||||
///
|
||||
/// If the LSN is already at the chunk boundary, it will return 0.
|
||||
pub fn calc_padding<T: Into<u64>>(self, sz: T) -> u64 {
|
||||
let sz: u64 = sz.into();
|
||||
// By using wrapping_sub, we can subtract first and then mod second.
|
||||
// If it's done the other way around, then we would return a full
|
||||
// chunk size if we're already at the chunk boundary.
|
||||
// (Regular subtraction will panic on overflow in debug builds.)
|
||||
(sz.wrapping_sub(self.0)) % sz
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for Lsn {
|
||||
fn from(n: u64) -> Self {
|
||||
Lsn(n)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Lsn> for u64 {
|
||||
fn from(lsn: Lsn) -> u64 {
|
||||
lsn.0
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for Lsn {
|
||||
type Err = LsnParseError;
|
||||
|
||||
/// Parse an LSN from a string in the form `00000000/00000000`
|
||||
///
|
||||
/// If the input string is missing the '/' character, then use `Lsn::from_hex`
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut splitter = s.split('/');
|
||||
if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
|
||||
{
|
||||
let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
|
||||
let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
|
||||
Ok(Lsn((left_num as u64) << 32 | right_num as u64))
|
||||
} else {
|
||||
Err(LsnParseError)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Lsn {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{:X}/{:X}", self.0 >> 32, self.0 & 0xffffffff)
|
||||
}
|
||||
}
|
||||
|
||||
impl Add<u64> for Lsn {
|
||||
type Output = Lsn;
|
||||
|
||||
fn add(self, other: u64) -> Self::Output {
|
||||
// panic if the addition overflows.
|
||||
Lsn(self.0.checked_add(other).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl AddAssign<u64> for Lsn {
|
||||
fn add_assign(&mut self, other: u64) {
|
||||
// panic if the addition overflows.
|
||||
self.0 = self.0.checked_add(other).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
/// An [`Lsn`] that can be accessed atomically.
|
||||
pub struct AtomicLsn {
|
||||
inner: AtomicU64,
|
||||
}
|
||||
|
||||
impl AtomicLsn {
|
||||
/// Creates a new atomic `Lsn`.
|
||||
pub fn new(val: u64) -> Self {
|
||||
AtomicLsn {
|
||||
inner: AtomicU64::new(val),
|
||||
}
|
||||
}
|
||||
|
||||
/// Atomically retrieve the `Lsn` value from memory.
|
||||
pub fn load(&self) -> Lsn {
|
||||
Lsn(self.inner.load(Ordering::Acquire))
|
||||
}
|
||||
|
||||
/// Atomically store a new `Lsn` value to memory.
|
||||
pub fn store(&self, lsn: Lsn) {
|
||||
self.inner.store(lsn.0, Ordering::Release);
|
||||
}
|
||||
|
||||
/// Adds to the current value, returning the previous value.
|
||||
///
|
||||
/// This operation will panic on overflow.
|
||||
pub fn fetch_add(&self, val: u64) -> Lsn {
|
||||
let prev = self.inner.fetch_add(val, Ordering::AcqRel);
|
||||
if prev.checked_add(val).is_none() {
|
||||
panic!("AtomicLsn overflow");
|
||||
}
|
||||
Lsn(prev)
|
||||
}
|
||||
|
||||
/// Atomically sets the Lsn to the max of old and new value, returning the old value.
|
||||
pub fn fetch_max(&self, lsn: Lsn) -> Lsn {
|
||||
let prev = self.inner.fetch_max(lsn.0, Ordering::AcqRel);
|
||||
Lsn(prev)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_lsn_strings() {
|
||||
assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
|
||||
assert_eq!("aaaa/bbbb".parse(), Ok(Lsn(0x0000AAAA0000BBBB)));
|
||||
assert_eq!("1/A".parse(), Ok(Lsn(0x000000010000000A)));
|
||||
assert_eq!("0/0".parse(), Ok(Lsn(0)));
|
||||
"ABCDEFG/12345678".parse::<Lsn>().unwrap_err();
|
||||
"123456789/AAAA5555".parse::<Lsn>().unwrap_err();
|
||||
"12345678/AAAA55550".parse::<Lsn>().unwrap_err();
|
||||
"-1/0".parse::<Lsn>().unwrap_err();
|
||||
"1/-1".parse::<Lsn>().unwrap_err();
|
||||
|
||||
assert_eq!(format!("{}", Lsn(0x12345678AAAA5555)), "12345678/AAAA5555");
|
||||
assert_eq!(format!("{}", Lsn(0x000000010000000A)), "1/A");
|
||||
|
||||
assert_eq!(
|
||||
Lsn::from_hex("12345678AAAA5555"),
|
||||
Ok(Lsn(0x12345678AAAA5555))
|
||||
);
|
||||
assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
|
||||
assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lsn_math() {
|
||||
assert_eq!(Lsn(1234) + 11u64, Lsn(1245));
|
||||
|
||||
assert_eq!(
|
||||
{
|
||||
let mut lsn = Lsn(1234);
|
||||
lsn += 11u64;
|
||||
lsn
|
||||
},
|
||||
Lsn(1245)
|
||||
);
|
||||
|
||||
assert_eq!(Lsn(1234).checked_sub(1233u64), Some(Lsn(1)));
|
||||
assert_eq!(Lsn(1234).checked_sub(1235u64), None);
|
||||
|
||||
let seg_sz = 16u64 * 1024 * 1024;
|
||||
assert_eq!(Lsn(0x1000007).segment_offset(seg_sz), 7u64);
|
||||
assert_eq!(Lsn(0x1000007).segment_number(seg_sz), 1u64);
|
||||
|
||||
assert_eq!(Lsn(0x4007).block_offset(), 7u64);
|
||||
assert_eq!(Lsn(0x4000).block_offset(), 0u64);
|
||||
assert_eq!(Lsn(0x4007).remaining_in_block(), 8185u64);
|
||||
assert_eq!(Lsn(0x4000).remaining_in_block(), 8192u64);
|
||||
|
||||
assert_eq!(Lsn(0xffff01).calc_padding(seg_sz), 255u64);
|
||||
assert_eq!(Lsn(0x2000000).calc_padding(seg_sz), 0u64);
|
||||
assert_eq!(Lsn(0xffff01).calc_padding(8u32), 7u64);
|
||||
assert_eq!(Lsn(0xffff00).calc_padding(8u32), 0u64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_atomic_lsn() {
|
||||
let lsn = AtomicLsn::new(0);
|
||||
assert_eq!(lsn.fetch_add(1234), Lsn(0));
|
||||
assert_eq!(lsn.load(), Lsn(1234));
|
||||
lsn.store(Lsn(5678));
|
||||
assert_eq!(lsn.load(), Lsn(5678));
|
||||
|
||||
assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
|
||||
assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
|
||||
}
|
||||
}
|
||||
@@ -1,264 +0,0 @@
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use std::cmp::{Eq, Ordering, PartialOrd};
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt::Debug;
|
||||
use std::mem;
|
||||
use std::sync::mpsc::{channel, Receiver, Sender};
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
|
||||
/// An error happened while waiting for a number
|
||||
#[derive(Debug, PartialEq, thiserror::Error)]
|
||||
#[error("SeqWaitError")]
|
||||
pub enum SeqWaitError {
|
||||
/// The wait timeout was reached
|
||||
Timeout,
|
||||
/// [`SeqWait::shutdown`] was called
|
||||
Shutdown,
|
||||
}
|
||||
|
||||
/// Internal components of a `SeqWait`
|
||||
struct SeqWaitInt<T>
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
waiters: BinaryHeap<Waiter<T>>,
|
||||
current: T,
|
||||
shutdown: bool,
|
||||
}
|
||||
|
||||
struct Waiter<T>
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
wake_num: T, // wake me when this number arrives ...
|
||||
wake_channel: Sender<()>, // ... by sending a message to this channel
|
||||
}
|
||||
|
||||
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||
// to get that.
|
||||
impl<T: Ord> PartialOrd for Waiter<T> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
other.wake_num.partial_cmp(&self.wake_num)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Ord> Ord for Waiter<T> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other.wake_num.cmp(&self.wake_num)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Ord> PartialEq for Waiter<T> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
other.wake_num == self.wake_num
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Ord> Eq for Waiter<T> {}
|
||||
|
||||
/// A tool for waiting on a sequence number
|
||||
///
|
||||
/// This provides a way to wait the arrival of a number.
|
||||
/// As soon as the number arrives by another caller calling
|
||||
/// [`advance`], then the waiter will be woken up.
|
||||
///
|
||||
/// This implementation takes a blocking Mutex on both [`wait_for`]
|
||||
/// and [`advance`], meaning there may be unexpected executor blocking
|
||||
/// due to thread scheduling unfairness. There are probably better
|
||||
/// implementations, but we can probably live with this for now.
|
||||
///
|
||||
/// [`wait_for`]: SeqWait::wait_for
|
||||
/// [`advance`]: SeqWait::advance
|
||||
///
|
||||
pub struct SeqWait<T>
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
internal: Mutex<SeqWaitInt<T>>,
|
||||
}
|
||||
|
||||
impl<T> SeqWait<T>
|
||||
where
|
||||
T: Ord + Debug + Copy,
|
||||
{
|
||||
/// Create a new `SeqWait`, initialized to a particular number
|
||||
pub fn new(starting_num: T) -> Self {
|
||||
let internal = SeqWaitInt {
|
||||
waiters: BinaryHeap::new(),
|
||||
current: starting_num,
|
||||
shutdown: false,
|
||||
};
|
||||
SeqWait {
|
||||
internal: Mutex::new(internal),
|
||||
}
|
||||
}
|
||||
|
||||
/// Shut down a `SeqWait`, causing all waiters (present and
|
||||
/// future) to return an error.
|
||||
pub fn shutdown(&self) {
|
||||
let waiters = {
|
||||
// Prevent new waiters; wake all those that exist.
|
||||
// Wake everyone with an error.
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
|
||||
// This will steal the entire waiters map.
|
||||
// When we drop it all waiters will be woken.
|
||||
mem::take(&mut internal.waiters)
|
||||
|
||||
// Drop the lock as we exit this scope.
|
||||
};
|
||||
|
||||
// When we drop the waiters list, each Receiver will
|
||||
// be woken with an error.
|
||||
// This drop doesn't need to be explicit; it's done
|
||||
// here to make it easier to read the code and understand
|
||||
// the order of events.
|
||||
drop(waiters);
|
||||
}
|
||||
|
||||
/// Wait for a number to arrive
|
||||
///
|
||||
/// This call won't complete until someone has called `advance`
|
||||
/// with a number greater than or equal to the one we're waiting for.
|
||||
pub fn wait_for(&self, num: T) -> Result<(), SeqWaitError> {
|
||||
match self.queue_for_wait(num) {
|
||||
Ok(None) => Ok(()),
|
||||
Ok(Some(rx)) => rx.recv().map_err(|_| SeqWaitError::Shutdown),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Wait for a number to arrive
|
||||
///
|
||||
/// This call won't complete until someone has called `advance`
|
||||
/// with a number greater than or equal to the one we're waiting for.
|
||||
///
|
||||
/// If that hasn't happened after the specified timeout duration,
|
||||
/// [`SeqWaitError::Timeout`] will be returned.
|
||||
pub fn wait_for_timeout(&self, num: T, timeout_duration: Duration) -> Result<(), SeqWaitError> {
|
||||
match self.queue_for_wait(num) {
|
||||
Ok(None) => Ok(()),
|
||||
Ok(Some(rx)) => rx.recv_timeout(timeout_duration).map_err(|e| match e {
|
||||
std::sync::mpsc::RecvTimeoutError::Timeout => SeqWaitError::Timeout,
|
||||
std::sync::mpsc::RecvTimeoutError::Disconnected => SeqWaitError::Shutdown,
|
||||
}),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Register and return a channel that will be notified when a number arrives,
|
||||
/// or None, if it has already arrived.
|
||||
fn queue_for_wait(&self, num: T) -> Result<Option<Receiver<()>>, SeqWaitError> {
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
if internal.current >= num {
|
||||
return Ok(None);
|
||||
}
|
||||
if internal.shutdown {
|
||||
return Err(SeqWaitError::Shutdown);
|
||||
}
|
||||
|
||||
// Create a new channel.
|
||||
let (tx, rx) = channel();
|
||||
internal.waiters.push(Waiter {
|
||||
wake_num: num,
|
||||
wake_channel: tx,
|
||||
});
|
||||
// Drop the lock as we exit this scope.
|
||||
Ok(Some(rx))
|
||||
}
|
||||
|
||||
/// Announce a new number has arrived
|
||||
///
|
||||
/// All waiters at this value or below will be woken.
|
||||
///
|
||||
/// Returns the old number.
|
||||
pub fn advance(&self, num: T) -> T {
|
||||
let old_value;
|
||||
let wake_these = {
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
|
||||
old_value = internal.current;
|
||||
if old_value >= num {
|
||||
return old_value;
|
||||
}
|
||||
internal.current = num;
|
||||
|
||||
// Pop all waiters <= num from the heap. Collect them in a vector, and
|
||||
// wake them up after releasing the lock.
|
||||
let mut wake_these = Vec::new();
|
||||
while let Some(n) = internal.waiters.peek() {
|
||||
if n.wake_num > num {
|
||||
break;
|
||||
}
|
||||
wake_these.push(internal.waiters.pop().unwrap().wake_channel);
|
||||
}
|
||||
wake_these
|
||||
};
|
||||
|
||||
for tx in wake_these {
|
||||
// This can fail if there are no receivers.
|
||||
// We don't care; discard the error.
|
||||
let _ = tx.send(());
|
||||
}
|
||||
old_value
|
||||
}
|
||||
|
||||
/// Read the current value, without waiting.
|
||||
pub fn load(&self) -> T {
|
||||
self.internal.lock().unwrap().current
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use std::thread::sleep;
|
||||
use std::thread::spawn;
|
||||
use std::time::Duration;
|
||||
|
||||
#[test]
|
||||
fn seqwait() {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
let seq3 = Arc::clone(&seq);
|
||||
spawn(move || {
|
||||
seq2.wait_for(42).expect("wait_for 42");
|
||||
let old = seq2.advance(100);
|
||||
assert_eq!(old, 99);
|
||||
seq2.wait_for(999).expect_err("no 999");
|
||||
});
|
||||
spawn(move || {
|
||||
seq3.wait_for(42).expect("wait_for 42");
|
||||
seq3.wait_for(0).expect("wait_for 0");
|
||||
});
|
||||
sleep(Duration::from_secs(1));
|
||||
let old = seq.advance(99);
|
||||
assert_eq!(old, 0);
|
||||
seq.wait_for(100).expect("wait_for 100");
|
||||
|
||||
// Calling advance with a smaller value is a no-op
|
||||
assert_eq!(seq.advance(98), 100);
|
||||
assert_eq!(seq.load(), 100);
|
||||
|
||||
seq.shutdown();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seqwait_timeout() {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
spawn(move || {
|
||||
let timeout = Duration::from_millis(1);
|
||||
let res = seq2.wait_for_timeout(42, timeout);
|
||||
assert_eq!(res, Err(SeqWaitError::Timeout));
|
||||
});
|
||||
sleep(Duration::from_secs(1));
|
||||
// This will attempt to wake, but nothing will happen
|
||||
// because the waiter already dropped its Receiver.
|
||||
let old = seq.advance(99);
|
||||
assert_eq!(old, 0)
|
||||
}
|
||||
}
|
||||
@@ -1,224 +0,0 @@
|
||||
///
|
||||
/// Async version of 'seqwait.rs'
|
||||
///
|
||||
/// NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs.
|
||||
///
|
||||
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::Debug;
|
||||
use std::mem;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use tokio::sync::watch::{channel, Receiver, Sender};
|
||||
use tokio::time::timeout;
|
||||
|
||||
/// An error happened while waiting for a number
|
||||
#[derive(Debug, PartialEq, thiserror::Error)]
|
||||
#[error("SeqWaitError")]
|
||||
pub enum SeqWaitError {
|
||||
/// The wait timeout was reached
|
||||
Timeout,
|
||||
/// [`SeqWait::shutdown`] was called
|
||||
Shutdown,
|
||||
}
|
||||
|
||||
/// Internal components of a `SeqWait`
|
||||
struct SeqWaitInt<T>
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
waiters: BTreeMap<T, (Sender<()>, Receiver<()>)>,
|
||||
current: T,
|
||||
shutdown: bool,
|
||||
}
|
||||
|
||||
/// A tool for waiting on a sequence number
|
||||
///
|
||||
/// This provides a way to await the arrival of a number.
|
||||
/// As soon as the number arrives by another caller calling
|
||||
/// [`advance`], then the waiter will be woken up.
|
||||
///
|
||||
/// This implementation takes a blocking Mutex on both [`wait_for`]
|
||||
/// and [`advance`], meaning there may be unexpected executor blocking
|
||||
/// due to thread scheduling unfairness. There are probably better
|
||||
/// implementations, but we can probably live with this for now.
|
||||
///
|
||||
/// [`wait_for`]: SeqWait::wait_for
|
||||
/// [`advance`]: SeqWait::advance
|
||||
///
|
||||
pub struct SeqWait<T>
|
||||
where
|
||||
T: Ord,
|
||||
{
|
||||
internal: Mutex<SeqWaitInt<T>>,
|
||||
}
|
||||
|
||||
impl<T> SeqWait<T>
|
||||
where
|
||||
T: Ord + Debug + Copy,
|
||||
{
|
||||
/// Create a new `SeqWait`, initialized to a particular number
|
||||
pub fn new(starting_num: T) -> Self {
|
||||
let internal = SeqWaitInt {
|
||||
waiters: BTreeMap::new(),
|
||||
current: starting_num,
|
||||
shutdown: false,
|
||||
};
|
||||
SeqWait {
|
||||
internal: Mutex::new(internal),
|
||||
}
|
||||
}
|
||||
|
||||
/// Shut down a `SeqWait`, causing all waiters (present and
|
||||
/// future) to return an error.
|
||||
pub fn shutdown(&self) {
|
||||
let waiters = {
|
||||
// Prevent new waiters; wake all those that exist.
|
||||
// Wake everyone with an error.
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
|
||||
// This will steal the entire waiters map.
|
||||
// When we drop it all waiters will be woken.
|
||||
mem::take(&mut internal.waiters)
|
||||
|
||||
// Drop the lock as we exit this scope.
|
||||
};
|
||||
|
||||
// When we drop the waiters list, each Receiver will
|
||||
// be woken with an error.
|
||||
// This drop doesn't need to be explicit; it's done
|
||||
// here to make it easier to read the code and understand
|
||||
// the order of events.
|
||||
drop(waiters);
|
||||
}
|
||||
|
||||
/// Wait for a number to arrive
|
||||
///
|
||||
/// This call won't complete until someone has called `advance`
|
||||
/// with a number greater than or equal to the one we're waiting for.
|
||||
pub async fn wait_for(&self, num: T) -> Result<(), SeqWaitError> {
|
||||
let mut rx = {
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
if internal.current >= num {
|
||||
return Ok(());
|
||||
}
|
||||
if internal.shutdown {
|
||||
return Err(SeqWaitError::Shutdown);
|
||||
}
|
||||
|
||||
// If we already have a channel for waiting on this number, reuse it.
|
||||
if let Some((_, rx)) = internal.waiters.get_mut(&num) {
|
||||
// an Err from changed() means the sender was dropped.
|
||||
rx.clone()
|
||||
} else {
|
||||
// Create a new channel.
|
||||
let (tx, rx) = channel(());
|
||||
internal.waiters.insert(num, (tx, rx.clone()));
|
||||
rx
|
||||
}
|
||||
// Drop the lock as we exit this scope.
|
||||
};
|
||||
rx.changed().await.map_err(|_| SeqWaitError::Shutdown)
|
||||
}
|
||||
|
||||
/// Wait for a number to arrive
|
||||
///
|
||||
/// This call won't complete until someone has called `advance`
|
||||
/// with a number greater than or equal to the one we're waiting for.
|
||||
///
|
||||
/// If that hasn't happened after the specified timeout duration,
|
||||
/// [`SeqWaitError::Timeout`] will be returned.
|
||||
pub async fn wait_for_timeout(
|
||||
&self,
|
||||
num: T,
|
||||
timeout_duration: Duration,
|
||||
) -> Result<(), SeqWaitError> {
|
||||
timeout(timeout_duration, self.wait_for(num))
|
||||
.await
|
||||
.unwrap_or(Err(SeqWaitError::Timeout))
|
||||
}
|
||||
|
||||
/// Announce a new number has arrived
|
||||
///
|
||||
/// All waiters at this value or below will be woken.
|
||||
///
|
||||
/// `advance` will panic if you send it a lower number than
|
||||
/// a previous call.
|
||||
pub fn advance(&self, num: T) {
|
||||
let wake_these = {
|
||||
let mut internal = self.internal.lock().unwrap();
|
||||
|
||||
if internal.current > num {
|
||||
panic!(
|
||||
"tried to advance backwards, from {:?} to {:?}",
|
||||
internal.current, num
|
||||
);
|
||||
}
|
||||
internal.current = num;
|
||||
|
||||
// split_off will give me all the high-numbered waiters,
|
||||
// so split and then swap. Everything at or above `num`
|
||||
// stays.
|
||||
let mut split = internal.waiters.split_off(&num);
|
||||
std::mem::swap(&mut split, &mut internal.waiters);
|
||||
|
||||
// `split_at` didn't get the value at `num`; if it's
|
||||
// there take that too.
|
||||
if let Some(sleeper) = internal.waiters.remove(&num) {
|
||||
split.insert(num, sleeper);
|
||||
}
|
||||
|
||||
split
|
||||
};
|
||||
|
||||
for (_wake_num, (tx, _rx)) in wake_these {
|
||||
// This can fail if there are no receivers.
|
||||
// We don't care; discard the error.
|
||||
let _ = tx.send(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use tokio::time::{sleep, Duration};
|
||||
|
||||
#[tokio::test]
|
||||
async fn seqwait() {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
let seq3 = Arc::clone(&seq);
|
||||
tokio::spawn(async move {
|
||||
seq2.wait_for(42).await.expect("wait_for 42");
|
||||
seq2.advance(100);
|
||||
seq2.wait_for(999).await.expect_err("no 999");
|
||||
});
|
||||
tokio::spawn(async move {
|
||||
seq3.wait_for(42).await.expect("wait_for 42");
|
||||
seq3.wait_for(0).await.expect("wait_for 0");
|
||||
});
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
seq.advance(99);
|
||||
seq.wait_for(100).await.expect("wait_for 100");
|
||||
seq.shutdown();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn seqwait_timeout() {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
tokio::spawn(async move {
|
||||
let timeout = Duration::from_millis(1);
|
||||
let res = seq2.wait_for_timeout(42, timeout).await;
|
||||
assert_eq!(res, Err(SeqWaitError::Timeout));
|
||||
});
|
||||
sleep(Duration::from_secs(1)).await;
|
||||
// This will attempt to wake, but nothing will happen
|
||||
// because the waiter already dropped its Receiver.
|
||||
seq.advance(99);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user