Compare commits

..

1 Commits

Author SHA1 Message Date
anastasia
babd2339cc [issue #56] Fix race at postgres instance + walreceiver start. Uses postgres/vendor issue_56 branch.
TODO: rebase on main
2021-04-22 15:51:44 +03:00
49 changed files with 6838 additions and 3767 deletions

View File

@@ -1,45 +0,0 @@
name: Send Notifications
on:
push:
branches: [ main ]
jobs:
send-notifications:
timeout-minutes: 30
name: send commit notifications
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: true
fetch-depth: 2
- name: Form variables for notification message
id: git_info_grab
run: |
git_stat=$(git show --stat=50)
git_stat="${git_stat//'%'/'%25'}"
git_stat="${git_stat//$'\n'/'%0A'}"
git_stat="${git_stat//$'\r'/'%0D'}"
git_stat="${git_stat// /}" # space -> 'Space En', as github tends to eat ordinary spaces
echo "::set-output name=git_stat::$git_stat"
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
- name: Send notification
uses: appleboy/telegram-action@master
with:
to: ${{ secrets.TELEGRAM_TO }}
token: ${{ secrets.TELEGRAM_TOKEN }}
format: markdown
args: |
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
```
${{ steps.git_info_grab.outputs.git_stat }}
```

View File

@@ -1,36 +1,45 @@
name: Build and Test
name: regression check
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
on: [push]
jobs:
regression-check:
strategy:
matrix:
# If we want to duplicate this job for different
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
rust_toolchain: [stable]
os: [ubuntu-latest]
timeout-minutes: 30
timeout-minutes: 10
name: run regression test suite
runs-on: ${{ matrix.os }}
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: true
fetch-depth: 2
- name: install rust toolchain ${{ matrix.rust_toolchain }}
uses: actions-rs/toolchain@v1
- name: Form variables for notification message
id: git_info_grab
run: |
git_stat=$(git show --stat=50)
git_stat="${git_stat//'%'/'%25'}"
git_stat="${git_stat//$'\n'/'%0A'}"
git_stat="${git_stat//$'\r'/'%0D'}"
git_stat="${git_stat// /}" # space -> 'Space En', as github tends to eat ordinary spaces
echo "::set-output name=git_stat::$git_stat"
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
- name: Send notification
uses: appleboy/telegram-action@master
with:
profile: minimal
toolchain: ${{ matrix.rust_toolchain }}
override: true
to: ${{ secrets.TELEGRAM_TO }}
token: ${{ secrets.TELEGRAM_TOKEN }}
format: markdown
args: |
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
```
${{ steps.git_info_grab.outputs.git_stat }}
```
- name: Install postgres dependencies
run: |
@@ -52,7 +61,11 @@ jobs:
- name: Build postgres
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
make postgres
./pgbuild.sh
- name: Install rust
run: |
sudo apt install -y cargo
- name: Cache cargo deps
id: cache_cargo
@@ -64,10 +77,10 @@ jobs:
target
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
- name: Run cargo build
- name: Build
run: |
cargo build --workspace --bins --examples --tests
cargo build
- name: Run cargo test
- name: Run test
run: |
cargo test -- --nocapture --test-threads=1
cargo test --test test_pageserver -- --nocapture --test-threads=1

1
.gitignore vendored
View File

@@ -3,4 +3,3 @@
/tmp_install
/tmp_check_cli
.vscode
.zenith

195
Cargo.lock generated
View File

@@ -91,19 +91,19 @@ dependencies = [
[[package]]
name = "async-io"
version = "1.4.0"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcb9af4888a70ad78ecb5efcb0ba95d66a3cf54a88b62ae81559954c7588c7a2"
checksum = "9315f8f07556761c3e48fec2e6b276004acf426e6dc068b2c2251854d65ee0fd"
dependencies = [
"concurrent-queue",
"fastrand",
"futures-lite",
"libc",
"log",
"nb-connect",
"once_cell",
"parking",
"polling",
"socket2",
"vec-arena",
"waker-fn",
"winapi",
@@ -111,9 +111,9 @@ dependencies = [
[[package]]
name = "async-lock"
version = "2.4.0"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6a8ea61bf9947a1007c5cada31e647dbc77b103c679858150003ba697ea798b"
checksum = "1996609732bde4a9988bc42125f55f2af5f3c36370e27c778d5191a4a1b63bfb"
dependencies = [
"event-listener",
]
@@ -162,9 +162,9 @@ checksum = "e91831deabf0d6d7ec49552e489aed63b7456a7a3c46cff62adad428110b0af0"
[[package]]
name = "async-trait"
version = "0.1.50"
version = "0.1.49"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b98e84bbb4cbcdd97da190ba0c58a1bb0de2c1fdf67d159e192ed766aeca722"
checksum = "589652ce7ccb335d1e7ecb3be145425702b290dbcb7029bbeaae263fc1d87b48"
dependencies = [
"proc-macro2",
"quote",
@@ -243,12 +243,13 @@ checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd"
[[package]]
name = "bindgen"
version = "0.57.0"
version = "0.53.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd4865004a46a0aafb2a0a5eb19d3c9fc46ee5f063a6cfc605c69ac9ecf5263d"
checksum = "c72a978d268b1d70b0e963217e60fdabd9523a941457a6c42a7315d15c7e89e5"
dependencies = [
"bitflags",
"cexpr",
"cfg-if 0.1.10",
"clang-sys",
"clap",
"env_logger",
@@ -345,9 +346,6 @@ name = "cc"
version = "1.0.67"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd"
dependencies = [
"jobserver",
]
[[package]]
name = "cexpr"
@@ -385,9 +383,9 @@ dependencies = [
[[package]]
name = "clang-sys"
version = "1.2.0"
version = "0.29.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "853eda514c284c2287f4bf20ae614f8781f40a81d32ecda6e91449304dfe077c"
checksum = "fe6837df1d5cba2397b835c8530f51723267e16abbf83892e9e5af4f0e5dd10a"
dependencies = [
"glob",
"libc",
@@ -598,9 +596,9 @@ dependencies = [
[[package]]
name = "env_logger"
version = "0.8.3"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17392a012ea30ef05a610aa97dfb49496e71c9f676b27879922ea5bdf60d9d3f"
checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36"
dependencies = [
"atty",
"humantime",
@@ -924,9 +922,9 @@ dependencies = [
[[package]]
name = "httparse"
version = "1.4.0"
version = "1.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a1ce40d6fc9764887c2fdc7305c3dcc429ba11ff981c1509416afd5697e4437"
checksum = "bc35c995b9d93ec174cf9a27d425c7892722101e14993cd227fdb51d70cf9589"
[[package]]
name = "httpdate"
@@ -936,9 +934,12 @@ checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47"
[[package]]
name = "humantime"
version = "2.1.0"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f"
dependencies = [
"quick-error",
]
[[package]]
name = "hyper"
@@ -979,9 +980,9 @@ dependencies = [
[[package]]
name = "idna"
version = "0.2.3"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8"
checksum = "89829a5d69c23d348314a7ac337fe39173b61149a9864deabd260983aed48c21"
dependencies = [
"matches",
"unicode-bidi",
@@ -1032,15 +1033,6 @@ version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736"
[[package]]
name = "jobserver"
version = "0.1.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "972f5ae5d1cb9c6ae417789196c803205313edde988685da5e3aae0827b9e7fd"
dependencies = [
"libc",
]
[[package]]
name = "js-sys"
version = "0.3.50"
@@ -1079,24 +1071,12 @@ checksum = "9385f66bf6105b241aa65a61cb923ef20efc665cb9f9bb50ac2f0c4b7f378d41"
[[package]]
name = "libloading"
version = "0.7.0"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f84d96438c15fcd6c3f244c8fce01d1e2b9c6b5623e9c711dc9286d8fc92d6a"
checksum = "f2b111a074963af1d37a139918ac6d49ad1d0d5e47f72fd55388619691a7d753"
dependencies = [
"cfg-if 1.0.0",
"winapi",
]
[[package]]
name = "librocksdb-sys"
version = "6.17.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5da125e1c0f22c7cae785982115523a0738728498547f415c9054cb17c7e89f9"
dependencies = [
"bindgen",
"cc",
"glob",
"libc",
"winapi",
]
[[package]]
@@ -1204,6 +1184,16 @@ dependencies = [
"tempfile",
]
[[package]]
name = "nb-connect"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a19900e7eee95eb2b3c2e26d12a874cc80aaf750e31be6fcbe743ead369fa45d"
dependencies = [
"libc",
"socket2",
]
[[package]]
name = "nom"
version = "5.1.2"
@@ -1223,41 +1213,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "num"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8536030f9fea7127f841b45bb6243b27255787fb4eb83958aa1ef9d2fdc0c36"
dependencies = [
"num-bigint",
"num-complex",
"num-integer",
"num-iter",
"num-rational",
"num-traits",
]
[[package]]
name = "num-bigint"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-complex"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6b19411a9719e753aff12e5187b74d60d3dc449ec3f4dc21e3989c3f554bc95"
dependencies = [
"autocfg",
"num-traits",
]
[[package]]
name = "num-integer"
version = "0.1.44"
@@ -1268,29 +1223,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-iter"
version = "0.1.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2021c8337a54d21aca0d59a92577a029af9431cb59b909b03252b9c164fad59"
dependencies = [
"autocfg",
"num-integer",
"num-traits",
]
[[package]]
name = "num-rational"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c000134b5dbf44adc5cb772486d335293351644b801551abe8f75c84cfa4aef"
dependencies = [
"autocfg",
"num-bigint",
"num-integer",
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.14"
@@ -1381,19 +1313,18 @@ dependencies = [
"chrono",
"clap",
"crc32c",
"crossbeam-channel",
"daemonize",
"futures",
"hex",
"lazy_static",
"log",
"parse_duration",
"postgres",
"postgres-protocol",
"postgres-types",
"postgres_ffi",
"rand 0.8.3",
"regex",
"rocksdb",
"rust-s3",
"slog",
"slog-async",
@@ -1408,7 +1339,6 @@ dependencies = [
"tokio-stream",
"tui",
"walkdir",
"zenith_utils",
]
[[package]]
@@ -1442,17 +1372,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "parse_duration"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7037e5e93e0172a5a96874380bf73bc6ecef022e26fa25f2be26864d6b3ba95d"
dependencies = [
"lazy_static",
"num",
"regex",
]
[[package]]
name = "peeking_take_while"
version = "0.1.2"
@@ -1485,18 +1404,18 @@ dependencies = [
[[package]]
name = "pin-project"
version = "1.0.7"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7509cc106041c40a4518d2af7a61530e1eed0e6285296a3d8c5472806ccc4a4"
checksum = "bc174859768806e91ae575187ada95c91a29e96a98dc5d2cd9a1fed039501ba6"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.0.7"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c950132583b500556b1efd71d45b319029f2b71518d979fcc208e16b42426f"
checksum = "a490329918e856ed1b083f244e3bfe2d8c4f336407e4ea9e1a9f479ff09049e5"
dependencies = [
"proc-macro2",
"quote",
@@ -1585,7 +1504,6 @@ dependencies = [
"chrono",
"crc32c",
"hex",
"log",
"rand 0.8.3",
]
@@ -1616,6 +1534,12 @@ dependencies = [
"unicode-xid",
]
[[package]]
name = "quick-error"
version = "1.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
[[package]]
name = "quote"
version = "1.0.9"
@@ -1812,16 +1736,6 @@ dependencies = [
"winreg",
]
[[package]]
name = "rocksdb"
version = "0.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c749134fda8bfc90d0de643d59bfc841dcb3ac8a1062e12b6754bd60235c48b3"
dependencies = [
"libc",
"librocksdb-sys",
]
[[package]]
name = "rust-argon2"
version = "0.8.3"
@@ -2059,9 +1973,9 @@ checksum = "cbce6d4507c7e4a3962091436e56e95290cb71fa302d0d270e32130b75fbff27"
[[package]]
name = "slab"
version = "0.4.3"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527"
checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8"
[[package]]
name = "slog"
@@ -2499,9 +2413,9 @@ dependencies = [
[[package]]
name = "vcpkg"
version = "0.2.12"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbdbff6266a24120518560b5dc983096efb98462e51d0d68169895b237be3e5d"
checksum = "b00bca6106a5e23f3eee943593759b7fcddb00554332e856d990c893966879fb"
[[package]]
name = "vec-arena"
@@ -2552,12 +2466,9 @@ dependencies = [
"lazy_static",
"log",
"pageserver",
"parse_duration",
"postgres",
"postgres-protocol",
"postgres_ffi",
"regex",
"rust-s3",
"slog",
"slog-async",
"slog-scope",
@@ -2566,7 +2477,6 @@ dependencies = [
"tokio",
"tokio-postgres",
"tokio-stream",
"walkdir",
]
[[package]]
@@ -2762,6 +2672,3 @@ dependencies = [
[[package]]
name = "zenith_utils"
version = "0.1.0"
dependencies = [
"thiserror",
]

View File

@@ -1,53 +0,0 @@
#
# Top level Makefile to build Zenith and PostgreSQL
#
all: zenith postgres
# We don't want to run 'cargo build' in parallel with the postgres build,
# because interleaving cargo build output with postgres build output looks
# confusing. Also, 'cargo build' is parallel on its own, so it would be too
# much parallelism. (Recursive invocation of postgres target still gets any
# '-j' flag from the command line, so 'make -j' is still useful.)
.NOTPARALLEL:
### Zenith Rust bits
#
# The 'postgres_ffi' depends on the Postgres headers.
zenith: postgres-headers
cargo build
### PostgreSQL parts
tmp_install/build/config.status:
+@echo "Configuring postgres build"
mkdir -p tmp_install/build
(cd tmp_install/build && \
../../vendor/postgres/configure CFLAGS='-O0' --enable-debug --enable-cassert \
--enable-depend --with-libxml --prefix=$(abspath tmp_install) > configure.log)
# nicer alias for running 'configure'
postgres-configure: tmp_install/build/config.status
# Install the PostgreSQL header files into tmp_install/include
postgres-headers: postgres-configure
+@echo "Installing PostgreSQL headers"
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
# Compile and install PostgreSQL
postgres: postgres-configure
+@echo "Compiling PostgreSQL"
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
postgres-clean:
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
# This doesn't remove the effects of 'configure'.
clean:
cd tmp_install/build && ${MAKE} clean
cargo clean
# This removes everything
distclean:
rm -rf tmp_install
cargo clean
.PHONY: postgres-configure postgres postgres-headers zenith

View File

@@ -8,7 +8,8 @@ Zenith substitutes PostgreSQL storage layer and redistributes data across a clus
```sh
git clone --recursive https://github.com/libzenith/zenith.git
cd zenith
make
./pgbuild.sh # builds postgres and installs it to ./tmp_install
cargo build
```
2. Start pageserver and postggres on top of it (should be called from repo root):
@@ -53,7 +54,7 @@ postgres=# select * from t;
```sh
git clone --recursive https://github.com/libzenith/zenith.git
make # builds also postgres and installs it to ./tmp_install
./pgbuild.sh # builds postgres and installs it to ./tmp_install
cargo test -- --test-threads=1
```

View File

@@ -1,10 +1,9 @@
use std::fs::{self, File, OpenOptions};
use std::fs::{self, OpenOptions};
use std::io::{Read, Write};
use std::net::SocketAddr;
use std::net::TcpStream;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::{Command, ExitStatus};
use std::process::Command;
use std::sync::Arc;
use std::time::Duration;
use std::{collections::BTreeMap, path::PathBuf};
@@ -12,12 +11,13 @@ use std::{collections::BTreeMap, path::PathBuf};
use anyhow::{Context, Result};
use lazy_static::lazy_static;
use regex::Regex;
use tar;
use postgres::{Client, NoTls};
use crate::local_env::LocalEnv;
use crate::storage::{PageServerNode, WalProposerNode};
use pageserver::{zenith_repo_dir, ZTimelineId};
use pageserver::ZTimelineId;
//
// ComputeControlPlane
@@ -190,11 +190,11 @@ impl PostgresNode {
);
let port: u16 = CONF_PORT_RE
.captures(config.as_str())
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
.ok_or(anyhow::Error::msg(err_msg.clone() + " 1"))?
.iter()
.last()
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
.ok_or(anyhow::Error::msg(err_msg.clone() + " 2"))?
.ok_or(anyhow::Error::msg(err_msg.clone() + " 3"))?
.as_str()
.parse()
.with_context(|| err_msg)?;
@@ -277,9 +277,7 @@ impl PostgresNode {
max_replication_slots = 10\n\
hot_standby = on\n\
shared_buffers = 1MB\n\
fsync = off\n\
max_connections = 100\n\
wal_sender_timeout = 0\n\
wal_level = replica\n\
listen_addresses = '{address}'\n\
port = {port}\n",
@@ -292,7 +290,7 @@ impl PostgresNode {
// slot or something proper, to prevent the compute node
// from removing WAL that hasn't been streamed to the safekeepr or
// page server yet. But this will do for now.
self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n");
self.append_conf("postgresql.conf", &format!("wal_keep_size='10TB'\n"));
// Connect it to the page server.
@@ -355,7 +353,6 @@ impl PostgresNode {
)
.env_clear()
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.status()
.with_context(|| "pg_ctl failed")?;
if !pg_ctl.success() {
@@ -399,14 +396,6 @@ impl PostgresNode {
String::from_utf8(output.stdout).unwrap().trim().to_string()
}
fn dump_log_file(&self) {
if let Ok(mut file) = File::open(self.env.repo_path.join("pageserver.log")) {
let mut buffer = String::new();
file.read_to_string(&mut buffer).unwrap();
println!("--------------- pageserver.log:\n{}", buffer);
}
}
pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
let connstring = format!(
"host={} port={} dbname={} user={}",
@@ -418,11 +407,7 @@ impl PostgresNode {
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
println!("Running {}", sql);
let result = client.query(sql, &[]);
if result.is_err() {
self.dump_log_file();
}
result.unwrap()
client.query(sql, &[]).unwrap()
}
pub fn open_psql(&self, db: &str) -> Client {
@@ -458,92 +443,8 @@ impl PostgresNode {
}
}
pub fn pg_regress(&self) -> ExitStatus {
self.safe_psql("postgres", "CREATE DATABASE regression");
let data_dir = zenith_repo_dir();
let regress_run_path = data_dir.join("regress");
fs::create_dir_all(&regress_run_path).unwrap();
fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap();
std::env::set_current_dir(regress_run_path).unwrap();
let regress_build_path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
let regress_src_path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
let regress_check = Command::new(regress_build_path.join("pg_regress"))
.args(&[
"--bindir=''",
"--use-existing",
format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(),
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
format!(
"--schedule={}",
regress_src_path.join("parallel_schedule").to_str().unwrap()
)
.as_str(),
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
])
.env_clear()
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("PGPORT", self.address.port().to_string())
.env("PGUSER", self.whoami())
.env("PGHOST", self.address.ip().to_string())
.status()
.expect("pg_regress failed");
if !regress_check.success() {
if let Ok(mut file) = File::open("regression.diffs") {
let mut buffer = String::new();
file.read_to_string(&mut buffer).unwrap();
println!("--------------- regression.diffs:\n{}", buffer);
}
self.dump_log_file();
if let Ok(mut file) = File::open(
self.env
.repo_path
.join("pgdatadirs")
.join("pg1")
.join("log"),
) {
let mut buffer = String::new();
file.read_to_string(&mut buffer).unwrap();
println!("--------------- pgdatadirs/pg1/log:\n{}", buffer);
}
}
regress_check
}
pub fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus {
let port = self.address.port().to_string();
let clients = clients.to_string();
let seconds = seconds.to_string();
let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench"))
.args(&["-i", "-p", port.as_str(), "postgres"])
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.status()
.expect("pgbench -i");
let pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench"))
.args(&[
"-p",
port.as_str(),
"-T",
seconds.as_str(),
"-P",
"1",
"-c",
clients.as_str(),
"-M",
"prepared",
"postgres",
])
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.status()
.expect("pgbench run");
pg_bench_run
}
// TODO
pub fn pg_bench() {}
}
impl Drop for PostgresNode {

View File

@@ -15,9 +15,8 @@ use std::process::{Command, Stdio};
use anyhow::Result;
use serde_derive::{Deserialize, Serialize};
use pageserver::zenith_repo_dir;
use pageserver::ZTimelineId;
use postgres_ffi::xlog_utils;
use walkeeper::xlog_utils;
//
// This data structure represents deserialized zenith config, which should be
@@ -53,6 +52,14 @@ impl LocalEnv {
}
}
fn zenith_repo_dir() -> PathBuf {
// Find repository path
match std::env::var_os("ZENITH_REPO_DIR") {
Some(val) => PathBuf::from(val.to_str().unwrap()),
None => ".zenith".into(),
}
}
//
// Initialize a new Zenith repository
//
@@ -71,7 +78,7 @@ pub fn init() -> Result<()> {
let cargo_path = env::current_dir()?;
if !cargo_path.join("pageserver/Cargo.toml").exists() {
anyhow::bail!(
"Current directory does not look like a zenith repo. \
"Current dirrectory does not look like a zenith repo. \
Please, run 'init' from zenith repo root."
);
}
@@ -84,7 +91,7 @@ pub fn init() -> Result<()> {
if !pg_path.exists() {
anyhow::bail!(
"Can't find postres binary at {}. \
Perhaps 'make postgres' is needed to build it first.",
Perhaps './pgbuild.sh' is needed to build it first.",
pg_path.to_str().unwrap()
);
}
@@ -101,7 +108,7 @@ pub fn init() -> Result<()> {
// ok, we are good to go
let mut conf = LocalEnv {
repo_path,
repo_path: repo_path.clone(),
pg_distrib_dir,
zenith_distrib_dir,
systemid: 0,
@@ -129,38 +136,30 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> {
// Run initdb
//
// We create the cluster temporarily in a "tmp" directory inside the repository,
// and move it to the right location from there.
let tmppath = repopath.join("tmp");
// FIXME: we create it temporarily in "tmp" directory, and move it into
// the repository. Use "tempdir()" or something? Or just create it directly
// in the repo?
let initdb_path = local_env.pg_bin_dir().join("initdb");
let initdb = Command::new(initdb_path)
.args(&["-D", tmppath.to_str().unwrap()])
let _initdb = Command::new(initdb_path)
.args(&["-D", "tmp"])
.arg("--no-instructions")
.env_clear()
.env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap())
.env(
"DYLD_LIBRARY_PATH",
local_env.pg_lib_dir().to_str().unwrap(),
)
.stdout(Stdio::null())
.status()
.with_context(|| "failed to execute initdb")?;
if !initdb.success() {
anyhow::bail!("initdb failed");
}
println!("initdb succeeded");
// Read control file to extract the LSN and system id
let controlfile_path = tmppath.join("global").join("pg_control");
let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
let controlfile =
postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?;
let systemid = controlfile.system_identifier;
let lsn = controlfile.checkPoint;
let lsnstr = format!("{:016X}", lsn);
// Move the initial WAL file
fs::rename(
tmppath.join("pg_wal").join("000000010000000000000001"),
"tmp/pg_wal/000000010000000000000001",
timelinedir
.join("wal")
.join("000000010000000000000001.partial"),
@@ -168,13 +167,14 @@ pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> {
println!("moved initial WAL file");
// Remove pg_wal
fs::remove_dir_all(tmppath.join("pg_wal"))?;
fs::remove_dir_all("tmp/pg_wal")?;
println!("removed tmp/pg_wal");
force_crash_recovery(&tmppath)?;
force_crash_recovery(&PathBuf::from("tmp"))?;
println!("updated pg_control");
let target = timelinedir.join("snapshots").join(&lsnstr);
fs::rename(tmppath, &target)?;
fs::rename("tmp", &target)?;
println!("moved 'tmp' to {}", target.display());
// Create 'main' branch to refer to the initial timeline
@@ -254,7 +254,7 @@ pub fn test_env(testname: &str) -> LocalEnv {
systemid: 0,
};
init_repo(&mut local_env).expect("could not initialize zenith repository");
local_env
return local_env;
}
// Find the directory where the binaries were put (i.e. target/debug/)
@@ -266,7 +266,7 @@ pub fn cargo_bin_dir() -> PathBuf {
pathbuf.pop();
}
pathbuf
return pathbuf;
}
#[derive(Debug, Clone, Copy)]
@@ -358,7 +358,7 @@ pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<u6
let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true);
Ok(lsn)
return Ok(lsn);
}
// Find the latest snapshot for a timeline

View File

@@ -13,6 +13,7 @@ use std::time::Duration;
use postgres::{Client, NoTls};
use crate::compute::PostgresNode;
use crate::local_env::LocalEnv;
use pageserver::ZTimelineId;
@@ -55,7 +56,24 @@ impl TestStorageControlPlane {
wal_acceptors: Vec::new(),
pageserver: pserver,
test_done: AtomicBool::new(false),
repopath,
repopath: repopath,
}
}
pub fn one_page_server_no_start(local_env: &LocalEnv) -> TestStorageControlPlane {
let repopath = local_env.repo_path.clone();
let pserver = Arc::new(PageServerNode {
env: local_env.clone(),
kill_on_exit: true,
listen_address: None,
});
TestStorageControlPlane {
wal_acceptors: Vec::new(),
pageserver: pserver,
test_done: AtomicBool::new(false),
repopath: repopath,
}
}
@@ -71,7 +89,7 @@ impl TestStorageControlPlane {
listen_address: None,
}),
test_done: AtomicBool::new(false),
repopath,
repopath: repopath,
};
cplane.pageserver.start().unwrap();
@@ -93,9 +111,6 @@ impl TestStorageControlPlane {
}
pub fn stop(&self) {
for wa in self.wal_acceptors.iter() {
let _ = wa.stop();
}
self.test_done.store(true, Ordering::Relaxed);
}
@@ -167,8 +182,7 @@ impl PageServerNode {
.env("RUST_BACKTRACE", "1")
.env("ZENITH_REPO_DIR", self.repo_path())
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
if !cmd.status()?.success() {
anyhow::bail!(
@@ -219,7 +233,7 @@ impl PageServerNode {
if !status.success() {
anyhow::bail!("Failed to stop pageserver with pid {}", pid);
} else {
Ok(())
return Ok(());
}
}
@@ -351,6 +365,42 @@ impl Drop for WalProposerNode {
}
}
///////////////////////////////////////////////////////////////////////////////
pub fn regress_check(pg: &PostgresNode) {
pg.safe_psql("postgres", "CREATE DATABASE regression");
let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress");
fs::create_dir_all(regress_run_path.clone()).unwrap();
std::env::set_current_dir(regress_run_path).unwrap();
let regress_build_path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
let regress_src_path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
let _regress_check = Command::new(regress_build_path.join("pg_regress"))
.args(&[
"--bindir=''",
"--use-existing",
format!("--bindir={}", pg.env.pg_bin_dir().to_str().unwrap()).as_str(),
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
format!(
"--schedule={}",
regress_src_path.join("parallel_schedule").to_str().unwrap()
)
.as_str(),
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
])
.env_clear()
.env("LD_LIBRARY_PATH", pg.env.pg_lib_dir().to_str().unwrap())
.env("PGHOST", pg.address.ip().to_string())
.env("PGPORT", pg.address.port().to_string())
.env("PGUSER", pg.whoami())
.status()
.expect("pg_regress failed");
}
/// Read a PID file
///
/// This should contain an unsigned integer, but we return it as a String

View File

@@ -50,6 +50,7 @@ fn test_redo_cases() {
// Runs pg_regress on a compute node
#[test]
#[ignore]
fn test_regress() {
let local_env = local_env::test_env("test_regress");
@@ -62,26 +63,7 @@ fn test_regress() {
let node = compute_cplane.new_test_node(maintli);
node.start().unwrap();
let status = node.pg_regress();
assert!(status.success());
}
// Runs pg_bench on a compute node
#[test]
fn pgbench() {
let local_env = local_env::test_env("pgbench");
// Start pageserver that reads WAL directly from that postgres
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
// start postgres
let maintli = storage_cplane.get_branch_timeline("main");
let node = compute_cplane.new_test_node(maintli);
node.start().unwrap();
let status = node.pg_bench(10, 100);
assert!(status.success());
control_plane::storage::regress_check(&node);
}
// Run two postgres instances on one pageserver, on different timelines

View File

@@ -10,46 +10,6 @@ use std::sync::Arc;
use std::time::SystemTime;
use std::{thread, time};
const DOWNTIME: u64 = 2;
#[test]
//#[ignore]
fn test_embedded_wal_proposer() {
let local_env = local_env::test_env("test_embedded_wal_proposer");
const REDUNDANCY: usize = 3;
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
// start postgres
let maintli = storage_cplane.get_branch_timeline("main");
let node = compute_cplane.new_test_master_node(maintli);
node.append_conf(
"postgresql.conf",
&format!("wal_acceptors='{}'\n", wal_acceptors),
);
node.start().unwrap();
// check basic work with table
node.safe_psql(
"postgres",
"CREATE TABLE t(key int primary key, value text)",
);
node.safe_psql(
"postgres",
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
);
let count: i64 = node
.safe_psql("postgres", "SELECT sum(key) FROM t")
.first()
.unwrap()
.get(0);
println!("sum = {}", count);
assert_eq!(count, 5000050000);
// check wal files equality
}
#[test]
fn test_acceptors_normal_work() {
let local_env = local_env::test_env("test_acceptors_normal_work");
@@ -212,7 +172,7 @@ fn test_acceptors_restarts() {
fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
let cp = cplane.clone();
thread::spawn(move || {
thread::sleep(time::Duration::from_secs(DOWNTIME));
thread::sleep(time::Duration::from_secs(1));
cp.wal_acceptors[no].start();
});
}
@@ -248,16 +208,13 @@ fn test_acceptors_unavailability() {
psql.execute("INSERT INTO t values (1, 'payload')", &[])
.unwrap();
// Shut down all wal acceptors
storage_cplane.wal_acceptors[0].stop().unwrap();
let cp = Arc::new(storage_cplane);
start_acceptor(&cp, 0);
let now = SystemTime::now();
psql.execute("INSERT INTO t values (2, 'payload')", &[])
.unwrap();
// Here we check that the query above was hanging
// while wal_acceptor was unavailiable
assert!(now.elapsed().unwrap().as_secs() >= DOWNTIME);
assert!(now.elapsed().unwrap().as_secs() > 1);
psql.execute("INSERT INTO t values (3, 'payload')", &[])
.unwrap();
@@ -265,9 +222,7 @@ fn test_acceptors_unavailability() {
start_acceptor(&cp, 1);
psql.execute("INSERT INTO t values (4, 'payload')", &[])
.unwrap();
// Here we check that the query above was hanging
// while wal_acceptor was unavailiable
assert!(now.elapsed().unwrap().as_secs() >= 2 * DOWNTIME);
assert!(now.elapsed().unwrap().as_secs() > 2);
psql.execute("INSERT INTO t values (5, 'payload')", &[])
.unwrap();
@@ -278,8 +233,6 @@ fn test_acceptors_unavailability() {
.unwrap()
.get(0);
println!("sum = {}", count);
// Ensure that all inserts succeeded.
// Including ones that were waiting for wal acceptor restart.
assert_eq!(count, 15);
}

2373
pageserver/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -8,6 +8,7 @@ edition = "2018"
[dependencies]
chrono = "0.4.19"
crossbeam-channel = "0.5.0"
rand = "0.8.3"
regex = "1.4.5"
bytes = "1.0.1"
@@ -31,14 +32,11 @@ tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
rocksdb = "0.16.0"
anyhow = "1.0"
crc32c = "0.6.0"
walkdir = "2"
thiserror = "1.0"
hex = "0.4.3"
tar = "0.4.33"
parse_duration = "*"
postgres_ffi = { path = "../postgres_ffi" }
zenith_utils = { path = "../zenith_utils" }

41
pageserver/build.rs Normal file
View File

@@ -0,0 +1,41 @@
//
// Triggers postgres build if there is no postgres binary present at
// 'REPO_ROOT/tmp_install/bin/postgres'.
//
// I can see a lot of disadvantages with such automatization and main
// advantage here is ability to build everything and run integration tests
// in a bare repo by running 'cargo test'.
//
// We can interceipt whether it is debug or release build and run
// corresponding pg build. But it seems like an overkill for now.
//
// Problem #1 -- language server in my editor likes calling 'cargo build'
// by himself. So if I delete tmp_install directory it would magically reappear
// after some time. During this compilation 'cargo build' may whine about
// "waiting for file lock on build directory".
//
// Problem #2 -- cargo build would run this only if something is changed in
// the crate.
//
// And generally speaking postgres is not a build dependency for the pageserver,
// just for integration tests. So let's not mix that. I'll leave this file in
// place for some time just in case if anybody would start doing the same.
//
// use std::path::Path;
// use std::process::{Command};
fn main() {
// // build some postgres if it is not done none yet
// if !Path::new("../tmp_install/bin/postgres").exists() {
// let make_res = Command::new("make")
// .arg("postgres")
// .env_clear()
// .status()
// .expect("failed to execute 'make postgres'");
// if !make_res.success() {
// panic!("postgres build failed");
// }
// }
}

View File

@@ -1,11 +1,12 @@
use crate::ZTimelineId;
use log::*;
use postgres_ffi::FilePathError;
use regex::Regex;
use std::fmt;
use std::io::Write;
use tar::Builder;
use walkdir::WalkDir;
use crate::ZTimelineId;
pub fn send_snapshot_tarball(
write: &mut dyn Write,
timelineid: ZTimelineId,
@@ -65,7 +66,7 @@ pub fn send_snapshot_tarball(
continue;
}
let archive_fname = relpath.to_str().unwrap();
let archive_fname = relpath.to_str().unwrap().clone();
let archive_fname = archive_fname
.strip_suffix(".partial")
.unwrap_or(&archive_fname);
@@ -84,7 +85,45 @@ pub fn send_snapshot_tarball(
// <oid>.<segment number>
// <oid>_<fork name>.<segment number>
fn parse_filename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
#[derive(Debug)]
struct FilePathError {
msg: String,
}
impl FilePathError {
fn new(msg: &str) -> FilePathError {
FilePathError {
msg: msg.to_string(),
}
}
}
impl From<core::num::ParseIntError> for FilePathError {
fn from(e: core::num::ParseIntError) -> Self {
return FilePathError {
msg: format!("invalid filename: {}", e),
};
}
}
impl fmt::Display for FilePathError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "invalid filename")
}
}
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
match forkname {
// "main" is not in filenames, it's implicit if the fork name is not present
None => Ok(0),
Some("fsm") => Ok(1),
Some("vm") => Ok(2),
Some("init") => Ok(3),
Some(_) => Err(FilePathError::new("invalid forkname")),
}
}
fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
let caps = re
@@ -94,8 +133,13 @@ fn parse_filename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
let relnode_str = caps.name("relnode").unwrap().as_str();
let relnode = u32::from_str_radix(relnode_str, 10)?;
let forkname = caps.name("forkname").map(|f| f.as_str());
let forknum = postgres_ffi::forkname_to_forknum(forkname)?;
let forkname_match = caps.name("forkname");
let forkname = if forkname_match.is_none() {
None
} else {
Some(forkname_match.unwrap().as_str())
};
let forknum = forkname_to_forknum(forkname)?;
let segno_match = caps.name("segno");
let segno = if segno_match.is_none() {
@@ -104,7 +148,7 @@ fn parse_filename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
};
Ok((relnode, forknum, segno))
return Ok((relnode, forknum, segno));
}
fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
@@ -128,9 +172,9 @@ fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
if let Some(fname) = path.strip_prefix("global/") {
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
Ok(())
return Ok(());
} else if let Some(dbpath) = path.strip_prefix("base/") {
let mut s = dbpath.split('/');
let mut s = dbpath.split("/");
let dbnode_str = s
.next()
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
@@ -144,15 +188,15 @@ fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
Ok(())
return Ok(());
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
// TODO
Err(FilePathError::new("tablespaces not supported"))
return Err(FilePathError::new("tablespaces not supported"));
} else {
Err(FilePathError::new("invalid relation data file name"))
return Err(FilePathError::new("invalid relation data file name"));
}
}
fn is_rel_file_path(path: &str) -> bool {
parse_rel_file_path(path).is_ok()
return parse_rel_file_path(path).is_ok();
}

View File

@@ -3,12 +3,12 @@
//
use log::*;
use parse_duration::parse;
use std::fs::{self, OpenOptions};
use std::fs;
use std::fs::{File, OpenOptions};
use std::io;
use std::path::PathBuf;
use std::process::exit;
use std::thread;
use std::time::Duration;
use anyhow::{Context, Result};
use clap::{App, Arg};
@@ -16,12 +16,18 @@ use daemonize::Daemonize;
use slog::Drain;
use pageserver::{page_service, tui, zenith_repo_dir, PageServerConf};
use pageserver::page_service;
use pageserver::tui;
//use pageserver::walreceiver;
use pageserver::PageServerConf;
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
const DEFAULT_GC_PERIOD_SEC: u64 = 10;
//const DEFAULT_GC_HORIZON: u64 = 1024 * 1024 * 1024;
//const DEFAULT_GC_PERIOD_SEC: u64 = 600;
fn zenith_repo_dir() -> String {
// Find repository path
match std::env::var_os("ZENITH_REPO_DIR") {
Some(val) => String::from(val.to_str().unwrap()),
None => ".zenith".into(),
}
}
fn main() -> Result<()> {
let arg_matches = App::new("Zenith page server")
@@ -47,25 +53,11 @@ fn main() -> Result<()> {
.takes_value(false)
.help("Run in the background"),
)
.arg(
Arg::with_name("gc_horizon")
.long("gc_horizon")
.takes_value(true)
.help("Distance from current LSN to perform all wal records cleanup"),
)
.arg(
Arg::with_name("gc_period")
.long("gc_period")
.takes_value(true)
.help("Interval between garbage collector iterations"),
)
.get_matches();
let mut conf = PageServerConf {
daemonize: false,
interactive: false,
gc_horizon: DEFAULT_GC_HORIZON,
gc_period: Duration::from_secs(DEFAULT_GC_PERIOD_SEC),
listen_addr: "127.0.0.1:5430".parse().unwrap(),
};
@@ -86,14 +78,6 @@ fn main() -> Result<()> {
conf.listen_addr = addr.parse()?;
}
if let Some(horizon) = arg_matches.value_of("gc_horizon") {
conf.gc_horizon = horizon.parse()?;
}
if let Some(period) = arg_matches.value_of("gc_period") {
conf.gc_period = parse(period)?;
}
start_pageserver(&conf)
}
@@ -124,7 +108,7 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
if conf.daemonize {
info!("daemonizing...");
let repodir = zenith_repo_dir();
let repodir = PathBuf::from(zenith_repo_dir());
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
// that we will see any accidental manual fprintf's or backtraces.
@@ -141,7 +125,7 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
.with_context(|| format!("failed to open {:?}", &log_filename))?;
let daemonize = Daemonize::new()
.pid_file(repodir.join("pageserver.pid"))
.pid_file(repodir.clone().join("pageserver.pid"))
.working_directory(repodir)
.stdout(stdout)
.stderr(stderr);
@@ -155,7 +139,7 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
// does this for us.
let repodir = zenith_repo_dir();
std::env::set_current_dir(&repodir)?;
info!("Changed current directory to repository in {:?}", &repodir);
info!("Changed current directory to repository in {}", &repodir);
}
let mut threads = Vec::new();
@@ -185,9 +169,9 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
.unwrap();
threads.push(page_server_thread);
if let Some(tui_thread) = tui_thread {
if tui_thread.is_some() {
// The TUI thread exits when the user asks to Quit.
tui_thread.join().unwrap();
tui_thread.unwrap().join().unwrap();
} else {
// In non-interactive mode, wait forever.
for t in threads {
@@ -201,23 +185,19 @@ fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard,
if conf.interactive {
Ok(tui::init_logging())
} else if conf.daemonize {
let log = zenith_repo_dir().join("pageserver.log");
let log_file = OpenOptions::new()
.create(true)
.append(true)
.open(&log)
.map_err(|err| {
// We failed to initialize logging, so we can't log this message with error!
eprintln!("Could not create log file {:?}: {}", log, err);
err
})?;
let log = zenith_repo_dir() + "/pageserver.log";
let log_file = File::create(&log).map_err(|err| {
// We failed to initialize logging, so we can't log this message with error!
eprintln!("Could not create log file {:?}: {}", log, err);
err
})?;
let decorator = slog_term::PlainSyncDecorator::new(log_file);
let drain = slog_term::CompactFormat::new(decorator).build();
let drain = slog::Filter::new(drain, |record: &slog::Record| {
if record.level().is_at_least(slog::Level::Info) {
if record.level().is_at_least(slog::Level::Debug) {
return true;
}
false
return false;
});
let drain = std::sync::Mutex::new(drain).fuse();
let logger = slog::Logger::root(drain, slog::o!());
@@ -235,7 +215,7 @@ fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard,
{
return true;
}
false
return false;
})
.fuse();
let logger = slog::Logger::root(drain, slog::o!());

View File

@@ -1,12 +1,11 @@
use std::fmt;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
pub mod basebackup;
pub mod page_cache;
pub mod page_service;
pub mod pg_constants;
pub mod restore_local_repo;
pub mod tui;
pub mod tui_event;
@@ -20,34 +19,9 @@ pub struct PageServerConf {
pub daemonize: bool,
pub interactive: bool,
pub listen_addr: SocketAddr,
pub gc_horizon: u64,
pub gc_period: Duration,
}
/// Zenith Timeline ID is a 128-bit random ID.
///
/// Zenith timeline IDs are different from PostgreSQL timeline
/// IDs. They serve a similar purpose though: they differentiate
/// between different "histories" of the same cluster. However,
/// PostgreSQL timeline IDs are a bit cumbersome, because they are only
/// 32-bits wide, and they must be in ascending order in any given
/// timeline history. Those limitations mean that we cannot generate a
/// new PostgreSQL timeline ID by just generating a random number. And
/// that in turn is problematic for the "pull/push" workflow, where you
/// have a local copy of a zenith repository, and you periodically sync
/// the local changes with a remote server. When you work "detached"
/// from the remote server, you cannot create a PostgreSQL timeline ID
/// that's guaranteed to be different from all existing timelines in
/// the remote server. For example, if two people are having a clone of
/// the repository on their laptops, and they both create a new branch
/// with different name. What timeline ID would they assign to their
/// branches? If they pick the same one, and later try to push the
/// branches to the same remote server, they will get mixed up.
///
/// To avoid those issues, Zenith has its own concept of timelines that
/// is separate from PostgreSQL timelines, and doesn't have those
/// limitations. A zenith timeline is identified by a 128-bit ID, which
/// is usually printed out as a hex string.
// Zenith Timeline ID is a 32-byte random ID.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct ZTimelineId([u8; 16]);
@@ -84,11 +58,3 @@ impl fmt::Display for ZTimelineId {
f.write_str(&hex::encode(self.0))
}
}
pub fn zenith_repo_dir() -> PathBuf {
// Find repository path
match std::env::var_os("ZENITH_REPO_DIR") {
Some(val) => PathBuf::from(val.to_str().unwrap()),
None => ".zenith".into(),
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -10,17 +10,20 @@
// *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
//
use byteorder::{ReadBytesExt, WriteBytesExt, BE};
use byteorder::{BigEndian, ByteOrder};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use log::*;
use regex::Regex;
use std::io;
use std::io::{BufReader, BufWriter, Read, Write};
use std::net::{TcpListener, TcpStream};
use std::str::FromStr;
use std::sync::Arc;
use std::thread;
use std::time::Duration;
use zenith_utils::lsn::Lsn;
use tokio::io::{AsyncReadExt, AsyncWriteExt, BufWriter};
use tokio::net::{TcpListener, TcpStream};
use tokio::runtime;
use tokio::runtime::Runtime;
use tokio::sync::mpsc;
use tokio::task;
use crate::basebackup;
use crate::page_cache;
@@ -47,8 +50,12 @@ enum FeMessage {
// All that messages are actually CopyData from libpq point of view.
//
ZenithExistsRequest(ZenithRequest),
ZenithTruncRequest(ZenithRequest),
ZenithUnlinkRequest(ZenithRequest),
ZenithNblocksRequest(ZenithRequest),
ZenithReadRequest(ZenithRequest),
ZenithCreateRequest(ZenithRequest),
ZenithExtendRequest(ZenithRequest),
}
#[derive(Debug)]
@@ -80,7 +87,7 @@ struct ZenithRequest {
relnode: u32,
forknum: u8,
blkno: u32,
lsn: Lsn,
lsn: u64,
}
#[derive(Debug)]
@@ -111,41 +118,26 @@ enum StartupRequestCode {
}
impl FeStartupMessage {
pub fn read(stream: &mut dyn std::io::Read) -> Result<Option<FeMessage>> {
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
const MAX_STARTUP_PACKET_LENGTH: u32 = 10000;
const CANCEL_REQUEST_CODE: u32 = (1234 << 16) | 5678;
const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679;
const NEGOTIATE_GSS_CODE: u32 = (1234 << 16) | 5680;
// Read length. If the connection is closed before reading anything (or before
// reading 4 bytes, to be precise), return None to indicate that the connection
// was closed. This matches the PostgreSQL server's behavior, which avoids noise
// in the log if the client opens connection but closes it immediately.
let len = match stream.read_u32::<BE>() {
Ok(len) => len,
Err(err) => {
if err.kind() == std::io::ErrorKind::UnexpectedEof {
return Ok(None);
} else {
return Err(err);
}
}
};
if buf.len() < 4 {
return Ok(None);
}
let len = BigEndian::read_u32(&buf[0..4]);
if len < 4 || len as u32 > MAX_STARTUP_PACKET_LENGTH {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"invalid message length",
));
}
let bodylen = len - 4;
// Read the rest of the startup packet
let mut body_buf: Vec<u8> = vec![0; bodylen as usize];
stream.read_exact(&mut body_buf)?;
let mut body = Bytes::from(body_buf);
let version = BigEndian::read_u32(&buf[4..8]);
// Parse the first field, which indicates what kind of a packet it is
let version = body.get_u32();
let kind = match version {
CANCEL_REQUEST_CODE => StartupRequestCode::Cancel,
NEGOTIATE_SSL_CODE => StartupRequestCode::NegotiateSsl,
@@ -153,8 +145,7 @@ impl FeStartupMessage {
_ => StartupRequestCode::Normal,
};
// Ignore the rest of the packet
buf.advance(len as usize);
Ok(Some(FeMessage::StartupMessage(FeStartupMessage {
version,
kind,
@@ -198,11 +189,12 @@ fn read_null_terminated(buf: &mut Bytes) -> Result<Bytes> {
}
result.put_u8(byte);
}
Ok(result.freeze())
return Ok(result.freeze());
}
impl FeParseMessage {
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
pub fn parse(body: Bytes) -> Result<FeMessage> {
let mut buf = body.clone();
let _pstmt_name = read_null_terminated(&mut buf)?;
let query_string = read_null_terminated(&mut buf)?;
let nparams = buf.get_i16();
@@ -212,7 +204,7 @@ impl FeParseMessage {
// now, just ignore the statement name, assuming that the client never
// uses more than one prepared statement at a time.
/*
if !pstmt_name.is_empty() {
if pstmt_name.len() != 0 {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"named prepared statements not implemented in Parse",
@@ -238,13 +230,14 @@ struct FeDescribeMessage {
}
impl FeDescribeMessage {
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
pub fn parse(body: Bytes) -> Result<FeMessage> {
let mut buf = body.clone();
let kind = buf.get_u8();
let _pstmt_name = read_null_terminated(&mut buf)?;
// FIXME: see FeParseMessage::parse
/*
if !pstmt_name.is_empty() {
if pstmt_name.len() != 0 {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"named prepared statements not implemented in Describe",
@@ -271,11 +264,12 @@ struct FeExecuteMessage {
}
impl FeExecuteMessage {
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
pub fn parse(body: Bytes) -> Result<FeMessage> {
let mut buf = body.clone();
let portal_name = read_null_terminated(&mut buf)?;
let maxrows = buf.get_i32();
if !portal_name.is_empty() {
if portal_name.len() != 0 {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"named portals not implemented",
@@ -298,11 +292,12 @@ impl FeExecuteMessage {
struct FeBindMessage {}
impl FeBindMessage {
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
pub fn parse(body: Bytes) -> Result<FeMessage> {
let mut buf = body.clone();
let portal_name = read_null_terminated(&mut buf)?;
let _pstmt_name = read_null_terminated(&mut buf)?;
if !portal_name.is_empty() {
if portal_name.len() != 0 {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"named portals not implemented",
@@ -311,7 +306,7 @@ impl FeBindMessage {
// FIXME: see FeParseMessage::parse
/*
if !pstmt_name.is_empty() {
if pstmt_name.len() != 0 {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"named prepared statements not implemented",
@@ -328,7 +323,8 @@ impl FeBindMessage {
struct FeCloseMessage {}
impl FeCloseMessage {
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
pub fn parse(body: Bytes) -> Result<FeMessage> {
let mut buf = body.clone();
let _kind = buf.get_u8();
let _pstmt_or_portal_name = read_null_terminated(&mut buf)?;
@@ -339,40 +335,37 @@ impl FeCloseMessage {
}
impl FeMessage {
pub fn read(stream: &mut dyn Read) -> Result<Option<FeMessage>> {
// Each libpq message begins with a message type byte, followed by message length
// If the client closes the connection, return None. But if the client closes the
// connection in the middle of a message, we will return an error.
let tag = match stream.read_u8() {
Ok(b) => b,
Err(err) => {
if err.kind() == std::io::ErrorKind::UnexpectedEof {
return Ok(None);
} else {
return Err(err);
}
}
};
let len = stream.read_u32::<BE>()?;
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
if buf.len() < 5 {
let to_read = 5 - buf.len();
buf.reserve(to_read);
return Ok(None);
}
let tag = buf[0];
let len = BigEndian::read_u32(&buf[1..5]);
// The message length includes itself, so it better be at least 4
if len < 4 {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"invalid message length: parsing u32",
));
}
let bodylen = len - 4;
// Read message body
let mut body_buf: Vec<u8> = vec![0; bodylen as usize];
stream.read_exact(&mut body_buf)?;
let total_len = len as usize + 1;
if buf.len() < total_len {
let to_read = total_len - buf.len();
buf.reserve(to_read);
return Ok(None);
}
let mut body = Bytes::from(body_buf);
let mut body = buf.split_to(total_len);
body.advance(5);
let mut body = body.freeze();
// Parse it
match tag {
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body }))),
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body: body }))),
b'P' => Ok(Some(FeParseMessage::parse(body)?)),
b'D' => Ok(Some(FeDescribeMessage::parse(body)?)),
b'E' => Ok(Some(FeExecuteMessage::parse(body)?)),
@@ -388,24 +381,28 @@ impl FeMessage {
relnode: body.get_u32(),
forknum: body.get_u8(),
blkno: body.get_u32(),
lsn: Lsn::from(body.get_u64()),
lsn: body.get_u64(),
};
// TODO: consider using protobuf or serde bincode for less error prone
// serialization.
match smgr_tag {
0 => Ok(Some(FeMessage::ZenithExistsRequest(zreq))),
1 => Ok(Some(FeMessage::ZenithNblocksRequest(zreq))),
2 => Ok(Some(FeMessage::ZenithReadRequest(zreq))),
1 => Ok(Some(FeMessage::ZenithTruncRequest(zreq))),
2 => Ok(Some(FeMessage::ZenithUnlinkRequest(zreq))),
3 => Ok(Some(FeMessage::ZenithNblocksRequest(zreq))),
4 => Ok(Some(FeMessage::ZenithReadRequest(zreq))),
5 => Ok(Some(FeMessage::ZenithCreateRequest(zreq))),
6 => Ok(Some(FeMessage::ZenithExtendRequest(zreq))),
_ => Err(io::Error::new(
io::ErrorKind::InvalidInput,
format!("unknown smgr message tag: {},'{:?}'", smgr_tag, body),
format!("unknown smgr message tag: {},'{:?}'", smgr_tag, buf),
)),
}
}
tag => Err(io::Error::new(
io::ErrorKind::InvalidInput,
format!("unknown message tag: {},'{:?}'", tag, body),
format!("unknown message tag: {},'{:?}'", tag, buf),
)),
}
}
@@ -413,233 +410,269 @@ impl FeMessage {
///////////////////////////////////////////////////////////////////////////////
///
/// Main loop of the page service.
///
/// Listens for connections, and launches a new handler thread for each.
///
pub fn thread_main(conf: &PageServerConf) {
// Create a new thread pool
//
// FIXME: It would be nice to keep this single-threaded for debugging purposes,
// but that currently leads to a deadlock: if a GetPage@LSN request arrives
// for an LSN that hasn't been received yet, the thread gets stuck waiting for
// the WAL to arrive. If the WAL receiver hasn't been launched yet, i.e
// we haven't received a "callmemaybe" request yet to tell us where to get the
// WAL, we will not have a thread available to process the "callmemaybe"
// request when it does arrive. Using a thread pool alleviates the problem so
// that it doesn't happen in the tests anymore, but in principle it could still
// happen if we receive enough GetPage@LSN requests to consume all of the
// available threads.
//let runtime = runtime::Builder::new_current_thread().enable_all().build().unwrap();
let runtime = runtime::Runtime::new().unwrap();
info!("Starting page server on {}", conf.listen_addr);
let listener = TcpListener::bind(conf.listen_addr).unwrap();
let runtime_ref = Arc::new(runtime);
loop {
let (socket, peer_addr) = listener.accept().unwrap();
debug!("accepted connection from {}", peer_addr);
socket.set_nodelay(true).unwrap();
let mut conn_handler = Connection::new(conf.clone(), socket);
runtime_ref.clone().block_on(async {
let listener = TcpListener::bind(conf.listen_addr).await.unwrap();
thread::spawn(move || {
if let Err(err) = conn_handler.run() {
error!("error: {}", err);
}
});
}
loop {
let (socket, peer_addr) = listener.accept().await.unwrap();
debug!("accepted connection from {}", peer_addr);
let mut conn_handler = Connection::new(conf.clone(), socket, &runtime_ref);
task::spawn(async move {
if let Err(err) = conn_handler.run().await {
error!("error: {}", err);
}
});
}
});
}
#[derive(Debug)]
struct Connection {
stream_in: BufReader<TcpStream>,
stream: BufWriter<TcpStream>,
buffer: BytesMut,
init_done: bool,
conf: PageServerConf,
runtime: Arc<Runtime>,
}
impl Connection {
pub fn new(conf: PageServerConf, socket: TcpStream) -> Connection {
pub fn new(conf: PageServerConf, socket: TcpStream, runtime: &Arc<Runtime>) -> Connection {
Connection {
stream_in: BufReader::new(socket.try_clone().unwrap()),
stream: BufWriter::new(socket),
buffer: BytesMut::with_capacity(10 * 1024),
init_done: false,
conf,
runtime: Arc::clone(runtime),
}
}
//
// Read full message or return None if connection is closed
//
fn read_message(&mut self) -> Result<Option<FeMessage>> {
if !self.init_done {
FeStartupMessage::read(&mut self.stream_in)
} else {
FeMessage::read(&mut self.stream_in)
async fn read_message(&mut self) -> Result<Option<FeMessage>> {
loop {
if let Some(message) = self.parse_message()? {
return Ok(Some(message));
}
if self.stream.read_buf(&mut self.buffer).await? == 0 {
if self.buffer.is_empty() {
return Ok(None);
} else {
return Err(io::Error::new(
io::ErrorKind::Other,
"connection reset by peer",
));
}
}
}
}
fn write_message_noflush(&mut self, message: &BeMessage) -> io::Result<()> {
fn parse_message(&mut self) -> Result<Option<FeMessage>> {
if !self.init_done {
FeStartupMessage::parse(&mut self.buffer)
} else {
FeMessage::parse(&mut self.buffer)
}
}
async fn write_message_noflush(&mut self, message: &BeMessage) -> io::Result<()> {
match message {
BeMessage::AuthenticationOk => {
self.stream.write_u8(b'R')?;
self.stream.write_i32::<BE>(4 + 4)?;
self.stream.write_i32::<BE>(0)?;
self.stream.write_u8(b'R').await?;
self.stream.write_i32(4 + 4).await?;
self.stream.write_i32(0).await?;
}
BeMessage::ReadyForQuery => {
self.stream.write_u8(b'Z')?;
self.stream.write_i32::<BE>(4 + 1)?;
self.stream.write_u8(b'I')?;
self.stream.write_u8(b'Z').await?;
self.stream.write_i32(4 + 1).await?;
self.stream.write_u8(b'I').await?;
}
BeMessage::ParseComplete => {
self.stream.write_u8(b'1')?;
self.stream.write_i32::<BE>(4)?;
self.stream.write_u8(b'1').await?;
self.stream.write_i32(4).await?;
}
BeMessage::BindComplete => {
self.stream.write_u8(b'2')?;
self.stream.write_i32::<BE>(4)?;
self.stream.write_u8(b'2').await?;
self.stream.write_i32(4).await?;
}
BeMessage::CloseComplete => {
self.stream.write_u8(b'3')?;
self.stream.write_i32::<BE>(4)?;
self.stream.write_u8(b'3').await?;
self.stream.write_i32(4).await?;
}
BeMessage::NoData => {
self.stream.write_u8(b'n')?;
self.stream.write_i32::<BE>(4)?;
self.stream.write_u8(b'n').await?;
self.stream.write_i32(4).await?;
}
BeMessage::ParameterDescription => {
self.stream.write_u8(b't')?;
self.stream.write_i32::<BE>(6)?;
self.stream.write_u8(b't').await?;
self.stream.write_i32(6).await?;
// we don't support params, so always 0
self.stream.write_i16::<BE>(0)?;
self.stream.write_i16(0).await?;
}
BeMessage::RowDescription => {
// XXX
let b = Bytes::from("data\0");
let mut b = Bytes::from("data\0");
self.stream.write_u8(b'T')?;
self.stream.write_u8(b'T').await?;
self.stream
.write_i32::<BE>(4 + 2 + b.len() as i32 + 3 * (4 + 2))?;
.write_i32(4 + 2 + b.len() as i32 + 3 * (4 + 2))
.await?;
self.stream.write_i16::<BE>(1)?;
self.stream.write_all(&b)?;
self.stream.write_i32::<BE>(0)?; /* table oid */
self.stream.write_i16::<BE>(0)?; /* attnum */
self.stream.write_i32::<BE>(25)?; /* TEXTOID */
self.stream.write_i16::<BE>(-1)?; /* typlen */
self.stream.write_i32::<BE>(0)?; /* typmod */
self.stream.write_i16::<BE>(0)?; /* format code */
self.stream.write_i16(1).await?;
self.stream.write_all(&mut b).await?;
self.stream.write_i32(0).await?; /* table oid */
self.stream.write_i16(0).await?; /* attnum */
self.stream.write_i32(25).await?; /* TEXTOID */
self.stream.write_i16(-1).await?; /* typlen */
self.stream.write_i32(0).await?; /* typmod */
self.stream.write_i16(0).await?; /* format code */
}
// XXX: accept some text data
BeMessage::DataRow => {
// XXX
let b = Bytes::from("hello world");
let mut b = Bytes::from("hello world");
self.stream.write_u8(b'D')?;
self.stream.write_i32::<BE>(4 + 2 + 4 + b.len() as i32)?;
self.stream.write_u8(b'D').await?;
self.stream.write_i32(4 + 2 + 4 + b.len() as i32).await?;
self.stream.write_i16::<BE>(1)?;
self.stream.write_i32::<BE>(b.len() as i32)?;
self.stream.write_all(&b)?;
self.stream.write_i16(1).await?;
self.stream.write_i32(b.len() as i32).await?;
self.stream.write_all(&mut b).await?;
}
BeMessage::ControlFile => {
// TODO pass checkpoint and xid info in this message
let b = Bytes::from("hello pg_control");
let mut b = Bytes::from("hello pg_control");
self.stream.write_u8(b'D')?;
self.stream.write_i32::<BE>(4 + 2 + 4 + b.len() as i32)?;
self.stream.write_u8(b'D').await?;
self.stream.write_i32(4 + 2 + 4 + b.len() as i32).await?;
self.stream.write_i16::<BE>(1)?;
self.stream.write_i32::<BE>(b.len() as i32)?;
self.stream.write_all(&b)?;
self.stream.write_i16(1).await?;
self.stream.write_i32(b.len() as i32).await?;
self.stream.write_all(&mut b).await?;
}
BeMessage::CommandComplete => {
let b = Bytes::from("SELECT 1\0");
let mut b = Bytes::from("SELECT 1\0");
self.stream.write_u8(b'C')?;
self.stream.write_i32::<BE>(4 + b.len() as i32)?;
self.stream.write_all(&b)?;
self.stream.write_u8(b'C').await?;
self.stream.write_i32(4 + b.len() as i32).await?;
self.stream.write_all(&mut b).await?;
}
BeMessage::ZenithStatusResponse(resp) => {
self.stream.write_u8(b'd')?;
self.stream.write_u32::<BE>(4 + 1 + 1 + 4)?;
self.stream.write_u8(100)?; /* tag from pagestore_client.h */
self.stream.write_u8(resp.ok as u8)?;
self.stream.write_u32::<BE>(resp.n_blocks)?;
self.stream.write_u8(b'd').await?;
self.stream.write_u32(4 + 1 + 1 + 4).await?;
self.stream.write_u8(100).await?; /* tag from pagestore_client.h */
self.stream.write_u8(resp.ok as u8).await?;
self.stream.write_u32(resp.n_blocks).await?;
}
BeMessage::ZenithNblocksResponse(resp) => {
self.stream.write_u8(b'd')?;
self.stream.write_u32::<BE>(4 + 1 + 1 + 4)?;
self.stream.write_u8(101)?; /* tag from pagestore_client.h */
self.stream.write_u8(resp.ok as u8)?;
self.stream.write_u32::<BE>(resp.n_blocks)?;
self.stream.write_u8(b'd').await?;
self.stream.write_u32(4 + 1 + 1 + 4).await?;
self.stream.write_u8(101).await?; /* tag from pagestore_client.h */
self.stream.write_u8(resp.ok as u8).await?;
self.stream.write_u32(resp.n_blocks).await?;
}
BeMessage::ZenithReadResponse(resp) => {
self.stream.write_u8(b'd')?;
self.stream.write_u8(b'd').await?;
self.stream
.write_u32::<BE>(4 + 1 + 1 + 4 + resp.page.len() as u32)?;
self.stream.write_u8(102)?; /* tag from pagestore_client.h */
self.stream.write_u8(resp.ok as u8)?;
self.stream.write_u32::<BE>(resp.n_blocks)?;
self.stream.write_all(&resp.page.clone())?;
.write_u32(4 + 1 + 1 + 4 + resp.page.len() as u32)
.await?;
self.stream.write_u8(102).await?; /* tag from pagestore_client.h */
self.stream.write_u8(resp.ok as u8).await?;
self.stream.write_u32(resp.n_blocks).await?;
self.stream.write_all(&mut resp.page.clone()).await?;
}
}
Ok(())
}
fn write_message(&mut self, message: &BeMessage) -> io::Result<()> {
self.write_message_noflush(message)?;
self.stream.flush()
async fn write_message(&mut self, message: &BeMessage) -> io::Result<()> {
self.write_message_noflush(message).await?;
self.stream.flush().await
}
fn run(&mut self) -> Result<()> {
async fn run(&mut self) -> Result<()> {
let mut unnamed_query_string = Bytes::new();
loop {
let msg = self.read_message()?;
trace!("got message {:?}", msg);
let msg = self.read_message().await?;
info!("got message {:?}", msg);
match msg {
Some(FeMessage::StartupMessage(m)) => {
trace!("got message {:?}", m);
match m.kind {
StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => {
let b = Bytes::from("N");
self.stream.write_all(&b)?;
self.stream.flush()?;
let mut b = Bytes::from("N");
self.stream.write_all(&mut b).await?;
self.stream.flush().await?;
}
StartupRequestCode::Normal => {
self.write_message_noflush(&BeMessage::AuthenticationOk)?;
self.write_message(&BeMessage::ReadyForQuery)?;
self.write_message_noflush(&BeMessage::AuthenticationOk)
.await?;
self.write_message(&BeMessage::ReadyForQuery).await?;
self.init_done = true;
}
StartupRequestCode::Cancel => return Ok(()),
}
}
Some(FeMessage::Query(m)) => {
self.process_query(m.body)?;
self.process_query(m.body).await?;
}
Some(FeMessage::Parse(m)) => {
unnamed_query_string = m.query_string;
self.write_message(&BeMessage::ParseComplete)?;
self.write_message(&BeMessage::ParseComplete).await?;
}
Some(FeMessage::Describe(_)) => {
self.write_message_noflush(&BeMessage::ParameterDescription)?;
self.write_message(&BeMessage::NoData)?;
self.write_message_noflush(&BeMessage::ParameterDescription)
.await?;
self.write_message(&BeMessage::NoData).await?;
}
Some(FeMessage::Bind(_)) => {
self.write_message(&BeMessage::BindComplete)?;
self.write_message(&BeMessage::BindComplete).await?;
}
Some(FeMessage::Close(_)) => {
self.write_message(&BeMessage::CloseComplete)?;
self.write_message(&BeMessage::CloseComplete).await?;
}
Some(FeMessage::Execute(_)) => {
self.process_query(unnamed_query_string.clone())?;
self.process_query(unnamed_query_string.clone()).await?;
}
Some(FeMessage::Sync) => {
self.write_message(&BeMessage::ReadyForQuery)?;
self.write_message(&BeMessage::ReadyForQuery).await?;
}
Some(FeMessage::Terminate) => {
break;
@@ -658,7 +691,7 @@ impl Connection {
Ok(())
}
fn process_query(&mut self, query_string: Bytes) -> Result<()> {
async fn process_query(&mut self, query_string: Bytes) -> Result<()> {
debug!("process query {:?}", query_string);
// remove null terminator, if any
@@ -668,13 +701,13 @@ impl Connection {
}
if query_string.starts_with(b"controlfile") {
self.handle_controlfile()
self.handle_controlfile().await
} else if query_string.starts_with(b"pagestream ") {
let (_l, r) = query_string.split_at("pagestream ".len());
let timelineid_str = String::from_utf8(r.to_vec()).unwrap();
let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
self.handle_pagerequests(timelineid)
self.handle_pagerequests(timelineid).await
} else if query_string.starts_with(b"basebackup ") {
let (_l, r) = query_string.split_at("basebackup ".len());
let r = r.to_vec();
@@ -683,18 +716,21 @@ impl Connection {
let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
// Check that the timeline exists
self.handle_basebackup_request(timelineid)?;
self.write_message_noflush(&BeMessage::CommandComplete)?;
self.write_message(&BeMessage::ReadyForQuery)
self.handle_basebackup_request(timelineid).await?;
self.write_message_noflush(&BeMessage::CommandComplete)
.await?;
self.write_message(&BeMessage::ReadyForQuery).await
} else if query_string.starts_with(b"callmemaybe ") {
let query_str = String::from_utf8(query_string.to_vec()).unwrap();
let query_str = String::from_utf8(query_string.to_vec())
.unwrap()
.to_string();
// callmemaybe <zenith timelineid as hex string> <connstr>
let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) (.*)$").unwrap();
let caps = re.captures(&query_str);
let caps = caps.unwrap();
let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str()).unwrap();
let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str().clone()).unwrap();
let connstr: String = String::from(caps.get(2).unwrap().as_str());
// Check that the timeline exists
@@ -707,29 +743,36 @@ impl Connection {
walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr);
self.write_message_noflush(&BeMessage::CommandComplete)?;
self.write_message(&BeMessage::ReadyForQuery)
self.write_message_noflush(&BeMessage::CommandComplete)
.await?;
self.write_message(&BeMessage::ReadyForQuery).await
} else if query_string.starts_with(b"status") {
self.write_message_noflush(&BeMessage::RowDescription)?;
self.write_message_noflush(&BeMessage::DataRow)?;
self.write_message_noflush(&BeMessage::CommandComplete)?;
self.write_message(&BeMessage::ReadyForQuery)
self.write_message_noflush(&BeMessage::RowDescription)
.await?;
self.write_message_noflush(&BeMessage::DataRow).await?;
self.write_message_noflush(&BeMessage::CommandComplete)
.await?;
self.write_message(&BeMessage::ReadyForQuery).await
} else {
self.write_message_noflush(&BeMessage::RowDescription)?;
self.write_message_noflush(&BeMessage::DataRow)?;
self.write_message_noflush(&BeMessage::CommandComplete)?;
self.write_message(&BeMessage::ReadyForQuery)
self.write_message_noflush(&BeMessage::RowDescription)
.await?;
self.write_message_noflush(&BeMessage::DataRow).await?;
self.write_message_noflush(&BeMessage::CommandComplete)
.await?;
self.write_message(&BeMessage::ReadyForQuery).await
}
}
fn handle_controlfile(&mut self) -> Result<()> {
self.write_message_noflush(&BeMessage::RowDescription)?;
self.write_message_noflush(&BeMessage::ControlFile)?;
self.write_message_noflush(&BeMessage::CommandComplete)?;
self.write_message(&BeMessage::ReadyForQuery)
async fn handle_controlfile(&mut self) -> Result<()> {
self.write_message_noflush(&BeMessage::RowDescription)
.await?;
self.write_message_noflush(&BeMessage::ControlFile).await?;
self.write_message_noflush(&BeMessage::CommandComplete)
.await?;
self.write_message(&BeMessage::ReadyForQuery).await
}
fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> {
async fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> {
// Check that the timeline exists
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
if pcache.is_err() {
@@ -740,17 +783,17 @@ impl Connection {
let pcache = pcache.unwrap();
/* switch client to COPYBOTH */
self.stream.write_u8(b'W')?;
self.stream.write_i32::<BE>(4 + 1 + 2)?;
self.stream.write_u8(0)?; /* copy_is_binary */
self.stream.write_i16::<BE>(0)?; /* numAttributes */
self.stream.flush()?;
self.stream.write_u8(b'W').await?;
self.stream.write_i32(4 + 1 + 2).await?;
self.stream.write_u8(0).await?; /* copy_is_binary */
self.stream.write_i16(0).await?; /* numAttributes */
self.stream.flush().await?;
loop {
let message = self.read_message()?;
let message = self.read_message().await?;
if let Some(m) = &message {
trace!("query({:?}): {:?}", timelineid, m);
info!("query({:?}): {:?}", timelineid, m);
};
if message.is_none() {
@@ -767,12 +810,27 @@ impl Connection {
forknum: req.forknum,
};
let exist = pcache.relsize_exist(&tag, req.lsn).unwrap_or(false);
let exist = pcache.relsize_exist(&tag);
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
ok: exist,
n_blocks: 0,
}))?
}))
.await?
}
Some(FeMessage::ZenithTruncRequest(_)) => {
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
ok: true,
n_blocks: 0,
}))
.await?
}
Some(FeMessage::ZenithUnlinkRequest(_)) => {
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
ok: true,
n_blocks: 0,
}))
.await?
}
Some(FeMessage::ZenithNblocksRequest(req)) => {
let tag = page_cache::RelTag {
@@ -782,21 +840,20 @@ impl Connection {
forknum: req.forknum,
};
let n_blocks = pcache.relsize_get(&tag, req.lsn).unwrap_or(0);
let n_blocks = pcache.relsize_get(&tag);
self.write_message(&BeMessage::ZenithNblocksResponse(ZenithStatusResponse {
ok: true,
n_blocks,
}))?
}))
.await?
}
Some(FeMessage::ZenithReadRequest(req)) => {
let buf_tag = page_cache::BufferTag {
rel: page_cache::RelTag {
spcnode: req.spcnode,
dbnode: req.dbnode,
relnode: req.relnode,
forknum: req.forknum,
},
spcnode: req.spcnode,
dbnode: req.dbnode,
relnode: req.relnode,
forknum: req.forknum,
blknum: req.blkno,
};
@@ -817,14 +874,46 @@ impl Connection {
}
};
self.write_message(&msg)?
self.write_message(&msg).await?
}
Some(FeMessage::ZenithCreateRequest(req)) => {
let tag = page_cache::RelTag {
spcnode: req.spcnode,
dbnode: req.dbnode,
relnode: req.relnode,
forknum: req.forknum,
};
pcache.relsize_inc(&tag, 0);
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
ok: true,
n_blocks: 0,
}))
.await?
}
Some(FeMessage::ZenithExtendRequest(req)) => {
let tag = page_cache::RelTag {
spcnode: req.spcnode,
dbnode: req.dbnode,
relnode: req.relnode,
forknum: req.forknum,
};
pcache.relsize_inc(&tag, req.blkno + 1);
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
ok: true,
n_blocks: 0,
}))
.await?
}
_ => {}
}
}
}
fn handle_basebackup_request(&mut self, timelineid: ZTimelineId) -> Result<()> {
async fn handle_basebackup_request(&mut self, timelineid: ZTimelineId) -> Result<()> {
// check that the timeline exists
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
if pcache.is_err() {
@@ -835,11 +924,11 @@ impl Connection {
/* switch client to COPYOUT */
let stream = &mut self.stream;
stream.write_u8(b'H')?;
stream.write_i32::<BE>(4 + 1 + 2)?;
stream.write_u8(0)?; /* copy_is_binary */
stream.write_i16::<BE>(0)?; /* numAttributes */
stream.flush()?;
stream.write_u8(b'H').await?;
stream.write_i32(4 + 1 + 2).await?;
stream.write_u8(0).await?; /* copy_is_binary */
stream.write_i16(0).await?; /* numAttributes */
stream.flush().await?;
info!("sent CopyOut");
/* Send a tarball of the latest snapshot on the timeline */
@@ -847,45 +936,72 @@ impl Connection {
// find latest snapshot
let snapshotlsn = restore_local_repo::find_latest_snapshot(&self.conf, timelineid).unwrap();
basebackup::send_snapshot_tarball(&mut CopyDataSink { stream }, timelineid, snapshotlsn)?;
// Stream it
let (s, mut r) = mpsc::channel(5);
let f_tar = task::spawn_blocking(move || {
basebackup::send_snapshot_tarball(&mut CopyDataSink(s), timelineid, snapshotlsn)?;
Ok(())
});
let f_tar2 = async {
let joinres = f_tar.await;
if joinres.is_err() {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
joinres.unwrap_err(),
));
}
return joinres.unwrap();
};
let f_pump = async move {
loop {
let buf = r.recv().await;
if buf.is_none() {
break;
}
let mut buf = buf.unwrap();
// CopyData
stream.write_u8(b'd').await?;
stream.write_u32((4 + buf.len()) as u32).await?;
stream.write_all(&mut buf).await?;
trace!("CopyData sent for {} bytes!", buf.len());
// FIXME: flush isn't really required, but makes it easier
// to view in wireshark
stream.flush().await?;
}
Ok(())
};
tokio::try_join!(f_tar2, f_pump)?;
// CopyDone
self.stream.write_u8(b'c')?;
self.stream.write_u32::<BE>(4)?;
self.stream.flush()?;
self.stream.write_u8(b'c').await?;
self.stream.write_u32(4).await?;
self.stream.flush().await?;
debug!("CopyDone sent!");
// FIXME: I'm getting an error from the tokio copyout driver without this.
// I think it happens when the CommandComplete, CloseComplete and ReadyForQuery
// are sent in the same TCP packet as the CopyDone. I don't understand why.
thread::sleep(Duration::from_secs(1));
thread::sleep(std::time::Duration::from_secs(1));
Ok(())
}
}
///
/// A std::io::Write implementation that wraps all data written to it in CopyData
/// messages.
///
struct CopyDataSink<'a> {
stream: &'a mut BufWriter<TcpStream>,
}
struct CopyDataSink(mpsc::Sender<Bytes>);
impl<'a> std::io::Write for CopyDataSink<'a> {
impl std::io::Write for CopyDataSink {
fn write(&mut self, data: &[u8]) -> std::result::Result<usize, std::io::Error> {
// CopyData
// FIXME: if the input is large, we should split it into multiple messages.
// Not sure what the threshold should be, but the ultimate hard limit is that
// the length cannot exceed u32.
self.stream.write_u8(b'd')?;
self.stream.write_u32::<BE>((4 + data.len()) as u32)?;
self.stream.write_all(&data)?;
trace!("CopyData sent for {} bytes!", data.len());
let buf = Bytes::copy_from_slice(data);
// FIXME: flush isn't really required, but makes it easier
// to view in wireshark
self.stream.flush()?;
if let Err(e) = self.0.blocking_send(buf) {
return Err(io::Error::new(io::ErrorKind::Other, e));
}
Ok(data.len())
}

View File

@@ -0,0 +1,53 @@
// From pg_tablespace_d.h
//
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
pub const GLOBALTABLESPACE_OID: u32 = 1664;
//Special values for non-rel files' tags
//TODO maybe use enum?
pub const PG_CONTROLFILE_FORKNUM: u32 = 42;
pub const PG_FILENODEMAP_FORKNUM: u32 = 43;
pub const PG_XACT_FORKNUM: u32 = 44;
pub const PG_MXACT_OFFSETS_FORKNUM: u32 = 45;
pub const PG_MXACT_MEMBERS_FORKNUM: u32 = 46;
//
// constants from clog.h
//
pub const CLOG_XACTS_PER_BYTE: u32 = 4;
pub const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
pub const CLOG_BITS_PER_XACT: u8 = 2;
pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
pub const CLOG_ZEROPAGE: u8 = 0x00;
pub const CLOG_TRUNCATE: u8 = 0x10;
// From xact.h
pub const XLOG_XACT_COMMIT: u8 = 0x00;
pub const XLOG_XACT_ABORT: u8 = 0x20;
/* mask for filtering opcodes out of xl_info */
pub const XLOG_XACT_OPMASK: u8 = 0x70;
/* does this record have a 'xinfo' field or not */
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
/*
* The following flags, stored in xinfo, determine which information is
* contained in commit/abort records.
*/
pub const XACT_XINFO_HAS_DBINFO: u32 = 1;
pub const XACT_XINFO_HAS_SUBXACTS: u32 = 2;
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 4;
// From pg_control.h and rmgrlist.h
pub const XLOG_SWITCH: u8 = 0x40;
pub const RM_XLOG_ID: u8 = 0;
pub const RM_XACT_ID: u8 = 1;
pub const RM_CLOG_ID: u8 = 3;
// pub const RM_MULTIXACT_ID:u8 = 6;
// from xlogreader.h
pub const XLR_INFO_MASK: u8 = 0x0F;

View File

@@ -12,8 +12,10 @@
use log::*;
use regex::Regex;
use std::fmt;
use std::cmp::max;
use std::error::Error;
use std::fs;
use std::fs::File;
use std::io::Read;
@@ -27,14 +29,9 @@ use bytes::Bytes;
use crate::page_cache;
use crate::page_cache::BufferTag;
use crate::page_cache::PageCache;
use crate::page_cache::RelTag;
use crate::waldecoder::{decode_wal_record, WalStreamDecoder};
use crate::PageServerConf;
use crate::ZTimelineId;
use postgres_ffi::pg_constants;
use postgres_ffi::xlog_utils::*;
use postgres_ffi::FilePathError;
use zenith_utils::lsn::Lsn;
// From pg_tablespace_d.h
//
@@ -61,21 +58,20 @@ pub fn restore_timeline(
.join(timeline.to_string())
.join("snapshots");
let mut last_snapshot_lsn: Lsn = Lsn(0);
let mut last_snapshot_lsn: u64 = 0;
for direntry in fs::read_dir(&snapshotspath).unwrap() {
let direntry = direntry?;
let filename = direntry.file_name();
let lsn = Lsn::from_filename(&filename)?;
let filename = direntry.file_name().to_str().unwrap().to_owned();
let lsn = u64::from_str_radix(&filename, 16)?;
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
// FIXME: pass filename as Path instead of str?
let filename_str = filename.into_string().unwrap();
restore_snapshot(conf, pcache, timeline, &filename_str)?;
info!("restored snapshot at {:?}", filename_str);
restore_snapshot(conf, pcache, timeline, &filename)?;
info!("restored snapshot at {}", filename);
}
if last_snapshot_lsn == Lsn(0) {
if last_snapshot_lsn == 0 {
error!(
"could not find valid snapshot in {}",
snapshotspath.display()
@@ -170,17 +166,7 @@ fn restore_snapshot(
}
}
}
for entry in fs::read_dir(snapshotpath.join("pg_xact"))? {
let entry = entry?;
restore_nonrelfile(
conf,
pcache,
timeline,
snapshot,
pg_constants::PG_XACT_FORKNUM,
&entry.path(),
)?;
}
// TODO: Scan pg_tblspc
Ok(())
@@ -195,33 +181,33 @@ fn restore_relfile(
dboid: u32,
path: &Path,
) -> Result<()> {
let lsn = Lsn::from_hex(snapshot)?;
let lsn = u64::from_str_radix(snapshot, 16)?;
// Does it look like a relation file?
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
if let Err(e) = p {
if p.is_err() {
let e = p.unwrap_err();
warn!("unrecognized file in snapshot: {:?} ({})", path, e);
return Err(e.into());
return Err(e)?;
}
let (relnode, forknum, segno) = p.unwrap();
let mut file = File::open(path)?;
let mut buf: [u8; 8192] = [0u8; 8192];
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
// FIXME: use constants (BLCKSZ)
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / 8192);
loop {
let r = file.read_exact(&mut buf);
match r {
Ok(_) => {
let tag = BufferTag {
rel: RelTag {
spcnode: spcoid,
dbnode: dboid,
relnode,
forknum,
},
blknum,
let tag = page_cache::BufferTag {
spcnode: spcoid,
dbnode: dboid,
relnode: relnode,
forknum: forknum as u8,
blknum: blknum,
};
pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
/*
@@ -247,62 +233,13 @@ fn restore_relfile(
blknum += 1;
}
Ok(())
}
fn restore_nonrelfile(
_conf: &PageServerConf,
pcache: &PageCache,
_timeline: ZTimelineId,
snapshot: &str,
forknum: u8,
path: &Path,
) -> Result<()> {
let lsn = Lsn::from_hex(snapshot)?;
// Does it look like a relation file?
let mut file = File::open(path)?;
let mut buf: [u8; 8192] = [0u8; 8192];
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
let mut blknum: u32 = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
loop {
let r = file.read_exact(&mut buf);
match r {
Ok(_) => {
let tag = BufferTag {
rel: RelTag {
spcnode: 0,
dbnode: 0,
relnode: 0,
forknum,
},
blknum,
};
pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
/*
if oldest_lsn == 0 || p.lsn < oldest_lsn {
oldest_lsn = p.lsn;
}
*/
}
// TODO: UnexpectedEof is expected
Err(e) => match e.kind() {
std::io::ErrorKind::UnexpectedEof => {
// reached EOF. That's expected.
// FIXME: maybe check that we read the full length of the file?
break;
}
_ => {
error!("error reading file: {:?} ({})", path, e);
break;
}
},
};
blknum += 1;
}
let tag = page_cache::RelTag {
spcnode: spcoid,
dbnode: dboid,
relnode: relnode,
forknum: forknum as u8,
};
pcache.relsize_inc(&tag, blknum);
Ok(())
}
@@ -313,16 +250,15 @@ fn restore_wal(
_conf: &PageServerConf,
pcache: &PageCache,
timeline: ZTimelineId,
startpoint: Lsn,
startpoint: u64,
) -> Result<()> {
let walpath = format!("timelines/{}/wal", timeline);
let mut waldecoder = WalStreamDecoder::new(startpoint);
let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
const SEG_SIZE: u64 = 16 * 1024 * 1024;
let mut segno = startpoint.segment_number(SEG_SIZE);
let mut offset = startpoint.segment_offset(SEG_SIZE);
let mut last_lsn = Lsn(0);
let mut segno = XLByteToSeg(startpoint, 16 * 1024 * 1024);
let mut offset = XLogSegmentOffset(startpoint, 16 * 1024 * 1024);
let mut last_lsn = 0;
loop {
// FIXME: assume postgresql tli 1 for now
let filename = XLogFileName(1, segno, 16 * 1024 * 1024);
@@ -330,17 +266,18 @@ fn restore_wal(
// It could be as .partial
if !PathBuf::from(&path).exists() {
path += ".partial";
path = path + ".partial";
}
// Slurp the WAL file
let open_result = File::open(&path);
if let Err(e) = &open_result {
if let Err(e) = open_result {
if e.kind() == std::io::ErrorKind::NotFound {
break;
}
return Err(e)?;
}
let mut file = open_result?;
let mut file = open_result.unwrap();
if offset > 0 {
file.seek(SeekFrom::Start(offset as u64))?;
@@ -370,20 +307,18 @@ fn restore_wal(
// so having multiple copies of it doesn't cost that much)
for blk in decoded.blocks.iter() {
let tag = BufferTag {
rel: RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum as u8,
},
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum as u8,
blknum: blk.blkno,
};
let rec = page_cache::WALRecord {
lsn,
lsn: lsn,
will_init: blk.will_init || blk.apply_image,
truncate: false,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
main_data_offset: decoded.main_data_offset,
};
pcache.put_wal_record(tag, rec);
@@ -403,17 +338,117 @@ fn restore_wal(
segno += 1;
offset = 0;
}
info!("reached end of WAL at {}", last_lsn);
info!(
"reached end of WAL at {:X}/{:X}",
last_lsn >> 32,
last_lsn & 0xffffffff
);
Ok(())
}
// FIXME: copied from xlog_utils.rs
pub const XLOG_FNAME_LEN: usize = 24;
pub type XLogRecPtr = u64;
pub type XLogSegNo = u64;
pub type TimeLineID = u32;
#[allow(non_snake_case)]
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
}
#[allow(non_snake_case)]
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
return xlogptr / wal_segsz_bytes as u64;
}
#[allow(non_snake_case)]
pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
return format!(
"{:>08X}{:>08X}{:>08X}",
tli,
logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes),
logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)
);
}
#[allow(non_snake_case)]
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
}
#[allow(non_snake_case)]
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
}
#[allow(non_snake_case)]
pub fn IsXLogFileName(fname: &str) -> bool {
return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
}
#[allow(non_snake_case)]
pub fn IsPartialXLogFileName(fname: &str) -> bool {
if let Some(basefname) = fname.strip_suffix(".partial") {
IsXLogFileName(basefname)
} else {
false
}
}
#[derive(Debug, Clone)]
struct FilePathError {
msg: String,
}
impl Error for FilePathError {
fn description(&self) -> &str {
&self.msg
}
}
impl FilePathError {
fn new(msg: &str) -> FilePathError {
FilePathError {
msg: msg.to_string(),
}
}
}
impl From<core::num::ParseIntError> for FilePathError {
fn from(e: core::num::ParseIntError) -> Self {
return FilePathError {
msg: format!("invalid filename: {}", e),
};
}
}
impl fmt::Display for FilePathError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "invalid filename")
}
}
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
match forkname {
// "main" is not in filenames, it's implicit if the fork name is not present
None => Ok(0),
Some("fsm") => Ok(1),
Some("vm") => Ok(2),
Some("init") => Ok(3),
Some(_) => Err(FilePathError::new("invalid forkname")),
}
}
#[derive(Debug)]
struct ParsedBaseImageFileName {
pub spcnode: u32,
pub dbnode: u32,
pub relnode: u32,
pub forknum: u8,
pub forknum: u32,
pub segno: u32,
pub lsn: u64,
@@ -425,7 +460,7 @@ struct ParsedBaseImageFileName {
// <oid>.<segment number>
// <oid>_<fork name>.<segment number>
fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
fn parse_relfilename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
let caps = re
@@ -435,8 +470,13 @@ fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
let relnode_str = caps.name("relnode").unwrap().as_str();
let relnode = u32::from_str_radix(relnode_str, 10)?;
let forkname = caps.name("forkname").map(|f| f.as_str());
let forknum = postgres_ffi::forkname_to_forknum(forkname)?;
let forkname_match = caps.name("forkname");
let forkname = if forkname_match.is_none() {
None
} else {
Some(forkname_match.unwrap().as_str())
};
let forknum = forkname_to_forknum(forkname)?;
let segno_match = caps.name("segno");
let segno = if segno_match.is_none() {
@@ -445,5 +485,5 @@ fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
};
Ok((relnode, forknum, segno))
return Ok((relnode, forknum, segno));
}

View File

@@ -38,9 +38,12 @@ pub fn restore_main(conf: &PageServerConf) {
let result = restore_chunk(conf).await;
match result {
Ok(_) => {}
Ok(_) => {
return;
}
Err(err) => {
error!("S3 error: {}", err);
return;
}
}
});
@@ -133,12 +136,50 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
const DEFAULTTABLESPACE_OID: u32 = 1663;
const GLOBALTABLESPACE_OID: u32 = 1664;
#[derive(Debug)]
struct FilePathError {
msg: String,
}
impl FilePathError {
fn new(msg: &str) -> FilePathError {
FilePathError {
msg: msg.to_string(),
}
}
}
impl From<core::num::ParseIntError> for FilePathError {
fn from(e: core::num::ParseIntError) -> Self {
return FilePathError {
msg: format!("invalid filename: {}", e),
};
}
}
impl fmt::Display for FilePathError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "invalid filename")
}
}
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
match forkname {
// "main" is not in filenames, it's implicit if the fork name is not present
None => Ok(0),
Some("fsm") => Ok(1),
Some("vm") => Ok(2),
Some("init") => Ok(3),
Some(_) => Err(FilePathError::new("invalid forkname")),
}
}
#[derive(Debug)]
struct ParsedBaseImageFileName {
pub spcnode: u32,
pub dbnode: u32,
pub relnode: u32,
pub forknum: u8,
pub forknum: u32,
pub segno: u32,
pub lsn: u64,
@@ -150,7 +191,7 @@ struct ParsedBaseImageFileName {
// <oid>.<segment number>
// <oid>_<fork name>.<segment number>
fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();
let caps = re
@@ -158,23 +199,28 @@ fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
let relnode_str = caps.name("relnode").unwrap().as_str();
let relnode: u32 = relnode_str.parse()?;
let relnode = u32::from_str_radix(relnode_str, 10)?;
let forkname = caps.name("forkname").map(|f| f.as_str());
let forkname_match = caps.name("forkname");
let forkname = if forkname_match.is_none() {
None
} else {
Some(forkname_match.unwrap().as_str())
};
let forknum = forkname_to_forknum(forkname)?;
let segno_match = caps.name("segno");
let segno = if segno_match.is_none() {
0
} else {
segno_match.unwrap().as_str().parse::<u32>()?
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
};
let lsn_hi: u64 = caps.name("lsnhi").unwrap().as_str().parse()?;
let lsn_lo: u64 = caps.name("lsnlo").unwrap().as_str().parse()?;
let lsn_hi = u64::from_str_radix(caps.name("lsnhi").unwrap().as_str(), 16)?;
let lsn_lo = u64::from_str_radix(caps.name("lsnlo").unwrap().as_str(), 16)?;
let lsn = lsn_hi << 32 | lsn_lo;
Ok((relnode, forknum, segno, lsn))
return Ok((relnode, forknum, segno, lsn));
}
fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
@@ -198,20 +244,20 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
if let Some(fname) = path.strip_prefix("global/") {
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
Ok(ParsedBaseImageFileName {
return Ok(ParsedBaseImageFileName {
spcnode: GLOBALTABLESPACE_OID,
dbnode: 0,
relnode,
forknum,
segno,
lsn,
})
});
} else if let Some(dbpath) = path.strip_prefix("base/") {
let mut s = dbpath.split("/");
let dbnode_str = s
.next()
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
let dbnode: u32 = dbnode_str.parse()?;
let dbnode = u32::from_str_radix(dbnode_str, 10)?;
let fname = s
.next()
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
@@ -221,19 +267,19 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
Ok(ParsedBaseImageFileName {
return Ok(ParsedBaseImageFileName {
spcnode: DEFAULTTABLESPACE_OID,
dbnode,
relnode,
forknum,
segno,
lsn,
})
});
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
// TODO
Err(FilePathError::new("tablespaces not supported"))
return Err(FilePathError::new("tablespaces not supported"));
} else {
Err(FilePathError::new("invalid relation data file name"))
return Err(FilePathError::new("invalid relation data file name"));
}
}
@@ -256,18 +302,17 @@ async fn slurp_base_file(
let mut bytes = BytesMut::from(data.as_slice()).freeze();
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
// FIXME: use constants (BLCKSZ)
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
let pcache = page_cache::get_pagecache(conf, sys_id);
while bytes.remaining() >= 8192 {
let tag = page_cache::BufferTag {
rel: page_cache::RelTag {
spcnode: parsed.spcnode,
dbnode: parsed.dbnode,
relnode: parsed.relnode,
forknum: parsed.forknum,
},
spcnode: parsed.spcnode,
dbnode: parsed.dbnode,
relnode: parsed.relnode,
forknum: parsed.forknum as u8,
blknum,
};

View File

@@ -31,7 +31,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
{
return true;
}
false
return false;
})
.fuse();
@@ -41,7 +41,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
{
return true;
}
false
return false;
})
.fuse();
@@ -52,7 +52,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
{
return true;
}
false
return false;
})
.fuse();
@@ -65,7 +65,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
{
return true;
}
false
return false;
})
.fuse();
@@ -84,11 +84,11 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
return true;
}
false
return false;
})
.fuse();
let logger = slog::Logger::root(drain, slog::o!());
slog_scope::set_global_logger(logger)
return slog_scope::set_global_logger(logger);
}
pub fn ui_main() -> Result<(), Box<dyn Error>> {
@@ -240,9 +240,7 @@ fn get_metric_u64(title: &str, value: u64) -> Spans {
])
}
// This is not used since LSNs were removed from page cache stats.
// Maybe it will be used in the future?
fn _get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
fn get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
Spans::from(vec![
Span::styled(format!("{:<20}", title), Style::default()),
Span::raw(": "),
@@ -250,6 +248,13 @@ fn _get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
])
}
// FIXME: We really should define a datatype for LSNs, with Display trait and
// helper functions. There's one in tokio-postgres, but I don't think we want
// to rely on that.
fn format_lsn(lsn: u64) -> String {
return format!("{:X}/{:X}", lsn >> 32, lsn & 0xffff_ffff);
}
impl tui::widgets::Widget for MetricsWidget {
fn render(self, area: Rect, buf: &mut Buffer) {
let block = Block::default()
@@ -263,19 +268,14 @@ impl tui::widgets::Widget for MetricsWidget {
let mut lines: Vec<Spans> = Vec::new();
let page_cache_stats = crate::page_cache::get_stats();
// This is not used since LSNs were removed from page cache stats.
// Maybe it will be used in the future?
/*
let lsnrange = format!(
"{} - {}",
page_cache_stats.first_valid_lsn, page_cache_stats.last_valid_lsn
format_lsn(page_cache_stats.first_valid_lsn),
format_lsn(page_cache_stats.last_valid_lsn)
);
let last_valid_recordlsn_str = page_cache_stats.last_record_lsn.to_string();
let last_valid_recordlsn_str = format_lsn(page_cache_stats.last_record_lsn);
lines.push(get_metric_str("Valid LSN range", &lsnrange));
lines.push(get_metric_str("Last record LSN", &last_valid_recordlsn_str));
*/
lines.push(get_metric_u64(
"# of cache entries",
page_cache_stats.num_entries,

View File

@@ -76,8 +76,8 @@ impl Events {
};
Events {
rx,
input_handle,
ignore_exit_key,
input_handle,
tick_handle,
}
}

View File

@@ -51,7 +51,7 @@ impl Drain for TuiLogger {
events.pop_back();
}
Ok(())
return Ok(());
}
}

View File

@@ -1,11 +1,10 @@
use crate::pg_constants;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use log::*;
use postgres_ffi::pg_constants;
use postgres_ffi::xlog_utils::XLogRecord;
use std::cmp::min;
use std::str;
use thiserror::Error;
use zenith_utils::lsn::Lsn;
const XLOG_BLCKSZ: u32 = 8192;
// FIXME: this is configurable in PostgreSQL, 16 MB is the default
const WAL_SEGMENT_SIZE: u64 = 16 * 1024 * 1024;
@@ -41,9 +40,9 @@ const SizeOfXLogLongPHD: usize = (2 + 2 + 4 + 8 + 4) + 4 + 8 + 4 + 4;
#[allow(dead_code)]
pub struct WalStreamDecoder {
lsn: Lsn,
lsn: u64,
startlsn: Lsn, // LSN where this record starts
startlsn: u64, // LSN where this record starts
contlen: u32,
padlen: u32,
@@ -56,7 +55,7 @@ pub struct WalStreamDecoder {
#[error("{msg} at {lsn}")]
pub struct WalDecodeError {
msg: String,
lsn: Lsn,
lsn: u64,
}
//
@@ -64,11 +63,11 @@ pub struct WalDecodeError {
// FIXME: This isn't a proper rust stream
//
impl WalStreamDecoder {
pub fn new(lsn: Lsn) -> WalStreamDecoder {
pub fn new(lsn: u64) -> WalStreamDecoder {
WalStreamDecoder {
lsn,
startlsn: Lsn(0),
startlsn: 0,
contlen: 0,
padlen: 0,
@@ -89,10 +88,10 @@ impl WalStreamDecoder {
/// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function
/// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
///
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
pub fn poll_decode(&mut self) -> Result<Option<(u64, Bytes)>, WalDecodeError> {
loop {
// parse and verify page boundaries as we go
if self.lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 {
if self.lsn % WAL_SEGMENT_SIZE == 0 {
// parse long header
if self.inputbuf.remaining() < SizeOfXLogLongPHD {
@@ -100,7 +99,7 @@ impl WalStreamDecoder {
}
let hdr = self.decode_XLogLongPageHeaderData();
if hdr.std.xlp_pageaddr != self.lsn.0 {
if hdr.std.xlp_pageaddr != self.lsn {
return Err(WalDecodeError {
msg: "invalid xlog segment header".into(),
lsn: self.lsn,
@@ -110,13 +109,15 @@ impl WalStreamDecoder {
self.lsn += SizeOfXLogLongPHD as u64;
continue;
} else if self.lsn.block_offset() == 0 {
} else if self.lsn % (XLOG_BLCKSZ as u64) == 0 {
// parse page header
if self.inputbuf.remaining() < SizeOfXLogShortPHD {
return Ok(None);
}
let hdr = self.decode_XLogPageHeaderData();
if hdr.xlp_pageaddr != self.lsn.0 {
if hdr.xlp_pageaddr != self.lsn {
return Err(WalDecodeError {
msg: "invalid xlog page header".into(),
lsn: self.lsn,
@@ -161,7 +162,7 @@ impl WalStreamDecoder {
continue;
} else {
// we're continuing a record, possibly from previous page.
let pageleft = self.lsn.remaining_in_block() as u32;
let pageleft: u32 = XLOG_BLCKSZ - (self.lsn % (XLOG_BLCKSZ as u64)) as u32;
// read the rest of the record, or as much as fits on this page.
let n = min(self.contlen, pageleft) as usize;
@@ -178,17 +179,20 @@ impl WalStreamDecoder {
let recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new());
let recordbuf = recordbuf.freeze();
let mut buf = recordbuf.clone();
// XLOG_SWITCH records are special. If we see one, we need to skip
// to the next WAL segment.
let xlogrec = XLogRecord::from_bytes(&mut buf);
if xlogrec.is_xlog_switch_record() {
trace!("saw xlog switch record at {}", self.lsn);
self.padlen = self.lsn.calc_padding(WAL_SEGMENT_SIZE) as u32;
} else {
// Pad to an 8-byte boundary
self.padlen = self.lsn.calc_padding(8u32) as u32;
if is_xlog_switch_record(&recordbuf) {
trace!(
"saw xlog switch record at {:X}/{:X}",
(self.lsn >> 32),
self.lsn & 0xffffffff
);
self.padlen = (WAL_SEGMENT_SIZE - (self.lsn % WAL_SEGMENT_SIZE)) as u32;
}
if self.lsn % 8 != 0 {
self.padlen = 8 - (self.lsn % 8) as u32;
}
let result = (self.lsn, recordbuf);
@@ -223,7 +227,7 @@ impl WalStreamDecoder {
// FIXME: check that hdr.xlp_rem_len matches self.contlen
//println!("next xlog page (xlp_rem_len: {})", hdr.xlp_rem_len);
hdr
return hdr;
}
#[allow(non_snake_case)]
@@ -235,10 +239,36 @@ impl WalStreamDecoder {
xlp_xlog_blcksz: self.inputbuf.get_u32_le(),
};
hdr
return hdr;
}
}
// FIXME:
const BLCKSZ: u16 = 8192;
//
// Constants from xlogrecord.h
//
const XLR_MAX_BLOCK_ID: u8 = 32;
const XLR_BLOCK_ID_DATA_SHORT: u8 = 255;
const XLR_BLOCK_ID_DATA_LONG: u8 = 254;
const XLR_BLOCK_ID_ORIGIN: u8 = 253;
const XLR_BLOCK_ID_TOPLEVEL_XID: u8 = 252;
const BKPBLOCK_FORK_MASK: u8 = 0x0F;
const _BKPBLOCK_FLAG_MASK: u8 = 0xF0;
const BKPBLOCK_HAS_IMAGE: u8 = 0x10; /* block data is an XLogRecordBlockImage */
const BKPBLOCK_HAS_DATA: u8 = 0x20;
const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */
const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */
/* Information stored in bimg_info */
const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
#[allow(dead_code)]
pub struct DecodedBkpBlock {
/* Is this block ref in use? */
@@ -298,147 +328,36 @@ impl DecodedBkpBlock {
const SizeOfXLogRecord: u32 = 24;
pub struct DecodedWALRecord {
pub xl_info: u8,
pub xl_rmid: u8,
pub record: Bytes, // raw XLogRecord
pub blocks: Vec<DecodedBkpBlock>,
pub main_data_offset: usize,
}
pub type Oid = u32;
pub type TransactionId = u32;
pub type BlockNumber = u32;
pub type OffsetNumber = u16;
// Is this record an XLOG_SWITCH record? They need some special processing,
// so we need to check for that before the rest of the parsing.
//
// FIXME: refactor this and decode_wal_record() below to avoid the duplication.
fn is_xlog_switch_record(rec: &Bytes) -> bool {
let mut buf = rec.clone();
#[repr(C)]
#[derive(Debug, Clone, Copy)]
// FIXME: assume little-endian here
let _xl_tot_len = buf.get_u32_le();
let _xl_xid = buf.get_u32_le();
let _xl_prev = buf.get_u64_le();
let xl_info = buf.get_u8();
let xl_rmid = buf.get_u8();
buf.advance(2); // 2 bytes of padding
let _xl_crc = buf.get_u32_le();
return xl_info == pg_constants::XLOG_SWITCH && xl_rmid == pg_constants::RM_XLOG_ID;
}
#[derive(Clone, Copy)]
pub struct RelFileNode {
pub spcnode: Oid, /* tablespace */
pub dbnode: Oid, /* database */
pub relnode: Oid, /* relation */
}
#[repr(C)]
#[derive(Debug)]
pub struct XlSmgrTruncate {
pub blkno: BlockNumber,
pub rnode: RelFileNode,
pub flags: u32,
}
impl XlSmgrTruncate {
pub fn decode(decoded: &DecodedWALRecord) -> XlSmgrTruncate {
let mut buf = decoded.record.clone();
buf.advance((SizeOfXLogRecord + 2) as usize);
XlSmgrTruncate {
blkno: buf.get_u32_le(),
rnode: RelFileNode {
spcnode: buf.get_u32_le(), /* tablespace */
dbnode: buf.get_u32_le(), /* database */
relnode: buf.get_u32_le(), /* relation */
},
flags: buf.get_u32_le(),
}
}
}
#[repr(C)]
#[derive(Debug)]
pub struct XlCreateDatabase {
pub db_id: Oid,
pub tablespace_id: Oid,
pub src_db_id: Oid,
pub src_tablespace_id: Oid,
}
impl XlCreateDatabase {
pub fn decode(decoded: &DecodedWALRecord) -> XlCreateDatabase {
let mut buf = decoded.record.clone();
buf.advance((SizeOfXLogRecord + 2) as usize);
XlCreateDatabase {
db_id: buf.get_u32_le(),
tablespace_id: buf.get_u32_le(),
src_db_id: buf.get_u32_le(),
src_tablespace_id: buf.get_u32_le(),
}
}
}
#[repr(C)]
#[derive(Debug)]
pub struct XlHeapInsert {
pub offnum: OffsetNumber,
pub flags: u8,
}
impl XlHeapInsert {
pub fn decode(buf: &mut Bytes) -> XlHeapInsert {
XlHeapInsert {
offnum: buf.get_u16_le(),
flags: buf.get_u8(),
}
}
}
#[repr(C)]
#[derive(Debug)]
pub struct XlHeapMultiInsert {
pub flags: u8,
pub ntuples: u16,
}
impl XlHeapMultiInsert {
pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert {
XlHeapMultiInsert {
flags: buf.get_u8(),
ntuples: buf.get_u16_le(),
}
}
}
#[repr(C)]
#[derive(Debug)]
pub struct XlHeapDelete {
pub xmax: TransactionId,
pub offnum: OffsetNumber,
pub infobits_set: u8,
pub flags: u8,
}
impl XlHeapDelete {
pub fn decode(buf: &mut Bytes) -> XlHeapDelete {
XlHeapDelete {
xmax: buf.get_u32_le(),
offnum: buf.get_u16_le(),
infobits_set: buf.get_u8(),
flags: buf.get_u8(),
}
}
}
#[repr(C)]
#[derive(Debug)]
pub struct XlHeapUpdate {
pub old_xmax: TransactionId,
pub old_offnum: OffsetNumber,
pub old_infobits_set: u8,
pub flags: u8,
pub new_xmax: TransactionId,
pub new_offnum: OffsetNumber,
}
impl XlHeapUpdate {
pub fn decode(buf: &mut Bytes) -> XlHeapUpdate {
XlHeapUpdate {
old_xmax: buf.get_u32_le(),
old_offnum: buf.get_u16_le(),
old_infobits_set: buf.get_u8(),
flags: buf.get_u8(),
new_xmax: buf.get_u32_le(),
new_offnum: buf.get_u16_le(),
}
}
pub spcnode: u32,
pub dbnode: u32,
pub relnode: u32,
}
//
@@ -448,10 +367,10 @@ impl XlHeapUpdate {
// The overall layout of an XLOG record is:
// Fixed-size header (XLogRecord struct)
// XLogRecordBlockHeader struct
// If pg_constants::BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
// If pg_constants::BKPIMAGE_HAS_HOLE and pg_constants::BKPIMAGE_IS_COMPRESSED, an
// If BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
// If BKPIMAGE_HAS_HOLE and BKPIMAGE_IS_COMPRESSED, an
// XLogRecordBlockCompressHeader struct follows.
// If pg_constants::BKPBLOCK_SAME_REL is not set, a RelFileNode follows
// If BKPBLOCK_SAME_REL is not set, a RelFileNode follows
// BlockNumber follows
// XLogRecordBlockHeader struct
// ...
@@ -460,26 +379,32 @@ impl XlHeapUpdate {
// block data
// ...
// main data
pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
pub fn decode_wal_record(rec: Bytes) -> DecodedWALRecord {
let mut rnode_spcnode: u32 = 0;
let mut rnode_dbnode: u32 = 0;
let mut rnode_relnode: u32 = 0;
let mut got_rnode = false;
let mut buf = record.clone();
let mut buf = rec.clone();
// 1. Parse XLogRecord struct
// FIXME: assume little-endian here
let xlogrec = XLogRecord::from_bytes(&mut buf);
let xl_tot_len = buf.get_u32_le();
let xl_xid = buf.get_u32_le();
let xl_prev = buf.get_u64_le();
let xl_info = buf.get_u8();
let xl_rmid = buf.get_u8();
buf.advance(2); // 2 bytes of padding
let _xl_crc = buf.get_u32_le();
trace!(
"decode_wal_record xl_rmid = {} xl_info = {}",
xlogrec.xl_rmid,
xlogrec.xl_info
xl_rmid,
xl_info
);
let remaining = xlogrec.xl_tot_len - SizeOfXLogRecord;
let remaining = xl_tot_len - SizeOfXLogRecord;
if buf.remaining() != remaining as usize {
//TODO error
@@ -498,29 +423,29 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
let block_id = buf.get_u8();
match block_id {
pg_constants::XLR_BLOCK_ID_DATA_SHORT => {
XLR_BLOCK_ID_DATA_SHORT => {
/* XLogRecordDataHeaderShort */
main_data_len = buf.get_u8() as u32;
datatotal += main_data_len;
}
pg_constants::XLR_BLOCK_ID_DATA_LONG => {
XLR_BLOCK_ID_DATA_LONG => {
/* XLogRecordDataHeaderLong */
main_data_len = buf.get_u32_le();
datatotal += main_data_len;
}
pg_constants::XLR_BLOCK_ID_ORIGIN => {
XLR_BLOCK_ID_ORIGIN => {
// RepOriginId is uint16
buf.advance(2);
}
pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
XLR_BLOCK_ID_TOPLEVEL_XID => {
// TransactionId is uint32
buf.advance(4);
}
0..=pg_constants::XLR_MAX_BLOCK_ID => {
0..=XLR_MAX_BLOCK_ID => {
/* XLogRecordBlockHeader */
let mut blk = DecodedBkpBlock::new();
let fork_flags: u8;
@@ -537,11 +462,11 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
max_block_id = block_id;
fork_flags = buf.get_u8();
blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
blk.forknum = fork_flags & BKPBLOCK_FORK_MASK;
blk.flags = fork_flags;
blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0;
blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0;
blk.will_init = (fork_flags & pg_constants::BKPBLOCK_WILL_INIT) != 0;
blk.has_image = (fork_flags & BKPBLOCK_HAS_IMAGE) != 0;
blk.has_data = (fork_flags & BKPBLOCK_HAS_DATA) != 0;
blk.will_init = (fork_flags & BKPBLOCK_WILL_INIT) != 0;
blk.data_len = buf.get_u16_le();
/* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */
@@ -554,16 +479,16 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
blk.hole_offset = buf.get_u16_le();
blk.bimg_info = buf.get_u8();
blk.apply_image = (blk.bimg_info & pg_constants::BKPIMAGE_APPLY) != 0;
blk.apply_image = (blk.bimg_info & BKPIMAGE_APPLY) != 0;
if blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED != 0 {
if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 {
if blk.bimg_info & BKPIMAGE_IS_COMPRESSED != 0 {
if blk.bimg_info & BKPIMAGE_HAS_HOLE != 0 {
blk.hole_length = buf.get_u16_le();
} else {
blk.hole_length = 0;
}
} else {
blk.hole_length = pg_constants::BLCKSZ - blk.bimg_len;
blk.hole_length = BLCKSZ - blk.bimg_len;
}
datatotal += blk.bimg_len as u32;
blocks_total_len += blk.bimg_len as u32;
@@ -572,15 +497,13 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
* cross-check that hole_offset > 0, hole_length > 0 and
* bimg_len < BLCKSZ if the HAS_HOLE flag is set.
*/
if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0
&& (blk.hole_offset == 0
|| blk.hole_length == 0
|| blk.bimg_len == pg_constants::BLCKSZ)
if blk.bimg_info & BKPIMAGE_HAS_HOLE != 0
&& (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ)
{
// TODO
/*
report_invalid_record(state,
"pg_constants::BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
"BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
(unsigned int) blk->hole_offset,
(unsigned int) blk->hole_length,
(unsigned int) blk->bimg_len,
@@ -593,13 +516,13 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
* cross-check that hole_offset == 0 and hole_length == 0 if
* the HAS_HOLE flag is not set.
*/
if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
if blk.bimg_info & BKPIMAGE_HAS_HOLE == 0
&& (blk.hole_offset != 0 || blk.hole_length != 0)
{
// TODO
/*
report_invalid_record(state,
"pg_constants::BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
"BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
(unsigned int) blk->hole_offset,
(unsigned int) blk->hole_length,
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
@@ -611,13 +534,11 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
* cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED
* flag is set.
*/
if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0)
&& blk.bimg_len == pg_constants::BLCKSZ
{
if (blk.bimg_info & BKPIMAGE_IS_COMPRESSED == 0) && blk.bimg_len == BLCKSZ {
// TODO
/*
report_invalid_record(state,
"pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X",
"BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X",
(unsigned int) blk->bimg_len,
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
goto err;
@@ -628,21 +549,21 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
* cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor
* IS_COMPRESSED flag is set.
*/
if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
&& blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0
&& blk.bimg_len != pg_constants::BLCKSZ
if blk.bimg_info & BKPIMAGE_HAS_HOLE == 0
&& blk.bimg_info & BKPIMAGE_IS_COMPRESSED == 0
&& blk.bimg_len != BLCKSZ
{
// TODO
/*
report_invalid_record(state,
"neither pg_constants::BKPIMAGE_HAS_HOLE nor pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X",
"neither BKPIMAGE_HAS_HOLE nor BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X",
(unsigned int) blk->data_len,
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
goto err;
*/
}
}
if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 {
if fork_flags & BKPBLOCK_SAME_REL == 0 {
rnode_spcnode = buf.get_u32_le();
rnode_dbnode = buf.get_u32_le();
rnode_relnode = buf.get_u32_le();
@@ -651,7 +572,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
// TODO
/*
report_invalid_record(state,
"pg_constants::BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
"BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
(uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
goto err; */
}
@@ -682,7 +603,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
// We don't need them, so just skip blocks_total_len bytes
buf.advance(blocks_total_len as usize);
let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;
let main_data_offset = (xl_tot_len - main_data_len) as usize;
// 4. Decode main_data
if main_data_len > 0 {
@@ -690,253 +611,46 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
}
//5. Handle special CLOG and XACT records
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
if xl_rmid == pg_constants::RM_CLOG_ID {
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::PG_XACT_FORKNUM;
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
blk.blkno = buf.get_i32_le() as u32;
blk.will_init = true;
trace!("RM_CLOG_ID updates block {}", blk.blkno);
blocks.push(blk);
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
} else if xl_rmid == pg_constants::RM_XACT_ID {
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
if info == pg_constants::XLOG_XACT_COMMIT {
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::PG_XACT_FORKNUM;
blk.blkno = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
blk.blkno = xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
trace!(
"XLOG_XACT_COMMIT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
xlogrec.xl_prev & 0xffffffff,
xlogrec.xl_xid,
blk.blkno,
main_data_len
"XLOG_XACT_COMMIT xl_prev {:X}/{:X} xid {} updates block {}",
(xl_prev >> 32),
xl_prev & 0xffffffff,
xl_xid,
blk.blkno
);
blocks.push(blk);
//parse commit record to extract subtrans entries
// xl_xact_commit starts with time of commit
let _xact_time = buf.get_i64_le();
let mut xinfo = 0;
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
xinfo = buf.get_u32_le();
}
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
let _dbid = buf.get_u32_le();
let _tsid = buf.get_u32_le();
}
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
let nsubxacts = buf.get_i32_le();
let mut prev_blkno = u32::MAX;
for _i in 0..nsubxacts {
let subxact = buf.get_u32_le();
let blkno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
if prev_blkno != blkno {
prev_blkno = blkno;
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::PG_XACT_FORKNUM;
blk.blkno = blkno;
blocks.push(blk);
}
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
let nrels = buf.get_i32_le();
for _i in 0..nrels {
let spcnode = buf.get_u32_le();
let dbnode = buf.get_u32_le();
let relnode = buf.get_u32_le();
//TODO handle this too?
trace!(
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
spcnode,
dbnode,
relnode
);
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
let nmsgs = buf.get_i32_le();
for _i in 0..nmsgs {
let sizeof_shared_invalidation_message = 0;
buf.advance(sizeof_shared_invalidation_message);
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
let _xid = buf.get_u32_le();
trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
//TODO handle this to be able to restore pg_twophase on node start
}
//TODO parse commit record to extract subtrans entries
} else if info == pg_constants::XLOG_XACT_ABORT {
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::PG_XACT_FORKNUM;
blk.blkno = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
blk.blkno = xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
trace!(
"XLOG_XACT_ABORT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
xlogrec.xl_prev & 0xffffffff,
xlogrec.xl_xid,
blk.blkno,
main_data_len
"XLOG_XACT_ABORT xl_prev {:X}/{:X} xid {} updates block {}",
(xl_prev >> 32),
xl_prev & 0xffffffff,
xl_xid,
blk.blkno
);
blocks.push(blk);
//parse abort record to extract subtrans entries
// xl_xact_abort starts with time of commit
let _xact_time = buf.get_i64_le();
let mut xinfo = 0;
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
xinfo = buf.get_u32_le();
}
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
let _dbid = buf.get_u32_le();
let _tsid = buf.get_u32_le();
}
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
let nsubxacts = buf.get_i32_le();
let mut prev_blkno = u32::MAX;
for _i in 0..nsubxacts {
let subxact = buf.get_u32_le();
let blkno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
if prev_blkno != blkno {
prev_blkno = blkno;
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::PG_XACT_FORKNUM;
blk.blkno = blkno;
blocks.push(blk);
}
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
let nrels = buf.get_i32_le();
for _i in 0..nrels {
let spcnode = buf.get_u32_le();
let dbnode = buf.get_u32_le();
let relnode = buf.get_u32_le();
//TODO save these too
trace!(
"XLOG_XACT_ABORT relfilenode {}/{}/{}",
spcnode,
dbnode,
relnode
);
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
let _xid = buf.get_u32_le();
trace!("XLOG_XACT_ABORT-XACT_XINFO_HAS_TWOPHASE");
}
}
} else if xlogrec.xl_rmid == pg_constants::RM_DBASE_ID {
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
if info == pg_constants::XLOG_DBASE_CREATE {
//buf points to main_data
let db_id = buf.get_u32_le();
let tablespace_id = buf.get_u32_le();
let src_db_id = buf.get_u32_le();
let src_tablespace_id = buf.get_u32_le();
trace!(
"XLOG_DBASE_CREATE tablespace_id/db_id {}/{} src_db_id {}/{}",
tablespace_id,
db_id,
src_tablespace_id,
src_db_id
);
// in postgres it is implemented as copydir
// we need to copy all pages in page_cache
} else {
trace!("XLOG_DBASE_DROP is not handled yet");
}
} else if xlogrec.xl_rmid == pg_constants::RM_TBLSPC_ID {
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
if info == pg_constants::XLOG_TBLSPC_CREATE {
//buf points to main_data
let ts_id = buf.get_u32_le();
let ts_path = str::from_utf8(&buf).unwrap();
trace!("XLOG_TBLSPC_CREATE ts_id {} ts_path {}", ts_id, ts_path);
} else {
trace!("XLOG_TBLSPC_DROP is not handled yet");
}
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
if info == pg_constants::XLOG_HEAP_INSERT {
let xlrec = XlHeapInsert::decode(&mut buf);
if (xlrec.flags
& (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
!= 0
{
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
blk.blkno = blkno;
blk.rnode_spcnode = blocks[0].rnode_spcnode;
blk.rnode_dbnode = blocks[0].rnode_dbnode;
blk.rnode_relnode = blocks[0].rnode_relnode;
blocks.push(blk);
}
} else if info == pg_constants::XLOG_HEAP_DELETE {
let xlrec = XlHeapDelete::decode(&mut buf);
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
blk.blkno = blkno;
blk.rnode_spcnode = blocks[0].rnode_spcnode;
blk.rnode_dbnode = blocks[0].rnode_dbnode;
blk.rnode_relnode = blocks[0].rnode_relnode;
blocks.push(blk);
}
} else if info == pg_constants::XLOG_HEAP_UPDATE
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
{
let xlrec = XlHeapUpdate::decode(&mut buf);
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
blk.blkno = blkno;
blk.rnode_spcnode = blocks[0].rnode_spcnode;
blk.rnode_dbnode = blocks[0].rnode_dbnode;
blk.rnode_relnode = blocks[0].rnode_relnode;
blocks.push(blk);
}
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0
&& blocks.len() > 1
{
let mut blk = DecodedBkpBlock::new();
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
blk.blkno = blocks[1].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
blk.rnode_spcnode = blocks[1].rnode_spcnode;
blk.rnode_dbnode = blocks[1].rnode_dbnode;
blk.rnode_relnode = blocks[1].rnode_relnode;
blocks.push(blk);
}
}
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP2_ID {
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
let xlrec = XlHeapMultiInsert::decode(&mut buf);
if (xlrec.flags
& (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
!= 0
{
let mut blk = DecodedBkpBlock::new();
let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
blk.blkno = blkno;
blk.rnode_spcnode = blocks[0].rnode_spcnode;
blk.rnode_dbnode = blocks[0].rnode_dbnode;
blk.rnode_relnode = blocks[0].rnode_relnode;
blocks.push(blk);
}
//TODO parse abort record to extract subtrans entries
}
}
DecodedWALRecord {
xl_info: xlogrec.xl_info,
xl_rmid: xlogrec.xl_rmid,
record,
record: rec,
blocks,
main_data_offset,
main_data_offset: main_data_offset,
}
}

View File

@@ -7,15 +7,13 @@
//!
use crate::page_cache;
use crate::page_cache::{BufferTag, RelTag};
use crate::waldecoder::*;
use crate::page_cache::BufferTag;
use crate::waldecoder::{decode_wal_record, WalStreamDecoder};
use crate::PageServerConf;
use crate::ZTimelineId;
use anyhow::Error;
use lazy_static::lazy_static;
use log::*;
use postgres_ffi::pg_constants;
use postgres_ffi::xlog_utils::*;
use postgres_protocol::message::backend::ReplicationMessage;
use postgres_types::PgLsn;
use std::collections::HashMap;
@@ -26,13 +24,11 @@ use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Mutex;
use std::thread;
use std::thread::sleep;
use std::time::Duration;
use tokio::runtime::Runtime;
use tokio::runtime;
use tokio::time::{sleep, Duration};
use tokio_postgres::replication::{PgTimestamp, ReplicationStream};
use tokio_postgres::{NoTls, SimpleQueryMessage, SimpleQueryRow};
use tokio_stream::StreamExt;
use zenith_utils::lsn::Lsn;
//
// We keep one WAL Receiver active per timeline.
@@ -96,38 +92,30 @@ fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
timelineid
);
// We need a tokio runtime to call the rust-postgres copy_both function.
// Most functions in the rust-postgres driver have a blocking wrapper,
// but copy_both does not (TODO: the copy_both support is still work-in-progress
// as of this writing. Check later if that has changed, or implement the
// wrapper ourselves in rust-postgres)
let runtime = tokio::runtime::Builder::new_current_thread()
let runtime = runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
//
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
// and start streaming WAL from it. If the connection is lost, keep retrying.
//
loop {
// Look up the current WAL producer address
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
runtime.block_on(async {
loop {
// Look up the current WAL producer address
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
let res = walreceiver_main(&runtime, conf, timelineid, &wal_producer_connstr);
let res = walreceiver_main(conf, timelineid, &wal_producer_connstr).await;
if let Err(e) = res {
info!(
"WAL streaming connection failed ({}), retrying in 1 second",
e
);
sleep(Duration::from_secs(1));
if let Err(e) = res {
info!(
"WAL streaming connection failed ({}), retrying in 1 second",
e
);
sleep(Duration::from_secs(1)).await;
}
}
}
});
}
fn walreceiver_main(
runtime: &Runtime,
async fn walreceiver_main(
conf: &PageServerConf,
timelineid: ZTimelineId,
wal_producer_connstr: &str,
@@ -135,21 +123,20 @@ fn walreceiver_main(
// Connect to the database in replication mode.
info!("connecting to {:?}", wal_producer_connstr);
let connect_cfg = format!("{} replication=true", wal_producer_connstr);
let (rclient, connection) = runtime.block_on(tokio_postgres::connect(&connect_cfg, NoTls))?;
let (rclient, connection) = tokio_postgres::connect(&connect_cfg, NoTls).await?;
info!("connected!");
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own.
runtime.spawn(async move {
tokio::spawn(async move {
if let Err(e) = connection.await {
error!("connection error: {}", e);
}
});
let identify = identify_system(runtime, &rclient)?;
let identify = identify_system(&rclient).await?;
info!("{:?}", identify);
let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
let end_of_wal = u64::from(identify.xlogpos);
let mut caught_up = false;
let pcache = page_cache::get_pagecache(&conf, timelineid).unwrap();
@@ -159,7 +146,7 @@ fn walreceiver_main(
//
let mut startpoint = pcache.get_last_valid_lsn();
let last_valid_lsn = pcache.get_last_valid_lsn();
if startpoint == Lsn(0) {
if startpoint == 0 {
// If we start here with identify.xlogpos we will have race condition with
// postgres start: insert into postgres may request page that was modified with lsn
// smaller than identify.xlogpos.
@@ -168,37 +155,45 @@ fn walreceiver_main(
// different like having 'initdb' method on a pageserver (or importing some shared
// empty database snapshot), so for now I just put start of first segment which
// seems to be a valid record.
pcache.init_valid_lsn(Lsn(0x0100_0000));
startpoint = Lsn(0x0100_0000);
pcache.init_valid_lsn(0x_1_000_000_u64);
startpoint = 0x_1_000_000_u64;
} else {
// There might be some padding after the last full record, skip it.
//
// FIXME: It probably would be better to always start streaming from the beginning
// of the page, or the segment, so that we could check the page/segment headers
// too. Just for the sake of paranoia.
startpoint += startpoint.calc_padding(8u32);
if startpoint % 8 != 0 {
startpoint += 8 - (startpoint % 8);
}
}
debug!(
"last_valid_lsn {} starting replication from {} for timeline {}, server is at {}...",
last_valid_lsn, startpoint, timelineid, end_of_wal
"last_valid_lsn {:X}/{:X} starting replication from {:X}/{:X} for timeline {}, server is at {:X}/{:X}...",
(last_valid_lsn >> 32),
(last_valid_lsn & 0xffffffff),
(startpoint >> 32),
(startpoint & 0xffffffff),
timelineid,
(end_of_wal >> 32),
(end_of_wal & 0xffffffff)
);
let startpoint = PgLsn::from(startpoint);
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
let copy_stream = runtime.block_on(rclient.copy_both_simple::<bytes::Bytes>(&query))?;
let copy_stream = rclient.copy_both_simple::<bytes::Bytes>(&query).await?;
let physical_stream = ReplicationStream::new(copy_stream);
tokio::pin!(physical_stream);
let mut waldecoder = WalStreamDecoder::new(startpoint);
let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
while let Some(replication_message) = runtime.block_on(physical_stream.next()) {
while let Some(replication_message) = physical_stream.next().await {
match replication_message? {
ReplicationMessage::XLogData(xlog_data) => {
// Pass the WAL data to the decoder, and see if we can decode
// more records as a result.
let data = xlog_data.data();
let startlsn = Lsn::from(xlog_data.wal_start());
let startlsn = xlog_data.wal_start();
let endlsn = startlsn + data.len() as u64;
write_wal_file(
@@ -208,7 +203,13 @@ fn walreceiver_main(
data,
)?;
trace!("received XLogData between {} and {}", startlsn, endlsn);
trace!(
"received XLogData between {:X}/{:X} and {:X}/{:X}",
(startlsn >> 32),
(startlsn & 0xffffffff),
(endlsn >> 32),
(endlsn & 0xffffffff)
);
waldecoder.feed_bytes(data);
@@ -221,63 +222,22 @@ fn walreceiver_main(
// so having multiple copies of it doesn't cost that much)
for blk in decoded.blocks.iter() {
let tag = BufferTag {
rel: RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum as u8,
},
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum as u8,
blknum: blk.blkno,
};
let rec = page_cache::WALRecord {
lsn,
will_init: blk.will_init || blk.apply_image,
truncate: false,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
main_data_offset: decoded.main_data_offset,
};
pcache.put_wal_record(tag, rec);
}
// include truncate wal record in all pages
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== pg_constants::XLOG_SMGR_TRUNCATE
{
let truncate = XlSmgrTruncate::decode(&decoded);
if (truncate.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 {
let tag = BufferTag {
rel: RelTag {
spcnode: truncate.rnode.spcnode,
dbnode: truncate.rnode.dbnode,
relnode: truncate.rnode.relnode,
forknum: pg_constants::MAIN_FORKNUM,
},
blknum: truncate.blkno,
};
let rec = page_cache::WALRecord {
lsn,
will_init: false,
truncate: true,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
pcache.put_rel_wal_record(tag, rec)?;
}
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== pg_constants::XLOG_DBASE_CREATE
{
let createdb = XlCreateDatabase::decode(&decoded);
pcache.create_database(
lsn,
createdb.db_id,
createdb.tablespace_id,
createdb.src_db_id,
createdb.src_tablespace_id,
)?;
}
// Now that this record has been handled, let the page cache know that
// it is up-to-date to this LSN
pcache.advance_last_record_lsn(lsn);
@@ -295,7 +255,11 @@ fn walreceiver_main(
pcache.advance_last_valid_lsn(endlsn);
if !caught_up && endlsn >= end_of_wal {
info!("caught up at LSN {}", endlsn);
info!(
"caught up at LSN {:X}/{:X}",
(endlsn >> 32),
(endlsn & 0xffffffff)
);
caught_up = true;
}
}
@@ -313,24 +277,23 @@ fn walreceiver_main(
);
if reply_requested {
// TODO: More thought should go into what values are sent here.
let last_lsn = PgLsn::from(u64::from(pcache.get_last_valid_lsn()));
let last_lsn = PgLsn::from(pcache.get_last_valid_lsn());
let write_lsn = last_lsn;
let flush_lsn = last_lsn;
let apply_lsn = PgLsn::INVALID;
let ts = PgTimestamp::now()?;
const NO_REPLY: u8 = 0u8;
runtime.block_on(
physical_stream
.as_mut()
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY),
)?;
physical_stream
.as_mut()
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)
.await?;
}
}
_ => (),
}
}
Ok(())
return Ok(());
}
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
@@ -353,12 +316,9 @@ pub struct IdentifySystem {
pub struct IdentifyError;
/// Run the postgres `IDENTIFY_SYSTEM` command
pub fn identify_system(
runtime: &Runtime,
client: &tokio_postgres::Client,
) -> Result<IdentifySystem, Error> {
pub async fn identify_system(client: &tokio_postgres::Client) -> Result<IdentifySystem, Error> {
let query_str = "IDENTIFY_SYSTEM";
let response = runtime.block_on(client.simple_query(query_str))?;
let response = client.simple_query(query_str).await?;
// get(N) from row, then parse it as some destination type.
fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
@@ -383,8 +343,64 @@ pub fn identify_system(
}
}
pub const XLOG_FNAME_LEN: usize = 24;
pub const XLOG_BLCKSZ: usize = 8192;
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
pub const XLOG_PAGE_MAGIC: u16 = 0xD109;
pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = XLP_REM_LEN_OFFS + 4 + 4;
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = XLOG_SIZE_OF_XLOG_SHORT_PHD + 8 + 4 + 4;
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = XLOG_RECORD_CRC_OFFS + 4;
pub type XLogRecPtr = u64;
pub type TimeLineID = u32;
pub type TimestampTz = u64;
pub type XLogSegNo = u64;
#[allow(non_snake_case)]
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
}
#[allow(non_snake_case)]
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
}
#[allow(non_snake_case)]
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
return xlogptr / wal_segsz_bytes as u64;
}
#[allow(non_snake_case)]
pub fn XLogSegNoOffsetToRecPtr(
segno: XLogSegNo,
offset: u32,
wal_segsz_bytes: usize,
) -> XLogRecPtr {
return segno * (wal_segsz_bytes as u64) + (offset as u64);
}
#[allow(non_snake_case)]
pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
return format!(
"{:>08X}{:>08X}{:>08X}",
tli,
logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes),
logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)
);
}
#[allow(non_snake_case)]
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
}
fn write_wal_file(
startpos: Lsn,
startpos: XLogRecPtr,
timeline: ZTimelineId,
wal_seg_size: usize,
buf: &[u8],
@@ -393,12 +409,12 @@ fn write_wal_file(
let mut bytes_written: usize = 0;
let mut partial;
let mut start_pos = startpos;
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
const ZERO_BLOCK: &'static [u8] = &[0u8; XLOG_BLCKSZ];
let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
/* Extract WAL location for this block */
let mut xlogoff = start_pos.segment_offset(wal_seg_size as u64) as usize;
let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
while bytes_left != 0 {
let bytes_to_write;
@@ -414,7 +430,7 @@ fn write_wal_file(
}
/* Open file */
let segno = start_pos.segment_number(wal_seg_size as u64);
let segno = XLByteToSeg(start_pos, wal_seg_size);
let wal_file_name = XLogFileName(
1, // FIXME: always use Postgres timeline 1
segno,
@@ -466,7 +482,7 @@ fn write_wal_file(
xlogoff += bytes_to_write;
/* Did we reach the end of a WAL segment? */
if start_pos.segment_offset(wal_seg_size as u64) == 0 {
if XLogSegmentOffset(start_pos, wal_seg_size) == 0 {
xlogoff = 0;
if partial {
fs::rename(&wal_file_partial_path, &wal_file_path)?;

View File

@@ -14,341 +14,221 @@
// TODO: Even though the postgres code runs in a separate process,
// it's not a secure sandbox.
//
use bytes::{Buf, BufMut, Bytes, BytesMut};
use log::*;
use std::assert;
use std::cell::RefCell;
use std::fs;
use std::fs::OpenOptions;
use std::io::prelude::*;
use std::io::Error;
use std::path::PathBuf;
use std::process::Stdio;
use std::sync::mpsc;
use std::sync::Mutex;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;
use tokio::io::AsyncBufReadExt;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::process::{ChildStdin, ChildStdout, Command};
use tokio::process::{Child, ChildStdin, ChildStdout, Command};
use tokio::runtime::Runtime;
use tokio::time::timeout;
use zenith_utils::lsn::Lsn;
use crate::page_cache::BufferTag;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use crate::page_cache;
use crate::page_cache::CacheEntry;
use crate::page_cache::WALRecord;
use crate::PageServerConf;
use crate::ZTimelineId;
use postgres_ffi::pg_constants;
use postgres_ffi::xlog_utils::XLogRecord;
use crate::{page_cache::BufferTag, pg_constants, PageServerConf};
static TIMEOUT: Duration = Duration::from_secs(20);
///
/// A WAL redo manager consists of two parts: WalRedoManager, and
/// WalRedoManagerInternal. WalRedoManager is the public struct
/// that can be used to send redo requests to the manager.
/// WalRedoManagerInternal is used by the manager thread itself.
///
pub struct WalRedoManager {
request_tx: Mutex<mpsc::Sender<WalRedoRequest>>,
}
//
// Main entry point for the WAL applicator thread.
//
pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) {
info!("WAL redo thread started {}", timelineid);
struct WalRedoManagerInternal {
_conf: PageServerConf,
timelineid: ZTimelineId,
// We block on waiting for requests on the walredo request channel, but
// use async I/O to communicate with the child process. Initialize the
// runtime for the async part.
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
request_rx: mpsc::Receiver<WalRedoRequest>,
}
let pcache = page_cache::get_pagecache(conf, timelineid).unwrap();
#[derive(Debug)]
struct WalRedoRequest {
tag: BufferTag,
lsn: Lsn,
// Loop forever, handling requests as they come.
let walredo_channel_receiver = &pcache.walredo_receiver;
loop {
let mut process: WalRedoProcess;
let datadir = format!("wal-redo/{}", timelineid);
base_img: Option<Bytes>,
records: Vec<WALRecord>,
response_channel: mpsc::Sender<Result<Bytes, WalRedoError>>,
}
/// An error happened in WAL redo
#[derive(Debug, thiserror::Error)]
pub enum WalRedoError {
#[error(transparent)]
IoError(#[from] std::io::Error),
}
///
/// Public interface of WAL redo manager
///
impl WalRedoManager {
///
/// Create a new WalRedoManager.
///
/// This only initializes the struct. You need to call WalRedoManager::launch to
/// start the thread that processes the requests.
pub fn new(conf: &PageServerConf, timelineid: ZTimelineId) -> WalRedoManager {
let (tx, rx) = mpsc::channel();
//
// Launch the WAL redo thread
//
// Get mutable references to the values that we need to pass to the
// thread.
let request_rx = rx;
let conf_copy = conf.clone();
// Currently, the join handle is not saved anywhere and we
// won't try restart the thread if it dies.
let _walredo_thread = std::thread::Builder::new()
.name("WAL redo thread".into())
.spawn(move || {
let mut internal = WalRedoManagerInternal {
_conf: conf_copy,
timelineid,
request_rx,
};
internal.wal_redo_main();
})
.unwrap();
WalRedoManager {
request_tx: Mutex::new(tx),
info!("launching WAL redo postgres process {}", timelineid);
{
let _guard = runtime.enter();
process = WalRedoProcess::launch(&datadir, &runtime).unwrap();
}
}
///
/// Request the WAL redo manager to apply WAL records, to reconstruct the page image
/// of the given page version.
///
pub fn request_redo(
&self,
tag: BufferTag,
lsn: Lsn,
base_img: Option<Bytes>,
records: Vec<WALRecord>,
) -> Result<Bytes, WalRedoError> {
// Create a channel where to receive the response
let (tx, rx) = mpsc::channel::<Result<Bytes, WalRedoError>>();
// Pretty arbitrarily, reuse the same Postgres process for 100 requests.
// After that, kill it and start a new one. This is mostly to avoid
// using up all shared buffers in Postgres's shared buffer cache; we don't
// want to write any pages to disk in the WAL redo process.
for _i in 1..100 {
let request = walredo_channel_receiver.recv().unwrap();
let request = WalRedoRequest {
tag,
lsn,
base_img,
records,
response_channel: tx,
};
self.request_tx
.lock()
.unwrap()
.send(request)
.expect("could not send WAL redo request");
rx.recv()
.expect("could not receive response to WAL redo request")
}
}
///
/// WAL redo thread
///
impl WalRedoManagerInternal {
//
// Main entry point for the WAL applicator thread.
//
fn wal_redo_main(&mut self) {
info!("WAL redo thread started {}", self.timelineid);
// We block on waiting for requests on the walredo request channel, but
// use async I/O to communicate with the child process. Initialize the
// runtime for the async part.
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let process: WalRedoProcess;
let datadir = format!("wal-redo/{}", self.timelineid);
info!("launching WAL redo postgres process {}", self.timelineid);
process = runtime.block_on(WalRedoProcess::launch(&datadir)).unwrap();
info!("WAL redo postgres started");
// Loop forever, handling requests as they come.
loop {
let request = self.request_rx.recv().unwrap();
let result = runtime.block_on(self.handle_apply_request(&process, &request));
let result_ok = result.is_ok();
// Send the result to the requester
let _ = request.response_channel.send(result);
if !result_ok {
error!("wal-redo-postgres filed to apply request {:?}", request);
let result = handle_apply_request(&pcache, &process, &runtime, request);
if result.is_err() {
// On error, kill the process.
break;
}
}
info!("killing WAL redo postgres process");
let _ = runtime.block_on(process.stdin.get_mut().shutdown());
let mut child = process.child;
drop(process.stdin);
let _ = runtime.block_on(child.wait());
}
}
fn transaction_id_set_status_bit(
xl_info: u8,
xl_rmid: u8,
xl_xid: u32,
record: WALRecord,
page: &mut BytesMut,
) {
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
let mut status = 0;
if info == pg_constants::XLOG_XACT_COMMIT {
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
} else if info == pg_constants::XLOG_XACT_ABORT {
status = pg_constants::TRANSACTION_STATUS_ABORTED;
} else {
trace!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {:X}/{:X} main_data_offset {}, rec.len {}",
status,
record.lsn >> 32,
record.lsn & 0xffffffff,
record.main_data_offset, record.rec.len());
return;
}
fn transaction_id_set_status_bit(&self, xid: u32, status: u8, page: &mut BytesMut) {
trace!(
"handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
status
);
trace!("handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort) lsn {:X}/{:X} main_data_offset {}, rec.len {}",
status,
record.lsn >> 32,
record.lsn & 0xffffffff,
record.main_data_offset, record.rec.len());
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
let byteno: usize = ((xl_rmid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
let bshift: u8 = ((xid % pg_constants::CLOG_XACTS_PER_BYTE)
* pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
let byteptr = &mut page[byteno..byteno + 1];
let bshift: u8 = ((xl_xid % pg_constants::CLOG_XACTS_PER_BYTE)
* pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
page[byteno] =
(page[byteno] & !(pg_constants::CLOG_XACT_BITMASK << bshift)) | (status << bshift);
}
let mut curval = byteptr[0];
curval = (curval >> bshift) & pg_constants::CLOG_XACT_BITMASK;
///
/// Process one request for WAL redo.
///
async fn handle_apply_request(
&self,
process: &WalRedoProcess,
request: &WalRedoRequest,
) -> Result<Bytes, WalRedoError> {
let tag = request.tag;
let lsn = request.lsn;
let base_img = request.base_img.clone();
let records = &request.records;
let mut byteval = [0];
byteval[0] = curval;
byteval[0] &= !(((1 << pg_constants::CLOG_BITS_PER_XACT as u8) - 1) << bshift);
byteval[0] |= status << bshift;
let nrecords = records.len();
byteptr.copy_from_slice(&byteval);
trace!(
"xl_xid {} byteno {} curval {} byteval {}",
xl_xid,
byteno,
curval,
byteval[0]
);
}
let start = Instant::now();
fn handle_apply_request(
pcache: &page_cache::PageCache,
process: &WalRedoProcess,
runtime: &Runtime,
entry_rc: Arc<CacheEntry>,
) -> Result<(), Error> {
let tag = entry_rc.key.tag;
let lsn = entry_rc.key.lsn;
let (base_img, records) = pcache.collect_records_for_apply(entry_rc.as_ref());
let apply_result: Result<Bytes, Error>;
if tag.rel.forknum == pg_constants::PG_XACT_FORKNUM {
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
let mut page = BytesMut::new();
if let Some(fpi) = base_img {
page.extend_from_slice(&fpi[..]);
} else {
page.extend_from_slice(&ZERO_PAGE);
}
for record in records {
let mut buf = record.rec.clone();
let mut entry = entry_rc.content.lock().unwrap();
entry.apply_pending = false;
// 1. Parse XLogRecord struct
// FIXME: refactor to avoid code duplication.
let xlogrec = XLogRecord::from_bytes(&mut buf);
let nrecords = records.len();
//move to main data
// TODO probably, we should store some records in our special format
// to avoid this weird parsing on replay
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
if buf.remaining() > skip {
buf.advance(skip);
}
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
if info == pg_constants::CLOG_ZEROPAGE {
page.clone_from_slice(&ZERO_PAGE);
}
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
let mut status = 0;
if info == pg_constants::XLOG_XACT_COMMIT {
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
self.transaction_id_set_status_bit(xlogrec.xl_xid, status, &mut page);
//handle subtrans
let _xact_time = buf.get_i64_le();
let mut xinfo = 0;
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
xinfo = buf.get_u32_le();
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
let _dbid = buf.get_u32_le();
let _tsid = buf.get_u32_le();
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
let nsubxacts = buf.get_i32_le();
for _i in 0..nsubxacts {
let subxact = buf.get_u32_le();
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
// only update xids on the requested page
if tag.blknum == blkno {
status = pg_constants::TRANSACTION_STATUS_SUB_COMMITTED;
self.transaction_id_set_status_bit(subxact, status, &mut page);
}
}
}
} else if info == pg_constants::XLOG_XACT_ABORT {
status = pg_constants::TRANSACTION_STATUS_ABORTED;
self.transaction_id_set_status_bit(xlogrec.xl_xid, status, &mut page);
//handle subtrans
let _xact_time = buf.get_i64_le();
let mut xinfo = 0;
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
xinfo = buf.get_u32_le();
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
let _dbid = buf.get_u32_le();
let _tsid = buf.get_u32_le();
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
let nsubxacts = buf.get_i32_le();
for _i in 0..nsubxacts {
let subxact = buf.get_u32_le();
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
// only update xids on the requested page
if tag.blknum == blkno {
status = pg_constants::TRANSACTION_STATUS_ABORTED;
self.transaction_id_set_status_bit(subxact, status, &mut page);
}
}
}
} else {
trace!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {} main_data_offset {}, rec.len {}",
status,
record.lsn,
record.main_data_offset, record.rec.len());
}
let start = Instant::now();
let apply_result: Result<Bytes, Error>;
if tag.forknum == pg_constants::PG_XACT_FORKNUM as u8 {
//TODO use base image if any
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
let zero_page_bytes: &[u8] = &ZERO_PAGE;
let mut page = BytesMut::from(zero_page_bytes);
for record in records {
let mut buf = record.rec.clone();
// 1. Parse XLogRecord struct
// FIXME: refactor to avoid code duplication.
let _xl_tot_len = buf.get_u32_le();
let xl_xid = buf.get_u32_le();
let _xl_prev = buf.get_u64_le();
let xl_info = buf.get_u8();
let xl_rmid = buf.get_u8();
buf.advance(2); // 2 bytes of padding
let _xl_crc = buf.get_u32_le();
if xl_rmid == pg_constants::RM_CLOG_ID {
let info = xl_info & !pg_constants::XLR_INFO_MASK;
if info == pg_constants::CLOG_ZEROPAGE {
page.clone_from_slice(zero_page_bytes);
trace!("handle_apply_request for RM_CLOG_ID-CLOG_ZEROPAGE lsn {:X}/{:X} main_data_offset {}, rec.len {}",
record.lsn >> 32,
record.lsn & 0xffffffff,
record.main_data_offset, record.rec.len());
}
} else if xl_rmid == pg_constants::RM_XACT_ID {
transaction_id_set_status_bit(xl_info, xl_rmid, xl_xid, record, &mut page);
}
apply_result = Ok::<Bytes, Error>(page.freeze());
} else {
apply_result = process.apply_wal_records(tag, base_img, records).await;
}
let duration = start.elapsed();
let result: Result<Bytes, WalRedoError>;
trace!(
"applied {} WAL records in {} ms to reconstruct page image at LSN {}",
nrecords,
duration.as_millis(),
lsn
);
if let Err(e) = apply_result {
error!("could not apply WAL records: {}", e);
result = Err(WalRedoError::IoError(e));
} else {
let img = apply_result.unwrap();
result = Ok(img);
}
// The caller is responsible for sending the response
result
apply_result = Ok::<Bytes, Error>(page.freeze());
} else {
apply_result = process.apply_wal_records(runtime, tag, base_img, records);
}
let duration = start.elapsed();
let result;
debug!(
"applied {} WAL records in {} ms to reconstruct page image at LSN {:X}/{:X}",
nrecords,
duration.as_millis(),
lsn >> 32,
lsn & 0xffff_ffff
);
if let Err(e) = apply_result {
error!("could not apply WAL records: {}", e);
result = Err(e);
} else {
entry.page_image = Some(apply_result.unwrap());
pcache
.num_page_images
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
result = Ok(());
}
// Wake up the requester, whether the operation succeeded or not.
entry_rc.walredo_condvar.notify_all();
return result;
}
struct WalRedoProcess {
child: Child,
stdin: RefCell<ChildStdin>,
stdout: RefCell<ChildStdout>,
}
@@ -360,14 +240,16 @@ impl WalRedoProcess {
// Tests who run pageserver binary are setting proper PG_BIN_DIR
// and PG_LIB_DIR so that WalRedo would start right postgres. We may later
// switch to setting same things in pageserver config file.
async fn launch(datadir: &str) -> Result<WalRedoProcess, Error> {
fn launch(datadir: &str, runtime: &Runtime) -> Result<WalRedoProcess, Error> {
// Create empty data directory for wal-redo postgres deleting old one.
fs::remove_dir_all(datadir).ok();
let initdb = Command::new("initdb")
.args(&["-D", datadir])
.arg("-N")
.output()
.await
let initdb = runtime
.block_on(
Command::new("initdb")
.args(&["-D", datadir])
.arg("-N")
.output(),
)
.expect("failed to execute initdb");
if !initdb.status.success() {
@@ -376,14 +258,8 @@ impl WalRedoProcess {
std::str::from_utf8(&initdb.stdout).unwrap(),
std::str::from_utf8(&initdb.stderr).unwrap()
);
} else {
// Limit shared cache for wal-redo-postres
let mut config = OpenOptions::new()
.append(true)
.open(PathBuf::from(&datadir).join("postgresql.conf"))?;
config.write_all(b"shared_buffers=128kB\n")?;
config.write_all(b"fsync=off\n")?;
}
// Start postgres itself
let mut child = Command::new("postgres")
.arg("--wal-redo")
@@ -414,7 +290,7 @@ impl WalRedoProcess {
if res.unwrap() == 0 {
break;
}
error!("wal-redo-postgres: {}", line.trim());
debug!("wal-redo-postgres: {}", line.trim());
line.clear();
}
Ok::<(), Error>(())
@@ -422,6 +298,7 @@ impl WalRedoProcess {
tokio::spawn(f_stderr);
Ok(WalRedoProcess {
child,
stdin: RefCell::new(stdin),
stdout: RefCell::new(stdout),
})
@@ -431,91 +308,97 @@ impl WalRedoProcess {
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
async fn apply_wal_records(
fn apply_wal_records(
&self,
runtime: &Runtime,
tag: BufferTag,
base_img: Option<Bytes>,
records: &Vec<WALRecord>,
) -> Result<Bytes, std::io::Error> {
records: Vec<WALRecord>,
) -> Result<Bytes, Error> {
let mut stdin = self.stdin.borrow_mut();
let mut stdout = self.stdout.borrow_mut();
// We do three things simultaneously: send the old base image and WAL records to
// the child process's stdin, read the result from child's stdout, and forward any logging
// information that the child writes to its stderr to the page server's log.
//
// 'f_stdin' handles writing the base image and WAL records to the child process.
// 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
// tokio runtime in the 'launch' function already, forwards the logging.
let f_stdin = async {
// Send base image, if any. (If the record initializes the page, previous page
// version is not needed.)
timeout(
TIMEOUT,
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
)
.await??;
if base_img.is_some() {
return runtime.block_on(async {
//
// This async block sends all the commands to the process.
//
// For reasons I don't understand, this needs to be a "move" block;
// otherwise the stdin pipe doesn't get closed, despite the shutdown()
// call.
//
let f_stdin = async {
// Send base image, if any. (If the record initializes the page, previous page
// version is not needed.)
timeout(
TIMEOUT,
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
)
.await??;
}
if base_img.is_some() {
timeout(
TIMEOUT,
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
)
.await??;
}
// Send WAL records.
for rec in records.iter() {
let r = rec.clone();
// Send WAL records.
for rec in records.iter() {
let r = rec.clone();
stdin
.write_all(&build_apply_record_msg(r.lsn, r.rec))
.await?;
stdin
.write_all(&build_apply_record_msg(r.lsn, r.rec))
.await?;
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
// r.lsn >> 32, r.lsn & 0xffff_ffff);
}
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
// r.lsn >> 32, r.lsn & 0xffff_ffff);
}
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
// Send GetPage command to get the result back
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
timeout(TIMEOUT, stdin.flush()).await??;
//debug!("sent GetPage for {}", tag.blknum);
Ok::<(), Error>(())
};
// Send GetPage command to get the result back
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
timeout(TIMEOUT, stdin.flush()).await??;
//debug!("sent GetPage for {}", tag.blknum);
Ok::<(), Error>(())
};
// Read back new page image
let f_stdout = async {
let mut buf = [0u8; 8192];
// Read back new page image
let f_stdout = async {
let mut buf = [0u8; 8192];
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
//debug!("got response for {}", tag.blknum);
Ok::<[u8; 8192], Error>(buf)
};
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
//debug!("got response for {}", tag.blknum);
Ok::<[u8; 8192], Error>(buf)
};
let res = futures::try_join!(f_stdout, f_stdin)?;
// Kill the process. This closes its stdin, which should signal the process
// to terminate. TODO: SIGKILL if needed
//child.wait();
let buf = res.0;
let res = futures::try_join!(f_stdout, f_stdin)?;
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
let buf = res.0;
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
});
}
}
// Functions for constructing messages to send to the postgres WAL redo
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
// explanation of the protocol.
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
let len = 4 + 5 * 4;
let mut buf = BytesMut::with_capacity(1 + len);
buf.put_u8(b'B');
buf.put_u8('B' as u8);
buf.put_u32(len as u32);
tag.pack(&mut buf);
buf.put_u32(tag.spcnode);
buf.put_u32(tag.dbnode);
buf.put_u32(tag.relnode);
buf.put_u32(tag.forknum as u32);
buf.put_u32(tag.blknum);
assert!(buf.len() == 1 + len);
buf.freeze()
return buf.freeze();
}
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
@@ -524,39 +407,47 @@ fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
let len = 4 + 5 * 4 + base_img.len();
let mut buf = BytesMut::with_capacity(1 + len);
buf.put_u8(b'P');
buf.put_u8('P' as u8);
buf.put_u32(len as u32);
tag.pack(&mut buf);
buf.put_u32(tag.spcnode);
buf.put_u32(tag.dbnode);
buf.put_u32(tag.relnode);
buf.put_u32(tag.forknum as u32);
buf.put_u32(tag.blknum);
buf.put(base_img);
assert!(buf.len() == 1 + len);
buf.freeze()
return buf.freeze();
}
fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
fn build_apply_record_msg(endlsn: u64, rec: Bytes) -> Bytes {
let len = 4 + 8 + rec.len();
let mut buf = BytesMut::with_capacity(1 + len);
buf.put_u8(b'A');
buf.put_u8('A' as u8);
buf.put_u32(len as u32);
buf.put_u64(endlsn.0);
buf.put_u64(endlsn);
buf.put(rec);
assert!(buf.len() == 1 + len);
buf.freeze()
return buf.freeze();
}
fn build_get_page_msg(tag: BufferTag) -> Bytes {
let len = 4 + 5 * 4;
let mut buf = BytesMut::with_capacity(1 + len);
buf.put_u8(b'G');
buf.put_u8('G' as u8);
buf.put_u32(len as u32);
tag.pack(&mut buf);
buf.put_u32(tag.spcnode);
buf.put_u32(tag.dbnode);
buf.put_u32(tag.relnode);
buf.put_u32(tag.forknum as u32);
buf.put_u32(tag.blknum);
assert!(buf.len() == 1 + len);
buf.freeze()
return buf.freeze();
}

33
pgbuild.sh Executable file
View File

@@ -0,0 +1,33 @@
#!/bin/sh
#
# Purpose of this script is to build and install postgres in a local directory
# so that zenith intergation tests would find pg binaries and support files.
#
# ./pgbuild.sh would do following:
#
# 1) run out-of-source build of postgres in REPO_ROOT/tmp_install/build directory (I'm reusing
# tmp_install path here since it is already present in .gitignore)
#
# 2) installs postgres to REPO_ROOT/tmp_install/
#
# Halt immediately if any command fails
set -e
REPO_ROOT=$(dirname "$0")
REPO_ROOT="`( cd \"$REPO_ROOT\" && pwd )`"
# configure
echo "Configuring postgres build"
mkdir -p $REPO_ROOT/tmp_install/build
cd $REPO_ROOT/tmp_install/build
../../vendor/postgres/configure CFLAGS='-O0' --enable-debug --enable-cassert \
--enable-depend --with-libxml --prefix=/ > configure.log
# compile
echo "Compiling postgres"
make -j8 -s
export DESTDIR=$REPO_ROOT/tmp_install
echo "Installing postgres to $DESTDIR"
make install -s

View File

@@ -14,7 +14,6 @@ byteorder = "1.4.3"
anyhow = "1.0"
crc32c = "0.6.0"
hex = "0.4.3"
log = "0.4.14"
[build-dependencies]
bindgen = "0.57"
bindgen = "0.53.1"

View File

@@ -1,3 +0,0 @@
This module contains utility functions for interacting with PostgreSQL
file formats.

View File

@@ -24,7 +24,7 @@ fn main() {
// Path the server include dir. It is in tmp_install/include/server, if you did
// "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
// and used DESTDIR to move it into tmp_install, then it's in
// tmp_install/include/postgres/server
// tmp_install/include/postgres/server (that's how the pgbuild.sh script does it).
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
// but this will do for now.
.clang_arg("-I../tmp_install/include/server")

View File

@@ -3,12 +3,7 @@
#![allow(non_snake_case)]
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
pub mod pg_constants;
pub mod xlog_utils;
use bytes::{Buf, Bytes, BytesMut};
use std::error::Error;
use std::fmt;
// sizeof(ControlFileData)
const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
@@ -23,13 +18,13 @@ impl ControlFileData {
controlfile =
unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
controlfile
return controlfile;
}
}
pub fn decode_pg_control(mut buf: Bytes) -> Result<ControlFileData, anyhow::Error> {
pub fn decode_pg_control(buf: Bytes) -> Result<ControlFileData, anyhow::Error> {
let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA];
buf.copy_to_slice(&mut b);
buf.clone().copy_to_slice(&mut b);
let controlfile: ControlFileData;
@@ -51,50 +46,6 @@ pub fn decode_pg_control(mut buf: Bytes) -> Result<ControlFileData, anyhow::Erro
Ok(controlfile)
}
#[derive(Debug, Clone)]
pub struct FilePathError {
msg: String,
}
impl Error for FilePathError {
fn description(&self) -> &str {
&self.msg
}
}
impl FilePathError {
pub fn new(msg: &str) -> FilePathError {
FilePathError {
msg: msg.to_string(),
}
}
}
impl From<core::num::ParseIntError> for FilePathError {
fn from(e: core::num::ParseIntError) -> Self {
return FilePathError {
msg: format!("invalid filename: {}", e),
};
}
}
impl fmt::Display for FilePathError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "invalid filename")
}
}
pub fn forkname_to_forknum(forkname: Option<&str>) -> Result<u8, FilePathError> {
match forkname {
// "main" is not in filenames, it's implicit if the fork name is not present
None => Ok(pg_constants::MAIN_FORKNUM),
Some("fsm") => Ok(pg_constants::FSM_FORKNUM),
Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM),
Some("init") => Ok(pg_constants::INIT_FORKNUM),
Some(_) => Err(FilePathError::new("invalid forkname")),
}
}
pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes {
let b: [u8; SIZEOF_CONTROLDATA];
@@ -112,5 +63,5 @@ pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes {
// Fill the rest of the control file with zeros.
buf.resize(PG_CONTROL_FILE_SIZE as usize, 0);
buf.into()
return buf.into();
}

View File

@@ -1,130 +0,0 @@
// From pg_tablespace_d.h
//
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
pub const GLOBALTABLESPACE_OID: u32 = 1664;
//TODO maybe use enum?
pub const MAIN_FORKNUM: u8 = 0;
pub const FSM_FORKNUM: u8 = 1;
pub const VISIBILITYMAP_FORKNUM: u8 = 2;
pub const INIT_FORKNUM: u8 = 3;
//Special values for non-rel files' tags
pub const PG_CONTROLFILE_FORKNUM: u8 = 42;
pub const PG_FILENODEMAP_FORKNUM: u8 = 43;
pub const PG_XACT_FORKNUM: u8 = 44;
pub const PG_MXACT_OFFSETS_FORKNUM: u8 = 45;
pub const PG_MXACT_MEMBERS_FORKNUM: u8 = 46;
//
// constants from clog.h
//
pub const CLOG_XACTS_PER_BYTE: u32 = 4;
pub const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
pub const CLOG_BITS_PER_XACT: u8 = 2;
pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
//
// Constants from visbilitymap.h
//
pub const SIZE_OF_PAGE_HEADER: u16 = 24;
pub const BITS_PER_HEAPBLOCK: u16 = 2;
pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
pub const CLOG_ZEROPAGE: u8 = 0x00;
pub const CLOG_TRUNCATE: u8 = 0x10;
// From xact.h
pub const XLOG_XACT_COMMIT: u8 = 0x00;
pub const XLOG_XACT_ABORT: u8 = 0x20;
// From srlu.h
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
/* mask for filtering opcodes out of xl_info */
pub const XLOG_XACT_OPMASK: u8 = 0x70;
/* does this record have a 'xinfo' field or not */
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
/*
* The following flags, stored in xinfo, determine which information is
* contained in commit/abort records.
*/
pub const XACT_XINFO_HAS_DBINFO: u32 = 1u32 << 0;
pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
// pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
// pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;
// From pg_control.h and rmgrlist.h
pub const XLOG_SWITCH: u8 = 0x40;
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
// From heapam_xlog.h
pub const XLOG_HEAP_INSERT: u8 = 0x00;
pub const XLOG_HEAP_DELETE: u8 = 0x10;
pub const XLOG_HEAP_UPDATE: u8 = 0x20;
pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
pub const RM_XLOG_ID: u8 = 0;
pub const RM_XACT_ID: u8 = 1;
pub const RM_SMGR_ID: u8 = 2;
pub const RM_CLOG_ID: u8 = 3;
pub const RM_DBASE_ID: u8 = 4;
pub const RM_TBLSPC_ID: u8 = 5;
pub const RM_MULTIXACT_ID: u8 = 6;
pub const RM_RELMAP_ID: u8 = 7;
pub const RM_STANDBY_ID: u8 = 8;
pub const RM_HEAP2_ID: u8 = 9;
pub const RM_HEAP_ID: u8 = 10;
// from xlogreader.h
pub const XLR_INFO_MASK: u8 = 0x0F;
pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
// from dbcommands_xlog.h
pub const XLOG_DBASE_CREATE: u8 = 0x00;
pub const XLOG_DBASE_DROP: u8 = 0x10;
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
pub const SIZEOF_XLOGRECORD: u32 = 24;
// FIXME:
pub const BLCKSZ: u16 = 8192;
//
// from xlogrecord.h
//
pub const XLR_MAX_BLOCK_ID: u8 = 32;
pub const XLR_BLOCK_ID_DATA_SHORT: u8 = 255;
pub const XLR_BLOCK_ID_DATA_LONG: u8 = 254;
pub const XLR_BLOCK_ID_ORIGIN: u8 = 253;
pub const XLR_BLOCK_ID_TOPLEVEL_XID: u8 = 252;
pub const BKPBLOCK_FORK_MASK: u8 = 0x0F;
pub const _BKPBLOCK_FLAG_MASK: u8 = 0xF0;
pub const BKPBLOCK_HAS_IMAGE: u8 = 0x10; /* block data is an XLogRecordBlockImage */
pub const BKPBLOCK_HAS_DATA: u8 = 0x20;
pub const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */
pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */
/* Information stored in bimg_info */
pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */

2373
walkeeper/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -20,7 +20,6 @@ slog = "2.7.0"
log = "0.4.14"
clap = "2.33.0"
daemonize = "0.4.1"
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", rev="7f15a24ec7daa0a5d9516da706212745f9042818", features = ["no-verify-ssl"] }
tokio = { version = "1.3.0", features = ["full"] }
tokio-stream = { version = "0.1.4" }
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
@@ -28,9 +27,6 @@ postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
anyhow = "1.0"
crc32c = "0.6.0"
parse_duration = "*"
walkdir = "2"
# FIXME: 'pageserver' is needed for ZTimelineId. Refactor
pageserver = { path = "../pageserver" }
postgres_ffi = { path = "../postgres_ffi" }

View File

@@ -3,11 +3,10 @@
//
use daemonize::Daemonize;
use log::*;
use parse_duration::parse;
use std::io;
use std::path::{Path, PathBuf};
use std::path::Path;
use std::path::PathBuf;
use std::thread;
use std::time::Duration;
use std::{fs::File, fs::OpenOptions};
use anyhow::Result;
@@ -15,7 +14,6 @@ use clap::{App, Arg};
use slog::Drain;
use walkeeper::s3_offload;
use walkeeper::wal_service;
use walkeeper::WalAcceptorConf;
@@ -47,13 +45,8 @@ fn main() -> Result<()> {
Arg::with_name("pageserver")
.short("p")
.long("pageserver")
.takes_value(true),
)
.arg(
Arg::with_name("ttl")
.long("ttl")
.takes_value(true)
.help("interval for keeping WAL as walkeeper node, after which them will be uploaded to S3 and removed locally"),
.help("address ip:port of pageserver with which wal_acceptor should establish connection"),
)
.arg(
Arg::with_name("daemonize")
@@ -76,12 +69,11 @@ fn main() -> Result<()> {
let mut conf = WalAcceptorConf {
data_dir: PathBuf::from("./"),
systemid,
systemid: systemid,
daemonize: false,
no_sync: false,
pageserver_addr: None,
listen_addr: "127.0.0.1:5454".parse()?,
ttl: None,
};
if let Some(dir) = arg_matches.value_of("datadir") {
@@ -107,10 +99,6 @@ fn main() -> Result<()> {
conf.pageserver_addr = Some(addr.parse().unwrap());
}
if let Some(ttl) = arg_matches.value_of("ttl") {
conf.ttl = Some::<Duration>(parse(ttl)?);
}
start_wal_acceptor(conf)
}
@@ -150,19 +138,6 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
}
let mut threads = Vec::new();
if conf.ttl.is_some() {
let s3_conf = conf.clone();
let s3_offload_thread = thread::Builder::new()
.name("S3 offload thread".into())
.spawn(|| {
// thread code
s3_offload::thread_main(s3_conf);
})
.unwrap();
threads.push(s3_offload_thread);
}
let wal_acceptor_thread = thread::Builder::new()
.name("WAL acceptor thread".into())
.spawn(|| {

View File

@@ -1,11 +1,10 @@
//
use std::net::SocketAddr;
use std::path::PathBuf;
use std::time::Duration;
mod pq_protocol;
pub mod s3_offload;
pub mod wal_service;
pub mod xlog_utils;
use crate::pq_protocol::SystemId;
@@ -17,5 +16,4 @@ pub struct WalAcceptorConf {
pub no_sync: bool,
pub listen_addr: SocketAddr,
pub pageserver_addr: Option<SocketAddr>,
pub ttl: Option<Duration>,
}

View File

@@ -40,7 +40,6 @@ pub struct FeStartupMessage {
pub version: u32,
pub kind: StartupRequestCode,
pub timelineid: ZTimelineId,
pub appname: Option<String>,
}
#[derive(Debug)]
@@ -87,17 +86,15 @@ impl FeStartupMessage {
let params = params_str.split('\0');
let mut options = false;
let mut timelineid: Option<ZTimelineId> = None;
let mut appname: Option<String> = None;
for p in params {
if p == "options" {
options = true;
} else if options {
for opt in p.split(' ') {
if let Some(ztimelineid_str) = opt.strip_prefix("ztimelineid=") {
if opt.starts_with("ztimelineid=") {
// FIXME: rethrow parsing error, don't unwrap
timelineid = Some(ZTimelineId::from_str(ztimelineid_str).unwrap());
} else if let Some(val) = opt.strip_prefix("application_name=") {
appname = Some(val.to_string());
timelineid = Some(ZTimelineId::from_str(&opt[12..]).unwrap());
break;
}
}
break;
@@ -114,7 +111,6 @@ impl FeStartupMessage {
Ok(Some(FeMessage::StartupMessage(FeStartupMessage {
version,
kind,
appname,
timelineid: timelineid.unwrap(),
})))
}

View File

@@ -1,106 +0,0 @@
//
// Offload old WAL segments to S3 and remove them locally
//
use anyhow::Result;
use log::*;
use postgres_ffi::xlog_utils::*;
use s3::bucket::Bucket;
use s3::creds::Credentials;
use s3::region::Region;
use std::collections::HashSet;
use std::env;
use std::fs::{self, File};
use std::io::prelude::*;
use std::iter::FromIterator;
use std::path::PathBuf;
use std::time::SystemTime;
use tokio::runtime;
use tokio::time::sleep;
use walkdir::WalkDir;
use crate::WalAcceptorConf;
pub fn thread_main(conf: WalAcceptorConf) {
// Create a new thread pool
//
// FIXME: keep it single-threaded for now, make it easier to debug with gdb,
// and we're not concerned with performance yet.
//let runtime = runtime::Runtime::new().unwrap();
let runtime = runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
info!("Starting S3 offload task");
runtime.block_on(async {
main_loop(&conf).await.unwrap();
});
}
async fn offload_files(
bucket: &Bucket,
listing: &HashSet<String>,
dir_path: &PathBuf,
conf: &WalAcceptorConf,
) -> Result<u64> {
let horizon = SystemTime::now() - conf.ttl.unwrap();
let mut n: u64 = 0;
for entry in WalkDir::new(dir_path) {
let entry = entry?;
let path = entry.path();
if path.is_file()
&& IsXLogFileName(entry.file_name().to_str().unwrap())
&& entry.metadata().unwrap().created().unwrap() <= horizon
{
let relpath = path.strip_prefix(&conf.data_dir).unwrap();
let s3path = String::from("walarchive/") + relpath.to_str().unwrap();
if !listing.contains(&s3path) {
let mut file = File::open(&path)?;
let mut content = Vec::new();
file.read_to_end(&mut content)?;
bucket.put_object(s3path, &content).await?;
fs::remove_file(&path)?;
n += 1;
}
}
}
Ok(n)
}
async fn main_loop(conf: &WalAcceptorConf) -> Result<()> {
let region = Region::Custom {
region: env::var("S3_REGION").unwrap(),
endpoint: env::var("S3_ENDPOINT").unwrap(),
};
let credentials = Credentials::new(
Some(&env::var("S3_ACCESSKEY").unwrap()),
Some(&env::var("S3_SECRET").unwrap()),
None,
None,
None,
)
.unwrap();
// Create Bucket in REGION for BUCKET
let bucket = Bucket::new_with_path_style("zenith-testbucket", region, credentials)?;
loop {
// List out contents of directory
let results = bucket
.list("walarchive/".to_string(), Some("".to_string()))
.await?;
let listing = HashSet::from_iter(
results
.iter()
.flat_map(|b| b.contents.iter().map(|o| o.key.clone())),
);
let n = offload_files(&bucket, &listing, &conf.data_dir, conf).await?;
info!("Offload {} files to S3", n);
sleep(conf.ttl.unwrap()).await;
}
}

View File

@@ -29,9 +29,9 @@ use tokio::task;
use tokio_postgres::{connect, Error, NoTls};
use crate::pq_protocol::*;
use crate::xlog_utils::*;
use crate::WalAcceptorConf;
use pageserver::ZTimelineId;
use postgres_ffi::xlog_utils::*;
type FullTransactionId = u64;
@@ -158,12 +158,11 @@ pub struct Timeline {
#[derive(Debug)]
struct Connection {
timeline: Option<Arc<Timeline>>,
stream: TcpStream, /* Postgres connection */
inbuf: BytesMut, /* input buffer */
outbuf: BytesMut, /* output buffer */
init_done: bool, /* startup packet proceeded */
appname: Option<String>, /* assigned application name */
conf: WalAcceptorConf, /* wal acceptor configuration */
stream: TcpStream, /* Postgres connection */
inbuf: BytesMut, /* input buffer */
outbuf: BytesMut, /* output buffer */
init_done: bool, /* startup packet proceeded */
conf: WalAcceptorConf, /* wal acceptor configuration */
}
/*
@@ -194,14 +193,14 @@ fn parse_hex_str(s: &str) -> Result<u64> {
impl Serializer for NodeId {
fn pack(&self, buf: &mut BytesMut) {
buf.put_u64_le(self.term);
buf.put_u128(self.uuid); // use big endian to provide compatibility with memcmp
buf.put_u128_le(self.uuid);
buf.put_u64(self.term); // use big endian to provide compatibility with memcmp
}
fn unpack(buf: &mut BytesMut) -> NodeId {
NodeId {
term: buf.get_u64_le(),
uuid: buf.get_u128(), // use big endian to provide compatibility with memcmp
uuid: buf.get_u128_le(),
term: buf.get_u64(), // use big endian to provide compatibility with memcmp
}
}
}
@@ -445,7 +444,7 @@ impl Timeline {
fn get_hs_feedback(&self) -> HotStandbyFeedback {
let shared_state = self.mutex.lock().unwrap();
shared_state.hs_feedback
return shared_state.hs_feedback;
}
// Load and lock control file (prevent running more than one instance of safekeeper)
@@ -528,7 +527,7 @@ impl Timeline {
let file = shared_state.control_file.as_mut().unwrap();
file.seek(SeekFrom::Start(0))?;
file.write_all(&buf[..])?;
file.write_all(&mut buf[..])?;
if sync {
file.sync_all()?;
}
@@ -544,7 +543,6 @@ impl Connection {
inbuf: BytesMut::with_capacity(10 * 1024),
outbuf: BytesMut::with_capacity(10 * 1024),
init_done: false,
appname: None,
conf: conf.clone(),
}
}
@@ -556,7 +554,7 @@ impl Connection {
async fn run(&mut self) -> Result<()> {
self.inbuf.resize(4, 0u8);
self.stream.read_exact(&mut self.inbuf[0..4]).await?;
let startup_pkg_len = BigEndian::read_u32(&self.inbuf[0..4]);
let startup_pkg_len = BigEndian::read_u32(&mut self.inbuf[0..4]);
if startup_pkg_len == 0 {
self.receive_wal().await?; // internal protocol between wal_proposer and wal_acceptor
} else {
@@ -857,7 +855,6 @@ impl Connection {
self.send().await?;
self.init_done = true;
self.set_timeline(m.timelineid)?;
self.appname = m.appname;
}
StartupRequestCode::Cancel => return Ok(()),
}
@@ -941,7 +938,7 @@ impl Connection {
let mut caps = re.captures_iter(str::from_utf8(&cmd[..]).unwrap());
let cap = caps.next().unwrap();
let mut start_pos: XLogRecPtr = (parse_hex_str(&cap[1])? << 32) | parse_hex_str(&cap[2])?;
let mut stop_pos: XLogRecPtr = if let Some(cap) = caps.next() {
let stop_pos: XLogRecPtr = if let Some(cap) = caps.next() {
(parse_hex_str(&cap[1])? << 32) | parse_hex_str(&cap[2])?
} else {
0
@@ -954,9 +951,6 @@ impl Connection {
if start_pos == 0 {
start_pos = wal_end;
}
if stop_pos == 0 && self.appname == Some("wal_proposer_recovery".to_string()) {
stop_pos = wal_end;
}
info!(
"Start replication from {:X}/{:>08X} till {:X}/{:>08X}",
(start_pos >> 32) as u32,
@@ -1003,12 +997,12 @@ impl Connection {
// Try to fetch replica's feedback
match self.stream.try_read_buf(&mut self.inbuf) {
Ok(0) => break,
Ok(_) => {
if let Some(FeMessage::CopyData(m)) = self.parse_message()? {
self.timeline()
.add_hs_feedback(HotStandbyFeedback::parse(&m.body))
}
}
Ok(_) => match self.parse_message()? {
Some(FeMessage::CopyData(m)) => self
.timeline()
.add_hs_feedback(HotStandbyFeedback::parse(&m.body)),
_ => {}
},
Err(ref e) if e.kind() == io::ErrorKind::WouldBlock => {}
Err(e) => {
return Err(e);
@@ -1108,7 +1102,7 @@ impl Connection {
let mut bytes_written: usize = 0;
let mut partial;
let mut start_pos = startpos;
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
const ZERO_BLOCK: &'static [u8] = &[0u8; XLOG_BLCKSZ];
/* Extract WAL location for this block */
let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;

View File

@@ -1,15 +1,4 @@
//
// This file contains common utilities for dealing with PostgreSQL WAL files and
// LSNs.
//
// Many of these functions have been copied from PostgreSQL, and rewritten in
// Rust. That's why they don't follow the usual Rust naming conventions, they
// have been named the same as the corresponding PostgreSQL functions instead.
//
use crate::pg_constants;
use byteorder::{ByteOrder, LittleEndian};
use bytes::{Buf, Bytes};
use crc32c::*;
use log::*;
use std::cmp::min;
@@ -34,17 +23,17 @@ pub type XLogSegNo = u64;
#[allow(non_snake_case)]
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
(xlogptr as u32) & (wal_segsz_bytes as u32 - 1)
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
}
#[allow(non_snake_case)]
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
(0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
}
#[allow(non_snake_case)]
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
xlogptr / wal_segsz_bytes as u64
return xlogptr / wal_segsz_bytes as u64;
}
#[allow(non_snake_case)]
@@ -53,7 +42,7 @@ pub fn XLogSegNoOffsetToRecPtr(
offset: u32,
wal_segsz_bytes: usize,
) -> XLogRecPtr {
segno * (wal_segsz_bytes as u64) + (offset as u64)
return segno * (wal_segsz_bytes as u64) + (offset as u64);
}
#[allow(non_snake_case)]
@@ -71,7 +60,7 @@ pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLin
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
(log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
}
#[allow(non_snake_case)]
@@ -81,7 +70,7 @@ pub fn IsXLogFileName(fname: &str) -> bool {
#[allow(non_snake_case)]
pub fn IsPartialXLogFileName(fname: &str) -> bool {
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
return fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8]);
}
pub fn get_current_timestamp() -> TimestampTz {
@@ -192,12 +181,9 @@ fn find_end_of_wal_segment(
}
}
}
last_valid_rec_pos as u32
return last_valid_rec_pos as u32;
}
///
/// Scan a directory that contains PostgreSQL WAL files, for the end of WAL.
///
pub fn find_end_of_wal(
data_dir: &Path,
wal_seg_size: usize,
@@ -251,7 +237,7 @@ pub fn find_end_of_wal(
let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
return (high_ptr, high_tli);
}
(0, 0)
return (0, 0);
}
pub fn main() {
@@ -266,39 +252,3 @@ pub fn main() {
tli
);
}
//
// Xlog record parsing routines
// TODO move here other related code from waldecoder.rs
//
#[repr(C)]
#[derive(Debug)]
pub struct XLogRecord {
pub xl_tot_len: u32,
pub xl_xid: u32,
pub xl_prev: u64,
pub xl_info: u8,
pub xl_rmid: u8,
pub xl_crc: u32,
}
impl XLogRecord {
pub fn from_bytes(buf: &mut Bytes) -> XLogRecord {
XLogRecord {
xl_tot_len: buf.get_u32_le(),
xl_xid: buf.get_u32_le(),
xl_prev: buf.get_u64_le(),
xl_info: buf.get_u8(),
xl_rmid: buf.get_u8(),
xl_crc: {
buf.advance(2);
buf.get_u32_le()
},
}
}
// Is this record an XLOG_SWITCH record? They need some special processing,
pub fn is_xlog_switch_record(&self) -> bool {
self.xl_info == pg_constants::XLOG_SWITCH && self.xl_rmid == pg_constants::RM_XLOG_ID
}
}

View File

@@ -34,7 +34,10 @@ fn main() -> Result<()> {
.required(true);
let matches = App::new("zenith")
.about("Zenith CLI")
.subcommand(SubCommand::with_name("init").about("Initialize a new Zenith repository"))
.subcommand(
SubCommand::with_name("init")
.about("Initialize a new Zenith repository in current directory"),
)
.subcommand(
SubCommand::with_name("branch")
.about("Create a new branch")
@@ -73,10 +76,10 @@ fn main() -> Result<()> {
// all other commands would need config
let repopath = zenith_repo_dir();
let repopath = PathBuf::from(zenith_repo_dir());
if !repopath.exists() {
bail!(
"Zenith repository does not exist in {}.\n\
"Zenith repository does not exists in {}.\n\
Set ZENITH_REPO_DIR or initialize a new repository with 'zenith init'",
repopath.display()
);
@@ -183,7 +186,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let node = cplane
.nodes
.get(name)
.ok_or_else(|| anyhow!("postgres {} is not found", name))?;
.ok_or(anyhow!("postgres {} is not found", name))?;
node.start()?;
}
("stop", Some(sub_m)) => {
@@ -191,7 +194,7 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
let node = cplane
.nodes
.get(name)
.ok_or_else(|| anyhow!("postgres {} is not found", name))?;
.ok_or(anyhow!("postgres {} is not found", name))?;
node.stop()?;
}
@@ -274,19 +277,19 @@ fn list_branches() -> Result<()> {
//
//
fn parse_point_in_time(s: &str) -> Result<local_env::PointInTime> {
let mut strings = s.split('@');
let mut strings = s.split("@");
let name = strings.next().unwrap();
let lsn: Option<u64>;
if let Some(lsnstr) = strings.next() {
let mut s = lsnstr.split('/');
let mut s = lsnstr.split("/");
let lsn_hi: u64 = s
.next()
.ok_or_else(|| anyhow!("invalid LSN in point-in-time specification"))?
.ok_or(anyhow!("invalid LSN in point-in-time specification"))?
.parse()?;
let lsn_lo: u64 = s
.next()
.ok_or_else(|| anyhow!("invalid LSN in point-in-time specification"))?
.ok_or(anyhow!("invalid LSN in point-in-time specification"))?
.parse()?;
lsn = Some(lsn_hi << 32 | lsn_lo);
} else {
@@ -309,8 +312,11 @@ fn parse_point_in_time(s: &str) -> Result<local_env::PointInTime> {
let pointstr = fs::read_to_string(branchpath)?;
let mut result = parse_point_in_time(&pointstr)?;
result.lsn = lsn.unwrap_or(0);
if lsn.is_some() {
result.lsn = lsn.unwrap();
} else {
result.lsn = 0;
}
return Ok(result);
}

View File

@@ -5,4 +5,3 @@ authors = ["Eric Seppanen <eric@zenith.tech>"]
edition = "2018"
[dependencies]
thiserror = "1"

View File

@@ -1,10 +1,2 @@
//! zenith_utils is intended to be a place to put code that is shared
//! between other crates in this repository.
/// `Lsn` type implements common tasks on Log Sequence Numbers
pub mod lsn;
/// SeqWait allows waiting for a future sequence number to arrive
pub mod seqwait;
// Async version of SeqWait. Currently unused.
// pub mod seqwait_async;

View File

@@ -1,251 +0,0 @@
#![warn(missing_docs)]
use std::fmt;
use std::ops::{Add, AddAssign};
use std::path::Path;
use std::str::FromStr;
use std::sync::atomic::{AtomicU64, Ordering};
/// Transaction log block size in bytes
pub const XLOG_BLCKSZ: u32 = 8192;
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
#[derive(Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd)]
pub struct Lsn(pub u64);
/// We tried to parse an LSN from a string, but failed
#[derive(Debug, PartialEq, thiserror::Error)]
#[error("LsnParseError")]
pub struct LsnParseError;
impl Lsn {
/// Maximum possible value for an LSN
pub const MAX: Lsn = Lsn(u64::MAX);
/// Subtract a number, returning None on overflow.
pub fn checked_sub<T: Into<u64>>(self, other: T) -> Option<Lsn> {
let other: u64 = other.into();
self.0.checked_sub(other).map(Lsn)
}
/// Parse an LSN from a filename in the form `0000000000000000`
pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
where
F: AsRef<Path>,
{
let filename: &Path = filename.as_ref();
let filename = filename.to_str().ok_or(LsnParseError)?;
Lsn::from_hex(filename)
}
/// Parse an LSN from a string in the form `0000000000000000`
pub fn from_hex<S>(s: S) -> Result<Self, LsnParseError>
where
S: AsRef<str>,
{
let s: &str = s.as_ref();
let n = u64::from_str_radix(s, 16).or(Err(LsnParseError))?;
Ok(Lsn(n))
}
/// Compute the offset into a segment
pub fn segment_offset(self, seg_sz: u64) -> u64 {
self.0 % seg_sz
}
/// Compute the segment number
pub fn segment_number(self, seg_sz: u64) -> u64 {
self.0 / seg_sz
}
/// Compute the offset into a block
pub fn block_offset(self) -> u64 {
const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
self.0 % BLCKSZ
}
/// Compute the bytes remaining in this block
///
/// If the LSN is already at the block boundary, it will return `XLOG_BLCKSZ`.
pub fn remaining_in_block(self) -> u64 {
const BLCKSZ: u64 = XLOG_BLCKSZ as u64;
BLCKSZ - (self.0 % BLCKSZ)
}
/// Compute the bytes remaining to fill a chunk of some size
///
/// If the LSN is already at the chunk boundary, it will return 0.
pub fn calc_padding<T: Into<u64>>(self, sz: T) -> u64 {
let sz: u64 = sz.into();
// By using wrapping_sub, we can subtract first and then mod second.
// If it's done the other way around, then we would return a full
// chunk size if we're already at the chunk boundary.
// (Regular subtraction will panic on overflow in debug builds.)
(sz.wrapping_sub(self.0)) % sz
}
}
impl From<u64> for Lsn {
fn from(n: u64) -> Self {
Lsn(n)
}
}
impl From<Lsn> for u64 {
fn from(lsn: Lsn) -> u64 {
lsn.0
}
}
impl FromStr for Lsn {
type Err = LsnParseError;
/// Parse an LSN from a string in the form `00000000/00000000`
///
/// If the input string is missing the '/' character, then use `Lsn::from_hex`
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut splitter = s.split('/');
if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
{
let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
let right_num = u32::from_str_radix(right, 16).map_err(|_| LsnParseError)?;
Ok(Lsn((left_num as u64) << 32 | right_num as u64))
} else {
Err(LsnParseError)
}
}
}
impl fmt::Display for Lsn {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{:X}/{:X}", self.0 >> 32, self.0 & 0xffffffff)
}
}
impl Add<u64> for Lsn {
type Output = Lsn;
fn add(self, other: u64) -> Self::Output {
// panic if the addition overflows.
Lsn(self.0.checked_add(other).unwrap())
}
}
impl AddAssign<u64> for Lsn {
fn add_assign(&mut self, other: u64) {
// panic if the addition overflows.
self.0 = self.0.checked_add(other).unwrap();
}
}
/// An [`Lsn`] that can be accessed atomically.
pub struct AtomicLsn {
inner: AtomicU64,
}
impl AtomicLsn {
/// Creates a new atomic `Lsn`.
pub fn new(val: u64) -> Self {
AtomicLsn {
inner: AtomicU64::new(val),
}
}
/// Atomically retrieve the `Lsn` value from memory.
pub fn load(&self) -> Lsn {
Lsn(self.inner.load(Ordering::Acquire))
}
/// Atomically store a new `Lsn` value to memory.
pub fn store(&self, lsn: Lsn) {
self.inner.store(lsn.0, Ordering::Release);
}
/// Adds to the current value, returning the previous value.
///
/// This operation will panic on overflow.
pub fn fetch_add(&self, val: u64) -> Lsn {
let prev = self.inner.fetch_add(val, Ordering::AcqRel);
if prev.checked_add(val).is_none() {
panic!("AtomicLsn overflow");
}
Lsn(prev)
}
/// Atomically sets the Lsn to the max of old and new value, returning the old value.
pub fn fetch_max(&self, lsn: Lsn) -> Lsn {
let prev = self.inner.fetch_max(lsn.0, Ordering::AcqRel);
Lsn(prev)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_lsn_strings() {
assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
assert_eq!("aaaa/bbbb".parse(), Ok(Lsn(0x0000AAAA0000BBBB)));
assert_eq!("1/A".parse(), Ok(Lsn(0x000000010000000A)));
assert_eq!("0/0".parse(), Ok(Lsn(0)));
"ABCDEFG/12345678".parse::<Lsn>().unwrap_err();
"123456789/AAAA5555".parse::<Lsn>().unwrap_err();
"12345678/AAAA55550".parse::<Lsn>().unwrap_err();
"-1/0".parse::<Lsn>().unwrap_err();
"1/-1".parse::<Lsn>().unwrap_err();
assert_eq!(format!("{}", Lsn(0x12345678AAAA5555)), "12345678/AAAA5555");
assert_eq!(format!("{}", Lsn(0x000000010000000A)), "1/A");
assert_eq!(
Lsn::from_hex("12345678AAAA5555"),
Ok(Lsn(0x12345678AAAA5555))
);
assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
}
#[test]
fn test_lsn_math() {
assert_eq!(Lsn(1234) + 11u64, Lsn(1245));
assert_eq!(
{
let mut lsn = Lsn(1234);
lsn += 11u64;
lsn
},
Lsn(1245)
);
assert_eq!(Lsn(1234).checked_sub(1233u64), Some(Lsn(1)));
assert_eq!(Lsn(1234).checked_sub(1235u64), None);
let seg_sz = 16u64 * 1024 * 1024;
assert_eq!(Lsn(0x1000007).segment_offset(seg_sz), 7u64);
assert_eq!(Lsn(0x1000007).segment_number(seg_sz), 1u64);
assert_eq!(Lsn(0x4007).block_offset(), 7u64);
assert_eq!(Lsn(0x4000).block_offset(), 0u64);
assert_eq!(Lsn(0x4007).remaining_in_block(), 8185u64);
assert_eq!(Lsn(0x4000).remaining_in_block(), 8192u64);
assert_eq!(Lsn(0xffff01).calc_padding(seg_sz), 255u64);
assert_eq!(Lsn(0x2000000).calc_padding(seg_sz), 0u64);
assert_eq!(Lsn(0xffff01).calc_padding(8u32), 7u64);
assert_eq!(Lsn(0xffff00).calc_padding(8u32), 0u64);
}
#[test]
fn test_atomic_lsn() {
let lsn = AtomicLsn::new(0);
assert_eq!(lsn.fetch_add(1234), Lsn(0));
assert_eq!(lsn.load(), Lsn(1234));
lsn.store(Lsn(5678));
assert_eq!(lsn.load(), Lsn(5678));
assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
}
}

View File

@@ -1,264 +0,0 @@
#![warn(missing_docs)]
use std::cmp::{Eq, Ordering, PartialOrd};
use std::collections::BinaryHeap;
use std::fmt::Debug;
use std::mem;
use std::sync::mpsc::{channel, Receiver, Sender};
use std::sync::Mutex;
use std::time::Duration;
/// An error happened while waiting for a number
#[derive(Debug, PartialEq, thiserror::Error)]
#[error("SeqWaitError")]
pub enum SeqWaitError {
/// The wait timeout was reached
Timeout,
/// [`SeqWait::shutdown`] was called
Shutdown,
}
/// Internal components of a `SeqWait`
struct SeqWaitInt<T>
where
T: Ord,
{
waiters: BinaryHeap<Waiter<T>>,
current: T,
shutdown: bool,
}
struct Waiter<T>
where
T: Ord,
{
wake_num: T, // wake me when this number arrives ...
wake_channel: Sender<()>, // ... by sending a message to this channel
}
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
// to get that.
impl<T: Ord> PartialOrd for Waiter<T> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
other.wake_num.partial_cmp(&self.wake_num)
}
}
impl<T: Ord> Ord for Waiter<T> {
fn cmp(&self, other: &Self) -> Ordering {
other.wake_num.cmp(&self.wake_num)
}
}
impl<T: Ord> PartialEq for Waiter<T> {
fn eq(&self, other: &Self) -> bool {
other.wake_num == self.wake_num
}
}
impl<T: Ord> Eq for Waiter<T> {}
/// A tool for waiting on a sequence number
///
/// This provides a way to wait the arrival of a number.
/// As soon as the number arrives by another caller calling
/// [`advance`], then the waiter will be woken up.
///
/// This implementation takes a blocking Mutex on both [`wait_for`]
/// and [`advance`], meaning there may be unexpected executor blocking
/// due to thread scheduling unfairness. There are probably better
/// implementations, but we can probably live with this for now.
///
/// [`wait_for`]: SeqWait::wait_for
/// [`advance`]: SeqWait::advance
///
pub struct SeqWait<T>
where
T: Ord,
{
internal: Mutex<SeqWaitInt<T>>,
}
impl<T> SeqWait<T>
where
T: Ord + Debug + Copy,
{
/// Create a new `SeqWait`, initialized to a particular number
pub fn new(starting_num: T) -> Self {
let internal = SeqWaitInt {
waiters: BinaryHeap::new(),
current: starting_num,
shutdown: false,
};
SeqWait {
internal: Mutex::new(internal),
}
}
/// Shut down a `SeqWait`, causing all waiters (present and
/// future) to return an error.
pub fn shutdown(&self) {
let waiters = {
// Prevent new waiters; wake all those that exist.
// Wake everyone with an error.
let mut internal = self.internal.lock().unwrap();
// This will steal the entire waiters map.
// When we drop it all waiters will be woken.
mem::take(&mut internal.waiters)
// Drop the lock as we exit this scope.
};
// When we drop the waiters list, each Receiver will
// be woken with an error.
// This drop doesn't need to be explicit; it's done
// here to make it easier to read the code and understand
// the order of events.
drop(waiters);
}
/// Wait for a number to arrive
///
/// This call won't complete until someone has called `advance`
/// with a number greater than or equal to the one we're waiting for.
pub fn wait_for(&self, num: T) -> Result<(), SeqWaitError> {
match self.queue_for_wait(num) {
Ok(None) => Ok(()),
Ok(Some(rx)) => rx.recv().map_err(|_| SeqWaitError::Shutdown),
Err(e) => Err(e),
}
}
/// Wait for a number to arrive
///
/// This call won't complete until someone has called `advance`
/// with a number greater than or equal to the one we're waiting for.
///
/// If that hasn't happened after the specified timeout duration,
/// [`SeqWaitError::Timeout`] will be returned.
pub fn wait_for_timeout(&self, num: T, timeout_duration: Duration) -> Result<(), SeqWaitError> {
match self.queue_for_wait(num) {
Ok(None) => Ok(()),
Ok(Some(rx)) => rx.recv_timeout(timeout_duration).map_err(|e| match e {
std::sync::mpsc::RecvTimeoutError::Timeout => SeqWaitError::Timeout,
std::sync::mpsc::RecvTimeoutError::Disconnected => SeqWaitError::Shutdown,
}),
Err(e) => Err(e),
}
}
/// Register and return a channel that will be notified when a number arrives,
/// or None, if it has already arrived.
fn queue_for_wait(&self, num: T) -> Result<Option<Receiver<()>>, SeqWaitError> {
let mut internal = self.internal.lock().unwrap();
if internal.current >= num {
return Ok(None);
}
if internal.shutdown {
return Err(SeqWaitError::Shutdown);
}
// Create a new channel.
let (tx, rx) = channel();
internal.waiters.push(Waiter {
wake_num: num,
wake_channel: tx,
});
// Drop the lock as we exit this scope.
Ok(Some(rx))
}
/// Announce a new number has arrived
///
/// All waiters at this value or below will be woken.
///
/// Returns the old number.
pub fn advance(&self, num: T) -> T {
let old_value;
let wake_these = {
let mut internal = self.internal.lock().unwrap();
old_value = internal.current;
if old_value >= num {
return old_value;
}
internal.current = num;
// Pop all waiters <= num from the heap. Collect them in a vector, and
// wake them up after releasing the lock.
let mut wake_these = Vec::new();
while let Some(n) = internal.waiters.peek() {
if n.wake_num > num {
break;
}
wake_these.push(internal.waiters.pop().unwrap().wake_channel);
}
wake_these
};
for tx in wake_these {
// This can fail if there are no receivers.
// We don't care; discard the error.
let _ = tx.send(());
}
old_value
}
/// Read the current value, without waiting.
pub fn load(&self) -> T {
self.internal.lock().unwrap().current
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Arc;
use std::thread::sleep;
use std::thread::spawn;
use std::time::Duration;
#[test]
fn seqwait() {
let seq = Arc::new(SeqWait::new(0));
let seq2 = Arc::clone(&seq);
let seq3 = Arc::clone(&seq);
spawn(move || {
seq2.wait_for(42).expect("wait_for 42");
let old = seq2.advance(100);
assert_eq!(old, 99);
seq2.wait_for(999).expect_err("no 999");
});
spawn(move || {
seq3.wait_for(42).expect("wait_for 42");
seq3.wait_for(0).expect("wait_for 0");
});
sleep(Duration::from_secs(1));
let old = seq.advance(99);
assert_eq!(old, 0);
seq.wait_for(100).expect("wait_for 100");
// Calling advance with a smaller value is a no-op
assert_eq!(seq.advance(98), 100);
assert_eq!(seq.load(), 100);
seq.shutdown();
}
#[test]
fn seqwait_timeout() {
let seq = Arc::new(SeqWait::new(0));
let seq2 = Arc::clone(&seq);
spawn(move || {
let timeout = Duration::from_millis(1);
let res = seq2.wait_for_timeout(42, timeout);
assert_eq!(res, Err(SeqWaitError::Timeout));
});
sleep(Duration::from_secs(1));
// This will attempt to wake, but nothing will happen
// because the waiter already dropped its Receiver.
let old = seq.advance(99);
assert_eq!(old, 0)
}
}

View File

@@ -1,224 +0,0 @@
///
/// Async version of 'seqwait.rs'
///
/// NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs.
///
#![warn(missing_docs)]
use std::collections::BTreeMap;
use std::fmt::Debug;
use std::mem;
use std::sync::Mutex;
use std::time::Duration;
use tokio::sync::watch::{channel, Receiver, Sender};
use tokio::time::timeout;
/// An error happened while waiting for a number
#[derive(Debug, PartialEq, thiserror::Error)]
#[error("SeqWaitError")]
pub enum SeqWaitError {
/// The wait timeout was reached
Timeout,
/// [`SeqWait::shutdown`] was called
Shutdown,
}
/// Internal components of a `SeqWait`
struct SeqWaitInt<T>
where
T: Ord,
{
waiters: BTreeMap<T, (Sender<()>, Receiver<()>)>,
current: T,
shutdown: bool,
}
/// A tool for waiting on a sequence number
///
/// This provides a way to await the arrival of a number.
/// As soon as the number arrives by another caller calling
/// [`advance`], then the waiter will be woken up.
///
/// This implementation takes a blocking Mutex on both [`wait_for`]
/// and [`advance`], meaning there may be unexpected executor blocking
/// due to thread scheduling unfairness. There are probably better
/// implementations, but we can probably live with this for now.
///
/// [`wait_for`]: SeqWait::wait_for
/// [`advance`]: SeqWait::advance
///
pub struct SeqWait<T>
where
T: Ord,
{
internal: Mutex<SeqWaitInt<T>>,
}
impl<T> SeqWait<T>
where
T: Ord + Debug + Copy,
{
/// Create a new `SeqWait`, initialized to a particular number
pub fn new(starting_num: T) -> Self {
let internal = SeqWaitInt {
waiters: BTreeMap::new(),
current: starting_num,
shutdown: false,
};
SeqWait {
internal: Mutex::new(internal),
}
}
/// Shut down a `SeqWait`, causing all waiters (present and
/// future) to return an error.
pub fn shutdown(&self) {
let waiters = {
// Prevent new waiters; wake all those that exist.
// Wake everyone with an error.
let mut internal = self.internal.lock().unwrap();
// This will steal the entire waiters map.
// When we drop it all waiters will be woken.
mem::take(&mut internal.waiters)
// Drop the lock as we exit this scope.
};
// When we drop the waiters list, each Receiver will
// be woken with an error.
// This drop doesn't need to be explicit; it's done
// here to make it easier to read the code and understand
// the order of events.
drop(waiters);
}
/// Wait for a number to arrive
///
/// This call won't complete until someone has called `advance`
/// with a number greater than or equal to the one we're waiting for.
pub async fn wait_for(&self, num: T) -> Result<(), SeqWaitError> {
let mut rx = {
let mut internal = self.internal.lock().unwrap();
if internal.current >= num {
return Ok(());
}
if internal.shutdown {
return Err(SeqWaitError::Shutdown);
}
// If we already have a channel for waiting on this number, reuse it.
if let Some((_, rx)) = internal.waiters.get_mut(&num) {
// an Err from changed() means the sender was dropped.
rx.clone()
} else {
// Create a new channel.
let (tx, rx) = channel(());
internal.waiters.insert(num, (tx, rx.clone()));
rx
}
// Drop the lock as we exit this scope.
};
rx.changed().await.map_err(|_| SeqWaitError::Shutdown)
}
/// Wait for a number to arrive
///
/// This call won't complete until someone has called `advance`
/// with a number greater than or equal to the one we're waiting for.
///
/// If that hasn't happened after the specified timeout duration,
/// [`SeqWaitError::Timeout`] will be returned.
pub async fn wait_for_timeout(
&self,
num: T,
timeout_duration: Duration,
) -> Result<(), SeqWaitError> {
timeout(timeout_duration, self.wait_for(num))
.await
.unwrap_or(Err(SeqWaitError::Timeout))
}
/// Announce a new number has arrived
///
/// All waiters at this value or below will be woken.
///
/// `advance` will panic if you send it a lower number than
/// a previous call.
pub fn advance(&self, num: T) {
let wake_these = {
let mut internal = self.internal.lock().unwrap();
if internal.current > num {
panic!(
"tried to advance backwards, from {:?} to {:?}",
internal.current, num
);
}
internal.current = num;
// split_off will give me all the high-numbered waiters,
// so split and then swap. Everything at or above `num`
// stays.
let mut split = internal.waiters.split_off(&num);
std::mem::swap(&mut split, &mut internal.waiters);
// `split_at` didn't get the value at `num`; if it's
// there take that too.
if let Some(sleeper) = internal.waiters.remove(&num) {
split.insert(num, sleeper);
}
split
};
for (_wake_num, (tx, _rx)) in wake_these {
// This can fail if there are no receivers.
// We don't care; discard the error.
let _ = tx.send(());
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Arc;
use tokio::time::{sleep, Duration};
#[tokio::test]
async fn seqwait() {
let seq = Arc::new(SeqWait::new(0));
let seq2 = Arc::clone(&seq);
let seq3 = Arc::clone(&seq);
tokio::spawn(async move {
seq2.wait_for(42).await.expect("wait_for 42");
seq2.advance(100);
seq2.wait_for(999).await.expect_err("no 999");
});
tokio::spawn(async move {
seq3.wait_for(42).await.expect("wait_for 42");
seq3.wait_for(0).await.expect("wait_for 0");
});
sleep(Duration::from_secs(1)).await;
seq.advance(99);
seq.wait_for(100).await.expect("wait_for 100");
seq.shutdown();
}
#[tokio::test]
async fn seqwait_timeout() {
let seq = Arc::new(SeqWait::new(0));
let seq2 = Arc::clone(&seq);
tokio::spawn(async move {
let timeout = Duration::from_millis(1);
let res = seq2.wait_for_timeout(42, timeout).await;
assert_eq!(res, Err(SeqWaitError::Timeout));
});
sleep(Duration::from_secs(1)).await;
// This will attempt to wake, but nothing will happen
// because the waiter already dropped its Receiver.
seq.advance(99);
}
}