Compare commits

..

1 Commits

Author SHA1 Message Date
anastasia
babd2339cc [issue #56] Fix race at postgres instance + walreceiver start. Uses postgres/vendor issue_56 branch.
TODO: rebase on main
2021-04-22 15:51:44 +03:00
118 changed files with 17913 additions and 11338 deletions

View File

@@ -1,267 +0,0 @@
version: 2.1
orbs:
python: circleci/python@1.4.0
executors:
zenith-build-executor:
resource_class: xlarge
docker:
- image: cimg/rust:1.51.0
jobs:
# A job to build postgres
build-postgres:
executor: zenith-build-executor
steps:
# Checkout the git repo (circleci doesn't have a flag to enable submodules here)
- checkout
# Grab the postgres git revision to build a cache key.
# Note this works even though the submodule hasn't been checkout out yet.
- run:
name: Get postgres cache key
command: |
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
- restore_cache:
name: Restore postgres cache
keys:
# Restore ONLY if the rev key matches exactly
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
# FIXME We could cache our own docker container, instead of installing packages every time.
- run:
name: apt install dependencies
command: |
if [ ! -e tmp_install/bin/postgres ]; then
sudo apt update
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libxml2-dev libcurl4-openssl-dev
fi
# Build postgres if the restore_cache didn't find a build.
# `make` can't figure out whether the cache is valid, since
# it only compares file timestamps.
- run:
name: build postgres
command: |
if [ ! -e tmp_install/bin/postgres ]; then
# "depth 1" saves some time by not cloning the whole repo
git submodule update --init --depth 1
make postgres
fi
- save_cache:
name: Save postgres cache
key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
paths:
- tmp_install
# A job to build zenith rust code
build-zenith:
executor: zenith-build-executor
parameters:
build_type:
type: enum
enum: ["debug", "release"]
steps:
- run:
name: apt install dependencies
command: |
sudo apt update
sudo apt install libssl-dev clang
# Checkout the git repo (without submodules)
- checkout
# Grab the postgres git revision to build a cache key.
# Note this works even though the submodule hasn't been checkout out yet.
- run:
name: Get postgres cache key
command: |
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
- restore_cache:
name: Restore postgres cache
keys:
# Restore ONLY if the rev key matches exactly
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
- restore_cache:
name: Restore rust cache
keys:
# Require an exact match. While an out of date cache might speed up the build,
# there's no way to clean out old packages, so the cache grows every time something
# changes.
- v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
# Build the rust code, including test binaries
- run:
name: Rust build << parameters.build_type >>
command: |
export CARGO_INCREMENTAL=0
BUILD_TYPE="<< parameters.build_type >>"
if [[ $BUILD_TYPE == "debug" ]]; then
echo "Build in debug mode"
cargo build --bins --tests
elif [[ $BUILD_TYPE == "release" ]]; then
echo "Build in release mode"
cargo build --release --bins --tests
fi
- save_cache:
name: Save rust cache
key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
paths:
- ~/.cargo/registry
- ~/.cargo/git
- target
# Run rust unit tests
# FIXME: remove -p zenith_utils once integration tests are moved to python
- run: cargo test -p zenith_utils
# Install the rust binaries, for use by test jobs
# `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
# FIXME: this is a really silly way to install; maybe we should just output
# a tarball as an artifact? Or a .deb package?
- run:
name: cargo install
command: |
export CARGO_INCREMENTAL=0
BUILD_TYPE="<< parameters.build_type >>"
if [[ $BUILD_TYPE == "debug" ]]; then
echo "Install debug mode"
CARGO_FLAGS="--debug"
elif [[ $BUILD_TYPE == "release" ]]; then
echo "Install release mode"
# The default is release mode; there is no --release flag.
CARGO_FLAGS=""
fi
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith
# Install the postgres binaries, for use by test jobs
# FIXME: this is a silly way to do "install"; maybe just output a standard
# postgres package, whatever the favored form is (tarball? .deb package?)
# Note that pg_regress needs some build artifacts that probably aren't
# in the usual package...?
- run:
name: postgres install
command: |
cp -a tmp_install /tmp/zenith/pg_install
# Save the rust output binaries for other jobs in this workflow.
- persist_to_workspace:
root: /tmp/zenith
paths:
- "*"
run-pytest:
#description: "Run pytest"
executor: python/default
parameters:
# pytest args to specify the tests to run.
#
# This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory,
# or '-k foobar' to run tests containing string 'foobar'. See pytest man page
# section SPECIFYING TESTS / SELECTING TESTS for details.
#
# Select the type of Rust build. Must be "release" or "debug".
build_type:
type: string
default: "debug"
# This parameter is required, to prevent the mistake of running all tests in one job.
test_selection:
type: string
default: ""
# Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
extra_params:
type: string
default: ""
needs_postgres_source:
type: boolean
default: false
steps:
- attach_workspace:
at: /tmp/zenith
- checkout
- when:
condition: << parameters.needs_postgres_source >>
steps:
- run: git submodule update --init --depth 1
- run: pip install pytest psycopg2
- run:
name: Run pytest
working_directory: test_runner
environment:
- ZENITH_BIN: /tmp/zenith/bin
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
- TEST_OUTPUT: /tmp/test_output
command: |
TEST_SELECTION="<< parameters.test_selection >>"
EXTRA_PARAMS="<< parameters.extra_params >>"
if [ -z "$TEST_SELECTION" ]; then
echo "test_selection must be set"
exit 1
fi
# Run the tests.
#
# The junit.xml file allows CircleCI to display more fine-grained test information
# in its "Tests" tab in the results page.
pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short $TEST_SELECTION $EXTRA_PARAMS
- run:
# CircleCI artifacts are preserved one file at a time, so skipping
# this step isn't a good idea. If you want to extract the
# pageserver state, perhaps a tarball would be a better idea.
name: Delete pageserver data
when: always
command: |
du -sh /tmp/test_output/*
for DIR in /tmp/test_output/*; do
mv $DIR/repo/pageserver.log $DIR/ || true # ignore errors
for PGDIR in $DIR/repo/pgdatadirs/pg?; do
echo "PGDIR: $PGDIR"
NEW_LOG="${PGDIR##*/}_log"
mv $PGDIR/log "$DIR/$NEW_LOG" || true # ignore errors
done
echo "rm $DIR/repo"
rm -rf $DIR/repo
done
du -sh /tmp/test_output/*
- store_artifacts:
path: /tmp/test_output
# The store_test_results step tells CircleCI where to find the junit.xml file.
- store_test_results:
path: /tmp/test_output
workflows:
build_and_test:
jobs:
- build-postgres
- build-zenith:
name: build-zenith-<< matrix.build_type >>
matrix:
parameters:
build_type: ["debug", "release"]
requires:
- build-postgres
- run-pytest:
name: pg_regress tests << matrix.build_type >>
matrix:
parameters:
build_type: ["debug", "release"]
test_selection: batch_pg_regress
needs_postgres_source: true
requires:
- build-zenith-<< matrix.build_type >>
- run-pytest:
name: other tests << matrix.build_type >>
matrix:
parameters:
build_type: ["debug", "release"]
test_selection: batch_others
requires:
- build-zenith-<< matrix.build_type >>

View File

@@ -1,45 +0,0 @@
name: Send Notifications
on:
push:
branches: [ main ]
jobs:
send-notifications:
timeout-minutes: 30
name: send commit notifications
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: true
fetch-depth: 2
- name: Form variables for notification message
id: git_info_grab
run: |
git_stat=$(git show --stat=50)
git_stat="${git_stat//'%'/'%25'}"
git_stat="${git_stat//$'\n'/'%0A'}"
git_stat="${git_stat//$'\r'/'%0D'}"
git_stat="${git_stat// /}" # space -> 'Space En', as github tends to eat ordinary spaces
echo "::set-output name=git_stat::$git_stat"
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
- name: Send notification
uses: appleboy/telegram-action@master
with:
to: ${{ secrets.TELEGRAM_TO }}
token: ${{ secrets.TELEGRAM_TOKEN }}
format: markdown
args: |
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
```
${{ steps.git_info_grab.outputs.git_stat }}
```

View File

@@ -1,36 +1,45 @@
name: Build and Test
name: regression check
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
on: [push]
jobs:
regression-check:
strategy:
matrix:
# If we want to duplicate this job for different
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
rust_toolchain: [stable]
os: [ubuntu-latest]
timeout-minutes: 30
timeout-minutes: 10
name: run regression test suite
runs-on: ${{ matrix.os }}
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: true
fetch-depth: 2
- name: install rust toolchain ${{ matrix.rust_toolchain }}
uses: actions-rs/toolchain@v1
- name: Form variables for notification message
id: git_info_grab
run: |
git_stat=$(git show --stat=50)
git_stat="${git_stat//'%'/'%25'}"
git_stat="${git_stat//$'\n'/'%0A'}"
git_stat="${git_stat//$'\r'/'%0D'}"
git_stat="${git_stat// /}" # space -> 'Space En', as github tends to eat ordinary spaces
echo "::set-output name=git_stat::$git_stat"
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
- name: Send notification
uses: appleboy/telegram-action@master
with:
profile: minimal
toolchain: ${{ matrix.rust_toolchain }}
override: true
to: ${{ secrets.TELEGRAM_TO }}
token: ${{ secrets.TELEGRAM_TOKEN }}
format: markdown
args: |
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
```
${{ steps.git_info_grab.outputs.git_stat }}
```
- name: Install postgres dependencies
run: |
@@ -52,7 +61,11 @@ jobs:
- name: Build postgres
if: steps.cache_pg.outputs.cache-hit != 'true'
run: |
make postgres
./pgbuild.sh
- name: Install rust
run: |
sudo apt install -y cargo
- name: Cache cargo deps
id: cache_cargo
@@ -64,10 +77,10 @@ jobs:
target
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
- name: Run cargo build
- name: Build
run: |
cargo build --workspace --bins --examples --tests
cargo build
- name: Run cargo test
- name: Run test
run: |
cargo test -- --nocapture --test-threads=1
cargo test --test test_pageserver -- --nocapture --test-threads=1

4
.gitignore vendored
View File

@@ -2,8 +2,4 @@
/tmp_check
/tmp_install
/tmp_check_cli
__pycache__/
test_output/
.vscode
/.zenith
/integration_tests/.zenith

806
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -7,5 +7,4 @@ members = [
"control_plane",
"postgres_ffi",
"zenith_utils",
"workspace_hack",
]

View File

@@ -1,57 +0,0 @@
#
# Top level Makefile to build Zenith and PostgreSQL
#
all: zenith postgres
# We don't want to run 'cargo build' in parallel with the postgres build,
# because interleaving cargo build output with postgres build output looks
# confusing. Also, 'cargo build' is parallel on its own, so it would be too
# much parallelism. (Recursive invocation of postgres target still gets any
# '-j' flag from the command line, so 'make -j' is still useful.)
.NOTPARALLEL:
### Zenith Rust bits
#
# The 'postgres_ffi' depends on the Postgres headers.
zenith: postgres-headers
cargo build
### PostgreSQL parts
tmp_install/build/config.status:
+@echo "Configuring postgres build"
mkdir -p tmp_install/build
(cd tmp_install/build && \
../../vendor/postgres/configure CFLAGS='-O0' --enable-debug --enable-cassert \
--enable-depend --with-libxml --prefix=$(abspath tmp_install) > configure.log)
# nicer alias for running 'configure'
postgres-configure: tmp_install/build/config.status
# Install the PostgreSQL header files into tmp_install/include
postgres-headers: postgres-configure
+@echo "Installing PostgreSQL headers"
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
# Compile and install PostgreSQL and contrib/zenith
postgres: postgres-configure
+@echo "Compiling PostgreSQL"
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
+@echo "Compiling contrib/zenith"
(cd vendor/postgres/contrib/zenith && \
$(MAKE) PG_CONFIG=$(abspath tmp_install)/bin/pg_config install USE_PGXS=1)
postgres-clean:
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
# This doesn't remove the effects of 'configure'.
clean:
cd tmp_install/build && ${MAKE} clean
cargo clean
# This removes everything
distclean:
rm -rf tmp_install
cargo clean
.PHONY: postgres-configure postgres postgres-headers zenith

View File

@@ -8,82 +8,54 @@ Zenith substitutes PostgreSQL storage layer and redistributes data across a clus
```sh
git clone --recursive https://github.com/libzenith/zenith.git
cd zenith
make -j5
./pgbuild.sh # builds postgres and installs it to ./tmp_install
cargo build
```
2. Start pageserver and postgres on top of it (should be called from repo root):
2. Start pageserver and postggres on top of it (should be called from repo root):
```sh
# Create repository in .zenith with proper paths to binaries and data
# Create ~/.zenith with proper paths to binaries and data
# Later that would be responsibility of a package install script
> ./target/debug/zenith init
<...>
new zenith repository was created in .zenith
>./target/debug/zenith init
# start pageserver
> ./target/debug/zenith start
Starting pageserver at '127.0.0.1:64000' in .zenith
Pageserver started
> ./target/debug/zenith pageserver start
Starting pageserver at '127.0.0.1:64000'
# start postgres on top on the pageserver
> ./target/debug/zenith pg start main
Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
waiting for server to start.... done
# create and configure postgres data dir
> ./target/debug/zenith pg create
Creating new postgres: path=/Users/user/code/zenith/tmp_check_cli/compute/pg1 port=55432
Database initialized
# check list of running postgres instances
> ./target/debug/zenith pg list
BRANCH ADDRESS LSN STATUS
main 127.0.0.1:55432 0/1609610 running
# start it
> ./target/debug/zenith pg start pg1
# look up status and connection info
> ./target/debug/zenith pg list
NODE ADDRESS STATUS
pg1 127.0.0.1:55432 running
```
3. Now it is possible to connect to postgres and run some queries:
```sh
```
> psql -p55432 -h 127.0.0.1 postgres
postgres=# CREATE TABLE t(key int primary key, value text);
CREATE TABLE
postgres=# insert into t values(1,1);
INSERT 0 1
postgres=# select * from t;
key | value
-----+-------
1 | 1
(1 row)
```
4. And create branches and run postgres on them:
```sh
# create branch named migration_check
> ./target/debug/zenith branch migration_check main
Created branch 'migration_check' at 0/1609610
# check branches tree
> ./target/debug/zenith branch
main
┗━ @0/1609610: migration_check
# start postgres on that branch
> ./target/debug/zenith pg start migration_check
Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
waiting for server to start.... done
# this new postgres instance will have all the data from 'main' postgres,
# but all modifications would not affect data in original postgres
> psql -p55433 -h 127.0.0.1 postgres
postgres=# select * from t;
key | value
-----+-------
1 | 1
(1 row)
postgres=# insert into t values(2,2);
INSERT 0 1
```
## Running tests
```sh
git clone --recursive https://github.com/libzenith/zenith.git
make # builds also postgres and installs it to ./tmp_install
pytest
./pgbuild.sh # builds postgres and installs it to ./tmp_install
cargo test -- --test-threads=1
```
## Source tree layout
@@ -102,6 +74,11 @@ Depends on the modified 'postgres' binary for WAL redo.
Tests with different combinations of a Postgres compute node, WAL safekeeper and Page Server.
/mgmt-console:
Web UI to launch (modified) Postgres servers, using S3 as the backing store. Written in Python.
This is somewhat outdated, as it doesn't use the WAL safekeeper or Page Servers.
/vendor/postgres:
PostgreSQL source tree, with the modifications needed for Zenith.

View File

@@ -9,22 +9,19 @@ edition = "2018"
[dependencies]
rand = "0.8.3"
tar = "0.4.33"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
toml = "0.5"
lazy_static = "1.4"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
serde = ""
serde_derive = ""
toml = ""
lazy_static = ""
regex = "1"
anyhow = "1.0"
# hex = "0.4.3"
hex = "0.4.3"
bytes = "1.0.1"
# fs_extra = "1.2.0"
nix = "0.20"
# thiserror = "1"
url = "2.2.2"
fs_extra = "1.2.0"
pageserver = { path = "../pageserver" }
walkeeper = { path = "../walkeeper" }
postgres_ffi = { path = "../postgres_ffi" }
zenith_utils = { path = "../zenith_utils" }
workspace_hack = { path = "../workspace_hack" }

View File

@@ -1,4 +1,5 @@
use std::io::Write;
use std::fs::{self, OpenOptions};
use std::io::{Read, Write};
use std::net::SocketAddr;
use std::net::TcpStream;
use std::os::unix::fs::PermissionsExt;
@@ -6,20 +7,18 @@ use std::process::Command;
use std::sync::Arc;
use std::time::Duration;
use std::{collections::BTreeMap, path::PathBuf};
use std::{
fs::{self, OpenOptions},
io::Read,
};
use anyhow::{Context, Result};
use lazy_static::lazy_static;
use regex::Regex;
use tar;
use postgres::{Client, NoTls};
use crate::local_env::LocalEnv;
use crate::storage::{PageServerNode, WalProposerNode};
use pageserver::ZTimelineId;
use crate::storage::PageServerNode;
//
// ComputeControlPlane
//
@@ -37,7 +36,7 @@ impl ComputeControlPlane {
// it is running on default port. Change that when pageserver will have config.
let pageserver = Arc::new(PageServerNode::from_env(&env));
let pgdatadirspath = &env.pg_data_dirs_path();
let pgdatadirspath = env.repo_path.join("pgdatadirs");
let nodes: Result<BTreeMap<_, _>> = fs::read_dir(&pgdatadirspath)
.with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
.into_iter()
@@ -80,10 +79,11 @@ impl ComputeControlPlane {
&mut self,
is_test: bool,
timelineid: ZTimelineId,
name: &str,
) -> Result<Arc<PostgresNode>> {
let node_id = self.nodes.len() as u32 + 1;
let node = Arc::new(PostgresNode {
name: name.to_owned(),
name: format!("pg{}", node_id),
address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
env: self.env.clone(),
pageserver: Arc::clone(&self.pageserver),
@@ -97,66 +97,47 @@ impl ComputeControlPlane {
Ok(node)
}
pub fn new_test_node(&mut self, branch_name: &str) -> Arc<PostgresNode> {
let timeline_id = self
.pageserver
.branch_get_by_name(branch_name)
.expect("failed to get timeline_id")
.timeline_id;
let node = self.new_from_page_server(true, timeline_id, branch_name);
pub fn new_test_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
let node = self.new_from_page_server(true, timelineid);
assert!(node.is_ok());
let node = node.unwrap();
// Configure the node to stream WAL directly to the pageserver
node.append_conf(
"postgresql.conf",
format!(
"shared_preload_libraries = zenith\n\
zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
"callmemaybe_connstring = '{}'\n", // FIXME escaping
node.connstr()
)
.as_str(),
)
.unwrap();
);
node
}
pub fn new_test_master_node(&mut self, branch_name: &str) -> Arc<PostgresNode> {
let timeline_id = self
.pageserver
.branch_get_by_name(branch_name)
.expect("failed to get timeline_id")
.timeline_id;
let node = self
.new_from_page_server(true, timeline_id, branch_name)
.unwrap();
pub fn new_test_master_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
let node = self.new_from_page_server(true, timelineid).unwrap();
node.append_conf(
"postgresql.conf",
"synchronous_standby_names = 'safekeeper_proxy'\n",
)
.unwrap();
);
node
}
pub fn new_node(&mut self, branch_name: &str) -> Result<Arc<PostgresNode>> {
let timeline_id = self.pageserver.branch_get_by_name(branch_name)?.timeline_id;
let node = self.new_from_page_server(false, timeline_id, branch_name)?;
pub fn new_node(&mut self, timelineid: ZTimelineId) -> Result<Arc<PostgresNode>> {
let node = self.new_from_page_server(false, timelineid).unwrap();
// Configure the node to stream WAL directly to the pageserver
node.append_conf(
"postgresql.conf",
format!(
"shared_preload_libraries = zenith\n\
zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
"callmemaybe_connstring = '{}'\n", // FIXME escaping
node.connstr()
)
.as_str(),
)?;
);
Ok(node)
}
@@ -170,7 +151,7 @@ pub struct PostgresNode {
pub env: LocalEnv,
pageserver: Arc<PageServerNode>,
is_test: bool,
pub timelineid: ZTimelineId,
timelineid: ZTimelineId,
}
impl PostgresNode {
@@ -188,8 +169,6 @@ impl PostgresNode {
lazy_static! {
static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap();
static ref CONF_TIMELINE_RE: Regex =
Regex::new(r"(?m)^\s*zenith.zenith_timeline\s*=\s*'(\w+)'\s*$").unwrap();
}
// parse data directory name
@@ -205,37 +184,26 @@ impl PostgresNode {
)
})?;
// parse port
let err_msg = format!(
"failed to find port definition in config file {}",
cfg_path.to_str().unwrap()
);
let port: u16 = CONF_PORT_RE
.captures(config.as_str())
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
.ok_or(anyhow::Error::msg(err_msg.clone() + " 1"))?
.iter()
.last()
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
.ok_or(anyhow::Error::msg(err_msg.clone() + " 2"))?
.ok_or(anyhow::Error::msg(err_msg.clone() + " 3"))?
.as_str()
.parse()
.with_context(|| err_msg)?;
// parse timeline
let err_msg = format!(
"failed to find timeline definition in config file {}",
cfg_path.to_str().unwrap()
);
let timelineid: ZTimelineId = CONF_TIMELINE_RE
.captures(config.as_str())
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
.iter()
.last()
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
.as_str()
.parse()
.with_context(|| err_msg)?;
// FIXME: What timeline is this server on? Would have to parse the postgresql.conf
// file for that, too. It's currently not needed for anything, but it would be
// nice to list the timeline in "zenith pg list"
let timelineid_buf = [0u8; 16];
let timelineid = ZTimelineId::from(timelineid_buf);
// ok now
Ok(PostgresNode {
@@ -252,6 +220,7 @@ impl PostgresNode {
// new data directory
pub fn init_from_page_server(&self) -> Result<()> {
let pgdata = self.pgdata();
println!(
"Extracting base backup to create postgres instance: path={} port={}",
pgdata.display(),
@@ -308,22 +277,20 @@ impl PostgresNode {
max_replication_slots = 10\n\
hot_standby = on\n\
shared_buffers = 1MB\n\
fsync = off\n\
max_connections = 100\n\
wal_sender_timeout = 0\n\
wal_level = replica\n\
listen_addresses = '{address}'\n\
port = {port}\n",
address = self.address.ip(),
port = self.address.port()
),
)?;
);
// Never clean up old WAL. TODO: We should use a replication
// slot or something proper, to prevent the compute node
// from removing WAL that hasn't been streamed to the safekeepr or
// page server yet. But this will do for now.
self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;
self.append_conf("postgresql.conf", &format!("wal_keep_size='10TB'\n"));
// Connect it to the page server.
@@ -331,23 +298,19 @@ impl PostgresNode {
self.append_conf(
"postgresql.conf",
&format!(
"shared_preload_libraries = zenith \n\
zenith.page_server_connstring = 'host={} port={}'\n\
zenith.zenith_timeline='{}'\n",
"page_server_connstring = 'host={} port={}'\n\
zenith_timeline='{}'\n",
self.pageserver.address().ip(),
self.pageserver.address().port(),
self.timelineid
),
)?;
);
fs::create_dir_all(self.pgdata().join("pg_wal"))?;
fs::create_dir_all(self.pgdata().join("pg_wal").join("archive_status"))?;
self.pg_resetwal(&["-f"])?;
Ok(())
}
pub fn pgdata(&self) -> PathBuf {
self.env.pg_data_dir(&self.name)
fn pgdata(&self) -> PathBuf {
self.env.repo_path.join("pgdatadirs").join(&self.name)
}
pub fn status(&self) -> &str {
@@ -363,12 +326,13 @@ impl PostgresNode {
}
}
pub fn append_conf(&self, config: &str, opts: &str) -> Result<()> {
pub fn append_conf(&self, config: &str, opts: &str) {
OpenOptions::new()
.append(true)
.open(self.pgdata().join(config).to_str().unwrap())?
.write_all(opts.as_bytes())?;
Ok(())
.open(self.pgdata().join(config).to_str().unwrap())
.unwrap()
.write_all(opts.as_bytes())
.unwrap();
}
fn pg_ctl(&self, args: &[&str]) -> Result<()> {
@@ -389,7 +353,6 @@ impl PostgresNode {
)
.env_clear()
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.status()
.with_context(|| "pg_ctl failed")?;
if !pg_ctl.success() {
@@ -398,19 +361,6 @@ impl PostgresNode {
Ok(())
}
fn pg_resetwal(&self, args: &[&str]) -> Result<()> {
let pg_resetwal_path = self.env.pg_bin_dir().join("pg_resetwal");
let pg_ctl = Command::new(pg_resetwal_path)
.args([&["-D", self.pgdata().to_str().unwrap()], args].concat())
.status()
.with_context(|| "pg_resetwal failed")?;
if !pg_ctl.success() {
anyhow::bail!("pg_resetwal failed");
}
Ok(())
}
pub fn start(&self) -> Result<()> {
println!("Starting postgres node at '{}'", self.connstr());
self.pg_ctl(&["start"])
@@ -420,16 +370,8 @@ impl PostgresNode {
self.pg_ctl(&["restart"])
}
pub fn stop(&self, destroy: bool) -> Result<()> {
self.pg_ctl(&["-m", "immediate", "stop"])?;
if destroy {
println!(
"Destroying postgres data directory '{}'",
self.pgdata().to_str().unwrap()
);
fs::remove_dir_all(&self.pgdata())?;
}
Ok(())
pub fn stop(&self) -> Result<()> {
self.pg_ctl(&["-m", "immediate", "stop"])
}
pub fn connstr(&self) -> String {
@@ -453,6 +395,56 @@ impl PostgresNode {
String::from_utf8(output.stdout).unwrap().trim().to_string()
}
pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
let connstring = format!(
"host={} port={} dbname={} user={}",
self.address.ip(),
self.address.port(),
db,
self.whoami()
);
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
println!("Running {}", sql);
client.query(sql, &[]).unwrap()
}
pub fn open_psql(&self, db: &str) -> Client {
let connstring = format!(
"host={} port={} dbname={} user={}",
self.address.ip(),
self.address.port(),
db,
self.whoami()
);
Client::connect(connstring.as_str(), NoTls).unwrap()
}
pub fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode {
let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy");
match Command::new(proxy_path.as_path())
.args(&["--ztimelineid", &self.timelineid.to_string()])
.args(&["-s", wal_acceptors])
.args(&["-h", &self.address.ip().to_string()])
.args(&["-p", &self.address.port().to_string()])
.arg("-v")
.stderr(
OpenOptions::new()
.create(true)
.append(true)
.open(self.pgdata().join("safekeeper_proxy.log"))
.unwrap(),
)
.spawn()
{
Ok(child) => WalProposerNode { pid: child.id() },
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
}
}
// TODO
pub fn pg_bench() {}
}
impl Drop for PostgresNode {
@@ -461,7 +453,7 @@ impl Drop for PostgresNode {
// and checking it here. But let just clean datadirs on start.
fn drop(&mut self) {
if self.is_test {
let _ = self.stop(true);
let _ = self.stop();
}
}
}

View File

@@ -6,26 +6,7 @@
// Intended to be used in integration tests and in CLI tools for
// local installations.
//
use anyhow::{anyhow, bail, Context, Result};
use std::fs;
use std::path::Path;
pub mod compute;
pub mod local_env;
pub mod storage;
/// Read a PID file
///
/// We expect a file that contains a single integer.
/// We return an i32 for compatibility with libc and nix.
pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
let pid_str = fs::read_to_string(pidfile)
.with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
let pid: i32 = pid_str
.parse()
.map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
if pid < 1 {
bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
}
Ok(pid)
}

View File

@@ -4,25 +4,33 @@
// Now it also provides init method which acts like a stub for proper installation
// script which will use local paths.
//
use anyhow::{anyhow, Result};
use serde::{Deserialize, Serialize};
use anyhow::Context;
use bytes::Bytes;
use rand::Rng;
use std::env;
use std::fs;
use std::path::PathBuf;
use std::{collections::BTreeMap, env};
use url::Url;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
pub type Remotes = BTreeMap<String, String>;
use anyhow::Result;
use serde_derive::{Deserialize, Serialize};
use pageserver::ZTimelineId;
use walkeeper::xlog_utils;
//
// This data structures represent deserialized zenith CLI config
// This data structure represents deserialized zenith config, which should be
// located in ~/.zenith
//
// TODO: should we also support ZENITH_CONF env var?
//
#[derive(Serialize, Deserialize, Clone)]
pub struct LocalEnv {
// Pageserver connection strings
pub pageserver_connstring: String,
// Path to the Repository. Here page server and compute nodes will create and store their data.
pub repo_path: PathBuf,
// Base directory for both pageserver and compute nodes
pub base_data_dir: PathBuf,
// System identifier, from the PostgreSQL control file
pub systemid: u64,
// Path to postgres distribution. It's expected that "bin", "include",
// "lib", "share" from postgres distribution are there. If at some point
@@ -30,44 +38,22 @@ pub struct LocalEnv {
// to four separate paths and match OS-specific installation layout.
pub pg_distrib_dir: PathBuf,
// Path to pageserver binary. Empty for remote pageserver.
pub zenith_distrib_dir: Option<PathBuf>,
pub remotes: Remotes,
// Path to pageserver binary.
pub zenith_distrib_dir: PathBuf,
}
impl LocalEnv {
// postgres installation paths
// postgres installation
pub fn pg_bin_dir(&self) -> PathBuf {
self.pg_distrib_dir.join("bin")
}
pub fn pg_lib_dir(&self) -> PathBuf {
self.pg_distrib_dir.join("lib")
}
pub fn pageserver_bin(&self) -> Result<PathBuf> {
Ok(self
.zenith_distrib_dir
.as_ref()
.ok_or(anyhow!("Can not manage remote pageserver"))?
.join("pageserver"))
}
pub fn pg_data_dirs_path(&self) -> PathBuf {
self.base_data_dir.join("pgdatadirs")
}
pub fn pg_data_dir(&self, name: &str) -> PathBuf {
self.pg_data_dirs_path().join(name)
}
// TODO: move pageserver files into ./pageserver
pub fn pageserver_data_dir(&self) -> PathBuf {
self.base_data_dir.clone()
}
}
fn base_path() -> PathBuf {
fn zenith_repo_dir() -> PathBuf {
// Find repository path
match std::env::var_os("ZENITH_REPO_DIR") {
Some(val) => PathBuf::from(val.to_str().unwrap()),
None => ".zenith".into(),
@@ -77,71 +63,167 @@ fn base_path() -> PathBuf {
//
// Initialize a new Zenith repository
//
pub fn init(remote_pageserver: Option<&str>) -> Result<()> {
pub fn init() -> Result<()> {
// check if config already exists
let base_path = base_path();
if base_path.exists() {
let repo_path = zenith_repo_dir();
if repo_path.exists() {
anyhow::bail!(
"{} already exists. Perhaps already initialized?",
base_path.to_str().unwrap()
repo_path.to_str().unwrap()
);
}
// Now we can run init only from crate directory, so check that current dir is our crate.
// Use 'pageserver/Cargo.toml' existence as evidendce.
let cargo_path = env::current_dir()?;
if !cargo_path.join("pageserver/Cargo.toml").exists() {
anyhow::bail!(
"Current dirrectory does not look like a zenith repo. \
Please, run 'init' from zenith repo root."
);
}
// ok, now check that expected binaries are present
// Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
let pg_distrib_dir: PathBuf = {
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
postgres_bin.into()
} else {
let cwd = env::current_dir()?;
cwd.join("tmp_install")
}
};
if !pg_distrib_dir.join("bin/postgres").exists() {
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
// check postgres
let pg_distrib_dir = cargo_path.join("tmp_install");
let pg_path = pg_distrib_dir.join("bin/postgres");
if !pg_path.exists() {
anyhow::bail!(
"Can't find postres binary at {}. \
Perhaps './pgbuild.sh' is needed to build it first.",
pg_path.to_str().unwrap()
);
}
fs::create_dir(&base_path)?;
fs::create_dir(base_path.join("pgdatadirs"))?;
// check pageserver
let zenith_distrib_dir = cargo_path.join("target/debug/");
let pageserver_path = zenith_distrib_dir.join("pageserver");
if !pageserver_path.exists() {
anyhow::bail!(
"Can't find pageserver binary at {}. Please build it.",
pageserver_path.to_str().unwrap()
);
}
let conf = if let Some(addr) = remote_pageserver {
// check that addr is parsable
let _uri = Url::parse(addr).map_err(|e| anyhow!("{}: {}", addr, e))?;
LocalEnv {
pageserver_connstring: format!("postgresql://{}/", addr),
pg_distrib_dir,
zenith_distrib_dir: None,
base_data_dir: base_path,
remotes: BTreeMap::default(),
}
} else {
// Find zenith binaries.
let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
if !zenith_distrib_dir.join("pageserver").exists() {
anyhow::bail!("Can't find pageserver binary.",);
}
LocalEnv {
pageserver_connstring: "postgresql://127.0.0.1:6400".to_string(),
pg_distrib_dir,
zenith_distrib_dir: Some(zenith_distrib_dir),
base_data_dir: base_path,
remotes: BTreeMap::default(),
}
// ok, we are good to go
let mut conf = LocalEnv {
repo_path: repo_path.clone(),
pg_distrib_dir,
zenith_distrib_dir,
systemid: 0,
};
let toml = toml::to_string_pretty(&conf)?;
fs::write(conf.base_data_dir.join("config"), toml)?;
init_repo(&mut conf)?;
Ok(())
}
// Locate and load config
pub fn load_config() -> Result<LocalEnv> {
let repopath = base_path();
pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> {
let repopath = &local_env.repo_path;
fs::create_dir(&repopath)
.with_context(|| format!("could not create directory {}", repopath.display()))?;
fs::create_dir(repopath.join("pgdatadirs"))?;
fs::create_dir(repopath.join("timelines"))?;
fs::create_dir(repopath.join("refs"))?;
fs::create_dir(repopath.join("refs").join("branches"))?;
fs::create_dir(repopath.join("refs").join("tags"))?;
println!("created directory structure in {}", repopath.display());
// Create initial timeline
let tli = create_timeline(&local_env, None)?;
let timelinedir = repopath.join("timelines").join(tli.to_string());
println!("created initial timeline {}", timelinedir.display());
// Run initdb
//
// FIXME: we create it temporarily in "tmp" directory, and move it into
// the repository. Use "tempdir()" or something? Or just create it directly
// in the repo?
let initdb_path = local_env.pg_bin_dir().join("initdb");
let _initdb = Command::new(initdb_path)
.args(&["-D", "tmp"])
.arg("--no-instructions")
.env_clear()
.env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap())
.stdout(Stdio::null())
.status()
.with_context(|| "failed to execute initdb")?;
println!("initdb succeeded");
// Read control file to extract the LSN and system id
let controlfile =
postgres_ffi::decode_pg_control(Bytes::from(fs::read("tmp/global/pg_control")?))?;
let systemid = controlfile.system_identifier;
let lsn = controlfile.checkPoint;
let lsnstr = format!("{:016X}", lsn);
// Move the initial WAL file
fs::rename(
"tmp/pg_wal/000000010000000000000001",
timelinedir
.join("wal")
.join("000000010000000000000001.partial"),
)?;
println!("moved initial WAL file");
// Remove pg_wal
fs::remove_dir_all("tmp/pg_wal")?;
println!("removed tmp/pg_wal");
force_crash_recovery(&PathBuf::from("tmp"))?;
println!("updated pg_control");
let target = timelinedir.join("snapshots").join(&lsnstr);
fs::rename("tmp", &target)?;
println!("moved 'tmp' to {}", target.display());
// Create 'main' branch to refer to the initial timeline
let data = tli.to_string();
fs::write(repopath.join("refs").join("branches").join("main"), data)?;
println!("created main branch");
// Also update the system id in the LocalEnv
local_env.systemid = systemid;
// write config
let toml = toml::to_string(&local_env)?;
fs::write(repopath.join("config"), toml)?;
println!(
"new zenith repository was created in {}",
repopath.display()
);
Ok(())
}
// If control file says the cluster was shut down cleanly, modify it, to mark
// it as crashed. That forces crash recovery when you start the cluster.
//
// FIXME:
// We currently do this to the initial snapshot in "zenith init". It would
// be more natural to do this when the snapshot is restored instead, but we
// currently don't have any code to create new snapshots, so it doesn't matter
// Or better yet, use a less hacky way of putting the cluster into recovery.
// Perhaps create a backup label file in the data directory when it's restored.
fn force_crash_recovery(datadir: &Path) -> Result<()> {
// Read in the control file
let controlfilepath = datadir.to_path_buf().join("global").join("pg_control");
let mut controlfile =
postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfilepath.as_path())?))?;
controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION;
fs::write(
controlfilepath.as_path(),
postgres_ffi::encode_pg_control(controlfile),
)?;
Ok(())
}
// check that config file is present
pub fn load_config(repopath: &Path) -> Result<LocalEnv> {
if !repopath.exists() {
anyhow::bail!(
"Zenith config is not found in {}. You need to run 'zenith init' first",
@@ -149,18 +231,159 @@ pub fn load_config() -> Result<LocalEnv> {
);
}
// TODO: check that it looks like a zenith repository
// load and parse file
let config = fs::read_to_string(repopath.join("config"))?;
toml::from_str(config.as_str()).map_err(|e| e.into())
}
// Save config. We use that to change set of remotes from CLI itself.
pub fn save_config(conf: &LocalEnv) -> Result<()> {
let config_path = base_path().join("config");
let conf_str = toml::to_string_pretty(conf)?;
// local env for tests
pub fn test_env(testname: &str) -> LocalEnv {
fs::create_dir_all("../tmp_check").expect("could not create directory ../tmp_check");
let repo_path = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("../tmp_check/")
.join(testname);
// Remove remnants of old test repo
let _ = fs::remove_dir_all(&repo_path);
let mut local_env = LocalEnv {
repo_path,
pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"),
zenith_distrib_dir: cargo_bin_dir(),
systemid: 0,
};
init_repo(&mut local_env).expect("could not initialize zenith repository");
return local_env;
}
// Find the directory where the binaries were put (i.e. target/debug/)
pub fn cargo_bin_dir() -> PathBuf {
let mut pathbuf = std::env::current_exe().unwrap();
pathbuf.pop();
if pathbuf.ends_with("deps") {
pathbuf.pop();
}
return pathbuf;
}
#[derive(Debug, Clone, Copy)]
pub struct PointInTime {
pub timelineid: ZTimelineId,
pub lsn: u64,
}
fn create_timeline(local_env: &LocalEnv, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
let repopath = &local_env.repo_path;
// Create initial timeline
let mut tli_buf = [0u8; 16];
rand::thread_rng().fill(&mut tli_buf);
let timelineid = ZTimelineId::from(tli_buf);
let timelinedir = repopath.join("timelines").join(timelineid.to_string());
fs::create_dir(&timelinedir)?;
fs::create_dir(&timelinedir.join("snapshots"))?;
fs::create_dir(&timelinedir.join("wal"))?;
if let Some(ancestor) = ancestor {
let data = format!(
"{}@{:X}/{:X}",
ancestor.timelineid,
ancestor.lsn >> 32,
ancestor.lsn & 0xffffffff
);
fs::write(timelinedir.join("ancestor"), data)?;
}
Ok(timelineid)
}
// Parse an LSN in the format used in filenames
//
// For example: 00000000015D3DD8
//
fn parse_lsn(s: &str) -> std::result::Result<u64, std::num::ParseIntError> {
u64::from_str_radix(s, 16)
}
// Create a new branch in the repository (for the "zenith branch" subcommand)
pub fn create_branch(
local_env: &LocalEnv,
branchname: &str,
startpoint: PointInTime,
) -> Result<()> {
let repopath = &local_env.repo_path;
// create a new timeline for it
let newtli = create_timeline(local_env, Some(startpoint))?;
let newtimelinedir = repopath.join("timelines").join(newtli.to_string());
let data = newtli.to_string();
fs::write(
repopath.join("refs").join("branches").join(branchname),
data,
)?;
// Copy the latest snapshot (TODO: before the startpoint) and all WAL
// TODO: be smarter and avoid the copying...
let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(local_env, startpoint.timelineid)?;
let copy_opts = fs_extra::dir::CopyOptions::new();
fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), &copy_opts)?;
let oldtimelinedir = repopath
.join("timelines")
.join(startpoint.timelineid.to_string());
let mut copy_opts = fs_extra::dir::CopyOptions::new();
copy_opts.content_only = true;
fs_extra::dir::copy(
oldtimelinedir.join("wal"),
newtimelinedir.join("wal"),
&copy_opts,
)?;
fs::write(config_path, conf_str)?;
Ok(())
}
// Find the end of valid WAL in a wal directory
pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<u64> {
let repopath = &local_env.repo_path;
let waldir = repopath
.join("timelines")
.join(timeline.to_string())
.join("wal");
let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true);
return Ok(lsn);
}
// Find the latest snapshot for a timeline
fn find_latest_snapshot(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<(u64, PathBuf)> {
let repopath = &local_env.repo_path;
let snapshotsdir = repopath
.join("timelines")
.join(timeline.to_string())
.join("snapshots");
let paths = fs::read_dir(&snapshotsdir)?;
let mut maxsnapshot: u64 = 0;
let mut snapshotdir: Option<PathBuf> = None;
for path in paths {
let path = path?;
let filename = path.file_name().to_str().unwrap().to_owned();
if let Ok(lsn) = parse_lsn(&filename) {
maxsnapshot = std::cmp::max(lsn, maxsnapshot);
snapshotdir = Some(path.path());
}
}
if maxsnapshot == 0 {
// TODO: check ancestor timeline
anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
}
Ok((maxsnapshot, snapshotdir.unwrap()))
}

View File

@@ -1,18 +1,137 @@
use std::collections::HashMap;
use std::net::{SocketAddr, TcpStream};
use std::path::PathBuf;
use anyhow::Result;
use std::fs;
use std::io;
use std::net::SocketAddr;
use std::net::TcpStream;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::str::FromStr;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::thread;
use std::time::Duration;
use anyhow::{anyhow, bail, Result};
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use postgres::{Client, NoTls};
use crate::compute::PostgresNode;
use crate::local_env::LocalEnv;
use crate::read_pidfile;
use pageserver::branches::BranchInfo;
use pageserver::ZTimelineId;
//
// Collection of several example deployments useful for tests.
//
// I'm intendedly modelling storage and compute control planes as a separate entities
// as it is closer to the actual setup.
//
pub struct TestStorageControlPlane {
pub wal_acceptors: Vec<WalAcceptorNode>,
pub pageserver: Arc<PageServerNode>,
pub test_done: AtomicBool,
pub repopath: PathBuf,
}
impl TestStorageControlPlane {
// Peek into the repository, to grab the timeline ID of given branch
pub fn get_branch_timeline(&self, branchname: &str) -> ZTimelineId {
let branchpath = self.repopath.join("refs/branches/".to_owned() + branchname);
ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap()
}
// postgres <-> page_server
//
// Initialize a new repository and configure a page server to run in it
//
pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane {
let repopath = local_env.repo_path.clone();
let pserver = Arc::new(PageServerNode {
env: local_env.clone(),
kill_on_exit: true,
listen_address: None,
});
pserver.start().unwrap();
TestStorageControlPlane {
wal_acceptors: Vec::new(),
pageserver: pserver,
test_done: AtomicBool::new(false),
repopath: repopath,
}
}
pub fn one_page_server_no_start(local_env: &LocalEnv) -> TestStorageControlPlane {
let repopath = local_env.repo_path.clone();
let pserver = Arc::new(PageServerNode {
env: local_env.clone(),
kill_on_exit: true,
listen_address: None,
});
TestStorageControlPlane {
wal_acceptors: Vec::new(),
pageserver: pserver,
test_done: AtomicBool::new(false),
repopath: repopath,
}
}
// postgres <-> {wal_acceptor1, wal_acceptor2, ...}
pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane {
let repopath = local_env.repo_path.clone();
let mut cplane = TestStorageControlPlane {
wal_acceptors: Vec::new(),
pageserver: Arc::new(PageServerNode {
env: local_env.clone(),
kill_on_exit: true,
listen_address: None,
}),
test_done: AtomicBool::new(false),
repopath: repopath,
};
cplane.pageserver.start().unwrap();
const WAL_ACCEPTOR_PORT: usize = 54321;
for i in 0..redundancy {
let wal_acceptor = WalAcceptorNode {
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
.parse()
.unwrap(),
data_dir: local_env.repo_path.join(format!("wal_acceptor_{}", i)),
env: local_env.clone(),
};
wal_acceptor.init();
wal_acceptor.start();
cplane.wal_acceptors.push(wal_acceptor);
}
cplane
}
pub fn stop(&self) {
self.test_done.store(true, Ordering::Relaxed);
}
pub fn get_wal_acceptor_conn_info(&self) -> String {
self.wal_acceptors
.iter()
.map(|wa| wa.listen.to_string())
.collect::<Vec<String>>()
.join(",")
}
pub fn is_running(&self) -> bool {
self.test_done.load(Ordering::Relaxed)
}
}
impl Drop for TestStorageControlPlane {
fn drop(&mut self) {
self.stop();
}
}
//
// Control routines for pageserver.
@@ -20,8 +139,8 @@ use pageserver::branches::BranchInfo;
// Used in CLI and tests.
//
pub struct PageServerNode {
pub kill_on_exit: bool,
pub listen_address: Option<SocketAddr>,
kill_on_exit: bool,
listen_address: Option<SocketAddr>,
pub env: LocalEnv,
}
@@ -41,36 +160,12 @@ impl PageServerNode {
}
}
pub fn init(&self) -> Result<()> {
let mut cmd = Command::new(self.env.pageserver_bin()?);
let status = cmd
.args(&["--init", "-D", self.env.base_data_dir.to_str().unwrap()])
.env_clear()
.env("RUST_BACKTRACE", "1")
.env(
"POSTGRES_DISTRIB_DIR",
self.env.pg_distrib_dir.to_str().unwrap(),
)
.env("ZENITH_REPO_DIR", self.repo_path())
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.status()
.expect("pageserver init failed");
if status.success() {
Ok(())
} else {
Err(anyhow!("pageserver init failed"))
}
}
pub fn repo_path(&self) -> PathBuf {
self.env.pageserver_data_dir()
self.env.repo_path.clone()
}
pub fn pid_file(&self) -> PathBuf {
self.repo_path().join("pageserver.pid")
self.env.repo_path.join("pageserver.pid")
}
pub fn start(&self) -> Result<()> {
@@ -80,27 +175,17 @@ impl PageServerNode {
self.repo_path().display()
);
let mut cmd = Command::new(self.env.pageserver_bin()?);
cmd.args(&[
"-l",
self.address().to_string().as_str(),
"-D",
self.repo_path().to_str().unwrap(),
])
.arg("-d")
.env_clear()
.env("RUST_BACKTRACE", "1")
.env(
"POSTGRES_DISTRIB_DIR",
self.env.pg_distrib_dir.to_str().unwrap(),
)
.env("ZENITH_REPO_DIR", self.repo_path())
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
let mut cmd = Command::new(self.env.zenith_distrib_dir.join("pageserver"));
cmd.args(&["-l", self.address().to_string().as_str()])
.arg("-d")
.env_clear()
.env("RUST_BACKTRACE", "1")
.env("ZENITH_REPO_DIR", self.repo_path())
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
if !cmd.status()?.success() {
bail!(
anyhow::bail!(
"Pageserver failed to start. See '{}' for details.",
self.repo_path().join("pageserver.log").display()
);
@@ -113,35 +198,43 @@ impl PageServerNode {
if client.is_ok() {
break;
} else {
println!("Pageserver not responding yet, retrying ({})...", retries);
println!("page server not responding yet, retrying ({})...", retries);
thread::sleep(Duration::from_secs(1));
}
}
println!("Pageserver started");
Ok(())
}
pub fn stop(&self) -> Result<()> {
let pid = read_pidfile(&self.pid_file())?;
let pid = Pid::from_raw(pid);
if kill(pid, Signal::SIGTERM).is_err() {
bail!("Failed to kill pageserver with pid {}", pid);
let pidfile = self.pid_file();
let pid = read_pidfile(&pidfile)?;
let status = Command::new("kill")
.arg(&pid)
.env_clear()
.status()
.expect("failed to execute kill");
if !status.success() {
anyhow::bail!("Failed to kill pageserver with pid {}", pid);
}
// wait for pageserver stop
// await for pageserver stop
for _ in 0..5 {
let stream = TcpStream::connect(self.address());
thread::sleep(Duration::from_secs(1));
if let Err(_e) = stream {
println!("Pageserver stopped");
return Ok(());
}
println!("Stopping pageserver on {}", self.address());
thread::sleep(Duration::from_secs(1));
}
bail!("Failed to stop pageserver with pid {}", pid);
// ok, we failed to stop pageserver, let's panic
if !status.success() {
anyhow::bail!("Failed to stop pageserver with pid {}", pid);
} else {
return Ok(());
}
}
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
@@ -158,7 +251,9 @@ impl PageServerNode {
client.simple_query(sql).unwrap()
}
pub fn page_server_psql_client(&self) -> Result<postgres::Client, postgres::Error> {
pub fn page_server_psql_client(
&self,
) -> std::result::Result<postgres::Client, postgres::Error> {
let connstring = format!(
"host={} port={} dbname={} user={}",
self.address().ip(),
@@ -168,79 +263,6 @@ impl PageServerNode {
);
Client::connect(connstring.as_str(), NoTls)
}
pub fn branches_list(&self) -> Result<Vec<BranchInfo>> {
let mut client = self.page_server_psql_client()?;
let query_result = client.simple_query("branch_list")?;
let branches_json = query_result
.first()
.map(|msg| match msg {
postgres::SimpleQueryMessage::Row(row) => row.get(0),
_ => None,
})
.flatten()
.ok_or_else(|| anyhow!("missing branches"))?;
let res: Vec<BranchInfo> = serde_json::from_str(branches_json)?;
Ok(res)
}
pub fn branch_create(&self, name: &str, startpoint: &str) -> Result<BranchInfo> {
let mut client = self.page_server_psql_client()?;
let query_result =
client.simple_query(format!("branch_create {} {}", name, startpoint).as_str())?;
let branch_json = query_result
.first()
.map(|msg| match msg {
postgres::SimpleQueryMessage::Row(row) => row.get(0),
_ => None,
})
.flatten()
.ok_or_else(|| anyhow!("missing branch"))?;
let res: BranchInfo = serde_json::from_str(branch_json).map_err(|e| {
anyhow!(
"failed to parse branch_create response: {}: {}",
branch_json,
e
)
})?;
Ok(res)
}
// TODO: make this a separate request type and avoid loading all the branches
pub fn branch_get_by_name(&self, name: &str) -> Result<BranchInfo> {
let branch_infos = self.branches_list()?;
let branche_by_name: Result<HashMap<String, BranchInfo>> = branch_infos
.into_iter()
.map(|branch_info| Ok((branch_info.name.clone(), branch_info)))
.collect();
let branche_by_name = branche_by_name?;
let branch = branche_by_name
.get(name)
.ok_or_else(|| anyhow!("Branch {} not found", name))?;
Ok(branch.clone())
}
pub fn system_id_get(&self) -> Result<u64> {
let mut client = self.page_server_psql_client()?;
let query_result = client
.simple_query("identify_system")?
.first()
.map(|msg| match msg {
postgres::SimpleQueryMessage::Row(row) => row.get(0),
_ => None,
})
.flatten()
.ok_or_else(|| anyhow!("failed to get system_id"))?
.parse::<u64>()?;
Ok(query_result)
}
}
impl Drop for PageServerNode {
@@ -250,3 +272,142 @@ impl Drop for PageServerNode {
}
}
}
//
// Control routines for WalAcceptor.
//
// Now used only in test setups.
//
pub struct WalAcceptorNode {
listen: SocketAddr,
data_dir: PathBuf,
env: LocalEnv,
}
impl WalAcceptorNode {
pub fn init(&self) {
if self.data_dir.exists() {
fs::remove_dir_all(self.data_dir.clone()).unwrap();
}
fs::create_dir_all(self.data_dir.clone()).unwrap();
}
pub fn start(&self) {
println!(
"Starting wal_acceptor in {} listening '{}'",
self.data_dir.to_str().unwrap(),
self.listen
);
let status = Command::new(self.env.zenith_distrib_dir.join("wal_acceptor"))
.args(&["-D", self.data_dir.to_str().unwrap()])
.args(&["-l", self.listen.to_string().as_str()])
.args(&["--systemid", &self.env.systemid.to_string()])
// Tell page server it can receive WAL from this WAL safekeeper
// FIXME: If there are multiple safekeepers, they will all inform
// the page server. Only the last "notification" will stay in effect.
// So it's pretty random which safekeeper the page server will connect to
.args(&["--pageserver", "127.0.0.1:64000"])
.arg("-d")
.arg("-n")
.status()
.expect("failed to start wal_acceptor");
if !status.success() {
panic!("wal_acceptor start failed");
}
}
pub fn stop(&self) -> std::result::Result<(), io::Error> {
println!("Stopping wal acceptor on {}", self.listen);
let pidfile = self.data_dir.join("wal_acceptor.pid");
let pid = read_pidfile(&pidfile)?;
// Ignores any failures when running this command
let _status = Command::new("kill")
.arg(pid)
.env_clear()
.status()
.expect("failed to execute kill");
Ok(())
}
}
impl Drop for WalAcceptorNode {
fn drop(&mut self) {
self.stop().unwrap();
}
}
///////////////////////////////////////////////////////////////////////////////
pub struct WalProposerNode {
pub pid: u32,
}
impl WalProposerNode {
pub fn stop(&self) {
let status = Command::new("kill")
.arg(self.pid.to_string())
.env_clear()
.status()
.expect("failed to execute kill");
if !status.success() {
panic!("kill start failed");
}
}
}
impl Drop for WalProposerNode {
fn drop(&mut self) {
self.stop();
}
}
///////////////////////////////////////////////////////////////////////////////
pub fn regress_check(pg: &PostgresNode) {
pg.safe_psql("postgres", "CREATE DATABASE regression");
let regress_run_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("tmp_check/regress");
fs::create_dir_all(regress_run_path.clone()).unwrap();
std::env::set_current_dir(regress_run_path).unwrap();
let regress_build_path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
let regress_src_path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
let _regress_check = Command::new(regress_build_path.join("pg_regress"))
.args(&[
"--bindir=''",
"--use-existing",
format!("--bindir={}", pg.env.pg_bin_dir().to_str().unwrap()).as_str(),
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
format!(
"--schedule={}",
regress_src_path.join("parallel_schedule").to_str().unwrap()
)
.as_str(),
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
])
.env_clear()
.env("LD_LIBRARY_PATH", pg.env.pg_lib_dir().to_str().unwrap())
.env("PGHOST", pg.address.ip().to_string())
.env("PGPORT", pg.address.port().to_string())
.env("PGUSER", pg.whoami())
.status()
.expect("pg_regress failed");
}
/// Read a PID file
///
/// This should contain an unsigned integer, but we return it as a String
/// because our callers only want to pass it back into a subcommand.
fn read_pidfile(pidfile: &Path) -> std::result::Result<String, io::Error> {
fs::read_to_string(pidfile).map_err(|err| {
eprintln!("failed to read pidfile {:?}: {:?}", pidfile, err);
err
})
}

View File

@@ -9,9 +9,8 @@ edition = "2018"
[dependencies]
lazy_static = "1.4.0"
rand = "0.8.3"
anyhow = "1.0"
nix = "0.20"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
pageserver = { path = "../pageserver" }
walkeeper = { path = "../walkeeper" }

View File

@@ -1,416 +0,0 @@
use std::collections::BTreeMap;
use std::convert::TryInto;
use std::fs::{self, File, OpenOptions};
use std::io::Read;
use std::net::SocketAddr;
use std::path::{Path, PathBuf};
use std::process::{Command, ExitStatus};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use anyhow::{bail, Result};
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use postgres;
use control_plane::compute::PostgresNode;
use control_plane::read_pidfile;
use control_plane::{local_env::LocalEnv, storage::PageServerNode};
// Find the directory where the binaries were put (i.e. target/debug/)
fn cargo_bin_dir() -> PathBuf {
let mut pathbuf = std::env::current_exe().unwrap();
pathbuf.pop();
if pathbuf.ends_with("deps") {
pathbuf.pop();
}
pathbuf
}
// local compute env for tests
pub fn create_test_env(testname: &str) -> LocalEnv {
let base_path = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("../tmp_check/")
.join(testname);
let base_path_str = base_path.to_str().unwrap();
// Remove remnants of old test repo
let _ = fs::remove_dir_all(&base_path);
fs::create_dir_all(&base_path)
.expect(format!("could not create directory for {}", base_path_str).as_str());
let pgdatadirs_path = base_path.join("pgdatadirs");
fs::create_dir(&pgdatadirs_path)
.expect(format!("could not create directory {:?}", pgdatadirs_path).as_str());
LocalEnv {
pageserver_connstring: "postgresql://127.0.0.1:64000".to_string(),
pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"),
zenith_distrib_dir: Some(cargo_bin_dir()),
base_data_dir: base_path,
remotes: BTreeMap::default(),
}
}
//
// Collection of several example deployments useful for tests.
//
// I'm intendedly modelling storage and compute control planes as a separate entities
// as it is closer to the actual setup.
//
pub struct TestStorageControlPlane {
pub wal_acceptors: Vec<WalAcceptorNode>,
pub pageserver: Arc<PageServerNode>,
pub test_done: AtomicBool,
}
impl TestStorageControlPlane {
// postgres <-> page_server
//
// Initialize a new repository and configure a page server to run in it
//
pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane {
let pserver = Arc::new(PageServerNode {
env: local_env.clone(),
kill_on_exit: true,
listen_address: None,
});
pserver.init().unwrap();
pserver.start().unwrap();
TestStorageControlPlane {
wal_acceptors: Vec::new(),
pageserver: pserver,
test_done: AtomicBool::new(false),
}
}
// postgres <-> {wal_acceptor1, wal_acceptor2, ...}
pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane {
let mut cplane = TestStorageControlPlane {
wal_acceptors: Vec::new(),
pageserver: Arc::new(PageServerNode {
env: local_env.clone(),
kill_on_exit: true,
listen_address: None,
}),
test_done: AtomicBool::new(false),
// repopath,
};
cplane.pageserver.init().unwrap();
cplane.pageserver.start().unwrap();
let systemid = cplane.pageserver.system_id_get().unwrap();
const WAL_ACCEPTOR_PORT: usize = 54321;
let datadir_base = local_env.base_data_dir.join("safekeepers");
fs::create_dir_all(&datadir_base).unwrap();
for i in 0..redundancy {
let wal_acceptor = WalAcceptorNode {
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
.parse()
.unwrap(),
data_dir: datadir_base.join(format!("wal_acceptor_{}", i)),
systemid,
env: local_env.clone(),
pass_to_pageserver: true,
};
wal_acceptor.init();
wal_acceptor.start();
cplane.wal_acceptors.push(wal_acceptor);
}
cplane
}
pub fn stop(&self) {
for wa in self.wal_acceptors.iter() {
let _ = wa.stop();
}
self.test_done.store(true, Ordering::Relaxed);
}
pub fn get_wal_acceptor_conn_info(&self) -> String {
self.wal_acceptors
.iter()
.map(|wa| wa.listen.to_string())
.collect::<Vec<String>>()
.join(",")
}
pub fn is_running(&self) -> bool {
self.test_done.load(Ordering::Relaxed)
}
}
impl Drop for TestStorageControlPlane {
fn drop(&mut self) {
self.stop();
}
}
///////////////////////////////////////////////////////////////////////////////
//
// PostgresNodeExt
//
///////////////////////////////////////////////////////////////////////////////
///
/// Testing utilities for PostgresNode type
///
pub trait PostgresNodeExt {
fn pg_regress(&self) -> ExitStatus;
fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus;
fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode;
fn open_psql(&self, db: &str) -> postgres::Client;
fn dump_log_file(&self);
fn safe_psql(&self, db: &str, sql: &str) -> Vec<postgres::Row>;
}
impl PostgresNodeExt for PostgresNode {
fn pg_regress(&self) -> ExitStatus {
self.safe_psql("postgres", "CREATE DATABASE regression");
let regress_run_path = self.env.base_data_dir.join("regress");
fs::create_dir_all(&regress_run_path).unwrap();
fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap();
std::env::set_current_dir(regress_run_path).unwrap();
let regress_build_path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
let regress_src_path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
let regress_check = Command::new(regress_build_path.join("pg_regress"))
.args(&[
"--bindir=''",
"--use-existing",
format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(),
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
format!(
"--schedule={}",
regress_src_path.join("parallel_schedule").to_str().unwrap()
)
.as_str(),
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
])
.env_clear()
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("PGPORT", self.address.port().to_string())
.env("PGUSER", self.whoami())
.env("PGHOST", self.address.ip().to_string())
.status()
.expect("pg_regress failed");
if !regress_check.success() {
if let Ok(mut file) = File::open("regression.diffs") {
let mut buffer = String::new();
file.read_to_string(&mut buffer).unwrap();
println!("--------------- regression.diffs:\n{}", buffer);
}
self.dump_log_file();
}
regress_check
}
fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus {
let port = self.address.port().to_string();
let clients = clients.to_string();
let seconds = seconds.to_string();
let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench"))
.args(&["-i", "-p", port.as_str(), "postgres"])
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.status()
.expect("pgbench -i");
let pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench"))
.args(&[
"-p",
port.as_str(),
"-T",
seconds.as_str(),
"-P",
"1",
"-c",
clients.as_str(),
"-M",
"prepared",
"postgres",
])
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
.status()
.expect("pgbench run");
pg_bench_run
}
fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode {
let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy");
match Command::new(proxy_path.as_path())
.args(&["--ztimelineid", &self.timelineid.to_string()])
.args(&["-s", wal_acceptors])
.args(&["-h", &self.address.ip().to_string()])
.args(&["-p", &self.address.port().to_string()])
.arg("-v")
.stderr(
OpenOptions::new()
.create(true)
.append(true)
.open(self.pgdata().join("safekeeper_proxy.log"))
.unwrap(),
)
.spawn()
{
Ok(child) => WalProposerNode { pid: child.id() },
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
}
}
fn dump_log_file(&self) {
if let Ok(mut file) = File::open(self.env.pageserver_data_dir().join("pageserver.log")) {
let mut buffer = String::new();
file.read_to_string(&mut buffer).unwrap();
println!("--------------- pageserver.log:\n{}", buffer);
}
}
fn safe_psql(&self, db: &str, sql: &str) -> Vec<postgres::Row> {
let connstring = format!(
"host={} port={} dbname={} user={}",
self.address.ip(),
self.address.port(),
db,
self.whoami()
);
let mut client = postgres::Client::connect(connstring.as_str(), postgres::NoTls).unwrap();
println!("Running {}", sql);
let result = client.query(sql, &[]);
if result.is_err() {
self.dump_log_file();
}
result.unwrap()
}
fn open_psql(&self, db: &str) -> postgres::Client {
let connstring = format!(
"host={} port={} dbname={} user={}",
self.address.ip(),
self.address.port(),
db,
self.whoami()
);
postgres::Client::connect(connstring.as_str(), postgres::NoTls).unwrap()
}
}
///////////////////////////////////////////////////////////////////////////////
//
// WalAcceptorNode
//
///////////////////////////////////////////////////////////////////////////////
//
// Control routines for WalAcceptor.
//
// Now used only in test setups.
//
pub struct WalAcceptorNode {
listen: SocketAddr,
data_dir: PathBuf,
systemid: u64,
env: LocalEnv,
pass_to_pageserver: bool,
}
impl WalAcceptorNode {
pub fn init(&self) {
if self.data_dir.exists() {
fs::remove_dir_all(self.data_dir.clone()).unwrap();
}
fs::create_dir_all(self.data_dir.clone()).unwrap();
}
pub fn start(&self) {
println!(
"Starting wal_acceptor in {} listening '{}'",
self.data_dir.to_str().unwrap(),
self.listen
);
let ps_arg = if self.pass_to_pageserver {
// Tell page server it can receive WAL from this WAL safekeeper
["--pageserver", "127.0.0.1:64000"].to_vec()
} else {
[].to_vec()
};
let status = Command::new(
self.env
.zenith_distrib_dir
.as_ref()
.unwrap()
.join("wal_acceptor"),
)
.args(&["-D", self.data_dir.to_str().unwrap()])
.args(&["-l", self.listen.to_string().as_str()])
.args(&["--systemid", self.systemid.to_string().as_str()])
.args(&ps_arg)
.arg("-d")
.arg("-n")
.status()
.expect("failed to start wal_acceptor");
if !status.success() {
panic!("wal_acceptor start failed");
}
}
pub fn stop(&self) -> Result<()> {
println!("Stopping wal acceptor on {}", self.listen);
let pidfile = self.data_dir.join("wal_acceptor.pid");
let pid = read_pidfile(&pidfile)?;
let pid = Pid::from_raw(pid);
if kill(pid, Signal::SIGTERM).is_err() {
bail!("Failed to kill wal_acceptor with pid {}", pid);
}
Ok(())
}
}
impl Drop for WalAcceptorNode {
fn drop(&mut self) {
// Ignore errors.
let _ = self.stop();
}
}
///////////////////////////////////////////////////////////////////////////////
//
// WalProposerNode
//
///////////////////////////////////////////////////////////////////////////////
pub struct WalProposerNode {
pub pid: u32,
}
impl WalProposerNode {
pub fn stop(&self) {
// std::process::Child::id() returns u32, we need i32.
let pid: i32 = self.pid.try_into().unwrap();
let pid = Pid::from_raw(pid);
kill(pid, Signal::SIGTERM).expect("failed to execute kill");
}
}
impl Drop for WalProposerNode {
fn drop(&mut self) {
self.stop();
}
}

View File

@@ -0,0 +1,11 @@
// test node resettlement to an empty datadir
// TODO
/*
#[test]
fn test_resettlement() {}
// test seq scan of everythin after restart
#[test]
fn test_cold_seqscan() {}
*/

View File

@@ -0,0 +1,8 @@
// TODO
/*
#[test]
fn test_actions() {}
#[test]
fn test_regress() {}
*/

View File

@@ -0,0 +1,132 @@
// mod control_plane;
use control_plane::compute::ComputeControlPlane;
use control_plane::local_env;
use control_plane::local_env::PointInTime;
use control_plane::storage::TestStorageControlPlane;
// XXX: force all redo at the end
// -- restart + seqscan won't read deleted stuff
// -- pageserver api endpoint to check all rels
#[test]
fn test_redo_cases() {
let local_env = local_env::test_env("test_redo_cases");
// Start pageserver that reads WAL directly from that postgres
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
// start postgres
let maintli = storage_cplane.get_branch_timeline("main");
let node = compute_cplane.new_test_node(maintli);
node.start().unwrap();
// check basic work with table
node.safe_psql(
"postgres",
"CREATE TABLE t(key int primary key, value text)",
);
node.safe_psql(
"postgres",
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
);
let count: i64 = node
.safe_psql("postgres", "SELECT sum(key) FROM t")
.first()
.unwrap()
.get(0);
println!("sum = {}", count);
assert_eq!(count, 5000050000);
// check 'create table as'
node.safe_psql("postgres", "CREATE TABLE t2 AS SELECT * FROM t");
let count: i64 = node
.safe_psql("postgres", "SELECT sum(key) FROM t")
.first()
.unwrap()
.get(0);
println!("sum = {}", count);
assert_eq!(count, 5000050000);
}
// Runs pg_regress on a compute node
#[test]
#[ignore]
fn test_regress() {
let local_env = local_env::test_env("test_regress");
// Start pageserver that reads WAL directly from that postgres
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
// start postgres
let maintli = storage_cplane.get_branch_timeline("main");
let node = compute_cplane.new_test_node(maintli);
node.start().unwrap();
control_plane::storage::regress_check(&node);
}
// Run two postgres instances on one pageserver, on different timelines
#[test]
fn test_pageserver_two_timelines() {
let local_env = local_env::test_env("test_pageserver_two_timelines");
// Start pageserver that reads WAL directly from that postgres
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
let maintli = storage_cplane.get_branch_timeline("main");
// Create new branch at the end of 'main'
let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
local_env::create_branch(
&local_env,
"experimental",
PointInTime {
timelineid: maintli,
lsn: startpoint,
},
)
.unwrap();
let experimentaltli = storage_cplane.get_branch_timeline("experimental");
// Launch postgres instances on both branches
let node1 = compute_cplane.new_test_node(maintli);
let node2 = compute_cplane.new_test_node(experimentaltli);
node1.start().unwrap();
node2.start().unwrap();
// check node1
node1.safe_psql(
"postgres",
"CREATE TABLE t(key int primary key, value text)",
);
node1.safe_psql(
"postgres",
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
);
let count: i64 = node1
.safe_psql("postgres", "SELECT sum(key) FROM t")
.first()
.unwrap()
.get(0);
println!("sum = {}", count);
assert_eq!(count, 5000050000);
// check node2
node2.safe_psql(
"postgres",
"CREATE TABLE t(key int primary key, value text)",
);
node2.safe_psql(
"postgres",
"INSERT INTO t SELECT generate_series(100000,200000), 'payload'",
);
let count: i64 = node2
.safe_psql("postgres", "SELECT sum(key) FROM t")
.first()
.unwrap()
.get(0);
println!("sum = {}", count);
assert_eq!(count, 15000150000);
}

View File

@@ -1,64 +1,18 @@
// Restart acceptors one by one while compute is under the load.
use control_plane::compute::ComputeControlPlane;
use control_plane::local_env;
use control_plane::local_env::PointInTime;
use control_plane::storage::TestStorageControlPlane;
use pageserver::ZTimelineId;
use rand::Rng;
use std::sync::Arc;
use std::time::SystemTime;
use std::{thread, time};
use control_plane::compute::{ComputeControlPlane, PostgresNode};
use integration_tests;
use integration_tests::PostgresNodeExt;
use integration_tests::TestStorageControlPlane;
const DOWNTIME: u64 = 2;
fn start_node_with_wal_proposer(
timeline: &str,
compute_cplane: &mut ComputeControlPlane,
wal_acceptors: &String,
) -> Arc<PostgresNode> {
let node = compute_cplane.new_test_master_node(timeline);
let _node = node.append_conf(
"postgresql.conf",
&format!("wal_acceptors='{}'\n", wal_acceptors),
);
node.start().unwrap();
node
}
#[test]
fn test_embedded_wal_proposer() {
let local_env = integration_tests::create_test_env("test_embedded_wal_proposer");
const REDUNDANCY: usize = 3;
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
// start postgres
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
// check basic work with table
node.safe_psql(
"postgres",
"CREATE TABLE t(key int primary key, value text)",
);
node.safe_psql(
"postgres",
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
);
let count: i64 = node
.safe_psql("postgres", "SELECT sum(key) FROM t")
.first()
.unwrap()
.get(0);
println!("sum = {}", count);
assert_eq!(count, 5000050000);
// check wal files equality
}
#[test]
fn test_acceptors_normal_work() {
let local_env = integration_tests::create_test_env("test_acceptors_normal_work");
let local_env = local_env::test_env("test_acceptors_normal_work");
const REDUNDANCY: usize = 3;
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
@@ -66,7 +20,12 @@ fn test_acceptors_normal_work() {
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
// start postgres
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
let maintli = storage_cplane.get_branch_timeline("main");
let node = compute_cplane.new_test_master_node(maintli);
node.start().unwrap();
// start proxy
let _proxy = node.start_proxy(&wal_acceptors);
// check basic work with table
node.safe_psql(
@@ -94,29 +53,39 @@ fn test_many_timelines() {
// Initialize a new repository, and set up WAL safekeepers and page server.
const REDUNDANCY: usize = 3;
const N_TIMELINES: usize = 5;
let local_env = integration_tests::create_test_env("test_many_timelines");
let local_env = local_env::test_env("test_many_timelines");
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
// Create branches
let mut timelines: Vec<String> = Vec::new();
timelines.push("main".to_string());
let mut timelines: Vec<ZTimelineId> = Vec::new();
let maintli = storage_cplane.get_branch_timeline("main"); // main branch
timelines.push(maintli);
let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
for i in 1..N_TIMELINES {
// additional branches
let branchname = format!("experimental{}", i);
storage_cplane
.pageserver
.branch_create(&branchname, "main")
.unwrap();
timelines.push(branchname);
local_env::create_branch(
&local_env,
&branchname,
PointInTime {
timelineid: maintli,
lsn: startpoint,
},
)
.unwrap();
let tli = storage_cplane.get_branch_timeline(&branchname);
timelines.push(tli);
}
// start postgres on each timeline
let mut nodes = Vec::new();
for tli_name in timelines {
let node = start_node_with_wal_proposer(&tli_name, &mut compute_cplane, &wal_acceptors);
for tli in timelines {
let node = compute_cplane.new_test_node(tli);
nodes.push(node.clone());
node.start().unwrap();
node.start_proxy(&wal_acceptors);
}
// create schema
@@ -150,7 +119,7 @@ fn test_many_timelines() {
// Majority is always alive
#[test]
fn test_acceptors_restarts() {
let local_env = integration_tests::create_test_env("test_acceptors_restarts");
let local_env = local_env::test_env("test_acceptors_restarts");
// Start pageserver that reads WAL directly from that postgres
const REDUNDANCY: usize = 3;
@@ -162,8 +131,12 @@ fn test_acceptors_restarts() {
let mut rng = rand::thread_rng();
// start postgres
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
let maintli = storage_cplane.get_branch_timeline("main");
let node = compute_cplane.new_test_master_node(maintli);
node.start().unwrap();
// start proxy
let _proxy = node.start_proxy(&wal_acceptors);
let mut failed_node: Option<usize> = None;
// check basic work with table
@@ -199,7 +172,7 @@ fn test_acceptors_restarts() {
fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
let cp = cplane.clone();
thread::spawn(move || {
thread::sleep(time::Duration::from_secs(DOWNTIME));
thread::sleep(time::Duration::from_secs(1));
cp.wal_acceptors[no].start();
});
}
@@ -209,7 +182,7 @@ fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
// N_CRASHES env var
#[test]
fn test_acceptors_unavailability() {
let local_env = integration_tests::create_test_env("test_acceptors_unavailability");
let local_env = local_env::test_env("test_acceptors_unavailability");
// Start pageserver that reads WAL directly from that postgres
const REDUNDANCY: usize = 2;
@@ -219,7 +192,12 @@ fn test_acceptors_unavailability() {
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
// start postgres
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
let maintli = storage_cplane.get_branch_timeline("main");
let node = compute_cplane.new_test_master_node(maintli);
node.start().unwrap();
// start proxy
let _proxy = node.start_proxy(&wal_acceptors);
// check basic work with table
node.safe_psql(
@@ -230,16 +208,13 @@ fn test_acceptors_unavailability() {
psql.execute("INSERT INTO t values (1, 'payload')", &[])
.unwrap();
// Shut down all wal acceptors
storage_cplane.wal_acceptors[0].stop().unwrap();
let cp = Arc::new(storage_cplane);
start_acceptor(&cp, 0);
let now = SystemTime::now();
psql.execute("INSERT INTO t values (2, 'payload')", &[])
.unwrap();
// Here we check that the query above was hanging
// while wal_acceptor was unavailiable
assert!(now.elapsed().unwrap().as_secs() >= DOWNTIME);
assert!(now.elapsed().unwrap().as_secs() > 1);
psql.execute("INSERT INTO t values (3, 'payload')", &[])
.unwrap();
@@ -247,9 +222,7 @@ fn test_acceptors_unavailability() {
start_acceptor(&cp, 1);
psql.execute("INSERT INTO t values (4, 'payload')", &[])
.unwrap();
// Here we check that the query above was hanging
// while wal_acceptor was unavailiable
assert!(now.elapsed().unwrap().as_secs() >= 2 * DOWNTIME);
assert!(now.elapsed().unwrap().as_secs() > 2);
psql.execute("INSERT INTO t values (5, 'payload')", &[])
.unwrap();
@@ -260,8 +233,6 @@ fn test_acceptors_unavailability() {
.unwrap()
.get(0);
println!("sum = {}", count);
// Ensure that all inserts succeeded.
// Including ones that were waiting for wal acceptor restart.
assert_eq!(count, 15);
}
@@ -289,7 +260,7 @@ fn simulate_failures(cplane: Arc<TestStorageControlPlane>) {
// Race condition test
#[test]
fn test_race_conditions() {
let local_env = integration_tests::create_test_env("test_race_conditions");
let local_env = local_env::test_env("test_race_conditions");
// Start pageserver that reads WAL directly from that postgres
const REDUNDANCY: usize = 3;
@@ -301,7 +272,12 @@ fn test_race_conditions() {
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
// start postgres
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
let maintli = storage_cplane.get_branch_timeline("main");
let node = compute_cplane.new_test_master_node(maintli);
node.start().unwrap();
// start proxy
let _proxy = node.start_proxy(&wal_acceptors);
// check basic work with table
node.safe_psql(

23
mgmt-console/.gitignore vendored Normal file
View File

@@ -0,0 +1,23 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.js
# testing
/coverage
# production
/build
# misc
.DS_Store
.env.local
.env.development.local
.env.test.local
.env.production.local
npm-debug.log*
yarn-debug.log*
yarn-error.log*

55
mgmt-console/README Normal file
View File

@@ -0,0 +1,55 @@
Mock implementation of a management console.
See demo-howto.txt for usage.
Building and Installation
-------------------------
To compile Postgres:
sudo apt build-dep postgresql
sudo apt install bison flex libz-dev libssl-dev
sudo apt install ccache
sudo apt install libcurl4-openssl-dev libxml2-dev
For the webapp:
# NOTE: This requires at least version 1.1.0 of python3-flask. That's not
# available in Debian Buster, need at least Bullseye.
sudo apt install python3 python3-flask python3-pip npm webpack
pip3 install Flask-BasicAuth
pip3 install boto3
git clone and compile and install patched version of Postgres:
git clone https://github.com/libzenith/postgres.git
cd postgres
git checkout zenith-experiments
./configure --enable-debug --enable-cassert --with-openssl --prefix=/home/heikki/pgsql-install --with-libxml CC="ccache gcc" CFLAGS="-O0"
make -j4 -s install
Get the webapp:
cd ~
git clone https://github.com/libzenith/zenith-mgmt-console.git
cd zenith-mgmt-console
mkdir pgdatadirs
openssl req -new -x509 -days 365 -nodes -text -out server.crt \
-keyout server.key -subj "/CN=zenith-demo"
For Mock S3 server (unless you want to test against a real cloud service):
sudo apt install python3-tornado
cd ~/zenith-mgmt-console
git clone https://github.com/hlinnaka/ms3.git
Compile & run it:
npm install
webpack # compile React app
BASIC_AUTH_PASSWORD=<password> ./launch-local.sh
You can view the contents of the S3 bucket with browser:
http://<server>/list_bucket

340
mgmt-console/app.py Normal file
View File

@@ -0,0 +1,340 @@
from flask import request
from flask_basicauth import BasicAuth
from flask import render_template
from subprocess import PIPE, STDOUT, run, Popen
import html
import os
import re
import shutil
import logging
import time
import boto3
from boto3.session import Session
from botocore.client import Config
from botocore.handlers import set_list_objects_encoding_type_url
from flask import Flask
import waldump
app = Flask(__name__)
app.config['BASIC_AUTH_USERNAME'] = 'zenith'
app.config['BASIC_AUTH_PASSWORD'] = os.getenv('BASIC_AUTH_PASSWORD')
app.config['BASIC_AUTH_FORCE'] = True
basic_auth = BasicAuth(app)
# S3 configuration:
ENDPOINT = os.getenv('S3_ENDPOINT', 'https://localhost:9000')
ACCESS_KEY = os.getenv('S3_ACCESSKEY', 'minioadmin')
SECRET = os.getenv('S3_SECRET', '')
BUCKET = os.getenv('S3_BUCKET', 'foobucket')
print("Using bucket at " + ENDPOINT);
#boto3.set_stream_logger('botocore', logging.DEBUG)
session = Session(aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET,
region_name=os.getenv('S3_REGION', 'auto'))
# needed for google cloud?
session.events.unregister('before-parameter-build.s3.ListObjects',
set_list_objects_encoding_type_url)
s3resource = session.resource('s3',
endpoint_url=ENDPOINT,
verify=False,
config=Config(signature_version='s3v4'))
s3bucket = s3resource.Bucket(BUCKET)
s3_client = boto3.client('s3',
endpoint_url=ENDPOINT,
verify=False,
config=Config(signature_version='s3v4'),
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET)
@app.route("/")
def index():
return render_template("index.html")
@app.route("/api/waldump")
def render_waldump():
return render_template("waldump.html")
@app.route('/api/fetch_wal')
def fetch_wal():
return waldump.fetch_wal(request, s3bucket);
@app.route("/api/server_status")
def server_status():
dirs = os.listdir("pgdatadirs")
dirs.sort()
primary = None
standbys = []
for dirname in dirs:
result = run("pg_ctl status -D pgdatadirs/" + dirname, stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
srv = {
'datadir': dirname,
'status': result.stdout,
'port': None
}
if dirname == 'primary':
primary = srv;
primary['port'] = 5432;
else:
standby_match = re.search('standby_([0-9]+)', dirname)
if standby_match:
srv['port'] = int(standby_match.group(1))
standbys.append(srv);
return {'primary': primary, 'standbys': standbys}
@app.route('/api/list_bucket')
def list_bucket():
response = 'cloud bucket contents:<br>\n'
for file in s3bucket.objects.all():
response = response + html.escape(file.key) + '<br>\n'
return response
def walpos_str(walpos):
return '{:X}/{:X}'.format(walpos >> 32, walpos & 0xFFFFFFFF)
@app.route('/api/bucket_summary')
def bucket_summary():
nonrelimages = []
minwal = int(0)
maxwal = int(0)
minseqwal = int(0)
maxseqwal = int(0)
for file in s3bucket.objects.all():
path = file.key
match = re.search('nonreldata/nonrel_([0-9A-F]+).tar', path)
if match:
walpos = int(match.group(1), 16)
nonrelimages.append(walpos_str(walpos))
match = re.search('nonreldata/nonrel_([0-9A-F]+)-([0-9A-F]+)', path)
if match:
endwal = int(match.group(2), 16)
if endwal > maxwal:
maxwal = endwal
match = re.search('walarchive/([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', path)
if match:
tli = int(match.group(1), 16)
logno = int(match.group(2), 16)
segno = int(match.group(3), 16)
# FIXME: this assumes default 16 MB wal segment size
logsegno = logno * (0x100000000 / (16*1024*1024)) + segno
seqwal = int((logsegno + 1) * (16*1024*1024))
if seqwal > maxseqwal:
maxseqwal = seqwal;
if minseqwal == 0 or seqwal < minseqwal:
minseqwal = seqwal;
return {
'nonrelimages': nonrelimages,
'minwal': walpos_str(minwal),
'maxwal': walpos_str(maxwal),
'minseqwal': walpos_str(minseqwal),
'maxseqwal': walpos_str(maxseqwal)
}
def print_cmd_result(cmd_result):
return print_cmd_result_ex(cmd_result.args, cmd_result.returncode, cmd_result.stdout)
def print_cmd_result_ex(cmd, returncode, stdout):
res = ''
res += 'ran command:\n' + str(cmd) + '\n'
res += 'It returned code ' + str(returncode) + '\n'
res += '\n'
res += 'stdout/stderr:\n'
res += stdout
return res
@app.route('/api/init_primary', methods=['GET', 'POST'])
def init_primary():
initdb_result = run("initdb -D pgdatadirs/primary --username=zenith --pwfile=pg-password.txt", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
if initdb_result.returncode != 0:
return print_cmd_result(initdb_result)
# Append archive_mode and archive_command and port to postgresql.conf
f=open("pgdatadirs/primary/postgresql.conf", "a+")
f.write("listen_addresses='*'\n")
f.write("archive_mode=on\n")
f.write("archive_command='zenith_push --archive-wal-path=%p --archive-wal-fname=%f'\n")
f.write("ssl=on\n")
f.close()
f=open("pgdatadirs/primary/pg_hba.conf", "a+")
f.write("# allow SSL connections with password from anywhere\n")
f.write("hostssl all all 0.0.0.0/0 md5\n")
f.write("hostssl all all ::0/0 md5\n")
f.close()
shutil.copyfile("server.crt", "pgdatadirs/primary/server.crt")
shutil.copyfile("server.key", "pgdatadirs/primary/server.key")
os.chmod("pgdatadirs/primary/server.key", 0o0600)
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
start_rc = start_proc.wait()
start_stdout, start_stderr = start_proc.communicate()
responsestr = print_cmd_result(initdb_result) + '\n'
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
return responsestr
@app.route('/api/zenith_push', methods=['GET', 'POST'])
def zenith_push():
# Stop the primary if it's running
stop_result = run(args=["pg_ctl", "stop", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
# Call zenith_push
push_result = run("zenith_push -D pgdatadirs/primary", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
# Restart the primary
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
start_rc = start_proc.wait()
start_stdout, start_stderr = start_proc.communicate()
responsestr = print_cmd_result(stop_result) + '\n'
responsestr += print_cmd_result(push_result) + '\n'
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout) + '\n'
return responsestr
@app.route('/api/create_standby', methods=['GET', 'POST'])
def create_standby():
walpos = request.form.get('walpos')
if not walpos:
return 'no walpos'
dirs = os.listdir("pgdatadirs")
last_port = 5432
for dirname in dirs:
standby_match = re.search('standby_([0-9]+)', dirname)
if standby_match:
port = int(standby_match.group(1))
if port > last_port:
last_port = port
standby_port = last_port + 1
standby_dir = "pgdatadirs/standby_" + str(standby_port)
# Call zenith_restore
restore_result = run(["zenith_restore", "--end=" + walpos, "-D", standby_dir], stdout=PIPE, stderr=STDOUT, encoding='latin1')
responsestr = print_cmd_result(restore_result)
if restore_result.returncode == 0:
# Append hot_standby and port to postgresql.conf
f=open(standby_dir + "/postgresql.conf", "a+")
f.write("hot_standby=on\n")
f.write("port=" + str(standby_port) + "\n")
f.close()
start_proc = Popen(args=["pg_ctl", "start", "-D", standby_dir, "-l", standby_dir + "/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
start_rc = start_proc.wait()
start_stdout, start_stderr = start_proc.communicate()
responsestr += '\n\n' + print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
return responsestr
@app.route('/api/destroy_server', methods=['GET', 'POST'])
def destroy_primary():
datadir = request.form.get('datadir')
# Check that the datadir parameter doesn't contain anything funny.
if not re.match("^[A-Za-z0-9_-]+$", datadir):
raise Exception('invalid datadir: ' + datadir)
# Stop the server if it's running
stop_result = run(args=["pg_ctl", "stop", "-m", "immediate", "-D", "pgdatadirs/" + datadir], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
shutil.rmtree('pgdatadirs/' + datadir, ignore_errors=True)
responsestr = print_cmd_result(stop_result) + '\n'
responsestr += 'Deleted datadir ' + datadir + '.\n'
return responsestr
@app.route('/api/restore_primary', methods=['GET', 'POST'])
def restore_primary():
# Call zenith_restore
restore_result = run(["zenith_restore", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, encoding='latin1')
responsestr = print_cmd_result(restore_result)
# Append restore_command to postgresql.conf, so that it can find the last raw WAL segments
f=open("pgdatadirs/primary/postgresql.conf", "a+")
f.write("listen_addresses='*'\n")
f.write("restore_command='zenith_restore --archive-wal-path=%p --archive-wal-fname=%f'\n")
f.write("ssl=on\n")
f.close()
if restore_result.returncode == 0:
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
start_rc = start_proc.wait()
start_stdout, start_stderr = start_proc.communicate()
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
return responsestr
@app.route('/api/slicedice', methods=['GET', 'POST'])
def run_slicedice():
result = run("zenith_slicedice", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
responsestr = print_cmd_result(result)
return responsestr
@app.route('/api/reset_demo', methods=['POST'])
def reset_all():
result = run("pkill -9 postgres", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
dirs = os.listdir("pgdatadirs")
for dirname in dirs:
shutil.rmtree('pgdatadirs/' + dirname)
for file in s3bucket.objects.all():
s3_client.delete_object(Bucket = BUCKET, Key = file.key)
responsestr = print_cmd_result(result) + '\n'
responsestr += '''
Deleted all Postgres datadirs.
Deleted all files in object storage bucket.
'''
return responsestr
if __name__ == '__main__':
app.run()

View File

@@ -0,0 +1,3 @@
module.exports = {
presets: ["@babel/preset-env", "@babel/preset-react"],
};

View File

@@ -0,0 +1,67 @@
Mock implementation of a management console.
This isn't very different from a "normal" PostgreSQL installation with
a base backup and WAL archive. The main user-visible difference is
that when you create a standby server, we don't restore the whole data
directory, but only the "non-relation" files. Relation files are
restored on demand, when they're accessed the first time. That makes
the "create standby" operation is very fast, but with some delay when
you connect and start running queries instead. Most visible if you
have a large database. (However, see note below about large databases)
Note: lots of things are broken/unsafe. Things will fail if a table is
larger than 1 GB. Or if there are more than 1000 files in the cloud
bucket.
How to use this demo:
1. If there are any leftovers from previous runs, reset by clicking
the RESET DEMO button. This kills and deletes all Postgres servers,
and empties the cloud storage bucket
2. Create primary server by clicking on the "Init primary" button
3. Push a base image of the primary to cloud storage, by clicking the
"push base image" button. (This takes about 30 seconds, be
patient)
4. Connect to primary with psql, and create a test table with a little data.
psql postgres -p5432 -U zenith -h<host>
create table mytable (i int4);
insert into mytable values (1);
select pg_switch_wal();
The Postgres password is the same as for the management console.
3. Now that there's a new WAL segment in the arhive, we can "slice &
dice" it. Click on the "Slice & dice button".
4. Perform more updates on the primary, to generate more WAL.
insert into mytable values (2); select pg_switch_wal();
insert into mytable values (3); select pg_switch_wal();
insert into mytable values (4); select pg_switch_wal();
insert into mytable values (5); select pg_switch_wal();
5. Slice & Dice the WAL again
6. Now you can create read-only standby servers at any point in the
WAL. Type a WAL position in the text box (or use the slider), and
click "Create new standby". The first standby is created at port 5433,
the second at port 5434, and so forth.
7. Connect to the standby with "psql -p 5433". Note that it takes a
few seconds until the connection is established. That's because the
standby has to restore the basic system catalogs, like pg_database and
pg_authid from the backup. After connecting, you can do "\d" to list
tables, this will also take a few seconds, as more catalog tables are
restored from backup. Subsequent commands will be faster.
Run queries in the standby:
select * from mytable;
the result depends on the LSN that you picked when you created the server.

463
mgmt-console/js/app.js Normal file
View File

@@ -0,0 +1,463 @@
import React, { useState, useEffect } from 'react';
import ReactDOM from 'react-dom';
import Loader from "react-loader-spinner";
import { Router, Route, Link, IndexRoute, hashHistory, browserHistory } from 'react-router';
function ServerStatus(props) {
const datadir = props.server.datadir;
const status = props.server.status;
const port = props.server.port;
return (
<div>
<h2>{ datadir == 'primary' ? 'Primary' : datadir }</h2>
status: <div className='status'>{status}</div><br/>
to connect: <span className='shellcommand'>psql -h { window.location.hostname } -p { port } -U zenith postgres</span><br/>
</div>
);
}
function StandbyList(props) {
const bucketSummary = props.bucketSummary;
const standbys = props.standbys;
const maxwalpos = bucketSummary.maxwal ? walpos_to_int(bucketSummary.maxwal) : 0;
const [walposInput, setWalposInput] = useState({ src: 'text', value: '0/0'});
// find earliest base image
const minwalpos = bucketSummary.nonrelimages ? bucketSummary.nonrelimages.reduce((minpos, imgpos_str, index, array) => {
const imgpos = walpos_to_int(imgpos_str);
return (minpos == 0 || imgpos < minpos) ? imgpos : minpos;
}, 0) : 0;
const can_create_standby = minwalpos > 0 && maxwalpos > 0 && maxwalpos >= minwalpos;
var walpos_valid = true;
function create_standby() {
const formdata = new FormData();
formdata.append("walpos", walposStr);
props.startOperation('Creating new standby at ' + walposStr + '...',
fetch("/api/create_standby", { method: 'POST', body: formdata }));
}
function destroy_standby(datadir) {
const formdata = new FormData();
formdata.append("datadir", datadir);
props.startOperation('Destroying ' + datadir + '...',
fetch("/api/destroy_server", { method: 'POST', body: formdata }));
}
const handleSliderChange = (event) => {
setWalposInput({ src: 'slider', value: event.target.value });
}
const handleWalposChange = (event) => {
setWalposInput({ src: 'text', value: event.target.value });
}
var sliderValue;
var walposStr;
if (walposInput.src == 'text')
{
const walpos = walpos_to_int(walposInput.value);
if (walpos >= minwalpos && walpos <= maxwalpos)
walpos_valid = true;
else
walpos_valid = false;
sliderValue = Math.round((walpos - minwalpos) / (maxwalpos - minwalpos) * 100);
walposStr = walposInput.value;
}
else
{
const slider = walposInput.value;
const new_walpos = minwalpos + slider / 100 * (maxwalpos - minwalpos);
console.log('minwalpos: '+ minwalpos);
console.log('maxwalpos: '+ maxwalpos);
walposStr = int_to_walpos(Math.round(new_walpos));
walpos_valid = true;
console.log(walposStr);
}
var standbystatus = ''
if (standbys)
{
standbystatus =
<div>
{
standbys.length > 0 ?
standbys.map((server) =>
<>
<ServerStatus key={ 'status_' + server.datadir} server={server}/>
<button key={ 'destroy_' + server.datadir} onClick={e => destroy_standby(server.datadir)}>Destroy standby</button>
</>
) : "no standby servers"
}
</div>
}
return (
<div>
<h2>Standbys</h2>
<button onClick={create_standby} disabled={!can_create_standby || !walpos_valid}>Create new Standby</button> at LSN
<input type="text" id="walpos_input" value={ walposStr } onChange={handleWalposChange} disabled={!can_create_standby}/>
<input type="range" id="walpos_slider" min="0" max="100" steps="1" value={sliderValue} onChange={handleSliderChange} disabled={!can_create_standby}/>
<br/>
{ standbystatus }
</div>
);
}
function ServerList(props) {
const primary = props.serverStatus ? props.serverStatus.primary : null;
const standbys = props.serverStatus ? props.serverStatus.standbys : [];
const bucketSummary = props.bucketSummary;
var primarystatus = '';
function destroy_primary() {
const formdata = new FormData();
formdata.append("datadir", 'primary');
props.startOperation('Destroying primary...',
fetch("/api/destroy_server", { method: 'POST', body: formdata }));
}
function restore_primary() {
props.startOperation('Restoring primary...',
fetch("/api/restore_primary", { method: 'POST' }));
}
if (primary)
{
primarystatus =
<div>
<ServerStatus server={primary}/>
<button onClick={destroy_primary}>Destroy primary</button>
</div>
}
else
{
primarystatus =
<div>
no primary server<br/>
<button onClick={restore_primary}>Restore primary</button>
</div>
}
return (
<>
{ primarystatus }
<StandbyList standbys={standbys} startOperation={props.startOperation} bucketSummary={props.bucketSummary}/>
<p className="todo">
Should we list the WAL safekeeper nodes here? Or are they part of the Storage? Or not visible to users at all?
</p>
</>
);
}
function BucketSummary(props) {
const bucketSummary = props.bucketSummary;
const startOperation = props.startOperation;
function slicedice() {
startOperation('Slicing sequential WAL to per-relation WAL...',
fetch("/api/slicedice", { method: 'POST' }));
}
if (!bucketSummary.nonrelimages)
{
return <>loading...</>
}
return (
<div>
<div>Base images at following WAL positions:
<ul>
{bucketSummary.nonrelimages.map((img) => (
<li key={img}>{img}</li>
))}
</ul>
</div>
Sliced WAL is available up to { bucketSummary.maxwal }<br/>
Raw WAL is available up to { bucketSummary.maxseqwal }<br/>
<br/>
<button onClick={slicedice}>Slice & Dice WAL</button>
<p className="todo">
Currently, the slicing or "sharding" of the WAL needs to be triggered manually, by clicking the above button.
<br/>
TODO: make it a continuous process that runs in the WAL safekeepers, or in the Page Servers, or as a standalone service.
</p>
</div>
);
}
function ProgressIndicator()
{
return (
<div>
<Loader
type="Puff"
color="#00BFFF"
height={100}
width={100}
/>
</div>
)
}
function walpos_to_int(walpos)
{
const [hi, lo] = walpos.split('/');
return parseInt(hi, 16) + parseInt(lo, 16);
}
function int_to_walpos(x)
{
console.log('converting ' + x);
return (Math.floor((x / 0x100000000)).toString(16) + '/' + (x % 0x100000000).toString(16)).toUpperCase();
}
function OperationStatus(props) {
const lastOperation = props.lastOperation;
const inProgress = props.inProgress;
const operationResult = props.operationResult;
if (lastOperation)
{
return (
<div><h2>Last operation:</h2>
<div>{lastOperation} { (!inProgress && lastOperation) ? 'done!' : '' }</div>
<div className='result'>
{inProgress ? <ProgressIndicator/> : <pre>{operationResult}</pre>}
</div>
</div>
);
}
else
return '';
}
function ActionButtons(props) {
const startOperation = props.startOperation;
const bucketSummary = props.bucketSummary;
function reset_demo() {
startOperation('resetting everything...',
fetch("/api/reset_demo", { method: 'POST' }));
}
function init_primary() {
startOperation('Initializing new primary...',
fetch("/api/init_primary", { method: 'POST' }));
}
function zenith_push() {
startOperation('Pushing new base image...',
fetch("/api/zenith_push", { method: 'POST' }));
}
return (
<div>
<p className="todo">
RESET DEMO deletes everything in the storage bucket, and stops and destroys all servers. This resets the whole demo environment to the initial state.
</p>
<button onClick={reset_demo}>RESET DEMO</button>
<p className="todo">
Init Primary runs initdb to create a new primary server. Click this after Resetting the demo.
</p>
<button onClick={init_primary}>Init primary</button>
<p className="todo">
Push Base Image stops the primary, copies the current state of the primary to the storage bucket as a new base backup, and restarts the primary.
<br/>
TODO: This should be handled by a continuous background process, probably running in the storage nodes. And without having to shut down the cluster, of course.
</p>
<button onClick={zenith_push}>Push base image</button>
</div>
);
}
function Sidenav(props)
{
const toPage = (page) => (event) => {
//event.preventDefault()
props.switchPage(page);
};
return (
<div>
<h3 className="sidenav-item">Menu</h3>
<a href="#servers" onClick={toPage('servers')} className="sidenav-item">Servers</a>
<a href="#storage" onClick={toPage('storage')} className="sidenav-item">Storage</a>
<a href="#snapshots" onClick={toPage('snapshots')} className="sidenav-item">Snapshots</a>
<a href="#demo" onClick={toPage('demo')} className="sidenav-item">Demo</a>
<a href="#import" onClick={toPage('import')} className="sidenav-item">Import / Export</a>
<a href="#jobs" onClick={toPage('jobs')} className="sidenav-item">Jobs</a>
</div>
);
}
function App()
{
const [page, setPage] = useState('servers');
const [serverStatus, setServerStatus] = useState({});
const [bucketSummary, setBucketSummary] = useState({});
const [lastOperation, setLastOperation] = useState('');
const [inProgress, setInProgress] = useState('');
const [operationResult, setOperationResult] = useState('');
useEffect(() => {
reloadStatus();
}, []);
function startOperation(operation, promise)
{
promise.then(result => result.text()).then(resultText => {
operationFinished(resultText);
});
setLastOperation(operation);
setInProgress(true);
setOperationResult('');
}
function operationFinished(result)
{
setInProgress(false);
setOperationResult(result);
reloadStatus();
}
function clearOperation()
{
setLastOperation('')
setInProgress('');
setOperationResult('');
console.log("cleared");
}
function reloadStatus()
{
fetch('/api/server_status').then(res => res.json()).then(data => {
setServerStatus(data);
});
fetch('/api/bucket_summary').then(res => res.json()).then(data => {
setBucketSummary(data);
});
}
const content = () => {
console.log(page);
if (page === 'servers') {
return (
<>
<h1>Server status</h1>
<ServerList startOperation={ startOperation }
serverStatus={ serverStatus }
bucketSummary={ bucketSummary }/>
</>
);
} else if (page === 'storage') {
return (
<>
<h1>Storage bucket status</h1>
<BucketSummary startOperation={ startOperation }
bucketSummary={ bucketSummary }/>
</>
);
} else if (page === 'snapshots') {
return (
<>
<h1>Snapshots</h1>
<p className="todo">
In Zenith, snapshots are just specific points (LSNs) in the WAL history, with a label. A snapshot prevents garbage collecting old data that's still needed to reconstruct the database at that LSN.
</p>
<p className="todo">
TODO:
<ul>
<li>List existing snapshots</li>
<li>Create new snapshot manually, from current state or from a given LSN</li>
<li>Drill into the WAL stream to see what have happened. Provide tools for e.g. finding point where a table was dropped</li>
<li>Create snapshots automatically based on events in the WAL, like if you call pg_create_restore_point(() in the primary</li>
<li>Launch new reader instance at a snapshot</li>
<li>Export snapshot</li>
<li>Rollback cluster to a snapshot</li>
</ul>
</p>
</>
);
} else if (page === 'demo') {
return (
<>
<h1>Misc actions</h1>
<ActionButtons startOperation={ startOperation }
bucketSummary={ bucketSummary }/>
</>
);
} else if (page === 'import') {
return (
<>
<h1>Import & Export tools</h1>
<p className="TODO">TODO:
<ul>
<li>Initialize database from existing backup (pg_basebackup, WAL-G, pgbackrest)</li>
<li>Initialize from a pg_dump or other SQL script</li>
<li>Launch batch job to import data files from S3</li>
<li>Launch batch job to export database with pg_dump to S3</li>
</ul>
These jobs can be run in against reader processing nodes. We can even
spawn a new reader node dedicated to a job, and destry it when the job is done.
</p>
</>
);
} else if (page === 'jobs') {
return (
<>
<h1>Batch jobs</h1>
<p className="TODO">TODO:
<ul>
<li>List running jobs launched from Import & Export tools</li>
<li>List other batch jobs launched by the user</li>
<li>Launch new batch jobs</li>
</ul>
</p>
</>
);
}
}
function switchPage(page)
{
console.log("topage " + page);
setPage(page)
clearOperation();
};
return (
<div className="row">
<div className="sidenav">
<Sidenav switchPage={switchPage} className="column"/>
</div>
<div className="column">
<div>
{ content() }
</div>
<OperationStatus lastOperation={ lastOperation }
inProgress = { inProgress }
operationResult = { operationResult }/>
</div>
</div>
);
}
ReactDOM.render(<App/>, document.getElementById('reactApp'));

105
mgmt-console/js/waldump.js Normal file
View File

@@ -0,0 +1,105 @@
import React, { useState, useEffect } from 'react';
import ReactDOM from 'react-dom';
import Loader from "react-loader-spinner";
function walpos_to_int(walpos)
{
const [hi, lo] = walpos.split('/');
return parseInt(hi, 16) + parseInt(lo, 16);
}
const palette = [
"#003f5c",
"#2f4b7c",
"#665191",
"#a05195",
"#d45087",
"#f95d6a",
"#ff7c43",
"#ffa600"];
function WalRecord(props)
{
const firstwalpos = props.firstwalpos;
const endwalpos = props.endwalpos;
const record = props.record;
const index = props.index;
const xidmap = props.xidmap;
const startpos = walpos_to_int(record.start)
const endpos = walpos_to_int(record.end)
const scale = 1000 / (16*1024*1024)
const startx = (startpos - firstwalpos) * scale;
const endx = (endpos - firstwalpos) * scale;
const xidindex = xidmap[record.xid];
const color = palette[index % palette.length];
const y = 5 + (xidindex) * 20 + (index % 2) * 2;
return (
<line x1={ startx } y1={y} x2={endx} y2={y} stroke={ color } strokeWidth="5">
<title>
start: { record.start } end: { record.end }
</title>
</line>
)
}
function WalFile(props)
{
const walContent = props.walContent;
const firstwalpos = props.firstwalpos;
const xidmap = props.xidmap;
return <svg width="1000" height="200">
{
walContent.records ?
walContent.records.map((record, index) =>
<WalRecord key={record.start} firstwalpos={firstwalpos} record={record} index={index} xidmap={xidmap}/>
) : "no records"
}
</svg>
}
function WalDumpApp()
{
const [walContent, setWalContent] = useState({});
const filename = '00000001000000000000000C';
useEffect(() => {
fetch('/fetch_wal?filename='+filename).then(res => res.json()).then(data => {
setWalContent(data);
});
}, []);
var firstwalpos = 0;
var endwalpos = 0;
var numxids = 0;
var xidmap = {};
if (walContent.records && walContent.records.length > 0)
{
firstwalpos = walpos_to_int(walContent.records[0].start);
endwalpos = firstwalpos + 16*1024*1024;
walContent.records.forEach(rec => {
if (!xidmap[rec.xid])
{
xidmap[rec.xid] = ++numxids;
}
});
}
return (
<>
<h2>{filename}</h2>
<WalFile walContent={walContent} firstwalpos={firstwalpos} endwalpos={endwalpos} xidmap={xidmap}/>
</>
);
}
console.log('hey there');
ReactDOM.render(<WalDumpApp/>, document.getElementById('waldump'));

View File

@@ -0,0 +1,9 @@
#!/bin/bash
#
# NOTE: You must set the following environment variables before running this:
# BASIC_AUTH_PASSWORD - basic http auth password
# S3_ACCESSKEY
# S3_SECRET
S3_ENDPOINT=https://storage.googleapis.com S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql-install/bin:$PATH flask run --host=0.0.0.0

8
mgmt-console/launch-local.sh Executable file
View File

@@ -0,0 +1,8 @@
#!/bin/bash
#
# NOTE: You should set the BASIC_AUTH_PASSWORD environment variable before calling
# Launch S3 server
(cd ms3 && python3 -m ms3.app --listen-address=localhost) &
FLASK_ENV=development S3_REGION=auto S3_ENDPOINT=http://localhost:9009 S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql.fsmfork/bin:$PATH flask run --host=0.0.0.0

6144
mgmt-console/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

27
mgmt-console/package.json Normal file
View File

@@ -0,0 +1,27 @@
{
"name": "starter-kit",
"version": "1.1.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"build": "webpack",
"start": "python app.py"
},
"author": "",
"license": "ISC",
"dependencies": {
"react": "^17.0.1",
"react-dom": "^17.0.1",
"react-loader-spinner": "^4.0.0",
"react-router": "^5.2.0"
},
"devDependencies": {
"@babel/core": "^7.13.1",
"@babel/preset-env": "^7.13.5",
"@babel/preset-react": "^7.12.13",
"babel-loader": "^8.2.2",
"webpack": "^5.24.2",
"webpack-cli": "^4.5.0"
}
}

View File

@@ -0,0 +1,58 @@
<head>
<style>
.status {
font-family: monospace;
background-color: lightgrey;
}
.shellcommand {
font-family: monospace;
background-color: lightgrey;
}
.result {
font-family: monospace;
background-color: lightgrey;
padding: 10px;
}
.todo {font-style: italic;}
h1 {color: blue;}
.column {
float: left;
width: 50%;
padding: 10px;
}
/* Clear floats after the columns */
.row:after {
content: "";
display: table;
clear: both;
}
.sidenav {
float: left;
width: 150px;
padding: 10px;
background-color: pink;
}
.sidenav-item {
padding:10px 0px;
border:none;
display:block;
}
</style>
</head>
<body>
<div id="reactApp"></div>
<!-- Attach React components -->
<script type="text/javascript" src="{{ url_for('static', filename='app_bundle.js') }}"></script>
</body>

View File

@@ -0,0 +1,46 @@
<head>
<style>
.status {
font-family: monospace;
background-color: lightgrey;
}
.shellcommand {
font-family: monospace;
background-color: lightgrey;
}
.result {
font-family: monospace;
background-color: lightgrey;
padding: 10px;
}
h1 {color: blue;}
p {color: red;}
* {
box-sizing: border-box;
}
.row {
display: flex;
}
/* Create two equal columns that sits next to each other */
.column1 {
flex: 30%;
padding: 10px;
}
.column2 {
flex: 70%;
padding: 10px;
}
</style>
</head>
<body>
<div id="waldump"></div>
<!-- Attach React components -->
<script type="text/javascript" src="{{ url_for('static', filename='waldump_bundle.js') }}"></script>
</body>

25
mgmt-console/waldump.py Normal file
View File

@@ -0,0 +1,25 @@
#
# This file contains work-in-progress code to visualize WAL contents.
#
# This is the API endpoint that calls a 'zenith_wal_to_json' executable,
# which is a hacked version of pg_waldump that prints information about the
# records in JSON format. The code in js/waldump.js displays it.
#
import os
import re
from subprocess import PIPE, STDOUT, run, Popen
def fetch_wal(request, s3bucket):
filename = request.args.get('filename')
if not re.match("^[A-Za-z0-9_]+$", filename):
raise Exception('invalid WAL filename: ' + filename)
# FIXME: this downloads the WAL file to current dir. Use a temp dir? Pipe?
s3bucket.download_file('walarchive/' + filename, filename)
result = run("zenith_wal_to_json " + filename, stdout=PIPE, universal_newlines=True, shell=True)
os.unlink(filename);
return result.stdout

View File

@@ -0,0 +1,27 @@
var webpack = require('webpack');
module.exports = {
entry: {
app: './js/app.js',
waldump: './js/waldump.js'
},
output: {
filename: "[name]_bundle.js",
path: __dirname + '/static'
},
module: {
rules: [
{
test: /\.js?$/,
exclude: /node_modules/,
use: {
loader: 'babel-loader',
options: {
presets: ['@babel/preset-env']
}
}
}
]
},
plugins: [
]
};

179
mgmt-console/zenith.py Normal file
View File

@@ -0,0 +1,179 @@
#zenith.py
import click
import testgres
import os
from testgres import PostgresNode
from tabulate import tabulate
zenith_base_dir = '/home/anastasia/zenith/basedir'
@click.group()
def main():
"""Run the Zenith CLI."""
@click.group()
def pg():
"""Db operations
NOTE: 'database' here means one postgresql node
"""
@click.command(name='create')
@click.option('--name', required=True)
@click.option('-s', '--storage-name', help='Name of the storage',
default='zenith-local',
show_default=True)
@click.option('--snapshot', help='init from the snapshot. Snap is a name or URL')
@click.option('--no-start', is_flag=True, help='Do not start created node',
default=False, show_default=True)
def pg_create(name, storage_name, snapshot, no_start):
"""Initialize the database"""
node = PostgresNode()
base_dir = os.path.join(zenith_base_dir, 'pg', name)
node = testgres.get_new_node(name, base_dir=base_dir)
# TODO skip init, instead of that link node with storage or upload it from snapshot
node.init()
if(no_start==False):
node.start()
@click.command(name='start')
@click.option('--name', required=True)
@click.option('--snapshot')
@click.option('--read-only', is_flag=True, help='Start read-only node', show_default=True)
def pg_start(name, snapshot, read_only):
"""Start the database"""
node = PostgresNode()
base_dir = os.path.join(zenith_base_dir, 'pg', name)
node = testgres.get_new_node(name, base_dir=base_dir)
# TODO pass snapshot as a parameter
node.start()
@click.command(name='stop')
@click.option('--name', required=True)
def pg_stop(name):
"""Stop the database"""
node = PostgresNode()
base_dir = os.path.join(zenith_base_dir, 'pg', name)
node = testgres.get_new_node(name, base_dir=base_dir)
node.stop()
@click.command(name='destroy')
@click.option('--name', required=True)
def pg_destroy(name):
"""Drop the database"""
node = PostgresNode()
base_dir = os.path.join(zenith_base_dir, 'pg', name)
node = testgres.get_new_node(name, base_dir=base_dir)
node.cleanup()
@click.command(name='list')
def pg_list():
"""List existing databases"""
dirs = os.listdir(os.path.join(zenith_base_dir, 'pg'))
path={}
status={}
data=[]
for dirname in dirs:
path[dirname] = os.path.join(zenith_base_dir, 'pg', dirname)
fname = os.path.join( path[dirname], 'data/postmaster.pid')
try:
f = open(fname,'r')
status[dirname] = f.readlines()[-1]
except OSError as err:
status[dirname]='inactive'
data.append([dirname , status[dirname], path[dirname]])
print(tabulate(data, headers=['Name', 'Status', 'Path']))
pg.add_command(pg_create)
pg.add_command(pg_destroy)
pg.add_command(pg_start)
pg.add_command(pg_stop)
pg.add_command(pg_list)
@click.group()
def storage():
"""Storage operations"""
@click.command(name='attach')
@click.option('--name')
def storage_attach(name):
"""Attach the storage"""
@click.command(name='detach')
@click.option('--name')
@click.option('--force', is_flag=True, show_default=True)
def storage_detach(name):
"""Detach the storage"""
@click.command(name='list')
def storage_list():
"""List existing storages"""
storage.add_command(storage_attach)
storage.add_command(storage_detach)
storage.add_command(storage_list)
@click.group()
def snapshot():
"""Snapshot operations"""
@click.command(name='create')
def snapshot_create():
"""Create new snapshot"""
@click.command(name='destroy')
def snapshot_destroy():
"""Destroy the snapshot"""
@click.command(name='pull')
def snapshot_pull():
"""Pull remote snapshot"""
@click.command(name='push')
def snapshot_push():
"""Push snapshot to remote"""
@click.command(name='import')
def snapshot_import():
"""Convert given format to zenith snapshot"""
@click.command(name='export')
def snapshot_export():
"""Convert zenith snapshot to PostgreSQL compatible format"""
snapshot.add_command(snapshot_create)
snapshot.add_command(snapshot_destroy)
snapshot.add_command(snapshot_pull)
snapshot.add_command(snapshot_push)
snapshot.add_command(snapshot_import)
snapshot.add_command(snapshot_export)
@click.group()
def wal():
"""WAL operations"""
@click.command()
def wallist(name="list"):
"""List WAL files"""
wal.add_command(wallist)
@click.command()
def console():
"""Open web console"""
main.add_command(pg)
main.add_command(storage)
main.add_command(snapshot)
main.add_command(wal)
main.add_command(console)
if __name__ == '__main__':
main()

2373
pageserver/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -8,6 +8,7 @@ edition = "2018"
[dependencies]
chrono = "0.4.19"
crossbeam-channel = "0.5.0"
rand = "0.8.3"
regex = "1.4.5"
bytes = "1.0.1"
@@ -24,24 +25,18 @@ clap = "2.33.0"
termion = "1.5.6"
tui = "0.14.0"
daemonize = "0.4.1"
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", rev="7f15a24ec7daa0a5d9516da706212745f9042818", features = ["no-verify-ssl"] }
tokio = { version = "1.3.0", features = ["full"] }
tokio-stream = { version = "0.1.4" }
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
rocksdb = "0.16.0"
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
anyhow = "1.0"
crc32c = "0.6.0"
walkdir = "2"
thiserror = "1.0"
hex = "0.4.3"
tar = "0.4.33"
parse_duration = "2.1.1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
fs_extra = "1.2.0"
postgres_ffi = { path = "../postgres_ffi" }
zenith_utils = { path = "../zenith_utils" }
workspace_hack = { path = "../workspace_hack" }

41
pageserver/build.rs Normal file
View File

@@ -0,0 +1,41 @@
//
// Triggers postgres build if there is no postgres binary present at
// 'REPO_ROOT/tmp_install/bin/postgres'.
//
// I can see a lot of disadvantages with such automatization and main
// advantage here is ability to build everything and run integration tests
// in a bare repo by running 'cargo test'.
//
// We can interceipt whether it is debug or release build and run
// corresponding pg build. But it seems like an overkill for now.
//
// Problem #1 -- language server in my editor likes calling 'cargo build'
// by himself. So if I delete tmp_install directory it would magically reappear
// after some time. During this compilation 'cargo build' may whine about
// "waiting for file lock on build directory".
//
// Problem #2 -- cargo build would run this only if something is changed in
// the crate.
//
// And generally speaking postgres is not a build dependency for the pageserver,
// just for integration tests. So let's not mix that. I'll leave this file in
// place for some time just in case if anybody would start doing the same.
//
// use std::path::Path;
// use std::process::{Command};
fn main() {
// // build some postgres if it is not done none yet
// if !Path::new("../tmp_install/bin/postgres").exists() {
// let make_res = Command::new("make")
// .arg("postgres")
// .env_clear()
// .status()
// .expect("failed to execute 'make postgres'");
// if !make_res.success() {
// panic!("postgres build failed");
// }
// }
}

View File

@@ -1,277 +1,20 @@
use crate::ZTimelineId;
use log::*;
use regex::Regex;
use std::fmt;
use std::io::Write;
use std::sync::Arc;
use std::time::SystemTime;
use tar::{Builder, Header};
use tar::Builder;
use walkdir::WalkDir;
use bytes::{BufMut, BytesMut};
use crate::repository::{BufferTag, RelTag, Timeline};
use postgres_ffi::relfile_utils::*;
use postgres_ffi::*;
use zenith_utils::lsn::Lsn;
use crate::ZTimelineId;
fn new_tar_header(path: &str, size: u64) -> anyhow::Result<Header> {
let mut header = Header::new_gnu();
header.set_size(size);
header.set_path(path)?;
header.set_mode(0b110000000);
header.set_mtime(
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs(),
);
header.set_cksum();
Ok(header)
}
//
// Generate SRLU segment files from repository
//
fn add_slru_segments(
ar: &mut Builder<&mut dyn Write>,
timeline: &Arc<dyn Timeline>,
path: &str,
forknum: u8,
lsn: Lsn,
) -> anyhow::Result<()> {
let rel = RelTag {
spcnode: 0,
dbnode: 0,
relnode: 0,
forknum,
};
let (first, last) = timeline.get_range(rel, lsn)?;
const SEG_SIZE: usize =
pg_constants::BLCKSZ as usize * pg_constants::SLRU_PAGES_PER_SEGMENT as usize;
let mut seg_buf = [0u8; SEG_SIZE];
let mut curr_segno: Option<u32> = None;
for page in first..last {
let tag = BufferTag { rel, blknum: page };
let img = timeline.get_page_at_lsn(tag, lsn)?;
// Zero length image indicates truncated segment: just skip it
if img.len() != 0 {
assert!(img.len() == pg_constants::BLCKSZ as usize);
let segno = page / pg_constants::SLRU_PAGES_PER_SEGMENT;
if curr_segno.is_some() && curr_segno.unwrap() != segno {
let segname = format!("{}/{:>04X}", path, curr_segno.unwrap());
let header = new_tar_header(&segname, SEG_SIZE as u64)?;
ar.append(&header, &seg_buf[..])?;
seg_buf = [0u8; SEG_SIZE];
}
curr_segno = Some(segno);
let offs_start = (page % pg_constants::SLRU_PAGES_PER_SEGMENT) as usize
* pg_constants::BLCKSZ as usize;
let offs_end = offs_start + pg_constants::BLCKSZ as usize;
seg_buf[offs_start..offs_end].copy_from_slice(&img);
}
}
if curr_segno.is_some() {
let segname = format!("{}/{:>04X}", path, curr_segno.unwrap());
let header = new_tar_header(&segname, SEG_SIZE as u64)?;
ar.append(&header, &seg_buf[..])?;
}
Ok(())
}
//
// Extract pg_filenode.map files from repository
//
fn add_relmap_files(
ar: &mut Builder<&mut dyn Write>,
timeline: &Arc<dyn Timeline>,
lsn: Lsn,
snappath: &str,
) -> anyhow::Result<()> {
for db in timeline.get_databases(lsn)?.iter() {
let tag = BufferTag {
rel: *db,
blknum: 0,
};
let img = timeline.get_page_at_lsn(tag, lsn)?;
let path = if db.spcnode == pg_constants::GLOBALTABLESPACE_OID {
String::from("global/pg_filenode.map")
} else {
// User defined tablespaces are not supported
assert!(db.spcnode == pg_constants::DEFAULTTABLESPACE_OID);
let src_path = format!("{}/base/1/PG_VERSION", snappath);
let dst_path = format!("base/{}/PG_VERSION", db.dbnode);
ar.append_path_with_name(&src_path, &dst_path)?;
format!("base/{}/pg_filenode.map", db.dbnode)
};
assert!(img.len() == 512);
let header = new_tar_header(&path, img.len() as u64)?;
ar.append(&header, &img[..])?;
}
Ok(())
}
//
// Extract twophase state files
//
fn add_twophase_files(
ar: &mut Builder<&mut dyn Write>,
timeline: &Arc<dyn Timeline>,
lsn: Lsn,
) -> anyhow::Result<()> {
for xid in timeline.get_twophase(lsn)?.iter() {
let tag = BufferTag {
rel: RelTag {
spcnode: 0,
dbnode: 0,
relnode: 0,
forknum: pg_constants::PG_TWOPHASE_FORKNUM,
},
blknum: *xid,
};
let img = timeline.get_page_at_lsn(tag, lsn)?;
let mut buf = BytesMut::new();
buf.extend_from_slice(&img[..]);
let crc = crc32c::crc32c(&img[..]);
buf.put_u32_le(crc);
let path = format!("pg_twophase/{:>08X}", xid);
let header = new_tar_header(&path, buf.len() as u64)?;
ar.append(&header, &buf[..])?;
}
Ok(())
}
//
// Add generated pg_control file
//
fn add_pgcontrol_file(
ar: &mut Builder<&mut dyn Write>,
timeline: &Arc<dyn Timeline>,
lsn: Lsn,
) -> anyhow::Result<()> {
if let Some(checkpoint_bytes) =
timeline.get_page_image(BufferTag::fork(pg_constants::PG_CHECKPOINT_FORKNUM), Lsn(0))?
{
if let Some(pg_control_bytes) = timeline.get_page_image(
BufferTag::fork(pg_constants::PG_CONTROLFILE_FORKNUM),
Lsn(0),
)? {
let mut pg_control = postgres_ffi::decode_pg_control(pg_control_bytes)?;
let mut checkpoint = postgres_ffi::decode_checkpoint(checkpoint_bytes)?;
checkpoint.redo = lsn.0;
checkpoint.nextXid.value += 1;
// TODO: When we restart master there are no active transaction and oldestXid is
// equal to nextXid if there are no prepared transactions.
// Let's ignore them for a while...
checkpoint.oldestXid = checkpoint.nextXid.value as u32;
pg_control.checkPointCopy = checkpoint;
let pg_control_bytes = postgres_ffi::encode_pg_control(pg_control);
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
ar.append(&header, &pg_control_bytes[..])?;
}
}
Ok(())
}
///
/// Generate tarball with non-relational files from repository
///
pub fn send_tarball_at_lsn(
write: &mut dyn Write,
timelineid: ZTimelineId,
timeline: &Arc<dyn Timeline>,
lsn: Lsn,
snapshot_lsn: Lsn,
) -> anyhow::Result<()> {
let mut ar = Builder::new(write);
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshot_lsn.0);
debug!("sending tarball of snapshot in {}", snappath);
for entry in WalkDir::new(&snappath) {
let entry = entry?;
let fullpath = entry.path();
let relpath = entry.path().strip_prefix(&snappath).unwrap();
if relpath.to_str().unwrap() == "" {
continue;
}
if entry.file_type().is_dir() {
trace!(
"sending dir {} as {}",
fullpath.display(),
relpath.display()
);
ar.append_dir(relpath, fullpath)?;
} else if entry.file_type().is_symlink() {
error!("ignoring symlink in snapshot dir");
} else if entry.file_type().is_file() {
// Shared catalogs are exempt
if relpath.starts_with("global/") {
trace!("sending shared catalog {}", relpath.display());
ar.append_path_with_name(fullpath, relpath)?;
} else if !is_rel_file_path(relpath.to_str().unwrap()) {
if entry.file_name() != "pg_filenode.map"
&& entry.file_name() != "pg_control"
&& !relpath.starts_with("pg_xact/")
&& !relpath.starts_with("pg_multixact/")
{
trace!("sending {}", relpath.display());
ar.append_path_with_name(fullpath, relpath)?;
}
} else {
trace!("not sending {}", relpath.display());
}
} else {
error!("unknown file type: {}", fullpath.display());
}
}
add_slru_segments(
&mut ar,
timeline,
"pg_xact",
pg_constants::PG_XACT_FORKNUM,
lsn,
)?;
add_slru_segments(
&mut ar,
timeline,
"pg_multixact/members",
pg_constants::PG_MXACT_MEMBERS_FORKNUM,
lsn,
)?;
add_slru_segments(
&mut ar,
timeline,
"pg_multixact/offsets",
pg_constants::PG_MXACT_OFFSETS_FORKNUM,
lsn,
)?;
add_relmap_files(&mut ar, timeline, lsn, &snappath)?;
add_twophase_files(&mut ar, timeline, lsn)?;
add_pgcontrol_file(&mut ar, timeline, lsn)?;
ar.finish()?;
debug!("all tarred up!");
Ok(())
}
///
/// Send a tarball containing a snapshot of all non-relation files in the
/// PostgreSQL data directory, at given LSN
///
/// There must be a snapshot at the given LSN in the snapshots directory, we cannot
/// reconstruct the state at an arbitrary LSN at the moment.
///
pub fn send_snapshot_tarball(
write: &mut dyn Write,
timelineid: ZTimelineId,
snapshotlsn: Lsn,
snapshotlsn: u64,
) -> Result<(), std::io::Error> {
let mut ar = Builder::new(write);
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn.0);
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn);
let walpath = format!("timelines/{}/wal", timelineid);
debug!("sending tarball of snapshot in {}", snappath);
@@ -305,14 +48,7 @@ pub fn send_snapshot_tarball(
ar.append_path_with_name(fullpath, relpath)?;
} else {
trace!("not sending {}", relpath.display());
// FIXME: For now, also send all the relation files.
// This really shouldn't be necessary, and kind of
// defeats the point of having a page server in the
// first place. But it is useful at least when
// debugging with the DEBUG_COMPARE_LOCAL option (see
// vendor/postgres/src/backend/storage/smgr/pagestore_smgr.c)
// FIXME: send all files for now
ar.append_path_with_name(fullpath, relpath)?;
}
} else {
@@ -320,11 +56,7 @@ pub fn send_snapshot_tarball(
}
}
// FIXME: Also send all the WAL. The compute node would only need
// the WAL that applies to non-relation files, because the page
// server handles all the relation files. But we don't have a
// mechanism for separating relation and non-relation WAL at the
// moment.
// FIXME: also send all the WAL
for entry in std::fs::read_dir(&walpath)? {
let entry = entry?;
let fullpath = &entry.path();
@@ -334,7 +66,7 @@ pub fn send_snapshot_tarball(
continue;
}
let archive_fname = relpath.to_str().unwrap();
let archive_fname = relpath.to_str().unwrap().clone();
let archive_fname = archive_fname
.strip_suffix(".partial")
.unwrap_or(&archive_fname);
@@ -347,10 +79,78 @@ pub fn send_snapshot_tarball(
Ok(())
}
///
/// Parse a path, relative to the root of PostgreSQL data directory, as
/// a PostgreSQL relation data file.
///
// formats:
// <oid>
// <oid>_<fork name>
// <oid>.<segment number>
// <oid>_<fork name>.<segment number>
#[derive(Debug)]
struct FilePathError {
msg: String,
}
impl FilePathError {
fn new(msg: &str) -> FilePathError {
FilePathError {
msg: msg.to_string(),
}
}
}
impl From<core::num::ParseIntError> for FilePathError {
fn from(e: core::num::ParseIntError) -> Self {
return FilePathError {
msg: format!("invalid filename: {}", e),
};
}
}
impl fmt::Display for FilePathError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "invalid filename")
}
}
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
match forkname {
// "main" is not in filenames, it's implicit if the fork name is not present
None => Ok(0),
Some("fsm") => Ok(1),
Some("vm") => Ok(2),
Some("init") => Ok(3),
Some(_) => Err(FilePathError::new("invalid forkname")),
}
}
fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
let caps = re
.captures(fname)
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
let relnode_str = caps.name("relnode").unwrap().as_str();
let relnode = u32::from_str_radix(relnode_str, 10)?;
let forkname_match = caps.name("forkname");
let forkname = if forkname_match.is_none() {
None
} else {
Some(forkname_match.unwrap().as_str())
};
let forknum = forkname_to_forknum(forkname)?;
let segno_match = caps.name("segno");
let segno = if segno_match.is_none() {
0
} else {
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
};
return Ok((relnode, forknum, segno));
}
fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
/*
* Relation data files can be in one of the following directories:
@@ -370,30 +170,33 @@ fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
* <oid>.<segment number>
*/
if let Some(fname) = path.strip_prefix("global/") {
let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
Ok(())
return Ok(());
} else if let Some(dbpath) = path.strip_prefix("base/") {
let mut s = dbpath.split('/');
let dbnode_str = s.next().ok_or(FilePathError::InvalidFileName)?;
let _dbnode = dbnode_str.parse::<u32>()?;
let fname = s.next().ok_or(FilePathError::InvalidFileName)?;
let mut s = dbpath.split("/");
let dbnode_str = s
.next()
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
let _dbnode = u32::from_str_radix(dbnode_str, 10)?;
let fname = s
.next()
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
if s.next().is_some() {
return Err(FilePathError::InvalidFileName);
return Err(FilePathError::new("invalid relation data file name"));
};
let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
Ok(())
return Ok(());
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
// TODO
error!("tablespaces not implemented yet");
Err(FilePathError::InvalidFileName)
return Err(FilePathError::new("tablespaces not supported"));
} else {
Err(FilePathError::InvalidFileName)
return Err(FilePathError::new("invalid relation data file name"));
}
}
fn is_rel_file_path(path: &str) -> bool {
parse_rel_file_path(path).is_ok()
return parse_rel_file_path(path).is_ok();
}

View File

@@ -3,29 +3,31 @@
//
use log::*;
use parse_duration::parse;
use std::fs;
use std::fs::{File, OpenOptions};
use std::io;
use std::path::PathBuf;
use std::process::exit;
use std::thread;
use std::time::Duration;
use std::{env, path::PathBuf};
use std::{
fs::{File, OpenOptions},
net::TcpListener,
};
use anyhow::{Context, Result};
use clap::{App, Arg};
use daemonize::Daemonize;
use slog::{Drain, FnValue};
use slog::Drain;
use pageserver::{branches, page_cache, page_service, tui, PageServerConf};
use pageserver::page_service;
use pageserver::tui;
//use pageserver::walreceiver;
use pageserver::PageServerConf;
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
const DEFAULT_GC_PERIOD_SEC: u64 = 10;
//const DEFAULT_GC_HORIZON: u64 = 1024 * 1024 * 1024;
//const DEFAULT_GC_PERIOD_SEC: u64 = 600;
fn zenith_repo_dir() -> String {
// Find repository path
match std::env::var_os("ZENITH_REPO_DIR") {
Some(val) => String::from(val.to_str().unwrap()),
None => ".zenith".into(),
}
}
fn main() -> Result<()> {
let arg_matches = App::new("Zenith page server")
@@ -51,64 +53,12 @@ fn main() -> Result<()> {
.takes_value(false)
.help("Run in the background"),
)
.arg(
Arg::with_name("init")
.long("init")
.takes_value(false)
.help("Initialize pageserver repo"),
)
.arg(
Arg::with_name("gc_horizon")
.long("gc_horizon")
.takes_value(true)
.help("Distance from current LSN to perform all wal records cleanup"),
)
.arg(
Arg::with_name("gc_period")
.long("gc_period")
.takes_value(true)
.help("Interval between garbage collector iterations"),
)
.arg(
Arg::with_name("workdir")
.short("D")
.long("workdir")
.takes_value(true)
.help("Working directory for the pageserver"),
)
.get_matches();
let workdir = if let Some(workdir_arg) = arg_matches.value_of("workdir") {
PathBuf::from(workdir_arg)
} else if let Some(workdir_arg) = std::env::var_os("ZENITH_REPO_DIR") {
PathBuf::from(workdir_arg.to_str().unwrap())
} else {
PathBuf::from(".zenith")
};
let pg_distrib_dir: PathBuf = {
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
postgres_bin.into()
} else {
let cwd = env::current_dir()?;
cwd.join("tmp_install")
}
};
if !pg_distrib_dir.join("bin/postgres").exists() {
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
}
let mut conf = PageServerConf {
daemonize: false,
interactive: false,
gc_horizon: DEFAULT_GC_HORIZON,
gc_period: Duration::from_secs(DEFAULT_GC_PERIOD_SEC),
listen_addr: "127.0.0.1:64000".parse().unwrap(),
// we will change the current working directory to the repository below,
// so always set 'workdir' to '.'
workdir: PathBuf::from("."),
pg_distrib_dir,
listen_addr: "127.0.0.1:5430".parse().unwrap(),
};
if arg_matches.is_present("daemonize") {
@@ -128,76 +78,55 @@ fn main() -> Result<()> {
conf.listen_addr = addr.parse()?;
}
if let Some(horizon) = arg_matches.value_of("gc_horizon") {
conf.gc_horizon = horizon.parse()?;
}
if let Some(period) = arg_matches.value_of("gc_period") {
conf.gc_period = parse(period)?;
}
// The configuration is all set up now. Turn it into a 'static
// that can be freely stored in structs and passed across threads
// as a ref.
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
// Create repo and exit if init was requested
if arg_matches.is_present("init") {
branches::init_repo(conf, &workdir)?;
return Ok(());
}
// Set CWD to workdir for non-daemon modes
env::set_current_dir(&workdir)?;
start_pageserver(conf)
start_pageserver(&conf)
}
fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
let log_filename = "pageserver.log";
// Don't open the same file for output multiple times;
// the different fds could overwrite each other's output.
let log_file = OpenOptions::new()
.create(true)
.append(true)
.open(&log_filename)
.with_context(|| format!("failed to open {:?}", &log_filename))?;
fn start_pageserver(conf: &PageServerConf) -> Result<()> {
// Initialize logger
let logger_file = log_file.try_clone().unwrap();
let _scope_guard = init_logging(&conf, logger_file)?;
let _scope_guard = init_logging(&conf)?;
let _log_guard = slog_stdlog::init()?;
// Note: this `info!(...)` macro comes from `log` crate
info!("standard logging redirected to slog");
let tui_thread = if conf.interactive {
let tui_thread: Option<thread::JoinHandle<()>>;
if conf.interactive {
// Initialize the UI
Some(
tui_thread = Some(
thread::Builder::new()
.name("UI thread".into())
.spawn(|| {
let _ = tui::ui_main();
})
.unwrap(),
)
);
//threads.push(tui_thread);
} else {
None
};
// TODO: Check that it looks like a valid repository before going further
tui_thread = None;
}
if conf.daemonize {
info!("daemonizing...");
let repodir = PathBuf::from(zenith_repo_dir());
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
// that we will see any accidental manual fprintf's or backtraces.
let stdout = log_file.try_clone().unwrap();
let stderr = log_file;
let log_filename = repodir.join("pageserver.log");
let stdout = OpenOptions::new()
.create(true)
.append(true)
.open(&log_filename)
.with_context(|| format!("failed to open {:?}", &log_filename))?;
let stderr = OpenOptions::new()
.create(true)
.append(true)
.open(&log_filename)
.with_context(|| format!("failed to open {:?}", &log_filename))?;
let daemonize = Daemonize::new()
.pid_file("pageserver.pid")
.working_directory(".")
.pid_file(repodir.clone().join("pageserver.pid"))
.working_directory(repodir)
.stdout(stdout)
.stderr(stderr);
@@ -205,63 +134,73 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
Ok(_) => info!("Success, daemonized"),
Err(e) => error!("Error, {}", e),
}
}
// Check that we can bind to address before further initialization
info!("Starting pageserver on {}", conf.listen_addr);
let pageserver_listener = TcpListener::bind(conf.listen_addr)?;
// Initialize page cache, this will spawn walredo_thread
page_cache::init(conf);
// Spawn a thread to listen for connections. It will spawn further threads
// for each connection.
let page_service_thread = thread::Builder::new()
.name("Page Service thread".into())
.spawn(move || page_service::thread_main(conf, pageserver_listener))?;
if let Some(tui_thread) = tui_thread {
// The TUI thread exits when the user asks to Quit.
tui_thread.join().unwrap();
} else {
page_service_thread
.join()
.expect("Page service thread has panicked")?
// change into the repository directory. In daemon mode, Daemonize
// does this for us.
let repodir = zenith_repo_dir();
std::env::set_current_dir(&repodir)?;
info!("Changed current directory to repository in {}", &repodir);
}
let mut threads = Vec::new();
// TODO: Check that it looks like a valid repository before going further
// Create directory for wal-redo datadirs
match fs::create_dir("wal-redo") {
Ok(_) => {}
Err(e) => match e.kind() {
io::ErrorKind::AlreadyExists => {}
_ => {
anyhow::bail!("Failed to create wal-redo data directory: {}", e);
}
},
}
// GetPage@LSN requests are served by another thread. (It uses async I/O,
// but the code in page_service sets up it own thread pool for that)
let conf_copy = conf.clone();
let page_server_thread = thread::Builder::new()
.name("Page Service thread".into())
.spawn(move || {
// thread code
page_service::thread_main(&conf_copy);
})
.unwrap();
threads.push(page_server_thread);
if tui_thread.is_some() {
// The TUI thread exits when the user asks to Quit.
tui_thread.unwrap().join().unwrap();
} else {
// In non-interactive mode, wait forever.
for t in threads {
t.join().unwrap()
}
}
Ok(())
}
fn init_logging(
conf: &PageServerConf,
log_file: File,
) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
if conf.interactive {
Ok(tui::init_logging())
} else if conf.daemonize {
let log = zenith_repo_dir() + "/pageserver.log";
let log_file = File::create(&log).map_err(|err| {
// We failed to initialize logging, so we can't log this message with error!
eprintln!("Could not create log file {:?}: {}", log, err);
err
})?;
let decorator = slog_term::PlainSyncDecorator::new(log_file);
let drain = slog_term::FullFormat::new(decorator).build();
let drain = slog_term::CompactFormat::new(decorator).build();
let drain = slog::Filter::new(drain, |record: &slog::Record| {
if record.level().is_at_least(slog::Level::Info) {
if record.level().is_at_least(slog::Level::Debug) {
return true;
}
false
return false;
});
let drain = std::sync::Mutex::new(drain).fuse();
let logger = slog::Logger::root(
drain,
slog::o!(
"location" =>
FnValue(move |record| {
format!("{}, {}:{}",
record.module(),
record.file(),
record.line()
)
}
)
),
);
let logger = slog::Logger::root(drain, slog::o!());
Ok(slog_scope::set_global_logger(logger))
} else {
let decorator = slog_term::TermDecorator::new().build();
@@ -276,7 +215,7 @@ fn init_logging(
{
return true;
}
false
return false;
})
.fuse();
let logger = slog::Logger::root(drain, slog::o!());

View File

@@ -1,409 +0,0 @@
//
// Branch management code
//
// TODO: move all paths construction to conf impl
//
use anyhow::{anyhow, bail, Context, Result};
use bytes::Bytes;
use fs::File;
use fs_extra;
use postgres_ffi::{pg_constants, xlog_utils};
use rand::Rng;
use serde::{Deserialize, Serialize};
use std::env;
use std::io::{Read, Write};
use std::{
collections::HashMap,
fs, io,
path::{Path, PathBuf},
process::{Command, Stdio},
str::FromStr,
};
use zenith_utils::lsn::Lsn;
use crate::{repository::Repository, PageServerConf, ZTimelineId};
#[derive(Serialize, Deserialize, Clone)]
pub struct BranchInfo {
pub name: String,
pub timeline_id: ZTimelineId,
pub latest_valid_lsn: Option<Lsn>,
pub ancestor_id: Option<String>,
pub ancestor_lsn: Option<String>,
}
#[derive(Debug, Clone, Copy)]
pub struct PointInTime {
pub timelineid: ZTimelineId,
pub lsn: Lsn,
}
pub fn init_repo(conf: &PageServerConf, repo_dir: &Path) -> Result<()> {
// top-level dir may exist if we are creating it through CLI
fs::create_dir_all(repo_dir)
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
env::set_current_dir(repo_dir)?;
fs::create_dir(std::path::Path::new("timelines"))?;
fs::create_dir(std::path::Path::new("refs"))?;
fs::create_dir(std::path::Path::new("refs").join("branches"))?;
fs::create_dir(std::path::Path::new("refs").join("tags"))?;
fs::create_dir(std::path::Path::new("wal-redo"))?;
println!("created directory structure in {}", repo_dir.display());
// Create initial timeline
let tli = create_timeline(conf, None)?;
let timelinedir = conf.timeline_path(tli);
println!("created initial timeline {}", tli);
// Run initdb
//
// We create the cluster temporarily in a "tmp" directory inside the repository,
// and move it to the right location from there.
let tmppath = std::path::Path::new("tmp");
print!("running initdb... ");
io::stdout().flush()?;
let initdb_path = conf.pg_bin_dir().join("initdb");
let initdb_otput = Command::new(initdb_path)
.args(&["-D", tmppath.to_str().unwrap()])
.arg("--no-instructions")
.env_clear()
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
.stdout(Stdio::null())
.output()
.with_context(|| "failed to execute initdb")?;
if !initdb_otput.status.success() {
anyhow::bail!("initdb failed");
}
println!("initdb succeeded");
// Read control file to extract the LSN and system id
let controlfile_path = tmppath.join("global").join("pg_control");
let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
// let systemid = controlfile.system_identifier;
let lsn = controlfile.checkPoint;
let lsnstr = format!("{:016X}", lsn);
// Move the initial WAL file
fs::rename(
tmppath.join("pg_wal").join("000000010000000000000001"),
timelinedir
.join("wal")
.join("000000010000000000000001.partial"),
)?;
println!("moved initial WAL file");
// Remove pg_wal
fs::remove_dir_all(tmppath.join("pg_wal"))?;
let target = timelinedir.join("snapshots").join(&lsnstr);
fs::rename(tmppath, &target)?;
// Create 'main' branch to refer to the initial timeline
let data = tli.to_string();
fs::write(conf.branch_path("main"), data)?;
println!("created main branch");
println!(
"new zenith repository was created in {}",
repo_dir.display()
);
Ok(())
}
pub(crate) fn get_branches(
conf: &PageServerConf,
repository: &dyn Repository,
) -> Result<Vec<BranchInfo>> {
// Each branch has a corresponding record (text file) in the refs/branches
// with timeline_id.
let branches_dir = std::path::Path::new("refs").join("branches");
std::fs::read_dir(&branches_dir)?
.map(|dir_entry_res| {
let dir_entry = dir_entry_res?;
let name = dir_entry.file_name().to_str().unwrap().to_string();
let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
let latest_valid_lsn = repository
.get_timeline(timeline_id)
.map(|timeline| timeline.get_last_valid_lsn())
.ok();
let ancestor_path = conf.ancestor_path(timeline_id);
let mut ancestor_id: Option<String> = None;
let mut ancestor_lsn: Option<String> = None;
if ancestor_path.exists() {
let ancestor = std::fs::read_to_string(ancestor_path)?;
let mut strings = ancestor.split('@');
ancestor_id = Some(
strings
.next()
.with_context(|| "wrong branch ancestor point in time format")?
.to_owned(),
);
ancestor_lsn = Some(
strings
.next()
.with_context(|| "wrong branch ancestor point in time format")?
.to_owned(),
);
}
Ok(BranchInfo {
name,
timeline_id,
latest_valid_lsn,
ancestor_id,
ancestor_lsn,
})
})
.collect()
}
pub(crate) fn get_system_id(conf: &PageServerConf) -> Result<u64> {
// let branches = get_branches();
let branches_dir = std::path::Path::new("refs").join("branches");
let branches = std::fs::read_dir(&branches_dir)?
.map(|dir_entry_res| {
let dir_entry = dir_entry_res?;
let name = dir_entry.file_name().to_str().unwrap().to_string();
let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
Ok((name, timeline_id))
})
.collect::<Result<HashMap<String, ZTimelineId>>>()?;
let main_tli = branches
.get("main")
.ok_or_else(|| anyhow!("Branch main not found"))?;
let (_, main_snap_dir) = find_latest_snapshot(conf, *main_tli)?;
let controlfile_path = main_snap_dir.join("global").join("pg_control");
let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
Ok(controlfile.system_identifier)
}
pub(crate) fn create_branch(
conf: &PageServerConf,
branchname: &str,
startpoint_str: &str,
) -> Result<BranchInfo> {
if conf.branch_path(&branchname).exists() {
anyhow::bail!("branch {} already exists", branchname);
}
let mut startpoint = parse_point_in_time(conf, startpoint_str)?;
if startpoint.lsn == Lsn(0) {
// Find end of WAL on the old timeline
let end_of_wal = find_end_of_wal(conf, startpoint.timelineid)?;
println!("branching at end of WAL: {}", end_of_wal);
startpoint.lsn = end_of_wal;
}
// create a new timeline for it
let newtli = create_timeline(conf, Some(startpoint))?;
let newtimelinedir = conf.timeline_path(newtli);
let data = newtli.to_string();
fs::write(conf.branch_path(&branchname), data)?;
// Copy the latest snapshot (TODO: before the startpoint) and all WAL
// TODO: be smarter and avoid the copying...
let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(conf, startpoint.timelineid)?;
let copy_opts = fs_extra::dir::CopyOptions::new();
fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), &copy_opts)?;
let oldtimelinedir = conf.timeline_path(startpoint.timelineid);
copy_wal(
&oldtimelinedir.join("wal"),
&newtimelinedir.join("wal"),
startpoint.lsn,
pg_constants::WAL_SEGMENT_SIZE,
)?;
Ok(BranchInfo {
name: branchname.to_string(),
timeline_id: newtli,
latest_valid_lsn: Some(startpoint.lsn),
ancestor_id: None,
ancestor_lsn: None,
})
}
//
// Parse user-given string that represents a point-in-time.
//
// We support multiple variants:
//
// Raw timeline id in hex, meaning the end of that timeline:
// bc62e7d612d0e6fe8f99a6dd2f281f9d
//
// A specific LSN on a timeline:
// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8
//
// Same, with a human-friendly branch name:
// main
// main@2/15D3DD8
//
// Human-friendly tag name:
// mytag
//
//
fn parse_point_in_time(conf: &PageServerConf, s: &str) -> Result<PointInTime> {
let mut strings = s.split('@');
let name = strings.next().unwrap();
let lsn: Option<Lsn>;
if let Some(lsnstr) = strings.next() {
lsn = Some(
Lsn::from_str(lsnstr).with_context(|| "invalid LSN in point-in-time specification")?,
);
} else {
lsn = None
}
// Check if it's a tag
if lsn.is_none() {
let tagpath = conf.tag_path(name);
if tagpath.exists() {
let pointstr = fs::read_to_string(tagpath)?;
return parse_point_in_time(conf, &pointstr);
}
}
// Check if it's a branch
// Check if it's branch @ LSN
let branchpath = conf.branch_path(name);
if branchpath.exists() {
let pointstr = fs::read_to_string(branchpath)?;
let mut result = parse_point_in_time(conf, &pointstr)?;
result.lsn = lsn.unwrap_or(Lsn(0));
return Ok(result);
}
// Check if it's a timelineid
// Check if it's timelineid @ LSN
if let Ok(timelineid) = ZTimelineId::from_str(name) {
let tlipath = conf.timeline_path(timelineid);
if tlipath.exists() {
return Ok(PointInTime {
timelineid,
lsn: lsn.unwrap_or(Lsn(0)),
});
}
}
bail!("could not parse point-in-time {}", s);
}
fn create_timeline(conf: &PageServerConf, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
// Create initial timeline
let mut tli_buf = [0u8; 16];
rand::thread_rng().fill(&mut tli_buf);
let timelineid = ZTimelineId::from(tli_buf);
let timelinedir = conf.timeline_path(timelineid);
fs::create_dir(&timelinedir)?;
fs::create_dir(&timelinedir.join("snapshots"))?;
fs::create_dir(&timelinedir.join("wal"))?;
if let Some(ancestor) = ancestor {
let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
fs::write(timelinedir.join("ancestor"), data)?;
}
Ok(timelineid)
}
///
/// Copy all WAL segments from one directory to another, up to given LSN.
///
/// If the given LSN is in the middle of a segment, the last segment containing it
/// is written out as .partial, and padded with zeros.
///
fn copy_wal(src_dir: &Path, dst_dir: &Path, upto: Lsn, wal_seg_size: usize) -> Result<()> {
let last_segno = upto.segment_number(wal_seg_size);
let last_segoff = upto.segment_offset(wal_seg_size);
for entry in fs::read_dir(src_dir).unwrap() {
if let Ok(entry) = entry {
let entry_name = entry.file_name();
let fname = entry_name.to_str().unwrap();
// Check if the filename looks like an xlog file, or a .partial file.
if !xlog_utils::IsXLogFileName(fname) && !xlog_utils::IsPartialXLogFileName(fname) {
continue;
}
let (segno, _tli) = xlog_utils::XLogFromFileName(fname, wal_seg_size as usize);
let copylen;
let mut dst_fname = PathBuf::from(fname);
if segno > last_segno {
// future segment, skip
continue;
} else if segno < last_segno {
copylen = wal_seg_size;
dst_fname.set_extension("");
} else {
copylen = last_segoff;
dst_fname.set_extension("partial");
}
let src_file = File::open(entry.path())?;
let mut dst_file = File::create(dst_dir.join(&dst_fname))?;
std::io::copy(&mut src_file.take(copylen as u64), &mut dst_file)?;
if copylen < wal_seg_size {
std::io::copy(
&mut std::io::repeat(0).take((wal_seg_size - copylen) as u64),
&mut dst_file,
)?;
}
}
}
Ok(())
}
// Find the end of valid WAL in a wal directory
pub fn find_end_of_wal(conf: &PageServerConf, timeline: ZTimelineId) -> Result<Lsn> {
let waldir = conf.timeline_path(timeline).join("wal");
let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, pg_constants::WAL_SEGMENT_SIZE, true);
Ok(Lsn(lsn))
}
// Find the latest snapshot for a timeline
fn find_latest_snapshot(conf: &PageServerConf, timeline: ZTimelineId) -> Result<(Lsn, PathBuf)> {
let snapshotsdir = conf.snapshots_path(timeline);
let paths = fs::read_dir(&snapshotsdir)?;
let mut maxsnapshot = Lsn(0);
let mut snapshotdir: Option<PathBuf> = None;
for path in paths {
let path = path?;
let filename = path.file_name().to_str().unwrap().to_owned();
if let Ok(lsn) = Lsn::from_hex(&filename) {
maxsnapshot = std::cmp::max(lsn, maxsnapshot);
snapshotdir = Some(path.path());
}
}
if maxsnapshot == Lsn(0) {
// TODO: check ancestor timeline
anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
}
Ok((maxsnapshot, snapshotdir.unwrap()))
}

View File

@@ -1,16 +1,11 @@
use serde::{Deserialize, Serialize};
use std::fmt;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
pub mod basebackup;
pub mod branches;
pub mod page_cache;
pub mod page_service;
pub mod repository;
pub mod pg_constants;
pub mod restore_local_repo;
pub mod tui;
pub mod tui_event;
@@ -24,83 +19,10 @@ pub struct PageServerConf {
pub daemonize: bool,
pub interactive: bool,
pub listen_addr: SocketAddr,
pub gc_horizon: u64,
pub gc_period: Duration,
// Repository directory, relative to current working directory.
// Normally, the page server changes the current working directory
// to the repository, and 'workdir' is always '.'. But we don't do
// that during unit testing, because the current directory is global
// to the process but different unit tests work on different
// repositories.
pub workdir: PathBuf,
pub pg_distrib_dir: PathBuf,
}
impl PageServerConf {
//
// Repository paths, relative to workdir.
//
fn tag_path(&self, name: &str) -> PathBuf {
self.workdir.join("refs").join("tags").join(name)
}
fn branch_path(&self, name: &str) -> PathBuf {
self.workdir.join("refs").join("branches").join(name)
}
fn timeline_path(&self, timelineid: ZTimelineId) -> PathBuf {
self.workdir.join("timelines").join(timelineid.to_string())
}
fn snapshots_path(&self, timelineid: ZTimelineId) -> PathBuf {
self.timeline_path(timelineid).join("snapshots")
}
fn ancestor_path(&self, timelineid: ZTimelineId) -> PathBuf {
self.timeline_path(timelineid).join("ancestor")
}
//
// Postgres distribution paths
//
pub fn pg_bin_dir(&self) -> PathBuf {
self.pg_distrib_dir.join("bin")
}
pub fn pg_lib_dir(&self) -> PathBuf {
self.pg_distrib_dir.join("lib")
}
}
/// Zenith Timeline ID is a 128-bit random ID.
///
/// Zenith timeline IDs are different from PostgreSQL timeline
/// IDs. They serve a similar purpose though: they differentiate
/// between different "histories" of the same cluster. However,
/// PostgreSQL timeline IDs are a bit cumbersome, because they are only
/// 32-bits wide, and they must be in ascending order in any given
/// timeline history. Those limitations mean that we cannot generate a
/// new PostgreSQL timeline ID by just generating a random number. And
/// that in turn is problematic for the "pull/push" workflow, where you
/// have a local copy of a zenith repository, and you periodically sync
/// the local changes with a remote server. When you work "detached"
/// from the remote server, you cannot create a PostgreSQL timeline ID
/// that's guaranteed to be different from all existing timelines in
/// the remote server. For example, if two people are having a clone of
/// the repository on their laptops, and they both create a new branch
/// with different name. What timeline ID would they assign to their
/// branches? If they pick the same one, and later try to push the
/// branches to the same remote server, they will get mixed up.
///
/// To avoid those issues, Zenith has its own concept of timelines that
/// is separate from PostgreSQL timelines, and doesn't have those
/// limitations. A zenith timeline is identified by a 128-bit ID, which
/// is usually printed out as a hex string.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
// Zenith Timeline ID is a 32-byte random ID.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct ZTimelineId([u8; 16]);
impl FromStr for ZTimelineId {

View File

@@ -1,32 +1,724 @@
//! This module acts as a switchboard to access different repositories managed by this
//! page server. Currently, a Page Server can only manage one repository, so there
//! isn't much here. If we implement multi-tenancy, this will probably be changed into
//! a hash map, keyed by the tenant ID.
//
// Page Cache holds all the different page versions and WAL records
//
// The Page Cache is a BTreeMap, keyed by the RelFileNode an blocknumber, and the LSN.
// The BTreeMap is protected by a Mutex, and each cache entry is protected by another
// per-entry mutex.
//
use crate::repository::rocksdb::RocksRepository;
use crate::repository::Repository;
use crate::walredo::PostgresRedoManager;
use crate::PageServerConf;
use crate::restore_local_repo::restore_timeline;
use crate::ZTimelineId;
use crate::{walredo, PageServerConf};
use anyhow::bail;
use bytes::Bytes;
use core::ops::Bound::Included;
use crossbeam_channel::unbounded;
use crossbeam_channel::{Receiver, Sender};
use lazy_static::lazy_static;
use std::sync::{Arc, Mutex};
use log::*;
use rand::Rng;
use std::collections::{BTreeMap, HashMap};
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::{Arc, Condvar, Mutex};
use std::thread;
use std::time::Duration;
use std::{convert::TryInto, ops::AddAssign};
// Timeout when waiting or WAL receiver to catch up to an LSN given in a GetPage@LSN call.
static TIMEOUT: Duration = Duration::from_secs(60);
pub struct PageCache {
shared: Mutex<PageCacheShared>,
// Channel for communicating with the WAL redo process here.
pub walredo_sender: Sender<Arc<CacheEntry>>,
pub walredo_receiver: Receiver<Arc<CacheEntry>>,
valid_lsn_condvar: Condvar,
// Counters, for metrics collection.
pub num_entries: AtomicU64,
pub num_page_images: AtomicU64,
pub num_wal_records: AtomicU64,
pub num_getpage_requests: AtomicU64,
// copies of shared.first/last_valid_lsn fields (copied here so
// that they can be read without acquiring the mutex).
pub first_valid_lsn: AtomicU64,
pub last_valid_lsn: AtomicU64,
pub last_record_lsn: AtomicU64,
}
#[derive(Clone)]
pub struct PageCacheStats {
pub num_entries: u64,
pub num_page_images: u64,
pub num_wal_records: u64,
pub num_getpage_requests: u64,
pub first_valid_lsn: u64,
pub last_valid_lsn: u64,
pub last_record_lsn: u64,
}
impl AddAssign for PageCacheStats {
fn add_assign(&mut self, other: Self) {
*self = Self {
num_entries: self.num_entries + other.num_entries,
num_page_images: self.num_page_images + other.num_page_images,
num_wal_records: self.num_wal_records + other.num_wal_records,
num_getpage_requests: self.num_getpage_requests + other.num_getpage_requests,
first_valid_lsn: self.first_valid_lsn + other.first_valid_lsn,
last_valid_lsn: self.last_valid_lsn + other.last_valid_lsn,
last_record_lsn: self.last_record_lsn + other.last_record_lsn,
}
}
}
//
// Shared data structure, holding page cache and related auxiliary information
//
struct PageCacheShared {
// The actual page cache
pagecache: BTreeMap<CacheKey, Arc<CacheEntry>>,
// Relation n_blocks cache
//
// This hashtable should be updated together with the pagecache. Now it is
// accessed unreasonably often through the smgr_nblocks(). It is better to just
// cache it in postgres smgr and ask only on restart.
relsize_cache: HashMap<RelTag, u32>,
// What page versions do we hold in the cache? If we get GetPage with
// LSN < first_valid_lsn, that's an error because we (no longer) hold that
// page version. If we get a request > last_valid_lsn, we need to wait until
// we receive all the WAL up to the request.
//
// last_record_lsn points to the end of last processed WAL record.
// It can lag behind last_valid_lsn, if the WAL receiver has received some WAL
// after the end of last record, but not the whole next record yet. In the
// page cache, we care about last_valid_lsn, but if the WAL receiver needs to
// restart the streaming, it needs to restart at the end of last record, so
// we track them separately. last_record_lsn should perhaps be in
// walreceiver.rs instead of here, but it seems convenient to keep all three
// values together.
//
first_valid_lsn: u64,
last_valid_lsn: u64,
last_record_lsn: u64,
}
lazy_static! {
pub static ref REPOSITORY: Mutex<Option<Arc<dyn Repository + Send + Sync>>> = Mutex::new(None);
pub static ref PAGECACHES: Mutex<HashMap<ZTimelineId, Arc<PageCache>>> =
Mutex::new(HashMap::new());
}
pub fn init(conf: &'static PageServerConf) {
let mut m = REPOSITORY.lock().unwrap();
// Get Page Cache for given timeline. It is assumed to already exist.
pub fn get_pagecache(_conf: &PageServerConf, timelineid: ZTimelineId) -> Option<Arc<PageCache>> {
let pcaches = PAGECACHES.lock().unwrap();
// Set up a WAL redo manager, for applying WAL records.
let walredo_mgr = PostgresRedoManager::new(conf);
// we have already changed current dir to the repository.
let repo = RocksRepository::new(conf, Arc::new(walredo_mgr));
*m = Some(Arc::new(repo));
match pcaches.get(&timelineid) {
Some(pcache) => Some(pcache.clone()),
None => None,
}
}
pub fn get_repository() -> Arc<dyn Repository + Send + Sync> {
let o = &REPOSITORY.lock().unwrap();
Arc::clone(o.as_ref().unwrap())
pub fn get_or_restore_pagecache(
conf: &PageServerConf,
timelineid: ZTimelineId,
) -> anyhow::Result<Arc<PageCache>> {
let mut pcaches = PAGECACHES.lock().unwrap();
match pcaches.get(&timelineid) {
Some(pcache) => Ok(pcache.clone()),
None => {
let pcache = init_page_cache();
restore_timeline(conf, &pcache, timelineid)?;
let result = Arc::new(pcache);
pcaches.insert(timelineid, result.clone());
// Initialize the WAL redo thread
//
// Now join_handle is not saved any where and we won'try restart tharead
// if it is dead. We may later stop that treads after some inactivity period
// and restart them on demand.
let conf_copy = conf.clone();
let _walredo_thread = thread::Builder::new()
.name("WAL redo thread".into())
.spawn(move || {
walredo::wal_redo_main(&conf_copy, timelineid);
})
.unwrap();
return Ok(result);
}
}
}
fn init_page_cache() -> PageCache {
// Initialize the channel between the page cache and the WAL applicator
let (s, r) = unbounded();
PageCache {
shared: Mutex::new(PageCacheShared {
pagecache: BTreeMap::new(),
relsize_cache: HashMap::new(),
first_valid_lsn: 0,
last_valid_lsn: 0,
last_record_lsn: 0,
}),
valid_lsn_condvar: Condvar::new(),
walredo_sender: s,
walredo_receiver: r,
num_entries: AtomicU64::new(0),
num_page_images: AtomicU64::new(0),
num_wal_records: AtomicU64::new(0),
num_getpage_requests: AtomicU64::new(0),
first_valid_lsn: AtomicU64::new(0),
last_valid_lsn: AtomicU64::new(0),
last_record_lsn: AtomicU64::new(0),
}
}
//
// We store two kinds of entries in the page cache:
//
// 1. Ready-made images of the block
// 2. WAL records, to be applied on top of the "previous" entry
//
// Some WAL records will initialize the page from scratch. For such records,
// the 'will_init' flag is set. They don't need the previous page image before
// applying. The 'will_init' flag is set for records containing a full-page image,
// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
// stored directly in the cache entry in that you still need to run the WAL redo
// routine to generate the page image.
//
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone)]
pub struct CacheKey {
pub tag: BufferTag,
pub lsn: u64,
}
pub struct CacheEntry {
pub key: CacheKey,
pub content: Mutex<CacheEntryContent>,
// Condition variable used by the WAL redo service, to wake up
// requester.
//
// FIXME: this takes quite a lot of space. Consider using parking_lot::Condvar
// or something else.
pub walredo_condvar: Condvar,
}
pub struct CacheEntryContent {
pub page_image: Option<Bytes>,
pub wal_record: Option<WALRecord>,
pub apply_pending: bool,
}
impl CacheEntry {
fn new(key: CacheKey) -> CacheEntry {
CacheEntry {
key,
content: Mutex::new(CacheEntryContent {
page_image: None,
wal_record: None,
apply_pending: false,
}),
walredo_condvar: Condvar::new(),
}
}
}
#[derive(Eq, PartialEq, Hash, Clone, Copy)]
pub struct RelTag {
pub spcnode: u32,
pub dbnode: u32,
pub relnode: u32,
pub forknum: u8,
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Debug)]
pub struct BufferTag {
pub spcnode: u32,
pub dbnode: u32,
pub relnode: u32,
pub forknum: u8,
pub blknum: u32,
}
#[derive(Clone)]
pub struct WALRecord {
pub lsn: u64, // LSN at the *end* of the record
pub will_init: bool,
pub rec: Bytes,
// Remember the offset of main_data in rec,
// so that we don't have to parse the record again.
// If record has no main_data, this offset equals rec.len().
pub main_data_offset: usize,
}
// Public interface functions
impl PageCache {
//
// GetPage@LSN
//
// Returns an 8k page image
//
pub fn get_page_at_lsn(&self, tag: BufferTag, req_lsn: u64) -> anyhow::Result<Bytes> {
self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
let mut lsn = req_lsn;
//When invalid LSN is requested, it means "don't wait, return latest version of the page"
//This is necessary for bootstrap.
//TODO should we use last_valid_lsn here instead of maxvalue?
if lsn == 0
{
lsn = 0xffff_ffff_ffff_eeee;
}
// Look up cache entry. If it's a page image, return that. If it's a WAL record,
// ask the WAL redo service to reconstruct the page image from the WAL records.
let minkey = CacheKey { tag, lsn: 0 };
let maxkey = CacheKey { tag, lsn };
let entry_rc: Arc<CacheEntry>;
{
let mut shared = self.shared.lock().unwrap();
let mut waited = false;
// There is a a race at postgres instance start
// when we request a page before walsender established connection
// and was able to stream the page. Just don't wait and return what we have.
if req_lsn == 0
{
trace!(
"walsender hasn't started yet. Don't wait. last_valid_lsn {}, requested {}",
shared.last_valid_lsn, lsn);
}
if req_lsn != 0
{
while lsn > shared.last_valid_lsn {
// TODO: Wait for the WAL receiver to catch up
waited = true;
trace!(
"not caught up yet: {}, requested {}",
shared.last_valid_lsn,
lsn
);
let wait_result = self
.valid_lsn_condvar
.wait_timeout(shared, TIMEOUT)
.unwrap();
shared = wait_result.0;
if wait_result.1.timed_out() {
bail!(
"Timed out while waiting for WAL record at LSN {:X}/{:X} to arrive",
lsn >> 32,
lsn & 0xffff_ffff
);
}
}
}
if waited {
trace!("caught up now, continuing");
}
if lsn < shared.first_valid_lsn {
bail!(
"LSN {:X}/{:X} has already been removed",
lsn >> 32,
lsn & 0xffff_ffff
);
}
let pagecache = &shared.pagecache;
let mut entries = pagecache.range((Included(&minkey), Included(&maxkey)));
let entry_opt = entries.next_back();
if entry_opt.is_none() {
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
return Ok(Bytes::from_static(&ZERO_PAGE));
/* return Err("could not find page image")?; */
}
let (_key, entry) = entry_opt.unwrap();
entry_rc = entry.clone();
// Now that we have a reference to the cache entry, drop the lock on the map.
// It's important to do this before waiting on the condition variable below,
// and better to do it as soon as possible to maximize concurrency.
}
// Lock the cache entry and dig the page image out of it.
let page_img: Bytes;
{
let mut entry_content = entry_rc.content.lock().unwrap();
if let Some(img) = &entry_content.page_image {
assert!(!entry_content.apply_pending);
page_img = img.clone();
} else if entry_content.wal_record.is_some() {
//
// If this page needs to be reconstructed by applying some WAL,
// send a request to the WAL redo thread.
//
if !entry_content.apply_pending {
assert!(!entry_content.apply_pending);
entry_content.apply_pending = true;
let s = &self.walredo_sender;
s.send(entry_rc.clone())?;
}
while entry_content.apply_pending {
entry_content = entry_rc.walredo_condvar.wait(entry_content).unwrap();
}
// We should now have a page image. If we don't, it means that WAL redo
// failed to reconstruct it. WAL redo should've logged that error already.
page_img = match &entry_content.page_image {
Some(p) => p.clone(),
None => {
error!(
"could not apply WAL to reconstruct page image for GetPage@LSN request"
);
bail!("could not apply WAL to reconstruct page image");
}
};
} else {
// No base image, and no WAL record. Huh?
bail!("no page image or WAL record for requested page");
}
}
// FIXME: assumes little-endian. Only used for the debugging log though
let page_lsn_hi = u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
let page_lsn_lo = u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap());
trace!(
"Returning page with LSN {:X}/{:X} for {}/{}/{}.{} blk {}",
page_lsn_hi,
page_lsn_lo,
tag.spcnode,
tag.dbnode,
tag.relnode,
tag.forknum,
tag.blknum
);
return Ok(page_img);
}
//
// Collect all the WAL records that are needed to reconstruct a page
// image for the given cache entry.
//
// Returns an old page image (if any), and a vector of WAL records to apply
// over it.
//
pub fn collect_records_for_apply(&self, entry: &CacheEntry) -> (Option<Bytes>, Vec<WALRecord>) {
// Scan the BTreeMap backwards, starting from the given entry.
let shared = self.shared.lock().unwrap();
let pagecache = &shared.pagecache;
let minkey = CacheKey {
tag: entry.key.tag,
lsn: 0,
};
let maxkey = CacheKey {
tag: entry.key.tag,
lsn: entry.key.lsn,
};
let entries = pagecache.range((Included(&minkey), Included(&maxkey)));
// the last entry in the range should be the CacheEntry we were given
//let _last_entry = entries.next_back();
//assert!(last_entry == entry);
let mut base_img: Option<Bytes> = None;
let mut records: Vec<WALRecord> = Vec::new();
// Scan backwards, collecting the WAL records, until we hit an
// old page image.
for (_key, e) in entries.rev() {
let e = e.content.lock().unwrap();
if let Some(img) = &e.page_image {
// We have a base image. No need to dig deeper into the list of
// records
base_img = Some(img.clone());
break;
} else if let Some(rec) = &e.wal_record {
records.push(rec.clone());
// If this WAL record initializes the page, no need to dig deeper.
if rec.will_init {
break;
}
} else {
panic!("no base image and no WAL record on cache entry");
}
}
records.reverse();
return (base_img, records);
}
//
// Adds a WAL record to the page cache
//
pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) {
let lsn = rec.lsn;
let key = CacheKey { tag, lsn };
let entry = CacheEntry::new(key.clone());
entry.content.lock().unwrap().wal_record = Some(rec);
let mut shared = self.shared.lock().unwrap();
let rel_tag = RelTag {
spcnode: tag.spcnode,
dbnode: tag.dbnode,
relnode: tag.relnode,
forknum: tag.forknum,
};
let rel_entry = shared.relsize_cache.entry(rel_tag).or_insert(0);
if tag.blknum >= *rel_entry {
*rel_entry = tag.blknum + 1;
}
//trace!("put_wal_record lsn: {}", lsn);
let oldentry = shared.pagecache.insert(key, Arc::new(entry));
self.num_entries.fetch_add(1, Ordering::Relaxed);
if !oldentry.is_none() {
error!(
"overwriting WAL record with LSN {:X}/{:X} in page cache",
lsn >> 32,
lsn & 0xffffffff
);
}
self.num_wal_records.fetch_add(1, Ordering::Relaxed);
}
//
// Memorize a full image of a page version
//
pub fn put_page_image(&self, tag: BufferTag, lsn: u64, img: Bytes) {
let key = CacheKey { tag, lsn };
let entry = CacheEntry::new(key.clone());
entry.content.lock().unwrap().page_image = Some(img);
let mut shared = self.shared.lock().unwrap();
let pagecache = &mut shared.pagecache;
let oldentry = pagecache.insert(key, Arc::new(entry));
self.num_entries.fetch_add(1, Ordering::Relaxed);
assert!(oldentry.is_none());
//debug!("inserted page image for {}/{}/{}_{} blk {} at {}",
// tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum, lsn);
self.num_page_images.fetch_add(1, Ordering::Relaxed);
}
//
pub fn advance_last_valid_lsn(&self, lsn: u64) {
let mut shared = self.shared.lock().unwrap();
// Can't move backwards.
let oldlsn = shared.last_valid_lsn;
if lsn >= oldlsn {
shared.last_valid_lsn = lsn;
self.valid_lsn_condvar.notify_all();
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
} else {
warn!(
"attempted to move last valid LSN backwards (was {:X}/{:X}, new {:X}/{:X})",
oldlsn >> 32,
oldlsn & 0xffffffff,
lsn >> 32,
lsn & 0xffffffff
);
}
}
//
// NOTE: this updates last_valid_lsn as well.
//
pub fn advance_last_record_lsn(&self, lsn: u64) {
let mut shared = self.shared.lock().unwrap();
// Can't move backwards.
assert!(lsn >= shared.last_valid_lsn);
assert!(lsn >= shared.last_record_lsn);
shared.last_valid_lsn = lsn;
shared.last_record_lsn = lsn;
self.valid_lsn_condvar.notify_all();
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
self.last_record_lsn.store(lsn, Ordering::Relaxed);
}
//
pub fn _advance_first_valid_lsn(&self, lsn: u64) {
let mut shared = self.shared.lock().unwrap();
// Can't move backwards.
assert!(lsn >= shared.first_valid_lsn);
// Can't overtake last_valid_lsn (except when we're
// initializing the system and last_valid_lsn hasn't been set yet.
assert!(shared.last_valid_lsn == 0 || lsn < shared.last_valid_lsn);
shared.first_valid_lsn = lsn;
self.first_valid_lsn.store(lsn, Ordering::Relaxed);
}
pub fn init_valid_lsn(&self, lsn: u64) {
let mut shared = self.shared.lock().unwrap();
assert!(shared.first_valid_lsn == 0);
assert!(shared.last_valid_lsn == 0);
assert!(shared.last_record_lsn == 0);
shared.first_valid_lsn = lsn;
shared.last_valid_lsn = lsn;
shared.last_record_lsn = lsn;
self.first_valid_lsn.store(lsn, Ordering::Relaxed);
self.last_valid_lsn.store(lsn, Ordering::Relaxed);
self.last_record_lsn.store(lsn, Ordering::Relaxed);
}
pub fn get_last_valid_lsn(&self) -> u64 {
let shared = self.shared.lock().unwrap();
return shared.last_record_lsn;
}
//
// Simple test function for the WAL redo code:
//
// 1. Pick a page from the page cache at random.
// 2. Request that page with GetPage@LSN, using Max LSN (i.e. get the latest page version)
//
//
pub fn _test_get_page_at_lsn(&self) {
// for quick testing of the get_page_at_lsn() funcion.
//
// Get a random page from the page cache. Apply all its WAL, by requesting
// that page at the highest lsn.
let mut tag: Option<BufferTag> = None;
{
let shared = self.shared.lock().unwrap();
let pagecache = &shared.pagecache;
if pagecache.is_empty() {
info!("page cache is empty");
return;
}
// Find nth entry in the map, where n is picked at random
let n = rand::thread_rng().gen_range(0..pagecache.len());
let mut i = 0;
for (key, _e) in pagecache.iter() {
if i == n {
tag = Some(key.tag);
break;
}
i += 1;
}
}
info!("testing GetPage@LSN for block {}", tag.unwrap().blknum);
match self.get_page_at_lsn(tag.unwrap(), 0xffff_ffff_ffff_eeee) {
Ok(_img) => {
// This prints out the whole page image.
//println!("{:X?}", img);
}
Err(error) => {
error!("GetPage@LSN failed: {}", error);
}
}
}
/// Remember a relation's size in blocks.
///
/// If 'to' is larger than the previously remembered size, the remembered size is increased to 'to'.
/// But if it's smaller, there is no change.
pub fn relsize_inc(&self, rel: &RelTag, to: u32) {
// FIXME: Shouldn't relation size also be tracked with an LSN?
// If a replica is lagging behind, it needs to get the size as it was on
// the replica's current replay LSN.
let mut shared = self.shared.lock().unwrap();
let entry = shared.relsize_cache.entry(*rel).or_insert(0);
if to >= *entry {
*entry = to;
}
}
pub fn relsize_get(&self, rel: &RelTag) -> u32 {
let mut shared = self.shared.lock().unwrap();
let entry = shared.relsize_cache.entry(*rel).or_insert(0);
*entry
}
pub fn relsize_exist(&self, rel: &RelTag) -> bool {
let shared = self.shared.lock().unwrap();
let relsize_cache = &shared.relsize_cache;
relsize_cache.contains_key(rel)
}
pub fn get_stats(&self) -> PageCacheStats {
PageCacheStats {
num_entries: self.num_entries.load(Ordering::Relaxed),
num_page_images: self.num_page_images.load(Ordering::Relaxed),
num_wal_records: self.num_wal_records.load(Ordering::Relaxed),
num_getpage_requests: self.num_getpage_requests.load(Ordering::Relaxed),
first_valid_lsn: self.first_valid_lsn.load(Ordering::Relaxed),
last_valid_lsn: self.last_valid_lsn.load(Ordering::Relaxed),
last_record_lsn: self.last_record_lsn.load(Ordering::Relaxed),
}
}
}
pub fn get_stats() -> PageCacheStats {
let pcaches = PAGECACHES.lock().unwrap();
let mut stats = PageCacheStats {
num_entries: 0,
num_page_images: 0,
num_wal_records: 0,
num_getpage_requests: 0,
first_valid_lsn: 0,
last_valid_lsn: 0,
last_record_lsn: 0,
};
pcaches.iter().for_each(|(_sys_id, pcache)| {
stats += pcache.get_stats();
});
stats
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,53 @@
// From pg_tablespace_d.h
//
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
pub const GLOBALTABLESPACE_OID: u32 = 1664;
//Special values for non-rel files' tags
//TODO maybe use enum?
pub const PG_CONTROLFILE_FORKNUM: u32 = 42;
pub const PG_FILENODEMAP_FORKNUM: u32 = 43;
pub const PG_XACT_FORKNUM: u32 = 44;
pub const PG_MXACT_OFFSETS_FORKNUM: u32 = 45;
pub const PG_MXACT_MEMBERS_FORKNUM: u32 = 46;
//
// constants from clog.h
//
pub const CLOG_XACTS_PER_BYTE: u32 = 4;
pub const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
pub const CLOG_BITS_PER_XACT: u8 = 2;
pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
pub const CLOG_ZEROPAGE: u8 = 0x00;
pub const CLOG_TRUNCATE: u8 = 0x10;
// From xact.h
pub const XLOG_XACT_COMMIT: u8 = 0x00;
pub const XLOG_XACT_ABORT: u8 = 0x20;
/* mask for filtering opcodes out of xl_info */
pub const XLOG_XACT_OPMASK: u8 = 0x70;
/* does this record have a 'xinfo' field or not */
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
/*
* The following flags, stored in xinfo, determine which information is
* contained in commit/abort records.
*/
pub const XACT_XINFO_HAS_DBINFO: u32 = 1;
pub const XACT_XINFO_HAS_SUBXACTS: u32 = 2;
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 4;
// From pg_control.h and rmgrlist.h
pub const XLOG_SWITCH: u8 = 0x40;
pub const RM_XLOG_ID: u8 = 0;
pub const RM_XACT_ID: u8 = 1;
pub const RM_CLOG_ID: u8 = 3;
// pub const RM_MULTIXACT_ID:u8 = 6;
// from xlogreader.h
pub const XLR_INFO_MASK: u8 = 0x0F;

View File

@@ -1,532 +0,0 @@
pub mod rocksdb;
use crate::waldecoder::{DecodedWALRecord, Oid, TransactionId, XlCreateDatabase, XlSmgrTruncate};
use crate::ZTimelineId;
use anyhow::Result;
use bytes::{Buf, BufMut, Bytes, BytesMut};
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::forknumber_to_name;
use std::fmt;
use std::sync::Arc;
use zenith_utils::lsn::Lsn;
///
/// A repository corresponds to one .zenith directory. One repository holds multiple
/// timelines, forked off from the same initial call to 'initdb'.
pub trait Repository {
/// Get Timeline handle for given zenith timeline ID.
///
/// The Timeline is expected to be already "open", i.e. `get_or_restore_timeline`
/// should've been called on it earlier already.
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
/// Get Timeline handle for given zenith timeline ID.
///
/// Creates a new Timeline object if it's not "open" already.
fn get_or_restore_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
/// Create an empty timeline, without loading any data into it from possible on-disk snapshot.
///
/// For unit tests.
#[cfg(test)]
fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
//fn get_stats(&self) -> RepositoryStats;
}
pub trait Timeline {
//------------------------------------------------------------------------------
// Public GET functions
//------------------------------------------------------------------------------
/// Look up given page in the cache.
fn get_page_at_lsn(&self, tag: BufferTag, lsn: Lsn) -> Result<Bytes>;
/// Get size of relation
fn get_relsize(&self, tag: RelTag, lsn: Lsn) -> Result<u32>;
/// Does relation exist?
fn get_relsize_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;
/// Get page image at the particular LSN
fn get_page_image(&self, tag: BufferTag, lsn: Lsn) -> Result<Option<Bytes>>;
//------------------------------------------------------------------------------
// Public PUT functions, to update the repository with new page versions.
//
// These are called by the WAL receiver to digest WAL records.
//------------------------------------------------------------------------------
/// Put a new page version that can be constructed from a WAL record
///
/// This will implicitly extend the relation, if the page is beyond the
/// current end-of-file.
fn put_wal_record(&self, tag: BufferTag, rec: WALRecord);
/// Like put_wal_record, but with ready-made image of the page.
fn put_page_image(&self, tag: BufferTag, lsn: Lsn, img: Bytes);
/// Truncate relation
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()>;
/// Create a new database from a template database
///
/// In PostgreSQL, CREATE DATABASE works by scanning the data directory and
/// copying all relation files from the template database. This is the equivalent
/// of that.
fn put_create_database(
&self,
lsn: Lsn,
db_id: Oid,
tablespace_id: Oid,
src_db_id: Oid,
src_tablespace_id: Oid,
) -> Result<()>;
///
/// Helper function to parse a WAL record and call the above functions for all the
/// relations/pages that the record affects.
///
fn save_decoded_record(
&self,
decoded: DecodedWALRecord,
recdata: Bytes,
lsn: Lsn,
) -> Result<()> {
// Figure out which blocks the record applies to, and "put" a separate copy
// of the record for each block.
for blk in decoded.blocks.iter() {
let tag = BufferTag {
rel: RelTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum as u8,
},
blknum: blk.blkno,
};
let rec = WALRecord {
lsn,
will_init: blk.will_init || blk.apply_image,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset as u32,
};
self.put_wal_record(tag, rec);
}
// Handle a few special record types
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== pg_constants::XLOG_SMGR_TRUNCATE
{
let truncate = XlSmgrTruncate::decode(&decoded);
if (truncate.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 {
let rel = RelTag {
spcnode: truncate.rnode.spcnode,
dbnode: truncate.rnode.dbnode,
relnode: truncate.rnode.relnode,
forknum: pg_constants::MAIN_FORKNUM,
};
self.put_truncation(rel, lsn, truncate.blkno)?;
}
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== pg_constants::XLOG_DBASE_CREATE
{
let createdb = XlCreateDatabase::decode(&decoded);
self.put_create_database(
lsn,
createdb.db_id,
createdb.tablespace_id,
createdb.src_db_id,
createdb.src_tablespace_id,
)?;
}
// Now that this record has been handled, let the repository know that
// it is up-to-date to this LSN
self.advance_last_record_lsn(lsn);
Ok(())
}
/// Remember the all WAL before the given LSN has been processed.
///
/// The WAL receiver calls this after the put_* functions, to indicate that
/// all WAL before this point has been digested. Before that, if you call
/// GET on an earlier LSN, it will block.
fn advance_last_valid_lsn(&self, lsn: Lsn);
fn get_last_valid_lsn(&self) -> Lsn;
fn init_valid_lsn(&self, lsn: Lsn);
/// Like `advance_last_valid_lsn`, but this always points to the end of
/// a WAL record, not in the middle of one.
///
/// This must be <= last valid LSN. This is tracked separately from last
/// valid LSN, so that the WAL receiver knows where to restart streaming.
fn advance_last_record_lsn(&self, lsn: Lsn);
fn get_last_record_lsn(&self) -> Lsn;
/// Get range [begin,end) of stored blocks. Used mostly for SMGR pseudorelations
/// but can be also applied to normal relations.
fn get_range(&self, rel: RelTag, lsn: Lsn) -> Result<(u32, u32)>;
/// Get vector of databases (represented using RelTag only dbnode and spcnode fields are used)
fn get_databases(&self, lsn: Lsn) -> Result<Vec<RelTag>>;
/// Get vector of prepared twophase transactions
fn get_twophase(&self, lsn: Lsn) -> Result<Vec<TransactionId>>;
}
#[derive(Clone)]
pub struct RepositoryStats {
pub num_entries: Lsn,
pub num_page_images: Lsn,
pub num_wal_records: Lsn,
pub num_getpage_requests: Lsn,
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy)]
pub struct RelTag {
pub forknum: u8,
pub spcnode: u32,
pub dbnode: u32,
pub relnode: u32,
}
impl RelTag {
pub fn pack(&self, buf: &mut BytesMut) {
buf.put_u8(self.forknum);
buf.put_u32(self.spcnode);
buf.put_u32(self.dbnode);
buf.put_u32(self.relnode);
}
pub fn unpack(buf: &mut Bytes) -> RelTag {
RelTag {
forknum: buf.get_u8(),
spcnode: buf.get_u32(),
dbnode: buf.get_u32(),
relnode: buf.get_u32(),
}
}
}
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
///
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
///
impl fmt::Display for RelTag {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(forkname) = forknumber_to_name(self.forknum) {
write!(
f,
"{}/{}/{}_{}",
self.spcnode, self.dbnode, self.relnode, forkname
)
} else {
write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode)
}
}
}
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
pub struct BufferTag {
pub rel: RelTag,
pub blknum: u32,
}
impl BufferTag {
pub fn fork(forknum: u8) -> BufferTag {
BufferTag {
rel: RelTag {
forknum,
spcnode: 0,
dbnode: 0,
relnode: 0,
},
blknum: 0,
}
}
pub fn pack(&self, buf: &mut BytesMut) {
self.rel.pack(buf);
buf.put_u32(self.blknum);
}
pub fn unpack(buf: &mut Bytes) -> BufferTag {
BufferTag {
rel: RelTag::unpack(buf),
blknum: buf.get_u32(),
}
}
}
#[derive(Debug, Clone)]
pub struct WALRecord {
pub lsn: Lsn, // LSN at the *end* of the record
pub will_init: bool,
pub rec: Bytes,
// Remember the offset of main_data in rec,
// so that we don't have to parse the record again.
// If record has no main_data, this offset equals rec.len().
pub main_data_offset: u32,
}
impl WALRecord {
pub fn pack(&self, buf: &mut BytesMut) {
buf.put_u64(self.lsn.0);
buf.put_u8(self.will_init as u8);
buf.put_u32(self.main_data_offset);
buf.put_u32(self.rec.len() as u32);
buf.put_slice(&self.rec[..]);
}
pub fn unpack(buf: &mut Bytes) -> WALRecord {
let lsn = Lsn::from(buf.get_u64());
let will_init = buf.get_u8() != 0;
let main_data_offset = buf.get_u32();
let mut dst = vec![0u8; buf.get_u32() as usize];
buf.copy_to_slice(&mut dst);
WALRecord {
lsn,
will_init,
rec: Bytes::from(dst),
main_data_offset,
}
}
}
///
/// Tests that should work the same with any Repository/Timeline implementation.
///
#[cfg(test)]
mod tests {
use super::*;
use crate::walredo::{WalRedoError, WalRedoManager};
use crate::PageServerConf;
use postgres_ffi::pg_constants;
use std::fs;
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Duration;
/// Arbitrary relation tag, for testing.
const TESTREL_A: RelTag = RelTag {
spcnode: 0,
dbnode: 111,
relnode: 1000,
forknum: 0,
};
/// Convenience function to create a BufferTag for testing.
/// Helps to keeps the tests shorter.
#[allow(non_snake_case)]
fn TEST_BUF(blknum: u32) -> BufferTag {
BufferTag {
rel: TESTREL_A,
blknum,
}
}
/// Convenience function to create a page image with given string as the only content
#[allow(non_snake_case)]
fn TEST_IMG(s: &str) -> Bytes {
let mut buf = BytesMut::new();
buf.extend_from_slice(s.as_bytes());
buf.resize(8192, 0);
buf.freeze()
}
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
let _ = fs::remove_dir_all(&repo_dir);
fs::create_dir_all(&repo_dir)?;
let conf = PageServerConf {
daemonize: false,
interactive: false,
gc_horizon: 64 * 1024 * 1024,
gc_period: Duration::from_secs(10),
listen_addr: "127.0.0.1:5430".parse().unwrap(),
workdir: repo_dir.into(),
pg_distrib_dir: "".into(),
};
// Make a static copy of the config. This can never be free'd, but that's
// OK in a test.
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
let walredo_mgr = TestRedoManager {};
let repo = rocksdb::RocksRepository::new(conf, Arc::new(walredo_mgr));
Ok(Box::new(repo))
}
/// Test get_relsize() and truncation.
///
/// FIXME: The RocksRepository implementation returns wrong relation size, if
/// you make a request with an old LSN. It seems to ignore the requested LSN
/// and always return result as of latest LSN. For such cases, the expected
/// results below match the current RocksRepository behavior, so that the test
/// passes, and the actually correct answers are in comments like
/// "// CORRECT: <correct answer>"
#[test]
fn test_relsize() -> Result<()> {
// get_timeline() with non-existent timeline id should fail
//repo.get_timeline("11223344556677881122334455667788");
// Create timeline to work on
let repo = get_test_repo("test_relsize")?;
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
let tline = repo.create_empty_timeline(timelineid)?;
tline.init_valid_lsn(Lsn(1));
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"));
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"));
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"));
tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"));
tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"));
tline.advance_last_valid_lsn(Lsn(5));
// rocksdb implementation erroneosly returns 'true' here
assert_eq!(tline.get_relsize_exists(TESTREL_A, Lsn(1))?, true); // CORRECT: false
// likewise, it returns wrong size here
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(1))?, 3); // CORRECT: 0 (or error?)
assert_eq!(tline.get_relsize_exists(TESTREL_A, Lsn(2))?, true);
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(2))?, 3); // CORRECT: 1
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(5))?, 3);
// Check page contents at each LSN
assert_eq!(
tline.get_page_at_lsn(TEST_BUF(0), Lsn(2))?,
TEST_IMG("foo blk 0 at 2")
);
assert_eq!(
tline.get_page_at_lsn(TEST_BUF(0), Lsn(3))?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline.get_page_at_lsn(TEST_BUF(1), Lsn(4))?,
TEST_IMG("foo blk 1 at 4")
);
assert_eq!(
tline.get_page_at_lsn(TEST_BUF(0), Lsn(5))?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline.get_page_at_lsn(TEST_BUF(1), Lsn(5))?,
TEST_IMG("foo blk 1 at 4")
);
assert_eq!(
tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
TEST_IMG("foo blk 2 at 5")
);
// Truncate last block
tline.put_truncation(TESTREL_A, Lsn(6), 2)?;
tline.advance_last_valid_lsn(Lsn(6));
// Check reported size and contents after truncation
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(6))?, 2);
assert_eq!(
tline.get_page_at_lsn(TEST_BUF(0), Lsn(6))?,
TEST_IMG("foo blk 0 at 3")
);
assert_eq!(
tline.get_page_at_lsn(TEST_BUF(1), Lsn(6))?,
TEST_IMG("foo blk 1 at 4")
);
// should still see the truncated block with older LSN
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(5))?, 2); // CORRECT: 3
assert_eq!(
tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
TEST_IMG("foo blk 2 at 5")
);
Ok(())
}
/// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
/// split into multiple 1 GB segments in Postgres.
///
/// This isn't very interesting with the RocksDb implementation, as we don't pay
/// any attention to Postgres segment boundaries there.
#[test]
fn test_large_rel() -> Result<()> {
let repo = get_test_repo("test_large_rel")?;
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
let tline = repo.create_empty_timeline(timelineid)?;
tline.init_valid_lsn(Lsn(1));
let mut lsn = 0;
for i in 0..pg_constants::RELSEG_SIZE + 1 {
let img = TEST_IMG(&format!("foo blk {} at {}", i, Lsn(lsn)));
lsn += 1;
tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img);
}
tline.advance_last_valid_lsn(Lsn(lsn));
assert_eq!(
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
pg_constants::RELSEG_SIZE + 1
);
// Truncate one block
lsn += 1;
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
tline.advance_last_valid_lsn(Lsn(lsn));
assert_eq!(
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
pg_constants::RELSEG_SIZE
);
// Truncate another block
lsn += 1;
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
tline.advance_last_valid_lsn(Lsn(lsn));
assert_eq!(
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
pg_constants::RELSEG_SIZE - 1
);
Ok(())
}
// Mock WAL redo manager that doesn't do much
struct TestRedoManager {}
impl WalRedoManager for TestRedoManager {
fn request_redo(
&self,
tag: BufferTag,
lsn: Lsn,
base_img: Option<Bytes>,
records: Vec<WALRecord>,
) -> Result<Bytes, WalRedoError> {
let s = format!(
"redo for rel {} blk {} to get to {}, with {} and {} records",
tag.rel,
tag.blknum,
lsn,
if base_img.is_some() {
"base image"
} else {
"no base image"
},
records.len()
);
println!("{}", s);
Ok(TEST_IMG(&s))
}
}
}

View File

@@ -1,978 +0,0 @@
//
// A Repository holds all the different page versions and WAL records
//
// This implementation uses RocksDB to store WAL wal records and
// full page images, keyed by the RelFileNode, blocknumber, and the
// LSN.
use crate::repository::{BufferTag, RelTag, Repository, Timeline, WALRecord};
use crate::restore_local_repo::restore_timeline;
use crate::waldecoder::{Oid, TransactionId};
use crate::walredo::WalRedoManager;
use crate::PageServerConf;
use crate::ZTimelineId;
// use crate::PageServerConf;
// use crate::branches;
use anyhow::{bail, Context, Result};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use log::*;
use postgres_ffi::nonrelfile_utils::transaction_id_get_status;
use postgres_ffi::*;
use std::cmp::min;
use std::collections::HashMap;
use std::convert::TryInto;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;
use std::sync::{Arc, Mutex};
use std::thread;
use std::time::{Duration, Instant};
use zenith_utils::lsn::{AtomicLsn, Lsn};
use zenith_utils::seqwait::SeqWait;
// Timeout when waiting or WAL receiver to catch up to an LSN given in a GetPage@LSN call.
static TIMEOUT: Duration = Duration::from_secs(60);
pub struct RocksRepository {
conf: &'static PageServerConf,
timelines: Mutex<HashMap<ZTimelineId, Arc<RocksTimeline>>>,
walredo_mgr: Arc<dyn WalRedoManager>,
}
pub struct RocksTimeline {
// RocksDB handle
db: rocksdb::DB,
// WAL redo manager
walredo_mgr: Arc<dyn WalRedoManager>,
// What page versions do we hold in the cache? If we get a request > last_valid_lsn,
// we need to wait until we receive all the WAL up to the request. The SeqWait
// provides functions for that. TODO: If we get a request for an old LSN, such that
// the versions have already been garbage collected away, we should throw an error,
// but we don't track that currently.
//
// last_record_lsn points to the end of last processed WAL record.
// It can lag behind last_valid_lsn, if the WAL receiver has received some WAL
// after the end of last record, but not the whole next record yet. In the
// page cache, we care about last_valid_lsn, but if the WAL receiver needs to
// restart the streaming, it needs to restart at the end of last record, so
// we track them separately. last_record_lsn should perhaps be in
// walreceiver.rs instead of here, but it seems convenient to keep all three
// values together.
//
last_valid_lsn: SeqWait<Lsn>,
last_record_lsn: AtomicLsn,
// Counters, for metrics collection.
pub num_entries: AtomicU64,
pub num_page_images: AtomicU64,
pub num_wal_records: AtomicU64,
pub num_getpage_requests: AtomicU64,
}
//
// We store two kinds of entries in the repository:
//
// 1. Ready-made images of the block
// 2. WAL records, to be applied on top of the "previous" entry
//
// Some WAL records will initialize the page from scratch. For such records,
// the 'will_init' flag is set. They don't need the previous page image before
// applying. The 'will_init' flag is set for records containing a full-page image,
// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
// stored directly in the cache entry in that you still need to run the WAL redo
// routine to generate the page image.
//
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
struct CacheKey {
pub tag: BufferTag,
pub lsn: Lsn,
}
impl CacheKey {
fn pack(&self, buf: &mut BytesMut) {
self.tag.pack(buf);
buf.put_u64(self.lsn.0);
}
fn unpack(buf: &mut Bytes) -> CacheKey {
CacheKey {
tag: BufferTag::unpack(buf),
lsn: Lsn::from(buf.get_u64()),
}
}
fn from_slice(slice: &[u8]) -> Self {
let mut buf = Bytes::copy_from_slice(slice);
Self::unpack(&mut buf)
}
fn to_bytes(&self) -> BytesMut {
let mut buf = BytesMut::new();
self.pack(&mut buf);
buf
}
}
enum CacheEntryContent {
PageImage(Bytes),
WALRecord(WALRecord),
Truncation,
}
// The serialized representation of a CacheEntryContent begins with
// single byte that indicates what kind of entry it is. There is also
// an UNUSED_VERSION_FLAG that is not represented in the CacheEntryContent
// at all, you must peek into the first byte of the serialized representation
// to read it.
const CONTENT_PAGE_IMAGE: u8 = 1u8;
const CONTENT_WAL_RECORD: u8 = 2u8;
const CONTENT_TRUNCATION: u8 = 3u8;
const CONTENT_KIND_MASK: u8 = 3u8; // bitmask that covers the above
const UNUSED_VERSION_FLAG: u8 = 4u8;
impl CacheEntryContent {
pub fn pack(&self, buf: &mut BytesMut) {
match self {
CacheEntryContent::PageImage(image) => {
buf.put_u8(CONTENT_PAGE_IMAGE);
buf.put_u16(image.len() as u16);
buf.put_slice(&image[..]);
}
CacheEntryContent::WALRecord(rec) => {
buf.put_u8(CONTENT_WAL_RECORD);
rec.pack(buf);
}
CacheEntryContent::Truncation => {
buf.put_u8(CONTENT_TRUNCATION);
}
}
}
pub fn unpack(buf: &mut Bytes) -> CacheEntryContent {
let kind = buf.get_u8() & CONTENT_KIND_MASK;
match kind {
CONTENT_PAGE_IMAGE => {
let len = buf.get_u16() as usize;
let mut dst = vec![0u8; len];
buf.copy_to_slice(&mut dst);
CacheEntryContent::PageImage(Bytes::from(dst))
}
CONTENT_WAL_RECORD => CacheEntryContent::WALRecord(WALRecord::unpack(buf)),
CONTENT_TRUNCATION => CacheEntryContent::Truncation,
_ => unreachable!(),
}
}
fn from_slice(slice: &[u8]) -> Self {
let mut buf = Bytes::copy_from_slice(slice);
Self::unpack(&mut buf)
}
fn to_bytes(&self) -> BytesMut {
let mut buf = BytesMut::new();
self.pack(&mut buf);
buf
}
}
impl RocksRepository {
pub fn new(
conf: &'static PageServerConf,
walredo_mgr: Arc<dyn WalRedoManager>,
) -> RocksRepository {
RocksRepository {
conf: conf,
timelines: Mutex::new(HashMap::new()),
walredo_mgr,
}
}
}
// Get handle to a given timeline. It is assumed to already exist.
impl Repository for RocksRepository {
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
let timelines = self.timelines.lock().unwrap();
match timelines.get(&timelineid) {
Some(timeline) => Ok(timeline.clone()),
None => bail!("timeline not found"),
}
}
fn get_or_restore_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
let mut timelines = self.timelines.lock().unwrap();
match timelines.get(&timelineid) {
Some(timeline) => Ok(timeline.clone()),
None => {
let timeline = RocksTimeline::new(self.conf, timelineid, self.walredo_mgr.clone());
restore_timeline(self.conf, &timeline, timelineid)?;
let timeline_rc = Arc::new(timeline);
timelines.insert(timelineid, timeline_rc.clone());
if self.conf.gc_horizon != 0 {
let timeline_rc_copy = timeline_rc.clone();
let conf = self.conf;
let _gc_thread = thread::Builder::new()
.name("Garbage collection thread".into())
.spawn(move || {
// FIXME
timeline_rc_copy.do_gc(conf).expect("GC thread died");
})
.unwrap();
}
Ok(timeline_rc)
}
}
}
#[cfg(test)]
fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
let mut timelines = self.timelines.lock().unwrap();
let timeline = RocksTimeline::new(&self.conf, timelineid, self.walredo_mgr.clone());
let timeline_rc = Arc::new(timeline);
let r = timelines.insert(timelineid, timeline_rc.clone());
assert!(r.is_none());
// don't start the garbage collector for unit tests, either.
Ok(timeline_rc)
}
}
impl RocksTimeline {
fn open_rocksdb(conf: &PageServerConf, timelineid: ZTimelineId) -> rocksdb::DB {
let path = conf.timeline_path(timelineid);
let mut opts = rocksdb::Options::default();
opts.create_if_missing(true);
opts.set_use_fsync(true);
opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
opts.set_compaction_filter("ttl", move |_level: u32, _key: &[u8], val: &[u8]| {
if (val[0] & UNUSED_VERSION_FLAG) != 0 {
rocksdb::compaction_filter::Decision::Remove
} else {
rocksdb::compaction_filter::Decision::Keep
}
});
rocksdb::DB::open(&opts, &path).unwrap()
}
fn new(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
walredo_mgr: Arc<dyn WalRedoManager>,
) -> RocksTimeline {
RocksTimeline {
db: RocksTimeline::open_rocksdb(conf, timelineid),
walredo_mgr,
last_valid_lsn: SeqWait::new(Lsn(0)),
last_record_lsn: AtomicLsn::new(0),
num_entries: AtomicU64::new(0),
num_page_images: AtomicU64::new(0),
num_wal_records: AtomicU64::new(0),
num_getpage_requests: AtomicU64::new(0),
}
}
}
impl RocksTimeline {
///
/// Collect all the WAL records that are needed to reconstruct a page
/// image for the given cache entry.
///
/// Returns an old page image (if any), and a vector of WAL records to apply
/// over it.
///
fn collect_records_for_apply(
&self,
tag: BufferTag,
lsn: Lsn,
) -> (Option<Bytes>, Vec<WALRecord>) {
let key = CacheKey { tag, lsn };
let mut base_img: Option<Bytes> = None;
let mut records: Vec<WALRecord> = Vec::new();
let mut iter = self.db.raw_iterator();
iter.seek_for_prev(key.to_bytes());
// Scan backwards, collecting the WAL records, until we hit an
// old page image.
while iter.valid() {
let key = CacheKey::from_slice(iter.key().unwrap());
if key.tag != tag {
break;
}
let content = CacheEntryContent::from_slice(iter.value().unwrap());
if let CacheEntryContent::PageImage(img) = content {
// We have a base image. No need to dig deeper into the list of
// records
base_img = Some(img);
break;
} else if let CacheEntryContent::WALRecord(rec) = content {
records.push(rec.clone());
// If this WAL record initializes the page, no need to dig deeper.
if rec.will_init {
break;
}
} else {
panic!("no base image and no WAL record on cache entry");
}
iter.prev();
}
records.reverse();
(base_img, records)
}
// Internal functions
//
// Internal function to get relation size at given LSN.
//
// The caller must ensure that WAL has been received up to 'lsn'.
//
fn relsize_get_nowait(&self, rel: RelTag, lsn: Lsn) -> Result<u32> {
assert!(lsn <= self.last_valid_lsn.load());
let mut key = CacheKey {
tag: BufferTag {
rel,
blknum: u32::MAX,
},
lsn,
};
let mut iter = self.db.raw_iterator();
loop {
iter.seek_for_prev(key.to_bytes());
if iter.valid() {
let thiskey = CacheKey::from_slice(iter.key().unwrap());
if thiskey.tag.rel == rel {
let content = CacheEntryContent::from_slice(iter.value().unwrap());
if let CacheEntryContent::Truncation = content {
if thiskey.tag.blknum > 0 {
key.tag.blknum = thiskey.tag.blknum - 1;
continue;
}
break;
}
let relsize = thiskey.tag.blknum + 1;
debug!("Size of relation {} at {} is {}", rel, lsn, relsize);
return Ok(relsize);
}
}
break;
}
debug!("Size of relation {} at {} is zero", rel, lsn);
Ok(0)
}
fn do_gc(&self, conf: &'static PageServerConf) -> Result<Bytes> {
loop {
thread::sleep(conf.gc_period);
let last_lsn = self.get_last_valid_lsn();
// checked_sub() returns None on overflow.
if let Some(horizon) = last_lsn.checked_sub(conf.gc_horizon) {
let mut maxkey = CacheKey {
tag: BufferTag {
rel: RelTag {
spcnode: u32::MAX,
dbnode: u32::MAX,
relnode: u32::MAX,
forknum: u8::MAX,
},
blknum: u32::MAX,
},
lsn: Lsn::MAX,
};
let now = Instant::now();
let mut reconstructed = 0u64;
let mut truncated = 0u64;
let mut inspected = 0u64;
let mut deleted = 0u64;
loop {
let mut iter = self.db.raw_iterator();
iter.seek_for_prev(maxkey.to_bytes());
if iter.valid() {
let key = CacheKey::from_slice(iter.key().unwrap());
let v = iter.value().unwrap();
inspected += 1;
// Construct boundaries for old records cleanup
maxkey.tag = key.tag;
let last_lsn = key.lsn;
maxkey.lsn = min(horizon, last_lsn); // do not remove last version
let mut minkey = maxkey.clone();
minkey.lsn = Lsn(0); // first version
// Special handling of delete of PREPARE WAL record
if last_lsn < horizon
&& key.tag.rel.forknum == pg_constants::PG_TWOPHASE_FORKNUM
{
if (v[0] & UNUSED_VERSION_FLAG) == 0 {
let mut v = v.to_owned();
v[0] |= UNUSED_VERSION_FLAG;
self.db.put(key.to_bytes(), &v[..])?;
deleted += 1;
}
maxkey = minkey;
continue;
}
// reconstruct most recent page version
if (v[0] & CONTENT_KIND_MASK) == CONTENT_WAL_RECORD {
// force reconstruction of most recent page version
let (base_img, records) =
self.collect_records_for_apply(key.tag, key.lsn);
trace!(
"Reconstruct most recent page {} blk {} at {} from {} records",
key.tag.rel,
key.tag.blknum,
key.lsn,
records.len()
);
let new_img = self
.walredo_mgr
.request_redo(key.tag, key.lsn, base_img, records)?;
self.put_page_image(key.tag, key.lsn, new_img.clone());
reconstructed += 1;
}
iter.seek_for_prev(maxkey.to_bytes());
if iter.valid() {
// do not remove last version
if last_lsn > horizon {
// locate most recent record before horizon
let key = CacheKey::from_slice(iter.key().unwrap());
if key.tag == maxkey.tag {
let v = iter.value().unwrap();
if (v[0] & CONTENT_KIND_MASK) == CONTENT_WAL_RECORD {
let (base_img, records) =
self.collect_records_for_apply(key.tag, key.lsn);
trace!("Reconstruct horizon page {} blk {} at {} from {} records",
key.tag.rel, key.tag.blknum, key.lsn, records.len());
let new_img = self
.walredo_mgr
.request_redo(key.tag, key.lsn, base_img, records)?;
self.put_page_image(key.tag, key.lsn, new_img.clone());
truncated += 1;
} else {
trace!(
"Keeping horizon page {} blk {} at {}",
key.tag.rel,
key.tag.blknum,
key.lsn
);
}
}
} else {
trace!(
"Last page {} blk {} at {}, horizon {}",
key.tag.rel,
key.tag.blknum,
key.lsn,
horizon
);
}
// remove records prior to horizon
loop {
iter.prev();
if !iter.valid() {
break;
}
let key = CacheKey::from_slice(iter.key().unwrap());
if key.tag != maxkey.tag {
break;
}
let v = iter.value().unwrap();
if (v[0] & UNUSED_VERSION_FLAG) == 0 {
let mut v = v.to_owned();
v[0] |= UNUSED_VERSION_FLAG;
self.db.put(key.to_bytes(), &v[..])?;
deleted += 1;
trace!(
"deleted: {} blk {} at {}",
key.tag.rel,
key.tag.blknum,
key.lsn
);
} else {
break;
}
}
}
maxkey = minkey;
} else {
break;
}
}
info!("Garbage collection completed in {:?}:\n{} version chains inspected, {} pages reconstructed, {} version histories truncated, {} versions deleted",
now.elapsed(), inspected, reconstructed, truncated, deleted);
}
}
}
//
// Wait until WAL has been received up to the given LSN.
//
fn wait_lsn(&self, mut lsn: Lsn) -> Result<Lsn> {
// When invalid LSN is requested, it means "don't wait, return latest version of the page"
// This is necessary for bootstrap.
if lsn == Lsn(0) {
let last_valid_lsn = self.last_valid_lsn.load();
trace!(
"walreceiver doesn't work yet last_valid_lsn {}, requested {}",
last_valid_lsn,
lsn
);
lsn = last_valid_lsn;
}
//trace!("Start waiting for LSN {}, valid LSN is {}", lsn, self.last_valid_lsn.load());
self.last_valid_lsn
.wait_for_timeout(lsn, TIMEOUT)
.with_context(|| {
format!(
"Timed out while waiting for WAL record at LSN {} to arrive",
lsn
)
})?;
//trace!("Stop waiting for LSN {}, valid LSN is {}", lsn, self.last_valid_lsn.load());
Ok(lsn)
}
}
impl Timeline for RocksTimeline {
// Public GET interface functions
///
/// GetPage@LSN
///
/// Returns an 8k page image
///
fn get_page_at_lsn(&self, tag: BufferTag, req_lsn: Lsn) -> Result<Bytes> {
self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
let lsn = self.wait_lsn(req_lsn)?;
// Look up cache entry. If it's a page image, return that. If it's a WAL record,
// ask the WAL redo service to reconstruct the page image from the WAL records.
let key = CacheKey { tag, lsn };
let mut iter = self.db.raw_iterator();
iter.seek_for_prev(key.to_bytes());
if iter.valid() {
let key = CacheKey::from_slice(iter.key().unwrap());
if key.tag == tag {
let content = CacheEntryContent::from_slice(iter.value().unwrap());
let page_img: Bytes;
if let CacheEntryContent::PageImage(img) = content {
page_img = img;
} else if let CacheEntryContent::WALRecord(_rec) = content {
// Request the WAL redo manager to apply the WAL records for us.
let (base_img, records) = self.collect_records_for_apply(tag, lsn);
page_img = self.walredo_mgr.request_redo(tag, lsn, base_img, records)?;
self.put_page_image(tag, lsn, page_img.clone());
} else {
// No base image, and no WAL record. Huh?
bail!("no page image or WAL record for requested page");
}
// FIXME: assumes little-endian. Only used for the debugging log though
let page_lsn_hi =
u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
let page_lsn_lo =
u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap());
debug!(
"Returning page with LSN {:X}/{:X} for {} blk {}",
page_lsn_hi, page_lsn_lo, tag.rel, tag.blknum
);
return Ok(page_img);
}
}
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
debug!(
"Page {} blk {} at {}({}) not found",
tag.rel, tag.blknum, req_lsn, lsn
);
Ok(Bytes::from_static(&ZERO_PAGE))
/* return Err("could not find page image")?; */
}
///
/// Get size of relation at given LSN.
///
fn get_relsize(&self, rel: RelTag, lsn: Lsn) -> Result<u32> {
let lsn = self.wait_lsn(lsn)?;
self.relsize_get_nowait(rel, lsn)
}
/// Get vector of prepared twophase transactions
fn get_twophase(&self, lsn: Lsn) -> Result<Vec<TransactionId>> {
let key = CacheKey {
// minimal key
tag: BufferTag {
rel: RelTag {
forknum: pg_constants::PG_TWOPHASE_FORKNUM,
spcnode: 0,
dbnode: 0,
relnode: 0,
},
blknum: 0,
},
lsn: Lsn(0),
};
let mut gxacts = Vec::new();
let mut iter = self.db.raw_iterator();
iter.seek(key.to_bytes());
while iter.valid() {
let key = CacheKey::from_slice(iter.key().unwrap());
if key.tag.rel.forknum != pg_constants::PG_TWOPHASE_FORKNUM {
break; // we are done with this fork
}
if key.lsn <= lsn {
let xid = key.tag.blknum;
let tag = BufferTag {
rel: RelTag {
forknum: pg_constants::PG_XACT_FORKNUM,
spcnode: 0,
dbnode: 0,
relnode: 0,
},
blknum: xid / pg_constants::CLOG_XACTS_PER_PAGE,
};
let clog_page = self.get_page_at_lsn(tag, lsn)?;
let status = transaction_id_get_status(xid, &clog_page[..]);
if status == pg_constants::TRANSACTION_STATUS_IN_PROGRESS {
gxacts.push(xid);
}
}
iter.next();
}
return Ok(gxacts);
}
/// Get databases. This function is used to local pg_filenode.map files
fn get_databases(&self, lsn: Lsn) -> Result<Vec<RelTag>> {
let key = CacheKey {
// minimal key
tag: BufferTag {
rel: RelTag {
forknum: pg_constants::PG_FILENODEMAP_FORKNUM,
spcnode: 0,
dbnode: 0,
relnode: 0,
},
blknum: 0,
},
lsn: Lsn(0),
};
let mut dbs = Vec::new();
let mut iter = self.db.raw_iterator();
iter.seek(key.to_bytes());
let mut prev_tag = key.tag.rel;
while iter.valid() {
let key = CacheKey::from_slice(iter.key().unwrap());
if key.tag.rel.forknum != pg_constants::PG_FILENODEMAP_FORKNUM {
break; // we are done with this fork
}
if key.tag.rel != prev_tag && key.lsn <= lsn {
prev_tag = key.tag.rel;
dbs.push(prev_tag); // collect unique tags
}
iter.next();
}
return Ok(dbs);
}
/// Get range [begin,end) of stored blocks. Used mostly for SMGR pseudorelations
/// but can be also applied to normal relations.
fn get_range(&self, rel: RelTag, lsn: Lsn) -> Result<(u32, u32)> {
let _lsn = self.wait_lsn(lsn)?;
let mut key = CacheKey {
// minimal key to start with
tag: BufferTag { rel, blknum: 0 },
lsn: Lsn(0),
};
let mut iter = self.db.raw_iterator();
iter.seek(key.to_bytes()); // locate first entry
if iter.valid() {
let thiskey = CacheKey::from_slice(iter.key().unwrap());
let tag = thiskey.tag;
if tag.rel == rel {
// still trversing this relation
let first_blknum = tag.blknum;
key.tag.blknum = u32::MAX; // maximal key
let mut iter = self.db.raw_iterator();
iter.seek_for_prev(key.to_bytes()); // localte last entry
if iter.valid() {
let thiskey = CacheKey::from_slice(iter.key().unwrap());
let last_blknum = thiskey.tag.blknum;
return Ok((first_blknum, last_blknum + 1)); // upper boundary is exclusive
}
}
}
Ok((0, 0)) // empty range
}
///
/// Does relation exist at given LSN?
///
/// FIXME: this actually returns true, if the relation exists at *any* LSN
fn get_relsize_exists(&self, rel: RelTag, req_lsn: Lsn) -> Result<bool> {
let lsn = self.wait_lsn(req_lsn)?;
let key = CacheKey {
tag: BufferTag {
rel,
blknum: u32::MAX,
},
lsn,
};
let mut iter = self.db.raw_iterator();
iter.seek_for_prev(key.to_bytes());
if iter.valid() {
let key = CacheKey::from_slice(iter.key().unwrap());
if key.tag.rel == rel {
debug!("Relation {} exists at {}", rel, lsn);
return Ok(true);
}
}
debug!("Relation {} doesn't exist at {}", rel, lsn);
Ok(false)
}
// Other public functions, for updating the repository.
// These are used by the WAL receiver and WAL redo.
///
/// Adds a WAL record to the repository
///
fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) {
let lsn = rec.lsn;
let key = CacheKey { tag, lsn };
let content = CacheEntryContent::WALRecord(rec);
let _res = self.db.put(key.to_bytes(), content.to_bytes());
trace!(
"put_wal_record rel {} blk {} at {}",
tag.rel,
tag.blknum,
lsn
);
self.num_entries.fetch_add(1, Ordering::Relaxed);
self.num_wal_records.fetch_add(1, Ordering::Relaxed);
}
///
/// Adds a relation-wide WAL record (like truncate) to the repository,
/// associating it with all pages started with specified block number
///
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()> {
// What was the size of the relation before this record?
let last_lsn = self.last_valid_lsn.load();
let old_rel_size = self.relsize_get_nowait(rel, last_lsn)?;
let content = CacheEntryContent::Truncation;
// set new relation size
trace!("Truncate relation {} to {} blocks at {}", rel, nblocks, lsn);
for blknum in nblocks..old_rel_size {
let key = CacheKey {
tag: BufferTag { rel, blknum },
lsn,
};
trace!("put_wal_record lsn: {}", key.lsn);
let _res = self.db.put(key.to_bytes(), content.to_bytes());
}
let n = (old_rel_size - nblocks) as u64;
self.num_entries.fetch_add(n, Ordering::Relaxed);
self.num_wal_records.fetch_add(n, Ordering::Relaxed);
Ok(())
}
///
/// Get page image at particular LSN
///
fn get_page_image(&self, tag: BufferTag, lsn: Lsn) -> Result<Option<Bytes>> {
let key = CacheKey { tag, lsn };
if let Some(bytes) = self.db.get(key.to_bytes())? {
let content = CacheEntryContent::from_slice(&bytes);
if let CacheEntryContent::PageImage(img) = content {
return Ok(Some(img));
}
}
return Ok(None);
}
///
/// Memorize a full image of a page version
///
fn put_page_image(&self, tag: BufferTag, lsn: Lsn, img: Bytes) {
let img_len = img.len();
let key = CacheKey { tag, lsn };
let content = CacheEntryContent::PageImage(img);
let mut val_buf = content.to_bytes();
// Zero size of page image indicates that page can be removed
if img_len == 0 {
if (val_buf[0] & UNUSED_VERSION_FLAG) != 0 {
// records already marked for deletion
return;
} else {
// delete truncated multixact page
val_buf[0] |= UNUSED_VERSION_FLAG;
}
}
trace!("put_wal_record lsn: {}", key.lsn);
let _res = self.db.put(key.to_bytes(), content.to_bytes());
trace!(
"put_page_image rel {} blk {} at {}",
tag.rel,
tag.blknum,
lsn
);
self.num_page_images.fetch_add(1, Ordering::Relaxed);
}
fn put_create_database(
&self,
lsn: Lsn,
db_id: Oid,
tablespace_id: Oid,
src_db_id: Oid,
src_tablespace_id: Oid,
) -> Result<()> {
let mut n = 0;
for forknum in &[
pg_constants::MAIN_FORKNUM,
pg_constants::FSM_FORKNUM,
pg_constants::VISIBILITYMAP_FORKNUM,
pg_constants::INIT_FORKNUM,
pg_constants::PG_FILENODEMAP_FORKNUM,
] {
let key = CacheKey {
tag: BufferTag {
rel: RelTag {
spcnode: src_tablespace_id,
dbnode: src_db_id,
relnode: 0,
forknum: *forknum,
},
blknum: 0,
},
lsn: Lsn(0),
};
let mut iter = self.db.raw_iterator();
iter.seek(key.to_bytes());
while iter.valid() {
let mut key = CacheKey::from_slice(iter.key().unwrap());
if key.tag.rel.spcnode != src_tablespace_id || key.tag.rel.dbnode != src_db_id {
break;
}
key.tag.rel.spcnode = tablespace_id;
key.tag.rel.dbnode = db_id;
key.lsn = lsn;
let v = iter.value().unwrap();
self.db.put(key.to_bytes(), v)?;
n += 1;
iter.next();
}
}
info!(
"Create database {}/{}, copy {} entries",
tablespace_id, db_id, n
);
Ok(())
}
/// Remember that WAL has been received and added to the timeline up to the given LSN
fn advance_last_valid_lsn(&self, lsn: Lsn) {
let lsn = Lsn((lsn.0 + 7) & !7); // align position on 8 bytes
let old = self.last_valid_lsn.advance(lsn);
// Can't move backwards.
if lsn < old {
warn!(
"attempted to move last valid LSN backwards (was {}, new {})",
old, lsn
);
}
}
///
/// Remember the (end of) last valid WAL record remembered for the timeline.
///
/// NOTE: this updates last_valid_lsn as well.
///
fn advance_last_record_lsn(&self, lsn: Lsn) {
let lsn = Lsn((lsn.0 + 7) & !7); // align position on 8 bytes
// Can't move backwards.
let old = self.last_record_lsn.fetch_max(lsn);
assert!(old <= lsn);
// Also advance last_valid_lsn
let old = self.last_valid_lsn.advance(lsn);
// Can't move backwards.
if lsn < old {
warn!(
"attempted to move last record LSN backwards (was {}, new {})",
old, lsn
);
}
}
fn get_last_record_lsn(&self) -> Lsn {
self.last_record_lsn.load()
}
fn init_valid_lsn(&self, lsn: Lsn) {
let old = self.last_valid_lsn.advance(lsn);
assert!(old == Lsn(0));
let old = self.last_record_lsn.fetch_max(lsn);
assert!(old == Lsn(0));
}
fn get_last_valid_lsn(&self) -> Lsn {
self.last_valid_lsn.load()
}
//
// Get statistics to be displayed in the user interface.
//
// FIXME
/*
fn get_stats(&self) -> TimelineStats {
TimelineStats {
num_entries: self.num_entries.load(Ordering::Relaxed),
num_page_images: self.num_page_images.load(Ordering::Relaxed),
num_wal_records: self.num_wal_records.load(Ordering::Relaxed),
num_getpage_requests: self.num_getpage_requests.load(Ordering::Relaxed),
}
}
*/
}

View File

@@ -11,7 +11,11 @@
//
use log::*;
use regex::Regex;
use std::fmt;
use std::cmp::max;
use std::error::Error;
use std::fs;
use std::fs::File;
use std::io::Read;
@@ -22,24 +26,28 @@ use std::path::{Path, PathBuf};
use anyhow::Result;
use bytes::Bytes;
use crate::repository::{BufferTag, RelTag, Timeline};
use crate::waldecoder::{decode_wal_record, Oid, WalStreamDecoder};
use crate::page_cache;
use crate::page_cache::BufferTag;
use crate::page_cache::PageCache;
use crate::waldecoder::{decode_wal_record, WalStreamDecoder};
use crate::PageServerConf;
use crate::ZTimelineId;
use postgres_ffi::relfile_utils::*;
use postgres_ffi::xlog_utils::*;
use postgres_ffi::*;
use zenith_utils::lsn::Lsn;
///
/// Load all WAL and all relation data pages from local disk into the repository.
///
// From pg_tablespace_d.h
//
// FIXME: we'll probably need these elsewhere too, move to some common location
const DEFAULTTABLESPACE_OID: u32 = 1663;
const GLOBALTABLESPACE_OID: u32 = 1664;
//
// Load it all into the page cache.
//
pub fn restore_timeline(
conf: &PageServerConf,
timeline: &dyn Timeline,
timelineid: ZTimelineId,
pcache: &PageCache,
timeline: ZTimelineId,
) -> Result<()> {
let timelinepath = PathBuf::from("timelines").join(timelineid.to_string());
let timelinepath = PathBuf::from("timelines").join(timeline.to_string());
if !timelinepath.exists() {
anyhow::bail!("timeline {} does not exist in the page server's repository");
@@ -47,55 +55,48 @@ pub fn restore_timeline(
// Scan .zenith/timelines/<timeline>/snapshots
let snapshotspath = PathBuf::from("timelines")
.join(timelineid.to_string())
.join(timeline.to_string())
.join("snapshots");
let mut last_snapshot_lsn: Lsn = Lsn(0);
let mut last_snapshot_lsn: u64 = 0;
for direntry in fs::read_dir(&snapshotspath).unwrap() {
let direntry = direntry?;
let filename = direntry.file_name();
let lsn = Lsn::from_filename(&filename)?;
let filename = direntry.file_name().to_str().unwrap().to_owned();
let lsn = u64::from_str_radix(&filename, 16)?;
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
// FIXME: pass filename as Path instead of str?
let filename_str = filename.into_string().unwrap();
restore_snapshot(conf, timeline, timelineid, &filename_str)?;
info!("restored snapshot at {:?}", filename_str);
restore_snapshot(conf, pcache, timeline, &filename)?;
info!("restored snapshot at {}", filename);
}
if last_snapshot_lsn == Lsn(0) {
if last_snapshot_lsn == 0 {
error!(
"could not find valid snapshot in {}",
snapshotspath.display()
);
// TODO return error?
}
timeline.init_valid_lsn(last_snapshot_lsn);
pcache.init_valid_lsn(last_snapshot_lsn);
restore_wal(timeline, timelineid, last_snapshot_lsn)?;
restore_wal(conf, pcache, timeline, last_snapshot_lsn)?;
Ok(())
}
///
/// Find latest snapshot in a timeline's 'snapshots' directory
///
pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result<Lsn> {
pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result<u64> {
let snapshotspath = format!("timelines/{}/snapshots", timeline);
let mut last_snapshot_lsn = Lsn(0);
let mut last_snapshot_lsn = 0;
for direntry in fs::read_dir(&snapshotspath).unwrap() {
let filename = direntry.unwrap().file_name();
let filename = direntry.unwrap().file_name().to_str().unwrap().to_owned();
if let Ok(lsn) = Lsn::from_filename(&filename) {
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
} else {
error!("unrecognized file in snapshots directory: {:?}", filename);
}
let lsn = u64::from_str_radix(&filename, 16)?;
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
}
if last_snapshot_lsn == Lsn(0) {
if last_snapshot_lsn == 0 {
error!("could not find valid snapshot in {}", &snapshotspath);
// TODO return error?
}
@@ -104,12 +105,12 @@ pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Re
fn restore_snapshot(
conf: &PageServerConf,
timeline: &dyn Timeline,
timelineid: ZTimelineId,
pcache: &PageCache,
timeline: ZTimelineId,
snapshot: &str,
) -> Result<()> {
let snapshotpath = PathBuf::from("timelines")
.join(timelineid.to_string())
.join(timeline.to_string())
.join("snapshots")
.join(snapshot);
@@ -120,34 +121,16 @@ fn restore_snapshot(
None => continue,
// These special files appear in the snapshot, but are not needed by the page server
Some("pg_control") => restore_nonrel_file(
conf,
timeline,
timelineid,
"0",
0,
0,
pg_constants::PG_CONTROLFILE_FORKNUM,
0,
&direntry.path(),
)?,
Some("pg_filenode.map") => restore_nonrel_file(
conf,
timeline,
timelineid,
snapshot,
pg_constants::GLOBALTABLESPACE_OID,
0,
pg_constants::PG_FILENODEMAP_FORKNUM,
0,
&direntry.path(),
)?,
Some("pg_control") => continue,
Some("pg_filenode.map") => continue,
// Load any relation files into the page server
_ => restore_relfile(
conf,
pcache,
timeline,
snapshot,
pg_constants::GLOBALTABLESPACE_OID,
GLOBALTABLESPACE_OID,
0,
&direntry.path(),
)?,
@@ -159,7 +142,7 @@ fn restore_snapshot(
for direntry in fs::read_dir(snapshotpath.join("base"))? {
let direntry = direntry?;
let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;
let dboid = u32::from_str_radix(direntry.file_name().to_str().unwrap(), 10)?;
for direntry in fs::read_dir(direntry.path())? {
let direntry = direntry?;
@@ -168,118 +151,65 @@ fn restore_snapshot(
// These special files appear in the snapshot, but are not needed by the page server
Some("PG_VERSION") => continue,
Some("pg_filenode.map") => restore_nonrel_file(
conf,
timeline,
timelineid,
snapshot,
pg_constants::DEFAULTTABLESPACE_OID,
dboid,
pg_constants::PG_FILENODEMAP_FORKNUM,
0,
&direntry.path(),
)?,
Some("pg_filenode.map") => continue,
// Load any relation files into the page server
_ => restore_relfile(
conf,
pcache,
timeline,
snapshot,
pg_constants::DEFAULTTABLESPACE_OID,
DEFAULTTABLESPACE_OID,
dboid,
&direntry.path(),
)?,
}
}
}
for entry in fs::read_dir(snapshotpath.join("pg_xact"))? {
let entry = entry?;
restore_slru_file(
conf,
timeline,
timelineid,
snapshot,
pg_constants::PG_XACT_FORKNUM,
&entry.path(),
)?;
}
for entry in fs::read_dir(snapshotpath.join("pg_multixact").join("members"))? {
let entry = entry?;
restore_slru_file(
conf,
timeline,
timelineid,
snapshot,
pg_constants::PG_MXACT_MEMBERS_FORKNUM,
&entry.path(),
)?;
}
for entry in fs::read_dir(snapshotpath.join("pg_multixact").join("offsets"))? {
let entry = entry?;
restore_slru_file(
conf,
timeline,
timelineid,
snapshot,
pg_constants::PG_MXACT_OFFSETS_FORKNUM,
&entry.path(),
)?;
}
for entry in fs::read_dir(snapshotpath.join("pg_twophase"))? {
let entry = entry?;
let xid = u32::from_str_radix(&entry.path().to_str().unwrap(), 16)?;
restore_nonrel_file(
conf,
timeline,
timelineid,
snapshot,
0,
0,
pg_constants::PG_TWOPHASE_FORKNUM,
xid,
&entry.path(),
)?;
}
// TODO: Scan pg_tblspc
Ok(())
}
fn restore_relfile(
timeline: &dyn Timeline,
_conf: &PageServerConf,
pcache: &PageCache,
_timeline: ZTimelineId,
snapshot: &str,
spcoid: Oid,
dboid: Oid,
spcoid: u32,
dboid: u32,
path: &Path,
) -> Result<()> {
let lsn = Lsn::from_hex(snapshot)?;
let lsn = u64::from_str_radix(snapshot, 16)?;
// Does it look like a relation file?
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
if let Err(e) = p {
if p.is_err() {
let e = p.unwrap_err();
warn!("unrecognized file in snapshot: {:?} ({})", path, e);
return Err(e.into());
return Err(e)?;
}
let (relnode, forknum, segno) = p.unwrap();
let mut file = File::open(path)?;
let mut buf: [u8; 8192] = [0u8; 8192];
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
// FIXME: use constants (BLCKSZ)
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / 8192);
loop {
let r = file.read_exact(&mut buf);
match r {
Ok(_) => {
let tag = BufferTag {
rel: RelTag {
spcnode: spcoid,
dbnode: dboid,
relnode,
forknum,
},
blknum,
let tag = page_cache::BufferTag {
spcnode: spcoid,
dbnode: dboid,
relnode: relnode,
forknum: forknum as u8,
blknum: blknum,
};
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
/*
if oldest_lsn == 0 || p.lsn < oldest_lsn {
oldest_lsn = p.lsn;
@@ -303,138 +233,51 @@ fn restore_relfile(
blknum += 1;
}
Ok(())
}
fn restore_nonrel_file(
_conf: &PageServerConf,
timeline: &dyn Timeline,
_timelineid: ZTimelineId,
snapshot: &str,
spcoid: Oid,
dboid: Oid,
forknum: u8,
blknum: u32,
path: &Path,
) -> Result<()> {
let lsn = Lsn::from_hex(snapshot)?;
// Does it look like a relation file?
let mut file = File::open(path)?;
let mut buffer = Vec::new();
// read the whole file
file.read_to_end(&mut buffer)?;
let tag = BufferTag {
rel: RelTag {
spcnode: spcoid,
dbnode: dboid,
relnode: 0,
forknum,
},
blknum,
let tag = page_cache::RelTag {
spcnode: spcoid,
dbnode: dboid,
relnode: relnode,
forknum: forknum as u8,
};
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buffer[..]));
Ok(())
}
fn restore_slru_file(
_conf: &PageServerConf,
timeline: &dyn Timeline,
_timelineid: ZTimelineId,
snapshot: &str,
forknum: u8,
path: &Path,
) -> Result<()> {
let lsn = Lsn::from_hex(snapshot)?;
// Does it look like a relation file?
let mut file = File::open(path)?;
let mut buf: [u8; 8192] = [0u8; 8192];
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
let mut blknum: u32 = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
loop {
let r = file.read_exact(&mut buf);
match r {
Ok(_) => {
let tag = BufferTag {
rel: RelTag {
spcnode: 0,
dbnode: 0,
relnode: 0,
forknum,
},
blknum,
};
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
/*
if oldest_lsn == 0 || p.lsn < oldest_lsn {
oldest_lsn = p.lsn;
}
*/
}
// TODO: UnexpectedEof is expected
Err(e) => match e.kind() {
std::io::ErrorKind::UnexpectedEof => {
// reached EOF. That's expected.
// FIXME: maybe check that we read the full length of the file?
break;
}
_ => {
error!("error reading file: {:?} ({})", path, e);
break;
}
},
};
blknum += 1;
}
pcache.relsize_inc(&tag, blknum);
Ok(())
}
// Scan WAL on a timeline, starting from given LSN, and load all the records
// Scan WAL on a timeline, starting from gien LSN, and load all the records
// into the page cache.
fn restore_wal(timeline: &dyn Timeline, timelineid: ZTimelineId, startpoint: Lsn) -> Result<()> {
let walpath = format!("timelines/{}/wal", timelineid);
fn restore_wal(
_conf: &PageServerConf,
pcache: &PageCache,
timeline: ZTimelineId,
startpoint: u64,
) -> Result<()> {
let walpath = format!("timelines/{}/wal", timeline);
let mut waldecoder = WalStreamDecoder::new(startpoint);
let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
let mut last_lsn = Lsn(0);
let mut checkpoint = CheckPoint::new(startpoint.0, 1);
let checkpoint_tag = BufferTag::fork(pg_constants::PG_CHECKPOINT_FORKNUM);
let pg_control_tag = BufferTag::fork(pg_constants::PG_CONTROLFILE_FORKNUM);
if let Some(pg_control_bytes) = timeline.get_page_image(pg_control_tag, Lsn(0))? {
let pg_control = decode_pg_control(pg_control_bytes)?;
checkpoint = pg_control.checkPointCopy.clone();
} else {
error!("No control file is found in reposistory");
}
let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
let mut segno = XLByteToSeg(startpoint, 16 * 1024 * 1024);
let mut offset = XLogSegmentOffset(startpoint, 16 * 1024 * 1024);
let mut last_lsn = 0;
loop {
// FIXME: assume postgresql tli 1 for now
let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
let filename = XLogFileName(1, segno, 16 * 1024 * 1024);
let mut path = walpath.clone() + "/" + &filename;
// It could be as .partial
if !PathBuf::from(&path).exists() {
path += ".partial";
path = path + ".partial";
}
// Slurp the WAL file
let open_result = File::open(&path);
if let Err(e) = &open_result {
if let Err(e) = open_result {
if e.kind() == std::io::ErrorKind::NotFound {
break;
}
return Err(e)?;
}
let mut file = open_result?;
let mut file = open_result.unwrap();
if offset > 0 {
file.seek(SeekFrom::Start(offset as u64))?;
@@ -442,7 +285,7 @@ fn restore_wal(timeline: &dyn Timeline, timelineid: ZTimelineId, startpoint: Lsn
let mut buf = Vec::new();
let nread = file.read_to_end(&mut buf)?;
if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
if nread != 16 * 1024 * 1024 - offset as usize {
// Maybe allow this for .partial files?
error!("read only {} bytes from WAL file", nread);
}
@@ -454,13 +297,35 @@ fn restore_wal(timeline: &dyn Timeline, timelineid: ZTimelineId, startpoint: Lsn
if rec.is_err() {
// Assume that an error means we've reached the end of
// a partial WAL record. So that's ok.
trace!("WAL decoder error {:?}", rec);
waldecoder.set_position(Lsn((segno + 1) * pg_constants::WAL_SEGMENT_SIZE as u64));
break;
}
if let Some((lsn, recdata)) = rec.unwrap() {
let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
timeline.save_decoded_record(decoded, recdata, lsn)?;
let decoded = decode_wal_record(recdata.clone());
// Put the WAL record to the page cache. We make a separate copy of
// it for every block it modifies. (The actual WAL record is kept in
// a Bytes, which uses a reference counter for the underlying buffer,
// so having multiple copies of it doesn't cost that much)
for blk in decoded.blocks.iter() {
let tag = BufferTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum as u8,
blknum: blk.blkno,
};
let rec = page_cache::WALRecord {
lsn: lsn,
will_init: blk.will_init || blk.apply_image,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset,
};
pcache.put_wal_record(tag, rec);
}
// Now that this record has been handled, let the page cache know that
// it is up-to-date to this LSN
pcache.advance_last_valid_lsn(lsn);
last_lsn = lsn;
} else {
break;
@@ -468,16 +333,157 @@ fn restore_wal(timeline: &dyn Timeline, timelineid: ZTimelineId, startpoint: Lsn
nrecords += 1;
}
info!(
"restored {} records from WAL file {} at {}",
nrecords, filename, last_lsn
);
info!("restored {} records from WAL file {}", nrecords, filename);
segno += 1;
offset = 0;
}
info!("reached end of WAL at {}", last_lsn);
let checkpoint_bytes = encode_checkpoint(checkpoint);
timeline.put_page_image(checkpoint_tag, Lsn(0), checkpoint_bytes);
info!(
"reached end of WAL at {:X}/{:X}",
last_lsn >> 32,
last_lsn & 0xffffffff
);
Ok(())
}
// FIXME: copied from xlog_utils.rs
pub const XLOG_FNAME_LEN: usize = 24;
pub type XLogRecPtr = u64;
pub type XLogSegNo = u64;
pub type TimeLineID = u32;
#[allow(non_snake_case)]
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
}
#[allow(non_snake_case)]
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
return xlogptr / wal_segsz_bytes as u64;
}
#[allow(non_snake_case)]
pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
return format!(
"{:>08X}{:>08X}{:>08X}",
tli,
logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes),
logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)
);
}
#[allow(non_snake_case)]
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
}
#[allow(non_snake_case)]
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
}
#[allow(non_snake_case)]
pub fn IsXLogFileName(fname: &str) -> bool {
return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
}
#[allow(non_snake_case)]
pub fn IsPartialXLogFileName(fname: &str) -> bool {
if let Some(basefname) = fname.strip_suffix(".partial") {
IsXLogFileName(basefname)
} else {
false
}
}
#[derive(Debug, Clone)]
struct FilePathError {
msg: String,
}
impl Error for FilePathError {
fn description(&self) -> &str {
&self.msg
}
}
impl FilePathError {
fn new(msg: &str) -> FilePathError {
FilePathError {
msg: msg.to_string(),
}
}
}
impl From<core::num::ParseIntError> for FilePathError {
fn from(e: core::num::ParseIntError) -> Self {
return FilePathError {
msg: format!("invalid filename: {}", e),
};
}
}
impl fmt::Display for FilePathError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "invalid filename")
}
}
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
match forkname {
// "main" is not in filenames, it's implicit if the fork name is not present
None => Ok(0),
Some("fsm") => Ok(1),
Some("vm") => Ok(2),
Some("init") => Ok(3),
Some(_) => Err(FilePathError::new("invalid forkname")),
}
}
#[derive(Debug)]
struct ParsedBaseImageFileName {
pub spcnode: u32,
pub dbnode: u32,
pub relnode: u32,
pub forknum: u32,
pub segno: u32,
pub lsn: u64,
}
// formats:
// <oid>
// <oid>_<fork name>
// <oid>.<segment number>
// <oid>_<fork name>.<segment number>
fn parse_relfilename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
let caps = re
.captures(fname)
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
let relnode_str = caps.name("relnode").unwrap().as_str();
let relnode = u32::from_str_radix(relnode_str, 10)?;
let forkname_match = caps.name("forkname");
let forkname = if forkname_match.is_none() {
None
} else {
Some(forkname_match.unwrap().as_str())
};
let forknum = forkname_to_forknum(forkname)?;
let segno_match = caps.name("segno");
let segno = if segno_match.is_none() {
0
} else {
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
};
return Ok((relnode, forknum, segno));
}

View File

@@ -23,8 +23,6 @@ use tokio::runtime;
use futures::future;
use crate::{page_cache, PageServerConf};
use postgres_ffi::pg_constants;
use postgres_ffi::relfile_utils::*;
struct Storage {
region: Region,
@@ -40,9 +38,12 @@ pub fn restore_main(conf: &PageServerConf) {
let result = restore_chunk(conf).await;
match result {
Ok(_) => {}
Ok(_) => {
return;
}
Err(err) => {
error!("S3 error: {}", err);
return;
}
}
});
@@ -129,12 +130,56 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
Ok(())
}
// From pg_tablespace_d.h
//
// FIXME: we'll probably need these elsewhere too, move to some common location
const DEFAULTTABLESPACE_OID: u32 = 1663;
const GLOBALTABLESPACE_OID: u32 = 1664;
#[derive(Debug)]
struct FilePathError {
msg: String,
}
impl FilePathError {
fn new(msg: &str) -> FilePathError {
FilePathError {
msg: msg.to_string(),
}
}
}
impl From<core::num::ParseIntError> for FilePathError {
fn from(e: core::num::ParseIntError) -> Self {
return FilePathError {
msg: format!("invalid filename: {}", e),
};
}
}
impl fmt::Display for FilePathError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "invalid filename")
}
}
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
match forkname {
// "main" is not in filenames, it's implicit if the fork name is not present
None => Ok(0),
Some("fsm") => Ok(1),
Some("vm") => Ok(2),
Some("init") => Ok(3),
Some(_) => Err(FilePathError::new("invalid forkname")),
}
}
#[derive(Debug)]
struct ParsedBaseImageFileName {
pub spcnode: u32,
pub dbnode: u32,
pub relnode: u32,
pub forknum: u8,
pub forknum: u32,
pub segno: u32,
pub lsn: u64,
@@ -146,7 +191,7 @@ struct ParsedBaseImageFileName {
// <oid>.<segment number>
// <oid>_<fork name>.<segment number>
fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();
let caps = re
@@ -154,23 +199,28 @@ fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
let relnode_str = caps.name("relnode").unwrap().as_str();
let relnode: u32 = relnode_str.parse()?;
let relnode = u32::from_str_radix(relnode_str, 10)?;
let forkname = caps.name("forkname").map(|f| f.as_str());
let forkname_match = caps.name("forkname");
let forkname = if forkname_match.is_none() {
None
} else {
Some(forkname_match.unwrap().as_str())
};
let forknum = forkname_to_forknum(forkname)?;
let segno_match = caps.name("segno");
let segno = if segno_match.is_none() {
0
} else {
segno_match.unwrap().as_str().parse::<u32>()?
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
};
let lsn_hi: u64 = caps.name("lsnhi").unwrap().as_str().parse()?;
let lsn_lo: u64 = caps.name("lsnlo").unwrap().as_str().parse()?;
let lsn_hi = u64::from_str_radix(caps.name("lsnhi").unwrap().as_str(), 16)?;
let lsn_lo = u64::from_str_radix(caps.name("lsnlo").unwrap().as_str(), 16)?;
let lsn = lsn_hi << 32 | lsn_lo;
Ok((relnode, forknum, segno, lsn))
return Ok((relnode, forknum, segno, lsn));
}
fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathError> {
@@ -194,20 +244,20 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
if let Some(fname) = path.strip_prefix("global/") {
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
Ok(ParsedBaseImageFileName {
spcnode: pg_constants::GLOBALTABLESPACE_OID,
return Ok(ParsedBaseImageFileName {
spcnode: GLOBALTABLESPACE_OID,
dbnode: 0,
relnode,
forknum,
segno,
lsn,
})
});
} else if let Some(dbpath) = path.strip_prefix("base/") {
let mut s = dbpath.split("/");
let dbnode_str = s
.next()
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
let dbnode: u32 = dbnode_str.parse()?;
let dbnode = u32::from_str_radix(dbnode_str, 10)?;
let fname = s
.next()
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
@@ -217,19 +267,19 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
Ok(ParsedBaseImageFileName {
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
return Ok(ParsedBaseImageFileName {
spcnode: DEFAULTTABLESPACE_OID,
dbnode,
relnode,
forknum,
segno,
lsn,
})
});
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
// TODO
Err(FilePathError::new("tablespaces not supported"))
return Err(FilePathError::new("tablespaces not supported"));
} else {
Err(FilePathError::new("invalid relation data file name"))
return Err(FilePathError::new("invalid relation data file name"));
}
}
@@ -252,18 +302,17 @@ async fn slurp_base_file(
let mut bytes = BytesMut::from(data.as_slice()).freeze();
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
// FIXME: use constants (BLCKSZ)
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
let pcache = page_cache::get_pagecache(conf, sys_id);
while bytes.remaining() >= 8192 {
let tag = page_cache::BufferTag {
rel: page_cache::RelTag {
spcnode: parsed.spcnode,
dbnode: parsed.dbnode,
relnode: parsed.relnode,
forknum: parsed.forknum,
},
spcnode: parsed.spcnode,
dbnode: parsed.dbnode,
relnode: parsed.relnode,
forknum: parsed.forknum as u8,
blknum,
};

View File

@@ -31,7 +31,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
{
return true;
}
false
return false;
})
.fuse();
@@ -41,7 +41,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
{
return true;
}
false
return false;
})
.fuse();
@@ -52,7 +52,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
{
return true;
}
false
return false;
})
.fuse();
@@ -65,7 +65,7 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
{
return true;
}
false
return false;
})
.fuse();
@@ -84,11 +84,11 @@ pub fn init_logging() -> slog_scope::GlobalLoggerGuard {
return true;
}
false
return false;
})
.fuse();
let logger = slog::Logger::root(drain, slog::o!());
slog_scope::set_global_logger(logger)
return slog_scope::set_global_logger(logger);
}
pub fn ui_main() -> Result<(), Box<dyn Error>> {
@@ -171,11 +171,6 @@ pub fn ui_main() -> Result<(), Box<dyn Error>> {
})?;
// If ther user presses 'q', quit.
// silence clippy's suggestion to rewrite this as an if-statement. Match
// makes more sense as soon as we get another command than 'q'.
#[allow(clippy::single_match)]
#[allow(clippy::collapsible_match)]
if let Event::Input(key) = events.next()? {
match key {
Key::Char('q') => {
@@ -234,7 +229,7 @@ impl<'a> Widget for LogWidget<'a> {
// Render a widget to show some metrics
struct MetricsWidget {}
fn _get_metric_u64(title: &str, value: u64) -> Spans {
fn get_metric_u64(title: &str, value: u64) -> Spans {
Spans::from(vec![
Span::styled(format!("{:<20}", title), Style::default()),
Span::raw(": "),
@@ -245,9 +240,7 @@ fn _get_metric_u64(title: &str, value: u64) -> Spans {
])
}
// This is not used since LSNs were removed from page cache stats.
// Maybe it will be used in the future?
fn _get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
fn get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
Spans::from(vec![
Span::styled(format!("{:<20}", title), Style::default()),
Span::raw(": "),
@@ -255,6 +248,13 @@ fn _get_metric_str<'a>(title: &str, value: &'a str) -> Spans<'a> {
])
}
// FIXME: We really should define a datatype for LSNs, with Display trait and
// helper functions. There's one in tokio-postgres, but I don't think we want
// to rely on that.
fn format_lsn(lsn: u64) -> String {
return format!("{:X}/{:X}", lsn >> 32, lsn & 0xffff_ffff);
}
impl tui::widgets::Widget for MetricsWidget {
fn render(self, area: Rect, buf: &mut Buffer) {
let block = Block::default()
@@ -265,24 +265,17 @@ impl tui::widgets::Widget for MetricsWidget {
block.render(area, buf);
#[allow(unused_mut)]
let mut lines: Vec<Spans> = Vec::new();
// FIXME
//let page_cache_stats = crate::page_cache::get_stats();
// This is not used since LSNs were removed from page cache stats.
// Maybe it will be used in the future?
/*
let page_cache_stats = crate::page_cache::get_stats();
let lsnrange = format!(
"{} - {}",
page_cache_stats.first_valid_lsn, page_cache_stats.last_valid_lsn
format_lsn(page_cache_stats.first_valid_lsn),
format_lsn(page_cache_stats.last_valid_lsn)
);
let last_valid_recordlsn_str = page_cache_stats.last_record_lsn.to_string();
let last_valid_recordlsn_str = format_lsn(page_cache_stats.last_record_lsn);
lines.push(get_metric_str("Valid LSN range", &lsnrange));
lines.push(get_metric_str("Last record LSN", &last_valid_recordlsn_str));
*/
/*
lines.push(get_metric_u64(
"# of cache entries",
page_cache_stats.num_entries,
@@ -299,7 +292,7 @@ impl tui::widgets::Widget for MetricsWidget {
"# of GetPage@LSN calls",
page_cache_stats.num_getpage_requests,
));
*/
let text = Text::from(lines);
Paragraph::new(text).render(inner_area, buf);

View File

@@ -76,8 +76,8 @@ impl Events {
};
Events {
rx,
input_handle,
ignore_exit_key,
input_handle,
tick_handle,
}
}

View File

@@ -51,7 +51,7 @@ impl Drain for TuiLogger {
events.pop_back();
}
Ok(())
return Ok(());
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -7,18 +7,13 @@
//!
use crate::page_cache;
use crate::repository::*;
use crate::waldecoder::*;
use crate::page_cache::BufferTag;
use crate::waldecoder::{decode_wal_record, WalStreamDecoder};
use crate::PageServerConf;
use crate::ZTimelineId;
use anyhow::Error;
use lazy_static::lazy_static;
use log::*;
use postgres::fallible_iterator::FallibleIterator;
use postgres::replication::ReplicationIter;
use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
use postgres_ffi::xlog_utils::*;
use postgres_ffi::*;
use postgres_protocol::message::backend::ReplicationMessage;
use postgres_types::PgLsn;
use std::collections::HashMap;
@@ -29,9 +24,11 @@ use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Mutex;
use std::thread;
use std::thread::sleep;
use std::time::{Duration, SystemTime};
use zenith_utils::lsn::Lsn;
use tokio::runtime;
use tokio::time::{sleep, Duration};
use tokio_postgres::replication::{PgTimestamp, ReplicationStream};
use tokio_postgres::{NoTls, SimpleQueryMessage, SimpleQueryRow};
use tokio_stream::StreamExt;
//
// We keep one WAL Receiver active per timeline.
@@ -47,7 +44,7 @@ lazy_static! {
// Launch a new WAL receiver, or tell one that's running about change in connection string
pub fn launch_wal_receiver(
conf: &'static PageServerConf,
conf: &PageServerConf,
timelineid: ZTimelineId,
wal_producer_connstr: &str,
) {
@@ -64,10 +61,11 @@ pub fn launch_wal_receiver(
receivers.insert(timelineid, receiver);
// Also launch a new thread to handle this connection
let conf_copy = conf.clone();
let _walreceiver_thread = thread::Builder::new()
.name("WAL receiver thread".into())
.spawn(move || {
thread_main(conf, timelineid);
thread_main(&conf_copy, timelineid);
})
.unwrap();
}
@@ -88,124 +86,164 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
//
// This is the entry point for the WAL receiver thread.
//
fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId) {
fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
info!(
"WAL receiver thread started for timeline : '{}'",
timelineid
);
//
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
// and start streaming WAL from it. If the connection is lost, keep retrying.
//
loop {
// Look up the current WAL producer address
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
let runtime = runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let res = walreceiver_main(conf, timelineid, &wal_producer_connstr);
runtime.block_on(async {
loop {
// Look up the current WAL producer address
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
if let Err(e) = res {
info!(
"WAL streaming connection failed ({}), retrying in 1 second",
e
);
sleep(Duration::from_secs(1));
let res = walreceiver_main(conf, timelineid, &wal_producer_connstr).await;
if let Err(e) = res {
info!(
"WAL streaming connection failed ({}), retrying in 1 second",
e
);
sleep(Duration::from_secs(1)).await;
}
}
}
});
}
fn walreceiver_main(
_conf: &PageServerConf,
async fn walreceiver_main(
conf: &PageServerConf,
timelineid: ZTimelineId,
wal_producer_connstr: &str,
) -> Result<(), Error> {
// Connect to the database in replication mode.
info!("connecting to {:?}", wal_producer_connstr);
let connect_cfg = format!("{} replication=true", wal_producer_connstr);
let mut rclient = Client::connect(&connect_cfg, NoTls)?;
let (rclient, connection) = tokio_postgres::connect(&connect_cfg, NoTls).await?;
info!("connected!");
let identify = identify_system(&mut rclient)?;
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own.
tokio::spawn(async move {
if let Err(e) = connection.await {
error!("connection error: {}", e);
}
});
let identify = identify_system(&rclient).await?;
info!("{:?}", identify);
let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
let end_of_wal = u64::from(identify.xlogpos);
let mut caught_up = false;
let repository = page_cache::get_repository();
let timeline = repository.get_timeline(timelineid).unwrap();
let pcache = page_cache::get_pagecache(&conf, timelineid).unwrap();
//
// Start streaming the WAL, from where we left off previously.
//
// If we had previously received WAL up to some point in the middle of a WAL record, we
// better start from the end of last full WAL record, not in the middle of one. Hence,
// use 'last_record_lsn' rather than 'last_valid_lsn' here.
let last_rec_lsn = timeline.get_last_record_lsn();
let mut startpoint = last_rec_lsn;
if startpoint == Lsn(0) {
error!("No previous WAL position");
}
startpoint = Lsn::max(
startpoint,
Lsn(end_of_wal.0 & !(pg_constants::WAL_SEGMENT_SIZE as u64 - 1)),
);
// There might be some padding after the last full record, skip it.
//
// FIXME: It probably would be better to always start streaming from the beginning
// of the page, or the segment, so that we could check the page/segment headers
// too. Just for the sake of paranoia.
startpoint += startpoint.calc_padding(8u32);
debug!(
"last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
last_rec_lsn, startpoint, timelineid, end_of_wal
);
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
let copy_stream = rclient.copy_both_simple(&query)?;
let mut physical_stream = ReplicationIter::new(copy_stream);
let mut waldecoder = WalStreamDecoder::new(startpoint);
let mut checkpoint = CheckPoint::new(startpoint.0, identify.timeline);
let checkpoint_tag = BufferTag::fork(pg_constants::PG_CHECKPOINT_FORKNUM);
if let Some(checkpoint_bytes) = timeline.get_page_image(checkpoint_tag, Lsn(0))? {
checkpoint = decode_checkpoint(checkpoint_bytes)?;
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
let mut startpoint = pcache.get_last_valid_lsn();
let last_valid_lsn = pcache.get_last_valid_lsn();
if startpoint == 0 {
// If we start here with identify.xlogpos we will have race condition with
// postgres start: insert into postgres may request page that was modified with lsn
// smaller than identify.xlogpos.
//
// Current procedure for starting postgres will anyway be changed to something
// different like having 'initdb' method on a pageserver (or importing some shared
// empty database snapshot), so for now I just put start of first segment which
// seems to be a valid record.
pcache.init_valid_lsn(0x_1_000_000_u64);
startpoint = 0x_1_000_000_u64;
} else {
error!("No checkpoint record was found in reposistory");
// There might be some padding after the last full record, skip it.
//
// FIXME: It probably would be better to always start streaming from the beginning
// of the page, or the segment, so that we could check the page/segment headers
// too. Just for the sake of paranoia.
if startpoint % 8 != 0 {
startpoint += 8 - (startpoint % 8);
}
}
while let Some(replication_message) = physical_stream.next()? {
match replication_message {
debug!(
"last_valid_lsn {:X}/{:X} starting replication from {:X}/{:X} for timeline {}, server is at {:X}/{:X}...",
(last_valid_lsn >> 32),
(last_valid_lsn & 0xffffffff),
(startpoint >> 32),
(startpoint & 0xffffffff),
timelineid,
(end_of_wal >> 32),
(end_of_wal & 0xffffffff)
);
let startpoint = PgLsn::from(startpoint);
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
let copy_stream = rclient.copy_both_simple::<bytes::Bytes>(&query).await?;
let physical_stream = ReplicationStream::new(copy_stream);
tokio::pin!(physical_stream);
let mut waldecoder = WalStreamDecoder::new(u64::from(startpoint));
while let Some(replication_message) = physical_stream.next().await {
match replication_message? {
ReplicationMessage::XLogData(xlog_data) => {
// Pass the WAL data to the decoder, and see if we can decode
// more records as a result.
let data = xlog_data.data();
let startlsn = Lsn::from(xlog_data.wal_start());
let startlsn = xlog_data.wal_start();
let endlsn = startlsn + data.len() as u64;
write_wal_file(startlsn, timelineid, pg_constants::WAL_SEGMENT_SIZE, data)?;
write_wal_file(
startlsn,
timelineid,
16 * 1024 * 1024, // FIXME
data,
)?;
trace!("received XLogData between {} and {}", startlsn, endlsn);
trace!(
"received XLogData between {:X}/{:X} and {:X}/{:X}",
(startlsn >> 32),
(startlsn & 0xffffffff),
(endlsn >> 32),
(endlsn & 0xffffffff)
);
waldecoder.feed_bytes(data);
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
let old_checkpoint_bytes = encode_checkpoint(checkpoint);
let decoded = decode_wal_record(&mut checkpoint, recdata.clone());
timeline.save_decoded_record(decoded, recdata, lsn)?;
loop {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
let decoded = decode_wal_record(recdata.clone());
// Put the WAL record to the page cache. We make a separate copy of
// it for every block it modifies. (The actual WAL record is kept in
// a Bytes, which uses a reference counter for the underlying buffer,
// so having multiple copies of it doesn't cost that much)
for blk in decoded.blocks.iter() {
let tag = BufferTag {
spcnode: blk.rnode_spcnode,
dbnode: blk.rnode_dbnode,
relnode: blk.rnode_relnode,
forknum: blk.forknum as u8,
blknum: blk.blkno,
};
let new_checkpoint_bytes = encode_checkpoint(checkpoint);
if new_checkpoint_bytes != old_checkpoint_bytes {
timeline.put_page_image(checkpoint_tag, Lsn(0), new_checkpoint_bytes);
let rec = page_cache::WALRecord {
lsn,
will_init: blk.will_init || blk.apply_image,
rec: recdata.clone(),
main_data_offset: decoded.main_data_offset,
};
pcache.put_wal_record(tag, rec);
}
// Now that this record has been handled, let the page cache know that
// it is up-to-date to this LSN
pcache.advance_last_record_lsn(lsn);
} else {
break;
}
// Now that this record has been handled, let the page cache know that
// it is up-to-date to this LSN
timeline.advance_last_record_lsn(lsn);
}
// Update the last_valid LSN value in the page cache one more time. We updated
@@ -214,10 +252,14 @@ fn walreceiver_main(
// better reflect that, because GetPage@LSN requests might also point in the
// middle of a record, if the request LSN was taken from the server's current
// flush ptr.
timeline.advance_last_valid_lsn(endlsn);
pcache.advance_last_valid_lsn(endlsn);
if !caught_up && endlsn >= end_of_wal {
info!("caught up at LSN {}", endlsn);
info!(
"caught up at LSN {:X}/{:X}",
(endlsn >> 32),
(endlsn & 0xffffffff)
);
caught_up = true;
}
}
@@ -228,28 +270,30 @@ fn walreceiver_main(
let reply_requested: bool = keepalive.reply() != 0;
trace!(
"received PrimaryKeepAlive(wal_end: {}, timestamp: {:?} reply: {})",
"received PrimaryKeepAlive(wal_end: {}, timestamp: {} reply: {})",
wal_end,
timestamp,
reply_requested,
);
if reply_requested {
// TODO: More thought should go into what values are sent here.
let last_lsn = PgLsn::from(u64::from(timeline.get_last_valid_lsn()));
let last_lsn = PgLsn::from(pcache.get_last_valid_lsn());
let write_lsn = last_lsn;
let flush_lsn = last_lsn;
let apply_lsn = PgLsn::from(0);
let ts = SystemTime::now();
let apply_lsn = PgLsn::INVALID;
let ts = PgTimestamp::now()?;
const NO_REPLY: u8 = 0u8;
physical_stream
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
.as_mut()
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)
.await?;
}
}
_ => (),
}
}
Ok(())
return Ok(());
}
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
@@ -272,9 +316,9 @@ pub struct IdentifySystem {
pub struct IdentifyError;
/// Run the postgres `IDENTIFY_SYSTEM` command
pub fn identify_system(client: &mut Client) -> Result<IdentifySystem, Error> {
pub async fn identify_system(client: &tokio_postgres::Client) -> Result<IdentifySystem, Error> {
let query_str = "IDENTIFY_SYSTEM";
let response = client.simple_query(query_str)?;
let response = client.simple_query(query_str).await?;
// get(N) from row, then parse it as some destination type.
fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
@@ -295,12 +339,68 @@ pub fn identify_system(client: &mut Client) -> Result<IdentifySystem, Error> {
dbname: get_parse(first_row, 3).ok(),
})
} else {
Err(IdentifyError.into())
Err(IdentifyError)?
}
}
pub const XLOG_FNAME_LEN: usize = 24;
pub const XLOG_BLCKSZ: usize = 8192;
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
pub const XLOG_PAGE_MAGIC: u16 = 0xD109;
pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = XLP_REM_LEN_OFFS + 4 + 4;
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = XLOG_SIZE_OF_XLOG_SHORT_PHD + 8 + 4 + 4;
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = XLOG_RECORD_CRC_OFFS + 4;
pub type XLogRecPtr = u64;
pub type TimeLineID = u32;
pub type TimestampTz = u64;
pub type XLogSegNo = u64;
#[allow(non_snake_case)]
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
return (xlogptr as u32) & (wal_segsz_bytes as u32 - 1);
}
#[allow(non_snake_case)]
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
return (0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo;
}
#[allow(non_snake_case)]
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
return xlogptr / wal_segsz_bytes as u64;
}
#[allow(non_snake_case)]
pub fn XLogSegNoOffsetToRecPtr(
segno: XLogSegNo,
offset: u32,
wal_segsz_bytes: usize,
) -> XLogRecPtr {
return segno * (wal_segsz_bytes as u64) + (offset as u64);
}
#[allow(non_snake_case)]
pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
return format!(
"{:>08X}{:>08X}{:>08X}",
tli,
logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes),
logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)
);
}
#[allow(non_snake_case)]
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
let seg = u32::from_str_radix(&fname[16..24], 16).unwrap() as XLogSegNo;
return (log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli);
}
fn write_wal_file(
startpos: Lsn,
startpos: XLogRecPtr,
timeline: ZTimelineId,
wal_seg_size: usize,
buf: &[u8],
@@ -309,12 +409,12 @@ fn write_wal_file(
let mut bytes_written: usize = 0;
let mut partial;
let mut start_pos = startpos;
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
const ZERO_BLOCK: &'static [u8] = &[0u8; XLOG_BLCKSZ];
let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
/* Extract WAL location for this block */
let mut xlogoff = start_pos.segment_offset(wal_seg_size);
let mut xlogoff = XLogSegmentOffset(start_pos, wal_seg_size) as usize;
while bytes_left != 0 {
let bytes_to_write;
@@ -330,7 +430,7 @@ fn write_wal_file(
}
/* Open file */
let segno = start_pos.segment_number(wal_seg_size);
let segno = XLByteToSeg(start_pos, wal_seg_size);
let wal_file_name = XLogFileName(
1, // FIXME: always use Postgres timeline 1
segno,
@@ -382,7 +482,7 @@ fn write_wal_file(
xlogoff += bytes_to_write;
/* Did we reach the end of a WAL segment? */
if start_pos.segment_offset(wal_seg_size) == 0 {
if XLogSegmentOffset(start_pos, wal_seg_size) == 0 {
xlogoff = 0;
if partial {
fs::rename(&wal_file_partial_path, &wal_file_path)?;

View File

@@ -1,522 +1,255 @@
//!
//! WAL redo
//!
//! We rely on Postgres to perform WAL redo for us. We launch a
//! postgres process in special "wal redo" mode that's similar to
//! single-user mode. We then pass the the previous page image, if any,
//! and all the WAL records we want to apply, to the postgres
//! process. Then we get the page image back. Communication with the
//! postgres process happens via stdin/stdout
//!
//! See src/backend/tcop/zenith_wal_redo.c for the other side of
//! this communication.
//!
//! TODO: Even though the postgres code runs in a separate process,
//! it's not a secure sandbox.
//!
use byteorder::{ByteOrder, LittleEndian};
use bytes::{Buf, BufMut, Bytes, BytesMut};
//
// WAL redo
//
// We rely on Postgres to perform WAL redo for us. We launch a
// postgres process in special "wal redo" mode that's similar to
// single-user mode. We then pass the the previous page image, if any,
// and all the WAL records we want to apply, to the postgress
// process. Then we get the page image back. Communication with the
// postgres process happens via stdin/stdout
//
// See src/backend/tcop/zenith_wal_redo.c for the other side of
// this communication.
//
// TODO: Even though the postgres code runs in a separate process,
// it's not a secure sandbox.
//
use log::*;
use std::assert;
use std::cell::RefCell;
use std::fs;
use std::fs::OpenOptions;
use std::io::prelude::*;
use std::io::Error;
use std::path::{Path, PathBuf};
use std::process::Stdio;
use std::sync::mpsc;
use std::sync::Mutex;
use std::sync::Arc;
use std::time::Duration;
use std::time::Instant;
use tokio::io::AsyncBufReadExt;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::process::{ChildStdin, ChildStdout, Command};
use tokio::process::{Child, ChildStdin, ChildStdout, Command};
use tokio::runtime::Runtime;
use tokio::time::timeout;
use zenith_utils::lsn::Lsn;
use crate::repository::BufferTag;
use crate::repository::WALRecord;
use crate::waldecoder::{MultiXactId, XlMultiXactCreate};
use crate::PageServerConf;
use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
use postgres_ffi::pg_constants;
use postgres_ffi::xlog_utils::XLogRecord;
use bytes::{Buf, BufMut, Bytes, BytesMut};
///
/// WAL Redo Manager is responsible for replaying WAL records.
///
/// Callers use the WAL redo manager through this abstract interface,
/// which makes it easy to mock it in tests.
pub trait WalRedoManager: Send + Sync {
/// Apply some WAL records.
///
/// The caller passes an old page image, and WAL records that should be
/// applied over it. The return value is a new page image, after applying
/// the reords.
fn request_redo(
&self,
tag: BufferTag,
lsn: Lsn,
base_img: Option<Bytes>,
records: Vec<WALRecord>,
) -> Result<Bytes, WalRedoError>;
}
use crate::page_cache;
use crate::page_cache::CacheEntry;
use crate::page_cache::WALRecord;
use crate::ZTimelineId;
use crate::{page_cache::BufferTag, pg_constants, PageServerConf};
static TIMEOUT: Duration = Duration::from_secs(20);
///
/// The implementation consists of two parts: PostgresRedoManager, and
/// PostgresRedoManagerInternal. PostgresRedoManager is the public struct
/// that can be used to send redo requests to the manager.
/// PostgresRedoManagerInternal is used by the manager thread itself.
///
pub struct PostgresRedoManager {
request_tx: Mutex<mpsc::Sender<WalRedoRequest>>,
}
//
// Main entry point for the WAL applicator thread.
//
pub fn wal_redo_main(conf: &PageServerConf, timelineid: ZTimelineId) {
info!("WAL redo thread started {}", timelineid);
struct PostgresRedoManagerInternal {
conf: &'static PageServerConf,
// We block on waiting for requests on the walredo request channel, but
// use async I/O to communicate with the child process. Initialize the
// runtime for the async part.
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
request_rx: mpsc::Receiver<WalRedoRequest>,
}
let pcache = page_cache::get_pagecache(conf, timelineid).unwrap();
#[derive(Debug)]
struct WalRedoRequest {
tag: BufferTag,
lsn: Lsn,
// Loop forever, handling requests as they come.
let walredo_channel_receiver = &pcache.walredo_receiver;
loop {
let mut process: WalRedoProcess;
let datadir = format!("wal-redo/{}", timelineid);
base_img: Option<Bytes>,
records: Vec<WALRecord>,
response_channel: mpsc::Sender<Result<Bytes, WalRedoError>>,
}
/// An error happened in WAL redo
#[derive(Debug, thiserror::Error)]
pub enum WalRedoError {
#[error(transparent)]
IoError(#[from] std::io::Error),
}
///
/// Public interface of WAL redo manager
///
impl PostgresRedoManager {
///
/// Create a new PostgresRedoManager.
///
/// This launches a new thread to handle the requests.
pub fn new(conf: &'static PageServerConf) -> PostgresRedoManager {
let (tx, rx) = mpsc::channel();
//
// Launch the WAL redo thread
//
// Get mutable references to the values that we need to pass to the
// thread.
let request_rx = rx;
// Currently, the join handle is not saved anywhere and we
// won't try restart the thread if it dies.
let _walredo_thread = std::thread::Builder::new()
.name("WAL redo thread".into())
.spawn(move || {
let mut internal = PostgresRedoManagerInternal { conf, request_rx };
internal.wal_redo_main();
})
.unwrap();
PostgresRedoManager {
request_tx: Mutex::new(tx),
info!("launching WAL redo postgres process {}", timelineid);
{
let _guard = runtime.enter();
process = WalRedoProcess::launch(&datadir, &runtime).unwrap();
}
}
}
impl WalRedoManager for PostgresRedoManager {
///
/// Request the WAL redo manager to apply some WAL records
///
/// The WAL redo is handled by a separate thread, so this just sends a request
/// to the thread and waits for response.
///
fn request_redo(
&self,
tag: BufferTag,
lsn: Lsn,
base_img: Option<Bytes>,
records: Vec<WALRecord>,
) -> Result<Bytes, WalRedoError> {
// Create a channel where to receive the response
let (tx, rx) = mpsc::channel::<Result<Bytes, WalRedoError>>();
// Pretty arbitrarily, reuse the same Postgres process for 100 requests.
// After that, kill it and start a new one. This is mostly to avoid
// using up all shared buffers in Postgres's shared buffer cache; we don't
// want to write any pages to disk in the WAL redo process.
for _i in 1..100 {
let request = walredo_channel_receiver.recv().unwrap();
let request = WalRedoRequest {
tag,
lsn,
base_img,
records,
response_channel: tx,
};
self.request_tx
.lock()
.unwrap()
.send(request)
.expect("could not send WAL redo request");
rx.recv()
.expect("could not receive response to WAL redo request")
}
}
fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
return ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize;
}
fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
return (xid as u16) % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP
* pg_constants::MXACT_MEMBER_BITS_PER_XACT;
}
/* Location (byte offset within page) of TransactionId of given member */
fn mx_offset_to_member_offset(xid: MultiXactId) -> usize {
return mx_offset_to_flags_offset(xid)
+ (pg_constants::MULTIXACT_FLAGBYTES_PER_GROUP
+ (xid as u16 % pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP) * 4)
as usize;
}
///
/// WAL redo thread
///
impl PostgresRedoManagerInternal {
//
// Main entry point for the WAL applicator thread.
//
fn wal_redo_main(&mut self) {
info!("WAL redo thread started");
// We block on waiting for requests on the walredo request channel, but
// use async I/O to communicate with the child process. Initialize the
// runtime for the async part.
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()
.unwrap();
let process: PostgresRedoProcess;
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
// just create one with constant name. That fails if you try to launch more than
// one WAL redo manager concurrently.
let datadir = self.conf.workdir.join("wal-redo-datadir");
info!("launching WAL redo postgres process");
process = runtime
.block_on(PostgresRedoProcess::launch(&datadir))
.unwrap();
// Loop forever, handling requests as they come.
loop {
let request = self
.request_rx
.recv()
.expect("WAL redo request channel was closed");
let result = runtime.block_on(self.handle_apply_request(&process, &request));
let result_ok = result.is_ok();
// Send the result to the requester
let _ = request.response_channel.send(result);
if !result_ok {
error!("wal-redo-postgres failed to apply request {:?}", request);
let result = handle_apply_request(&pcache, &process, &runtime, request);
if result.is_err() {
// On error, kill the process.
break;
}
}
info!("killing WAL redo postgres process");
let _ = runtime.block_on(process.stdin.get_mut().shutdown());
let mut child = process.child;
drop(process.stdin);
let _ = runtime.block_on(child.wait());
}
}
fn transaction_id_set_status_bit(
xl_info: u8,
xl_rmid: u8,
xl_xid: u32,
record: WALRecord,
page: &mut BytesMut,
) {
let info = xl_info & pg_constants::XLOG_XACT_OPMASK;
let mut status = 0;
if info == pg_constants::XLOG_XACT_COMMIT {
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
} else if info == pg_constants::XLOG_XACT_ABORT {
status = pg_constants::TRANSACTION_STATUS_ABORTED;
} else {
trace!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {:X}/{:X} main_data_offset {}, rec.len {}",
status,
record.lsn >> 32,
record.lsn & 0xffffffff,
record.main_data_offset, record.rec.len());
return;
}
///
/// Process one request for WAL redo.
///
async fn handle_apply_request(
&self,
process: &PostgresRedoProcess,
request: &WalRedoRequest,
) -> Result<Bytes, WalRedoError> {
let tag = request.tag;
let lsn = request.lsn;
let base_img = request.base_img.clone();
let records = &request.records;
trace!("handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort) lsn {:X}/{:X} main_data_offset {}, rec.len {}",
status,
record.lsn >> 32,
record.lsn & 0xffffffff,
record.main_data_offset, record.rec.len());
let nrecords = records.len();
let byteno: usize = ((xl_rmid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
let start = Instant::now();
let byteptr = &mut page[byteno..byteno + 1];
let bshift: u8 = ((xl_xid % pg_constants::CLOG_XACTS_PER_BYTE)
* pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
let apply_result: Result<Bytes, Error>;
if tag.rel.forknum > pg_constants::INIT_FORKNUM {
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
let mut page = BytesMut::new();
if let Some(fpi) = base_img {
page.extend_from_slice(&fpi[..]);
} else {
page.extend_from_slice(&ZERO_PAGE);
}
for record in records {
let mut buf = record.rec.clone();
let mut curval = byteptr[0];
curval = (curval >> bshift) & pg_constants::CLOG_XACT_BITMASK;
// 1. Parse XLogRecord struct
// FIXME: refactor to avoid code duplication.
let xlogrec = XLogRecord::from_bytes(&mut buf);
let mut byteval = [0];
byteval[0] = curval;
byteval[0] &= !(((1 << pg_constants::CLOG_BITS_PER_XACT as u8) - 1) << bshift);
byteval[0] |= status << bshift;
//move to main data
// TODO probably, we should store some records in our special format
// to avoid this weird parsing on replay
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
if buf.remaining() > skip {
buf.advance(skip);
}
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
if info == pg_constants::CLOG_ZEROPAGE {
page.copy_from_slice(&ZERO_PAGE);
}
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
let mut status = 0;
if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
if info == pg_constants::XLOG_XACT_COMMIT {
transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
}
//handle subtrans
let _xact_time = buf.get_i64_le();
let mut xinfo = 0;
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
xinfo = buf.get_u32_le();
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
let _dbid = buf.get_u32_le();
let _tsid = buf.get_u32_le();
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
let nsubxacts = buf.get_i32_le();
for _i in 0..nsubxacts {
let subxact = buf.get_u32_le();
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
// only update xids on the requested page
if tag.blknum == blkno {
status = pg_constants::TRANSACTION_STATUS_SUB_COMMITTED;
transaction_id_set_status(subxact, status, &mut page);
}
}
}
if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
let nrels = buf.get_i32_le();
for _i in 0..nrels {
let spcnode = buf.get_u32_le();
let dbnode = buf.get_u32_le();
let relnode = buf.get_u32_le();
//TODO handle this too?
trace!(
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
spcnode,
dbnode,
relnode
);
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
let nmsgs = buf.get_i32_le();
for _i in 0..nmsgs {
let sizeof_shared_invalidation_message = 0;
buf.advance(sizeof_shared_invalidation_message);
}
}
assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
let xid = buf.get_u32_le();
transaction_id_set_status(xid, status, &mut page);
}
} else if info == pg_constants::XLOG_XACT_ABORT || info == pg_constants::XLOG_XACT_ABORT_PREPARED {
status = pg_constants::TRANSACTION_STATUS_ABORTED;
if info == pg_constants::XLOG_XACT_ABORT {
transaction_id_set_status(xlogrec.xl_xid, status, &mut page);
}
//handle subtrans
let _xact_time = buf.get_i64_le();
let mut xinfo = 0;
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
xinfo = buf.get_u32_le();
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
let _dbid = buf.get_u32_le();
let _tsid = buf.get_u32_le();
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
let nsubxacts = buf.get_i32_le();
for _i in 0..nsubxacts {
let subxact = buf.get_u32_le();
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
// only update xids on the requested page
if tag.blknum == blkno {
status = pg_constants::TRANSACTION_STATUS_ABORTED;
transaction_id_set_status(subxact, status, &mut page);
}
}
}
if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
let nrels = buf.get_i32_le();
for _i in 0..nrels {
let spcnode = buf.get_u32_le();
let dbnode = buf.get_u32_le();
let relnode = buf.get_u32_le();
//TODO handle this too?
trace!(
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
spcnode,
dbnode,
relnode
);
}
}
if xinfo & pg_constants::XACT_XINFO_HAS_INVALS != 0 {
let nmsgs = buf.get_i32_le();
for _i in 0..nmsgs {
let sizeof_shared_invalidation_message = 0;
buf.advance(sizeof_shared_invalidation_message);
}
}
assert!((xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE) != 0);
let xid = buf.get_u32_le();
transaction_id_set_status(xid, status, &mut page);
}
} else if info == pg_constants::XLOG_XACT_PREPARE {
info!("Apply prepare {} record", xlogrec.xl_xid);
page.clear();
page.extend_from_slice(&buf[..]);
} else {
error!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {} main_data_offset {}, rec.len {}",
status,
record.lsn,
record.main_data_offset, record.rec.len());
}
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
page.copy_from_slice(&ZERO_PAGE);
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
page.copy_from_slice(&ZERO_PAGE);
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
if tag.rel.forknum == pg_constants::PG_MXACT_OFFSETS_FORKNUM {
let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
* 4) as usize;
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
} else {
assert!(tag.rel.forknum == pg_constants::PG_MXACT_MEMBERS_FORKNUM);
for i in 0..xlrec.nmembers {
let blkno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
if blkno == tag.blknum {
// update only target block
let offset = xlrec.moff + i;
let memberoff = mx_offset_to_member_offset(offset);
let flagsoff = mx_offset_to_flags_offset(offset);
let bshift = mx_offset_to_flags_bitshift(offset);
let mut flagsval =
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
flagsval &=
!(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
<< bshift);
flagsval |= xlrec.members[i as usize].status << bshift;
LittleEndian::write_u32(
&mut page[flagsoff..flagsoff + 4],
flagsval,
);
LittleEndian::write_u32(
&mut page[memberoff..memberoff + 4],
xlrec.members[i as usize].xid,
);
}
}
}
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
// empty page image indicates that this SLRU page is truncated and can be removed by GC
page.clear();
} else {
assert!(false);
}
} else if xlogrec.xl_rmid == pg_constants::RM_RELMAP_ID {
page.clear();
page.extend_from_slice(&buf[12..]); // skip xl_relmap_update
assert!(page.len() == 512); // size of pg_filenode.map
byteptr.copy_from_slice(&byteval);
trace!(
"xl_xid {} byteno {} curval {} byteval {}",
xl_xid,
byteno,
curval,
byteval[0]
);
}
fn handle_apply_request(
pcache: &page_cache::PageCache,
process: &WalRedoProcess,
runtime: &Runtime,
entry_rc: Arc<CacheEntry>,
) -> Result<(), Error> {
let tag = entry_rc.key.tag;
let lsn = entry_rc.key.lsn;
let (base_img, records) = pcache.collect_records_for_apply(entry_rc.as_ref());
let mut entry = entry_rc.content.lock().unwrap();
entry.apply_pending = false;
let nrecords = records.len();
let start = Instant::now();
let apply_result: Result<Bytes, Error>;
if tag.forknum == pg_constants::PG_XACT_FORKNUM as u8 {
//TODO use base image if any
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
let zero_page_bytes: &[u8] = &ZERO_PAGE;
let mut page = BytesMut::from(zero_page_bytes);
for record in records {
let mut buf = record.rec.clone();
// 1. Parse XLogRecord struct
// FIXME: refactor to avoid code duplication.
let _xl_tot_len = buf.get_u32_le();
let xl_xid = buf.get_u32_le();
let _xl_prev = buf.get_u64_le();
let xl_info = buf.get_u8();
let xl_rmid = buf.get_u8();
buf.advance(2); // 2 bytes of padding
let _xl_crc = buf.get_u32_le();
if xl_rmid == pg_constants::RM_CLOG_ID {
let info = xl_info & !pg_constants::XLR_INFO_MASK;
if info == pg_constants::CLOG_ZEROPAGE {
page.clone_from_slice(zero_page_bytes);
trace!("handle_apply_request for RM_CLOG_ID-CLOG_ZEROPAGE lsn {:X}/{:X} main_data_offset {}, rec.len {}",
record.lsn >> 32,
record.lsn & 0xffffffff,
record.main_data_offset, record.rec.len());
}
} else if xl_rmid == pg_constants::RM_XACT_ID {
transaction_id_set_status_bit(xl_info, xl_rmid, xl_xid, record, &mut page);
}
apply_result = Ok::<Bytes, Error>(page.freeze());
} else {
apply_result = process.apply_wal_records(tag, base_img, records).await;
}
let duration = start.elapsed();
let result: Result<Bytes, WalRedoError>;
trace!(
"applied {} WAL records in {} ms to reconstruct page image at LSN {}",
nrecords,
duration.as_millis(),
lsn
);
if let Err(e) = apply_result {
error!("could not apply WAL records: {}", e);
result = Err(WalRedoError::IoError(e));
} else {
let img = apply_result.unwrap();
result = Ok(img);
}
// The caller is responsible for sending the response
result
apply_result = Ok::<Bytes, Error>(page.freeze());
} else {
apply_result = process.apply_wal_records(runtime, tag, base_img, records);
}
let duration = start.elapsed();
let result;
debug!(
"applied {} WAL records in {} ms to reconstruct page image at LSN {:X}/{:X}",
nrecords,
duration.as_millis(),
lsn >> 32,
lsn & 0xffff_ffff
);
if let Err(e) = apply_result {
error!("could not apply WAL records: {}", e);
result = Err(e);
} else {
entry.page_image = Some(apply_result.unwrap());
pcache
.num_page_images
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
result = Ok(());
}
// Wake up the requester, whether the operation succeeded or not.
entry_rc.walredo_condvar.notify_all();
return result;
}
struct PostgresRedoProcess {
struct WalRedoProcess {
child: Child,
stdin: RefCell<ChildStdin>,
stdout: RefCell<ChildStdout>,
}
impl PostgresRedoProcess {
impl WalRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
// Tests who run pageserver binary are setting proper PG_BIN_DIR
// and PG_LIB_DIR so that WalRedo would start right postgres.
// do that: We may later
// and PG_LIB_DIR so that WalRedo would start right postgres. We may later
// switch to setting same things in pageserver config file.
async fn launch(datadir: &Path) -> Result<PostgresRedoProcess, Error> {
// Create empty data directory for wal-redo postgres, deleting old one first.
if datadir.exists() {
info!("directory {:?} exists, removing", &datadir);
if let Err(e) = fs::remove_dir_all(&datadir) {
error!("could not remove old wal-redo-datadir: {:?}", e);
}
}
info!("running initdb in {:?}", datadir.display());
let initdb = Command::new("initdb")
.args(&["-D", datadir.to_str().unwrap()])
.arg("-N")
.output()
.await
fn launch(datadir: &str, runtime: &Runtime) -> Result<WalRedoProcess, Error> {
// Create empty data directory for wal-redo postgres deleting old one.
fs::remove_dir_all(datadir).ok();
let initdb = runtime
.block_on(
Command::new("initdb")
.args(&["-D", datadir])
.arg("-N")
.output(),
)
.expect("failed to execute initdb");
if !initdb.status.success() {
@@ -525,16 +258,8 @@ impl PostgresRedoProcess {
std::str::from_utf8(&initdb.stdout).unwrap(),
std::str::from_utf8(&initdb.stderr).unwrap()
);
} else {
// Limit shared cache for wal-redo-postres
let mut config = OpenOptions::new()
.append(true)
.open(PathBuf::from(&datadir).join("postgresql.conf"))?;
config.write_all(b"shared_buffers=128kB\n")?;
config.write_all(b"fsync=off\n")?;
config.write_all(b"shared_preload_libraries=zenith\n")?;
config.write_all(b"zenith.wal_redo=on\n")?;
}
// Start postgres itself
let mut child = Command::new("postgres")
.arg("--wal-redo")
@@ -545,10 +270,7 @@ impl PostgresRedoProcess {
.spawn()
.expect("postgres --wal-redo command failed to start");
info!(
"launched WAL redo postgres process on {:?}",
datadir.display()
);
info!("launched WAL redo postgres process on {}", datadir);
let stdin = child.stdin.take().expect("failed to open child's stdin");
let stderr = child.stderr.take().expect("failed to open child's stderr");
@@ -568,14 +290,15 @@ impl PostgresRedoProcess {
if res.unwrap() == 0 {
break;
}
error!("wal-redo-postgres: {}", line.trim());
debug!("wal-redo-postgres: {}", line.trim());
line.clear();
}
Ok::<(), Error>(())
};
tokio::spawn(f_stderr);
Ok(PostgresRedoProcess {
Ok(WalRedoProcess {
child,
stdin: RefCell::new(stdin),
stdout: RefCell::new(stdout),
})
@@ -585,132 +308,146 @@ impl PostgresRedoProcess {
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
async fn apply_wal_records(
fn apply_wal_records(
&self,
runtime: &Runtime,
tag: BufferTag,
base_img: Option<Bytes>,
records: &[WALRecord],
) -> Result<Bytes, std::io::Error> {
records: Vec<WALRecord>,
) -> Result<Bytes, Error> {
let mut stdin = self.stdin.borrow_mut();
let mut stdout = self.stdout.borrow_mut();
// We do three things simultaneously: send the old base image and WAL records to
// the child process's stdin, read the result from child's stdout, and forward any logging
// information that the child writes to its stderr to the page server's log.
//
// 'f_stdin' handles writing the base image and WAL records to the child process.
// 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
// tokio runtime in the 'launch' function already, forwards the logging.
let f_stdin = async {
// Send base image, if any. (If the record initializes the page, previous page
// version is not needed.)
timeout(
TIMEOUT,
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
)
.await??;
if base_img.is_some() {
return runtime.block_on(async {
//
// This async block sends all the commands to the process.
//
// For reasons I don't understand, this needs to be a "move" block;
// otherwise the stdin pipe doesn't get closed, despite the shutdown()
// call.
//
let f_stdin = async {
// Send base image, if any. (If the record initializes the page, previous page
// version is not needed.)
timeout(
TIMEOUT,
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
)
.await??;
}
if base_img.is_some() {
timeout(
TIMEOUT,
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
)
.await??;
}
// Send WAL records.
for rec in records.iter() {
let r = rec.clone();
// Send WAL records.
for rec in records.iter() {
let r = rec.clone();
stdin
.write_all(&build_apply_record_msg(r.lsn, r.rec))
.await?;
stdin
.write_all(&build_apply_record_msg(r.lsn, r.rec))
.await?;
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
// r.lsn >> 32, r.lsn & 0xffff_ffff);
}
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
// r.lsn >> 32, r.lsn & 0xffff_ffff);
}
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
// Send GetPage command to get the result back
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
timeout(TIMEOUT, stdin.flush()).await??;
//debug!("sent GetPage for {}", tag.blknum);
Ok::<(), Error>(())
};
// Send GetPage command to get the result back
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
timeout(TIMEOUT, stdin.flush()).await??;
//debug!("sent GetPage for {}", tag.blknum);
Ok::<(), Error>(())
};
// Read back new page image
let f_stdout = async {
let mut buf = [0u8; 8192];
// Read back new page image
let f_stdout = async {
let mut buf = [0u8; 8192];
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
//debug!("got response for {}", tag.blknum);
Ok::<[u8; 8192], Error>(buf)
};
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
//debug!("got response for {}", tag.blknum);
Ok::<[u8; 8192], Error>(buf)
};
let res = tokio::try_join!(f_stdout, f_stdin)?;
// Kill the process. This closes its stdin, which should signal the process
// to terminate. TODO: SIGKILL if needed
//child.wait();
let buf = res.0;
let res = futures::try_join!(f_stdout, f_stdin)?;
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
let buf = res.0;
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
});
}
}
// Functions for constructing messages to send to the postgres WAL redo
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
// explanation of the protocol.
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
let len = 4 + 1 + 4 * 4;
let len = 4 + 5 * 4;
let mut buf = BytesMut::with_capacity(1 + len);
buf.put_u8(b'B');
buf.put_u8('B' as u8);
buf.put_u32(len as u32);
tag.pack(&mut buf);
buf.put_u32(tag.spcnode);
buf.put_u32(tag.dbnode);
buf.put_u32(tag.relnode);
buf.put_u32(tag.forknum as u32);
buf.put_u32(tag.blknum);
assert!(buf.len() == 1 + len);
buf.freeze()
return buf.freeze();
}
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
assert!(base_img.len() == 8192);
let len = 4 + 1 + 4 * 4 + base_img.len();
let len = 4 + 5 * 4 + base_img.len();
let mut buf = BytesMut::with_capacity(1 + len);
buf.put_u8(b'P');
buf.put_u8('P' as u8);
buf.put_u32(len as u32);
tag.pack(&mut buf);
buf.put_u32(tag.spcnode);
buf.put_u32(tag.dbnode);
buf.put_u32(tag.relnode);
buf.put_u32(tag.forknum as u32);
buf.put_u32(tag.blknum);
buf.put(base_img);
assert!(buf.len() == 1 + len);
buf.freeze()
return buf.freeze();
}
fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
fn build_apply_record_msg(endlsn: u64, rec: Bytes) -> Bytes {
let len = 4 + 8 + rec.len();
let mut buf = BytesMut::with_capacity(1 + len);
buf.put_u8(b'A');
buf.put_u8('A' as u8);
buf.put_u32(len as u32);
buf.put_u64(endlsn.0);
buf.put_u64(endlsn);
buf.put(rec);
assert!(buf.len() == 1 + len);
buf.freeze()
return buf.freeze();
}
fn build_get_page_msg(tag: BufferTag) -> Bytes {
let len = 4 + 1 + 4 * 4;
let len = 4 + 5 * 4;
let mut buf = BytesMut::with_capacity(1 + len);
buf.put_u8(b'G');
buf.put_u8('G' as u8);
buf.put_u32(len as u32);
tag.pack(&mut buf);
buf.put_u32(tag.spcnode);
buf.put_u32(tag.dbnode);
buf.put_u32(tag.relnode);
buf.put_u32(tag.forknum as u32);
buf.put_u32(tag.blknum);
assert!(buf.len() == 1 + len);
buf.freeze()
return buf.freeze();
}

33
pgbuild.sh Executable file
View File

@@ -0,0 +1,33 @@
#!/bin/sh
#
# Purpose of this script is to build and install postgres in a local directory
# so that zenith intergation tests would find pg binaries and support files.
#
# ./pgbuild.sh would do following:
#
# 1) run out-of-source build of postgres in REPO_ROOT/tmp_install/build directory (I'm reusing
# tmp_install path here since it is already present in .gitignore)
#
# 2) installs postgres to REPO_ROOT/tmp_install/
#
# Halt immediately if any command fails
set -e
REPO_ROOT=$(dirname "$0")
REPO_ROOT="`( cd \"$REPO_ROOT\" && pwd )`"
# configure
echo "Configuring postgres build"
mkdir -p $REPO_ROOT/tmp_install/build
cd $REPO_ROOT/tmp_install/build
../../vendor/postgres/configure CFLAGS='-O0' --enable-debug --enable-cassert \
--enable-depend --with-libxml --prefix=/ > configure.log
# compile
echo "Compiling postgres"
make -j8 -s
export DESTDIR=$REPO_ROOT/tmp_install
echo "Installing postgres to $DESTDIR"
make install -s

View File

@@ -9,16 +9,11 @@ edition = "2018"
[dependencies]
chrono = "0.4.19"
rand = "0.8.3"
regex = "1.4.5"
bytes = "1.0.1"
byteorder = "1.4.3"
anyhow = "1.0"
crc32c = "0.6.0"
hex = "0.4.3"
lazy_static = "1.4"
log = "0.4.14"
thiserror = "1.0"
workspace_hack = { path = "../workspace_hack" }
[build-dependencies]
bindgen = "0.57"
bindgen = "0.53.1"

View File

@@ -1,3 +0,0 @@
This module contains utility functions for interacting with PostgreSQL
file formats.

View File

@@ -18,15 +18,13 @@ fn main() {
// included header files changed.
.parse_callbacks(Box::new(bindgen::CargoCallbacks))
.whitelist_type("ControlFileData")
.whitelist_type("CheckPoint")
.whitelist_type("FullTransactionId")
.whitelist_var("PG_CONTROL_FILE_SIZE")
.whitelist_var("PG_CONTROLFILEDATA_OFFSETOF_CRC")
.whitelist_type("DBState")
// Path the server include dir. It is in tmp_install/include/server, if you did
// "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
// and used DESTDIR to move it into tmp_install, then it's in
// tmp_install/include/postgres/server
// tmp_install/include/postgres/server (that's how the pgbuild.sh script does it).
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
// but this will do for now.
.clang_arg("-I../tmp_install/include/server")

View File

@@ -3,16 +3,10 @@
#![allow(non_snake_case)]
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
pub mod nonrelfile_utils;
pub mod pg_constants;
pub mod relfile_utils;
pub mod xlog_utils;
use bytes::{Buf, Bytes, BytesMut};
// sizeof(ControlFileData)
const SIZEOF_CONTROLDATA: usize = std::mem::size_of::<ControlFileData>();
const SIZEOF_CHECKPOINT: usize = std::mem::size_of::<CheckPoint>();
const OFFSETOF_CRC: usize = PG_CONTROLFILEDATA_OFFSETOF_CRC as usize;
impl ControlFileData {
@@ -24,13 +18,13 @@ impl ControlFileData {
controlfile =
unsafe { std::mem::transmute::<[u8; SIZEOF_CONTROLDATA], ControlFileData>(b) };
controlfile
return controlfile;
}
}
pub fn decode_pg_control(mut buf: Bytes) -> Result<ControlFileData, anyhow::Error> {
pub fn decode_pg_control(buf: Bytes) -> Result<ControlFileData, anyhow::Error> {
let mut b: [u8; SIZEOF_CONTROLDATA] = [0u8; SIZEOF_CONTROLDATA];
buf.copy_to_slice(&mut b);
buf.clone().copy_to_slice(&mut b);
let controlfile: ControlFileData;
@@ -69,44 +63,5 @@ pub fn encode_pg_control(controlfile: ControlFileData) -> Bytes {
// Fill the rest of the control file with zeros.
buf.resize(PG_CONTROL_FILE_SIZE as usize, 0);
buf.into()
}
pub fn encode_checkpoint(checkpoint: CheckPoint) -> Bytes {
let b: [u8; SIZEOF_CHECKPOINT];
b = unsafe { std::mem::transmute::<CheckPoint, [u8; SIZEOF_CHECKPOINT]>(checkpoint) };
return Bytes::copy_from_slice(&b[..]);
}
pub fn decode_checkpoint(mut buf: Bytes) -> Result<CheckPoint, anyhow::Error> {
let mut b = [0u8; SIZEOF_CHECKPOINT];
buf.copy_to_slice(&mut b);
let checkpoint: CheckPoint;
checkpoint = unsafe { std::mem::transmute::<[u8; SIZEOF_CHECKPOINT], CheckPoint>(b) };
Ok(checkpoint)
}
impl CheckPoint {
pub fn new(lsn: u64, timeline: u32) -> CheckPoint {
CheckPoint {
redo: lsn,
ThisTimeLineID: timeline,
PrevTimeLineID: timeline,
fullPageWrites: true, // TODO: get actual value of full_page_writes
nextXid: FullTransactionId {
value: pg_constants::FIRST_NORMAL_TRANSACTION_ID as u64,
}, // TODO: handle epoch?
nextOid: pg_constants::FIRST_BOOTSTRAP_OBJECT_ID,
nextMulti: 1,
nextMultiOffset: 0,
oldestXid: pg_constants::FIRST_NORMAL_TRANSACTION_ID,
oldestXidDB: 0,
oldestMulti: 1,
oldestMultiDB: 0,
time: 0,
oldestCommitTsXid: 0,
newestCommitTsXid: 0,
oldestActiveXid: pg_constants::INVALID_TRANSACTION_ID,
}
}
return buf.into();
}

View File

@@ -1,32 +0,0 @@
//!
//! Common utilities for dealing with PostgreSQL non-relation files.
//!
use crate::pg_constants;
use bytes::BytesMut;
use log::*;
pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
trace!(
"handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
status
);
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
let bshift: u8 =
((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
page[byteno] =
(page[byteno] & !(pg_constants::CLOG_XACT_BITMASK << bshift)) | (status << bshift);
}
pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
let bshift: u8 =
((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
return ((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8;
}

View File

@@ -1,178 +0,0 @@
//!
//! Misc constants, copied from PostgreSQL headers.
//!
//
// From pg_tablespace_d.h
//
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
pub const GLOBALTABLESPACE_OID: u32 = 1664;
//
// Fork numbers, from relpath.h
//
pub const MAIN_FORKNUM: u8 = 0;
pub const FSM_FORKNUM: u8 = 1;
pub const VISIBILITYMAP_FORKNUM: u8 = 2;
pub const INIT_FORKNUM: u8 = 3;
// Special values for non-rel files' tags (Zenith-specific)
//Special values for non-rel files' tags
pub const PG_CONTROLFILE_FORKNUM: u8 = 42;
pub const PG_FILENODEMAP_FORKNUM: u8 = 43;
pub const PG_XACT_FORKNUM: u8 = 44;
pub const PG_MXACT_OFFSETS_FORKNUM: u8 = 45;
pub const PG_MXACT_MEMBERS_FORKNUM: u8 = 46;
pub const PG_TWOPHASE_FORKNUM: u8 = 47;
pub const PG_CHECKPOINT_FORKNUM: u8 = 48;
// From storage_xlog.h
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and
// --with-segsize=SEGSIZE, but assume the defaults for now.
pub const BLCKSZ: u16 = 8192;
pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32);
//
// constants from clog.h
//
pub const CLOG_XACTS_PER_BYTE: u32 = 4;
pub const CLOG_XACTS_PER_PAGE: u32 = BLCKSZ as u32 * CLOG_XACTS_PER_BYTE;
pub const CLOG_BITS_PER_XACT: u8 = 2;
pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
//
// Constants from visbilitymap.h
//
pub const SIZE_OF_PAGE_HEADER: u16 = 24;
pub const BITS_PER_HEAPBLOCK: u16 = 2;
pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;
pub const TRANSACTION_STATUS_IN_PROGRESS: u8 = 0x00;
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
pub const CLOG_ZEROPAGE: u8 = 0x00;
pub const CLOG_TRUNCATE: u8 = 0x10;
// From xact.h
pub const XLOG_XACT_COMMIT: u8 = 0x00;
pub const XLOG_XACT_PREPARE: u8 = 0x10;
pub const XLOG_XACT_ABORT: u8 = 0x20;
pub const XLOG_XACT_COMMIT_PREPARED: u8 = 0x30;
pub const XLOG_XACT_ABORT_PREPARED: u8 = 0x40;
// From srlu.h
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
/* mask for filtering opcodes out of xl_info */
pub const XLOG_XACT_OPMASK: u8 = 0x70;
/* does this record have a 'xinfo' field or not */
pub const XLOG_XACT_HAS_INFO: u8 = 0x80;
/*
* The following flags, stored in xinfo, determine which information is
* contained in commit/abort records.
*/
pub const XACT_XINFO_HAS_DBINFO: u32 = 1u32 << 0;
pub const XACT_XINFO_HAS_SUBXACTS: u32 = 1u32 << 1;
pub const XACT_XINFO_HAS_RELFILENODES: u32 = 1u32 << 2;
pub const XACT_XINFO_HAS_INVALS: u32 = 1u32 << 3;
pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
// pub const XACT_XINFO_HAS_ORIGIN: u32 = 1u32 << 5;
// pub const XACT_XINFO_HAS_AE_LOCKS: u32 = 1u32 << 6;
// pub const XACT_XINFO_HAS_GID: u32 = 1u32 << 7;
// From pg_control.h and rmgrlist.h
pub const XLOG_NEXTOID: u8 = 0x30;
pub const XLOG_SWITCH: u8 = 0x40;
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
// From multixact.h
pub const XLOG_MULTIXACT_ZERO_OFF_PAGE: u8 = 0x00;
pub const XLOG_MULTIXACT_ZERO_MEM_PAGE: u8 = 0x10;
pub const XLOG_MULTIXACT_CREATE_ID: u8 = 0x20;
pub const XLOG_MULTIXACT_TRUNCATE_ID: u8 = 0x30;
pub const MULTIXACT_OFFSETS_PER_PAGE: u16 = BLCKSZ / 4;
pub const MXACT_MEMBER_BITS_PER_XACT: u16 = 8;
pub const MXACT_MEMBER_FLAGS_PER_BYTE: u16 = 1;
pub const MULTIXACT_FLAGBYTES_PER_GROUP: u16 = 4;
pub const MULTIXACT_MEMBERS_PER_MEMBERGROUP: u16 =
MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE;
/* size in bytes of a complete group */
pub const MULTIXACT_MEMBERGROUP_SIZE: u16 =
4 * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP;
pub const MULTIXACT_MEMBERGROUPS_PER_PAGE: u16 = BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE;
pub const MULTIXACT_MEMBERS_PER_PAGE: u16 =
MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP;
// From heapam_xlog.h
pub const XLOG_HEAP_INSERT: u8 = 0x00;
pub const XLOG_HEAP_DELETE: u8 = 0x10;
pub const XLOG_HEAP_UPDATE: u8 = 0x20;
pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
pub const RM_XLOG_ID: u8 = 0;
pub const RM_XACT_ID: u8 = 1;
pub const RM_SMGR_ID: u8 = 2;
pub const RM_CLOG_ID: u8 = 3;
pub const RM_DBASE_ID: u8 = 4;
pub const RM_TBLSPC_ID: u8 = 5;
pub const RM_MULTIXACT_ID: u8 = 6;
pub const RM_RELMAP_ID: u8 = 7;
pub const RM_STANDBY_ID: u8 = 8;
pub const RM_HEAP2_ID: u8 = 9;
pub const RM_HEAP_ID: u8 = 10;
// from xlogreader.h
pub const XLR_INFO_MASK: u8 = 0x0F;
pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
// from dbcommands_xlog.h
pub const XLOG_DBASE_CREATE: u8 = 0x00;
pub const XLOG_DBASE_DROP: u8 = 0x10;
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
pub const SIZEOF_XLOGRECORD: u32 = 24;
//
// from xlogrecord.h
//
pub const XLR_MAX_BLOCK_ID: u8 = 32;
pub const XLR_BLOCK_ID_DATA_SHORT: u8 = 255;
pub const XLR_BLOCK_ID_DATA_LONG: u8 = 254;
pub const XLR_BLOCK_ID_ORIGIN: u8 = 253;
pub const XLR_BLOCK_ID_TOPLEVEL_XID: u8 = 252;
pub const BKPBLOCK_FORK_MASK: u8 = 0x0F;
pub const _BKPBLOCK_FLAG_MASK: u8 = 0xF0;
pub const BKPBLOCK_HAS_IMAGE: u8 = 0x10; /* block data is an XLogRecordBlockImage */
pub const BKPBLOCK_HAS_DATA: u8 = 0x20;
pub const BKPBLOCK_WILL_INIT: u8 = 0x40; /* redo will re-init the page */
pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous */
/* Information stored in bimg_info */
pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
/* From transam.h */
pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
pub const INVALID_TRANSACTION_ID: u32 = 0;
pub const FIRST_BOOTSTRAP_OBJECT_ID: u32 = 12000;
pub const FIRST_NORMAL_OBJECT_ID: u32 = 16384;
/* FIXME: pageserver should request wal_seg_size from compute node */
pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;

View File

@@ -1,151 +0,0 @@
//!
//! Common utilities for dealing with PostgreSQL relation files.
//!
use crate::pg_constants;
use lazy_static::lazy_static;
use regex::Regex;
#[derive(Debug, Clone, thiserror::Error, PartialEq)]
pub enum FilePathError {
#[error("invalid relation fork name")]
InvalidForkName,
#[error("invalid relation data file name")]
InvalidFileName,
}
impl From<core::num::ParseIntError> for FilePathError {
fn from(_e: core::num::ParseIntError) -> Self {
FilePathError::InvalidFileName
}
}
/// Convert Postgres relation file's fork suffix to fork number.
pub fn forkname_to_number(forkname: Option<&str>) -> Result<u8, FilePathError> {
match forkname {
// "main" is not in filenames, it's implicit if the fork name is not present
None => Ok(pg_constants::MAIN_FORKNUM),
Some("fsm") => Ok(pg_constants::FSM_FORKNUM),
Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM),
Some("init") => Ok(pg_constants::INIT_FORKNUM),
Some(_) => Err(FilePathError::InvalidForkName),
}
}
/// Convert Postgres fork number to the right suffix of the relation data file.
pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
match forknum {
pg_constants::MAIN_FORKNUM => None,
pg_constants::FSM_FORKNUM => Some("fsm"),
pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"),
pg_constants::INIT_FORKNUM => Some("init"),
// These should not appear in WAL records, but we use them internally,
// and need to be prepared to print them out in log messages and such
pg_constants::PG_CONTROLFILE_FORKNUM => Some("controlfile"),
pg_constants::PG_FILENODEMAP_FORKNUM => Some("filenodemap"),
pg_constants::PG_XACT_FORKNUM => Some("xact"),
pg_constants::PG_MXACT_OFFSETS_FORKNUM => Some("mxact_offsets"),
pg_constants::PG_MXACT_MEMBERS_FORKNUM => Some("mxact_members"),
pg_constants::PG_TWOPHASE_FORKNUM => Some("twophase"),
_ => Some("UNKNOWN FORKNUM"),
}
}
///
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
///
/// Formats:
/// <oid>
/// <oid>_<fork name>
/// <oid>.<segment number>
/// <oid>_<fork name>.<segment number>
///
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
///
pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
lazy_static! {
static ref RELFILE_RE: Regex =
Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
}
let caps = RELFILE_RE
.captures(fname)
.ok_or(FilePathError::InvalidFileName)?;
let relnode_str = caps.name("relnode").unwrap().as_str();
let relnode = relnode_str.parse::<u32>()?;
let forkname = caps.name("forkname").map(|f| f.as_str());
let forknum = forkname_to_number(forkname)?;
let segno_match = caps.name("segno");
let segno = if segno_match.is_none() {
0
} else {
segno_match.unwrap().as_str().parse::<u32>()?
};
Ok((relnode, forknum, segno))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_valid_relfilenames() {
assert_eq!(parse_relfilename("1234"), Ok((1234, 0, 0)));
assert_eq!(parse_relfilename("1234_fsm"), Ok((1234, 1, 0)));
assert_eq!(parse_relfilename("1234_vm"), Ok((1234, 2, 0)));
assert_eq!(parse_relfilename("1234_init"), Ok((1234, 3, 0)));
assert_eq!(parse_relfilename("1234.12"), Ok((1234, 0, 12)));
assert_eq!(parse_relfilename("1234_fsm.12"), Ok((1234, 1, 12)));
assert_eq!(parse_relfilename("1234_vm.12"), Ok((1234, 2, 12)));
assert_eq!(parse_relfilename("1234_init.12"), Ok((1234, 3, 12)));
// relfilenode is unsigned, so it can go up to 2^32-1
assert_eq!(parse_relfilename("3147483648"), Ok((3147483648, 0, 0)));
}
#[test]
fn test_parse_invalid_relfilenames() {
assert_eq!(
parse_relfilename("foo"),
Err(FilePathError::InvalidFileName)
);
assert_eq!(
parse_relfilename("1.2.3"),
Err(FilePathError::InvalidFileName)
);
assert_eq!(
parse_relfilename("1234_invalid"),
Err(FilePathError::InvalidForkName)
);
assert_eq!(
parse_relfilename("1234_"),
Err(FilePathError::InvalidFileName)
);
// too large for u32
assert_eq!(
parse_relfilename("12345678901"),
Err(FilePathError::InvalidFileName)
);
assert_eq!(
parse_relfilename("-1234"),
Err(FilePathError::InvalidFileName)
);
}
#[test]
fn test_parse_weird_relfilenames() {
// we accept 0 for the relfilenode, but PostgreSQL should never do that.
assert_eq!(parse_relfilename("0"), Ok((0, 0, 0)));
// PostgreSQL has a limit of 2^32-2 blocks in a table. With 8k block size and
// 1 GB segments, the max segment number is 32767. But we accept larger values
// currently.
assert_eq!(parse_relfilename("1.123456"), Ok((1, 0, 123456)));
}
}

View File

@@ -1,92 +0,0 @@
## Zenith test runner
This directory contains integration tests.
Prerequisites:
- Python 3.6 or later
- Python packages: pytest, psycopg2
- pytest 6.0 is required.
- __NOTE: `apt install` on Debian/Ubuntu won't work.__
They ship a much older version of pytest (and sometimes rename it to
`pytest-3`.)
- Install using something like this:
- `pip3 install pytest psycopg2` (Debian or Ubuntu)
- Zenith and Postgres binaries
- See the root README.md for build directions
- Tests can be run from the git tree; or see the environment variables
below to run from other directories.
- The zenith git repo, including the postgres submodule
(for some tests, e.g. pg_regress)
### Test Organization
The tests are divided into a few batches, such that each batch takes roughly
the same amount of time. The batches can be run in parallel, to minimize total
runtime. Currently, there are only two batches:
- test_batch_pg_regress: Runs PostgreSQL regression tests
- test_others: All other tests
### Running the tests
Because pytest will search all subdirectories for tests, it's easiest to
run the tests from within the `test_runner` directory.
Test state (postgres data, pageserver state, and log files) will
be stored under a directory `test_output`.
You can run all the tests with:
`pytest`
If you want to run all the tests in a particular file:
`pytest test_pgbench.py`
If you want to run all tests that have the string "bench" in their names:
`pytest -k bench`
Useful environment variables:
`ZENITH_BIN`: The directory where zenith binaries can be found.
`POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found.
`TEST_OUTPUT`: Set the directory where test state and test output files
should go.
`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
Let stdout and stderr go to the terminal instead of capturing them:
`pytest -s ...`
(Note many tests capture subprocess outputs separately, so this may not
show much.)
Exit after the first test failure:
`pytest -x ...`
(there are many more pytest options; run `pytest -h` to see them.)
### Building new tests
The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
So this code:
```
def test_something(zenith_cli, pg_bin):
pass
```
... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
Fixtures can't be imported using the normal python syntax. Instead, use this:
```
pytest_plugins = ("fixtures.something")
```
That will make all the fixtures in the `fixtures/something.py` file available.
Anything that's likely to be used in multiple tests should be built into a fixture.
Note that fixtures can clean up after themselves if they use the `yield` syntax.
Cleanup will happen even if the test fails (raises an unhandled exception).
Python destructors, e.g. `__del__()` aren't recommended for cleanup.

View File

@@ -1,67 +0,0 @@
import pytest
import getpass
import psycopg2
pytest_plugins = ("fixtures.zenith_fixtures")
#
# Create a couple of branches off the main branch, at a historical point in time.
#
def test_branch_behind(zenith_cli, pageserver, postgres, pg_bin):
# Branch at the point where only 100 rows were inserted
zenith_cli.run(["branch", "test_branch_behind", "empty"]);
pgmain = postgres.create_start('test_branch_behind')
print("postgres is running on 'test_branch_behind' branch")
main_pg_conn = psycopg2.connect(pgmain.connstr());
main_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
main_cur = main_pg_conn.cursor()
# Create table, and insert the first 100 rows
main_cur.execute('CREATE TABLE foo (t text)');
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100) g");
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
lsn_a = main_cur.fetchone()[0]
print('LSN after 100 rows: ' + lsn_a)
# Insert some more rows. (This generates enough WAL to fill a few segments.)
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g");
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
lsn_b = main_cur.fetchone()[0]
print('LSN after 100100 rows: ' + lsn_b)
# Branch at the point where only 100 rows were inserted
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@"+lsn_a]);
# Insert many more rows. This generates enough WAL to fill a few segments.
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g");
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
lsn_c = main_cur.fetchone()[0]
print('LSN after 200100 rows: ' + lsn_c)
# Branch at the point where only 200 rows were inserted
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@"+lsn_b]);
pg_hundred = postgres.create_start("test_branch_behind_hundred")
pg_more = postgres.create_start("test_branch_behind_more")
# On the 'hundred' branch, we should see only 100 rows
hundred_pg_conn = psycopg2.connect(pg_hundred.connstr())
hundred_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
hundred_cur = hundred_pg_conn.cursor()
hundred_cur.execute('SELECT count(*) FROM foo');
assert(hundred_cur.fetchone()[0] == 100);
# On the 'more' branch, we should see 100200 rows
more_pg_conn = psycopg2.connect(pg_more.connstr())
more_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
more_cur = more_pg_conn.cursor()
more_cur.execute('SELECT count(*) FROM foo');
assert(more_cur.fetchone()[0] == 100100);
# All the rows are visible on the main branch
main_cur.execute('SELECT count(*) FROM foo');
assert(main_cur.fetchone()[0] == 200100);

View File

@@ -1,30 +0,0 @@
import pytest
import os
import getpass
import psycopg2
pytest_plugins = ("fixtures.zenith_fixtures")
#
# Test starting Postgres with custom options
#
def test_config(zenith_cli, pageserver, postgres, pg_bin):
# Create a branch for us
zenith_cli.run(["branch", "test_config", "empty"]);
# change config
pg = postgres.create_start('test_config', ['log_min_messages=debug1'])
print('postgres is running on test_config branch')
pg_conn = psycopg2.connect(pg.connstr())
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
cur = pg_conn.cursor()
#check that config change was applied
cur.execute('SELECT name, setting from pg_settings WHERE source!=%s and source!=%s', ("default","override",))
for record in cur:
if record[0] == 'log_min_messages':
assert(record[1] == 'debug1')
pg_conn.close()

View File

@@ -1,37 +0,0 @@
import pytest
import getpass
import psycopg2
pytest_plugins = ("fixtures.zenith_fixtures")
#
# Test CREATE DATABASE when there have been relmapper changes
#
def test_createdb(zenith_cli, pageserver, postgres, pg_bin):
zenith_cli.run(["branch", "test_createdb", "empty"]);
pg = postgres.create_start('test_createdb')
print("postgres is running on 'test_createdb' branch")
conn = psycopg2.connect(pg.connstr());
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()
# Cause a 'relmapper' change in the original branch
cur.execute('VACUUM FULL pg_class');
cur.execute('CREATE DATABASE foodb');
cur.execute('SELECT pg_current_wal_insert_lsn()');
lsn = cur.fetchone()[0]
conn.close();
# Create a branch
zenith_cli.run(["branch", "test_createdb2", "test_createdb@"+lsn]);
pg2 = postgres.create_start('test_createdb2')
# Test that you can connect to the new database on both branches
conn = psycopg2.connect(pg.connstr('foodb'));
conn2 = psycopg2.connect(pg2.connstr('foodb'));

View File

@@ -1,54 +0,0 @@
import pytest
import psycopg2
import getpass
import json
pytest_plugins = ("fixtures.zenith_fixtures")
def test_status(pageserver):
pg_conn = psycopg2.connect(pageserver.connstr())
pg_conn.autocommit = True
cur = pg_conn.cursor()
cur.execute('status;')
assert cur.fetchone() == ('hello world',)
pg_conn.close()
def test_branch_list(pageserver, zenith_cli):
# Create a branch for us
zenith_cli.run(["branch", "test_branch_list_main", "empty"]);
page_server_conn = psycopg2.connect(pageserver.connstr())
page_server_conn.autocommit = True
page_server_cur = page_server_conn.cursor()
page_server_cur.execute('branch_list;')
branches = json.loads(page_server_cur.fetchone()[0])
# Filter out branches created by other tests
branches = [x for x in branches if x['name'].startswith('test_branch_list')]
assert len(branches) == 1
assert branches[0]['name'] == 'test_branch_list_main'
assert 'timeline_id' in branches[0]
assert 'latest_valid_lsn' in branches[0]
assert 'ancestor_id' in branches[0]
assert 'ancestor_lsn' in branches[0]
# Create another branch, and start Postgres on it
zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
page_server_cur.execute('branch_list;')
new_branches = json.loads(page_server_cur.fetchone()[0])
# Filter out branches created by other tests
new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
assert len(new_branches) == 2
new_branches.sort(key=lambda k: k['name'])
assert new_branches[0]['name'] == 'test_branch_list_experimental'
assert new_branches[0]['timeline_id'] != branches[0]['timeline_id']
# TODO: do the LSNs have to match here?
assert new_branches[1] == branches[0]
page_server_conn.close()

View File

@@ -1,17 +0,0 @@
import pytest
pytest_plugins = ("fixtures.zenith_fixtures")
def test_pgbench(pageserver, postgres, pg_bin, zenith_cli):
# Create a branch for us
zenith_cli.run(["branch", "test_pgbench", "empty"]);
pg = postgres.create_start('test_pgbench')
print("postgres is running on 'test_pgbench' branch")
connstr = pg.connstr();
pg_bin.run_capture(['pgbench', '-i', connstr])
pg_bin.run_capture(['pgbench'] + '-c 10 -T 5 -P 1 -M prepared'.split() + [connstr])

View File

@@ -1,58 +0,0 @@
#
# Test branching, when a transaction is in prepared state
#
import pytest
import getpass
import psycopg2
pytest_plugins = ("fixtures.zenith_fixtures")
def test_twophase(zenith_cli, pageserver, postgres, pg_bin):
zenith_cli.run(["branch", "test_twophase", "empty"]);
pg = postgres.create_start('test_twophase', ['max_prepared_transactions=5'])
print("postgres is running on 'test_twophase' branch")
conn = psycopg2.connect(pg.connstr());
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()
cur.execute('CREATE TABLE foo (t text)');
# Prepare a transaction that will insert a row
cur.execute('BEGIN');
cur.execute("INSERT INTO foo VALUES ('one')");
cur.execute("PREPARE TRANSACTION 'insert_one'");
# Prepare another transaction that will insert a row
cur.execute('BEGIN');
cur.execute("INSERT INTO foo VALUES ('two')");
cur.execute("PREPARE TRANSACTION 'insert_two'");
cur.execute('BEGIN');
cur.execute("INSERT INTO foo VALUES ('three')");
cur.execute("PREPARE TRANSACTION 'insert_three'");
cur.execute("COMMIT PREPARED 'insert_three'");
cur.execute('SELECT pg_current_wal_insert_lsn()');
lsn = cur.fetchone()[0]
# Create a branch with the transaction in prepared state
zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase@"+lsn]);
pg2 = postgres.create_start('test_twophase_prepared', ['max_prepared_transactions=5'])
conn2 = psycopg2.connect(pg2.connstr());
conn2.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
cur2 = conn2.cursor()
# On the new branch, commit one of the prepared transactions, abort the other one.
cur2.execute("COMMIT PREPARED 'insert_one'");
cur2.execute("ROLLBACK PREPARED 'insert_two'");
cur2.execute('SELECT * FROM foo');
assert(cur2.fetchall() == [('one',),('three',)]);
# Neither insert is visible on the original branch, the transactions are still
# in prepared state there.
cur.execute('SELECT * FROM foo');
assert(cur.fetchall() == [('three',)]);

View File

@@ -1,49 +0,0 @@
import pytest
import psycopg2
import json
pytest_plugins = ("fixtures.zenith_fixtures")
def helper_compare_branch_list(page_server_cur, zenith_cli):
"""
Compare branches list returned by CLI and directly via API.
Filters out branches created by other tests.
"""
page_server_cur.execute('branch_list;')
branches_api = sorted(map(lambda b: b['name'], json.loads(page_server_cur.fetchone()[0])))
branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')]
res = zenith_cli.run(["branch"]);
assert(res.stderr == '')
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]
assert(branches_api == branches_cli)
def test_cli_branch_list(pageserver, zenith_cli):
page_server_conn = psycopg2.connect(pageserver.connstr())
page_server_conn.autocommit = True
page_server_cur = page_server_conn.cursor()
# Initial sanity check
helper_compare_branch_list(page_server_cur, zenith_cli)
# Create a branch for us
res = zenith_cli.run(["branch", "test_cli_branch_list_main", "main"]);
assert(res.stderr == '')
helper_compare_branch_list(page_server_cur, zenith_cli)
# Create a nested branch
res = zenith_cli.run(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"]);
assert(res.stderr == '')
helper_compare_branch_list(page_server_cur, zenith_cli)
# Check that all new branches are visible via CLI
res = zenith_cli.run(["branch"]);
assert(res.stderr == '')
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
assert('test_cli_branch_list_main' in branches_cli)
assert('test_cli_branch_list_nested' in branches_cli)

View File

@@ -1,61 +0,0 @@
import pytest
from fixtures.utils import mkdir_if_needed
import getpass
import os
import psycopg2
pytest_plugins = ("fixtures.zenith_fixtures")
# FIXME: put host + port in a fixture
HOST = 'localhost'
PORT = 55432
def test_pg_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, base_dir, capsys):
# Create a branch for us
zenith_cli.run(["branch", "test_pg_regress", "empty"]);
# Connect to postgres and create a database called "regression".
pg = postgres.create_start('test_pg_regress')
pg_conn = psycopg2.connect(pg.connstr())
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
cur = pg_conn.cursor()
cur.execute('CREATE DATABASE regression')
pg_conn.close()
# Create some local directories for pg_regress to run in.
runpath = os.path.join(test_output_dir, 'regress')
mkdir_if_needed(runpath)
mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
# Compute all the file locations that pg_regress will need.
build_path = os.path.join(
pg_distrib_dir, 'build/src/test/regress')
src_path = os.path.join(
base_dir, 'vendor/postgres/src/test/regress')
bindir = os.path.join(pg_distrib_dir, 'bin')
schedule = os.path.join(src_path, 'parallel_schedule')
pg_regress = os.path.join(build_path, 'pg_regress')
pg_regress_command = [
pg_regress,
'--bindir=""',
'--use-existing',
'--bindir={}'.format(bindir),
'--dlpath={}'.format(build_path),
'--schedule={}'.format(schedule),
'--inputdir={}'.format(src_path),
]
env = {
'PGPORT': str(pg.port),
'PGUSER': pg.username,
'PGHOST': pg.host,
}
# Run the command.
# We don't capture the output. It's not too chatty, and it always
# logs the exact same data to `regression.out` anyway.
with capsys.disabled():
pg_bin.run(pg_regress_command, env=env, cwd=runpath)

View File

@@ -1,62 +0,0 @@
import pytest
from fixtures.utils import mkdir_if_needed
import getpass
import os
import psycopg2
pytest_plugins = ("fixtures.zenith_fixtures")
# FIXME: put host + port in a fixture
HOST = 'localhost'
PORT = 55432
def test_zenith_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, base_dir, capsys):
# Create a branch for us
zenith_cli.run(["branch", "test_zenith_regress", "empty"]);
# Connect to postgres and create a database called "regression".
pg = postgres.create_start('test_zenith_regress')
pg_conn = psycopg2.connect(pg.connstr())
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
cur = pg_conn.cursor()
cur.execute('CREATE DATABASE regression')
pg_conn.close()
# Create some local directories for pg_regress to run in.
runpath = os.path.join(test_output_dir, 'regress')
mkdir_if_needed(runpath)
mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
# Compute all the file locations that pg_regress will need.
# This test runs zenith specific tests
build_path = os.path.join(
pg_distrib_dir, 'build/src/test/regress')
src_path = os.path.join(
base_dir, 'test_runner/zenith_regress')
bindir = os.path.join(pg_distrib_dir, 'bin')
schedule = os.path.join(src_path, 'parallel_schedule')
pg_regress = os.path.join(build_path, 'pg_regress')
pg_regress_command = [
pg_regress,
'--use-existing',
'--bindir={}'.format(bindir),
'--dlpath={}'.format(build_path),
'--schedule={}'.format(schedule),
'--inputdir={}'.format(src_path),
]
print(pg_regress_command)
env = {
'PGPORT': str(pg.port),
'PGUSER': pg.username,
'PGHOST': pg.host,
}
# Run the command.
# We don't capture the output. It's not too chatty, and it always
# logs the exact same data to `regression.out` anyway.
with capsys.disabled():
pg_bin.run(pg_regress_command, env=env, cwd=runpath)

View File

@@ -1 +0,0 @@
pytest_plugins = ("fixtures.zenith_fixtures")

View File

@@ -1,53 +0,0 @@
import os
import subprocess
def get_self_dir():
""" Get the path to the directory where this script lives. """
return os.path.dirname(os.path.abspath(__file__))
def mkdir_if_needed(path):
""" Create a directory if it doesn't already exist
Note this won't try to create intermediate directories.
"""
if os.path.exists(path):
assert os.path.isdir(path)
return
os.mkdir(path)
def subprocess_capture(capture_dir, cmd, **kwargs):
""" Run a process and capture its output
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
where "cmd" is the name of the program and NNN is an incrementing
counter.
If those files already exist, we will overwrite them.
"""
assert type(cmd) is list
base = os.path.basename(cmd[0]) + '_{}'.format(global_counter())
basepath = os.path.join(capture_dir, base)
stdout_filename = basepath + '.stdout'
stderr_filename = basepath + '.stderr'
with open(stdout_filename, 'w') as stdout_f:
with open(stderr_filename, 'w') as stderr_f:
print('(capturing output to "{}.stdout")'.format(base))
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
_global_counter = 0
def global_counter():
""" A really dumb global counter.
This is useful for giving output files a unique number, so if we run the
same command multiple times we can keep their output separate.
"""
global _global_counter
_global_counter += 1
return _global_counter

View File

@@ -1,357 +0,0 @@
import getpass
import os
import psycopg2
import pytest
import shutil
import subprocess
import sys
from .utils import (get_self_dir, mkdir_if_needed,
subprocess_capture, global_counter)
"""
This file contains pytest fixtures. A fixture is a test resource that can be
summoned by placing its name in the test's arguments.
A fixture is created with the decorator @zenfixture, which is a wrapper around
the standard pytest.fixture with some extra behavior.
There are several environment variables that can control the running of tests:
ZENITH_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information.
To use fixtures in a test file, add this line of code:
pytest_plugins = ("fixtures.zenith_fixtures")
Don't import functions from this file, or pytest will emit warnings. Instead
put directly-importable functions into utils.py or another separate file.
"""
DEFAULT_OUTPUT_DIR = 'test_output'
DEFAULT_POSTGRES_DIR = 'tmp_install'
def determine_scope(fixture_name, config):
return 'session'
def zenfixture(func):
""" This is a python decorator for fixtures with a flexible scope.
By default every test function will set up and tear down a new
database. In pytest, this is called fixtures "function" scope.
If the environment variable TEST_SHARED_FIXTURES is set, then all
tests will share the same database. State, logs, etc. will be
stored in a directory called "shared".
"""
if os.environ.get('TEST_SHARED_FIXTURES') is None:
scope = 'function'
else:
scope = 'session'
return pytest.fixture(func, scope=scope)
@pytest.fixture(autouse=True, scope='session')
def safety_check():
""" Ensure that no unwanted daemons are running before we start testing. """
# does not use -c as it is not supported on macOS
cmd = ['pgrep', 'pageserver|postgres|wal_acceptor']
result = subprocess.run(cmd, stdout=subprocess.DEVNULL)
if result.returncode == 0:
# returncode of 0 means it found something.
# This is bad; we don't want any of those processes polluting the
# result of the test.
raise Exception('found interfering processes running')
class ZenithCli:
""" An object representing the CLI binary named "zenith".
We also store an environment that will tell the CLI to operate
on a particular ZENITH_REPO_DIR.
"""
def __init__(self, binpath, repo_dir, pg_distrib_dir):
assert os.path.isdir(binpath)
self.binpath = binpath
self.bin_zenith = os.path.join(binpath, 'zenith')
self.env = os.environ.copy()
self.env['ZENITH_REPO_DIR'] = repo_dir
self.env['POSTGRES_DISTRIB_DIR'] = pg_distrib_dir
def run(self, arguments):
""" Run "zenith" with the specified arguments.
arguments must be in list form, e.g. ['pg', 'create']
Return both stdout and stderr, which can be accessed as
result = zenith_cli.run(...)
assert(result.stderr == "")
print(result.stdout)
"""
assert type(arguments) == list
args = [self.bin_zenith] + arguments
print('Running command "{}"'.format(' '.join(args)))
return subprocess.run(args, env=self.env, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
@zenfixture
def zenith_cli(zenith_binpath, repo_dir, pg_distrib_dir):
return ZenithCli(zenith_binpath, repo_dir, pg_distrib_dir)
class ZenithPageserver:
""" An object representing a running pageserver. """
def __init__(self, zenith_cli):
self.zenith_cli = zenith_cli
self.running = False
# Initialize the repository, i.e. run "zenith init"
def init(self):
self.zenith_cli.run(['init'])
# Start the page server
def start(self):
self.zenith_cli.run(['start'])
self.running = True
# Stop the page server
def stop(self):
self.zenith_cli.run(['stop'])
self.running = True
# The page server speaks the Postgres FE/BE protocol, so you can connect
# to it with any Postgres client, and run special commands. This function
# returns a libpq connection string for connecting to it.
def connstr(self):
username = getpass.getuser()
conn_str = 'host={} port={} dbname=postgres user={}'.format(
'localhost', 64000, username)
return conn_str
# The 'pageserver' fixture provides a Page Server that's up and running.
#
# If TEST_SHARED_FIXTURES is set, the Page Server instance is shared by all
# the tests. To avoid clashing with other tests, don't use the 'main' branch in
# the tests directly. Instead, create a branch off the 'empty' branch and use
# that.
#
# By convention, the test branches are named after the tests. For example,
# test called 'test_foo' would create and use branches with the 'test_foo' prefix.
@zenfixture
def pageserver(zenith_cli):
ps = ZenithPageserver(zenith_cli)
ps.init()
ps.start()
# For convenience in tests, create a branch from the freshly-initialized cluster.
zenith_cli.run(["branch", "empty", "main"]);
yield ps
# After the yield comes any cleanup code we need.
print('Starting pageserver cleanup')
ps.stop()
class Postgres:
""" An object representing a running postgres daemon. """
def __init__(self, zenith_cli, repo_dir, instance_num):
self.zenith_cli = zenith_cli
self.instance_num = instance_num
self.running = False
self.username = getpass.getuser()
self.host = 'localhost'
self.port = 55431 + instance_num
self.repo_dir = repo_dir
self.branch = None
# path to conf is <repo_dir>/pgdatadirs/<branch_name>/postgresql.conf
def create_start(self, branch, config_lines=None):
""" create the pg data directory, and start the server """
self.zenith_cli.run(['pg', 'create', branch])
self.branch = branch
if config_lines is None:
config_lines = []
self.config(config_lines)
self.zenith_cli.run(['pg', 'start', branch])
self.running = True
return
#lines should be an array of valid postgresql.conf rows
def config(self, lines):
filename = 'pgdatadirs/{}/postgresql.conf'.format(self.branch)
config_name = os.path.join(self.repo_dir, filename)
with open(config_name, 'a') as conf:
for line in lines:
conf.write(line)
conf.write('\n')
def stop(self):
if self.running:
self.zenith_cli.run(['pg', 'stop', self.branch])
# Return a libpq connection string to connect to the Postgres instance
def connstr(self, dbname='postgres'):
conn_str = 'host={} port={} dbname={} user={}'.format(
self.host, self.port, dbname, self.username)
return conn_str
class PostgresFactory:
""" An object representing multiple running postgres daemons. """
def __init__(self, zenith_cli, repo_dir):
self.zenith_cli = zenith_cli
self.host = 'localhost'
self.repo_dir = repo_dir
self.num_instances = 0
self.instances = []
def create_start(self, branch="main", config_lines=None):
pg = Postgres(self.zenith_cli, self.repo_dir, self.num_instances + 1)
self.num_instances += 1
self.instances.append(pg)
pg.create_start(branch, config_lines)
return pg
def stop_all(self):
for pg in self.instances:
pg.stop()
@zenfixture
def postgres(zenith_cli, repo_dir):
pgfactory = PostgresFactory(zenith_cli, repo_dir)
yield pgfactory
# After the yield comes any cleanup code we need.
print('Starting postgres cleanup')
pgfactory.stop_all()
class PgBin:
""" A helper class for executing postgres binaries """
def __init__(self, log_dir, pg_distrib_dir):
self.log_dir = log_dir
self.pg_install_path = pg_distrib_dir
self.pg_bin_path = os.path.join(self.pg_install_path, 'bin')
self.env = os.environ.copy()
self.env['LD_LIBRARY_PATH'] = os.path.join(self.pg_install_path, 'lib')
def _fixpath(self, command):
if not '/' in command[0]:
command[0] = os.path.join(self.pg_bin_path, command[0])
def _build_env(self, env_add):
if env_add is None:
return self.env
env = self.env.copy()
env.update(env_add)
return env
def run(self, command, env=None, cwd=None):
""" Run one of the postgres binaries.
The command should be in list form, e.g. ['pgbench', '-p', '55432']
All the necessary environment variables will be set.
If the first argument (the command name) doesn't include a path (no '/'
characters present), then it will be edited to include the correct path.
If you want stdout/stderr captured to files, use `run_capture` instead.
"""
self._fixpath(command)
print('Running command "{}"'.format(' '.join(command)))
env = self._build_env(env)
subprocess.run(command, env=env, cwd=cwd, check=True)
def run_capture(self, command, env=None, cwd=None):
""" Run one of the postgres binaries, with stderr and stdout redirected to a file.
This is just like `run`, but for chatty programs.
"""
self._fixpath(command)
print('Running command "{}"'.format(' '.join(command)))
env = self._build_env(env)
subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True)
@zenfixture
def pg_bin(test_output_dir, pg_distrib_dir):
return PgBin(test_output_dir, pg_distrib_dir)
@zenfixture
def base_dir():
""" find the base directory (currently this is the git root) """
base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..'))
print('base_dir is', base_dir)
return base_dir
@zenfixture
def top_output_dir(base_dir):
""" Compute the top-level directory for all tests. """
env_test_output = os.environ.get('TEST_OUTPUT')
if env_test_output is not None:
output_dir = env_test_output
else:
output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
mkdir_if_needed(output_dir)
return output_dir
@zenfixture
def test_output_dir(request, top_output_dir):
""" Compute the working directory for an individual test. """
if os.environ.get('TEST_SHARED_FIXTURES') is None:
# one directory per test
test_name = request.node.name
else:
# We're running shared fixtures. Share a single directory.
test_name = 'shared'
test_output_dir = os.path.join(top_output_dir, test_name)
print('test_output_dir is', test_output_dir)
shutil.rmtree(test_output_dir, ignore_errors=True)
mkdir_if_needed(test_output_dir)
return test_output_dir
@zenfixture
def repo_dir(request, test_output_dir):
""" Compute the test repo_dir
"repo_dir" is the place where all of the pageserver files will go.
It doesn't have anything to do with the git repo.
"""
repo_dir = os.path.join(test_output_dir, 'repo')
return repo_dir
@zenfixture
def zenith_binpath(base_dir):
""" find the zenith binaries """
env_zenith_bin = os.environ.get('ZENITH_BIN')
if env_zenith_bin:
zenith_dir = env_zenith_bin
else:
zenith_dir = os.path.join(base_dir, 'target/debug')
if not os.path.exists(os.path.join(zenith_dir, 'pageserver')):
raise Exception('zenith binaries not found at "{}"'.format(zenith_dir))
return zenith_dir
@zenfixture
def pg_distrib_dir(base_dir):
""" find the postgress install """
env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR')
if env_postgres_bin:
pg_dir = env_postgres_bin
else:
pg_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR))
print('postgres dir is', pg_dir)
if not os.path.exists(os.path.join(pg_dir, 'bin/postgres')):
raise Exception('postgres not found at "{}"'.format(pg_dir))
return pg_dir

View File

@@ -1,2 +0,0 @@
[pytest]
minversion = 6.0

View File

@@ -1,33 +0,0 @@
import pytest
import os
pytest_plugins = ("fixtures.zenith_fixtures")
"""
Use this test to see what happens when tests fail.
We should be able to clean up after ourselves, including stopping any
postgres or pageserver processes.
Set the environment variable RUN_BROKEN to see this test run (and fail,
and hopefully not leave any server processes behind).
"""
run_broken = pytest.mark.skipif(
os.environ.get('RUN_BROKEN') == None,
reason="only used for testing the fixtures"
)
@run_broken
def test_broken(zenith_cli, pageserver, postgres, pg_bin):
# Create a branch for us
zenith_cli.run(["branch", "test_broken", "empty"]);
pg = postgres.create_start("test_broken")
print('postgres is running')
print('THIS NEXT COMMAND WILL FAIL:')
pg_bin.run('pgbench -i_am_a_broken_test'.split())

View File

@@ -1,11 +0,0 @@
# Local binaries
/pg_regress
# Generated subdirectories
/tmp_check/
/results/
/log/
# Note: regression.* are only left behind on a failure; that's why they're not ignored
#/regression.diffs
#/regression.out

View File

@@ -1,11 +0,0 @@
To add a new SQL test
- add sql script to run to zenith_regress/sql/testname.sql
- add expected output to zenith/regress/expected/testname.out
- add testname to both parallel_schedule and serial_schedule files*
That's it.
For more complex tests see PostgreSQL regression tests. These works basically the same.
*it was changed recently in PostgreSQL upstream - no more separate serial_schedule.
Someday we'll catch up with these changes.

View File

@@ -1,9 +0,0 @@
/constraints.out
/copy.out
/create_function_1.out
/create_function_2.out
/largeobject.out
/largeobject_1.out
/misc.out
/security_label.out
/tablespace.out

View File

@@ -1,34 +0,0 @@
BEGIN;
SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;
CREATE TABLE cursor (a int);
INSERT INTO cursor VALUES (1);
DECLARE c1 NO SCROLL CURSOR FOR SELECT * FROM cursor FOR UPDATE;
UPDATE cursor SET a = 2;
FETCH ALL FROM c1;
a
---
(0 rows)
COMMIT;
DROP TABLE cursor;
create table to_be_evicted(x bigint);
begin;
insert into to_be_evicted values (1);
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
select sum(x) from to_be_evicted;
sum
-------------
25937424601
(1 row)
end;
drop table to_be_evicted;

View File

@@ -1,15 +0,0 @@
create or replace procedure do_commits() as $$
declare
xid xid8;
i integer;
begin
for i in 1..1000000 loop
xid = txid_current();
commit;
if (pg_xact_status(xid) <> 'committed') then
raise exception 'CLOG corruption';
end if;
end loop;
end;
$$ language plpgsql;
call do_commits();

View File

@@ -1,19 +0,0 @@
--
-- Test that when a relation is truncated by VACUUM, the next smgrnblocks()
-- query to get the relation's size returns the new size.
-- (This isn't related to the TRUNCATE command, which works differently,
-- by creating a new relation file)
--
CREATE TABLE truncatetest (i int);
INSERT INTO truncatetest SELECT g FROM generate_series(1, 10000) g;
-- Remove all the rows, and run VACUUM to remove the dead tuples and
-- truncate the physical relation to 0 blocks.
DELETE FROM truncatetest;
VACUUM truncatetest;
-- Check that a SeqScan sees correct relation size (which is now 0)
SELECT * FROM truncatetest;
i
---
(0 rows)
DROP TABLE truncatetest;

View File

@@ -1,304 +0,0 @@
create table foo(a int primary key, b int, c int);
insert into foo values (generate_series(1,10000), generate_series(1,10000), generate_series(1,10000));
create index concurrently on foo(b);
create index concurrently on foo(c);
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
vacuum full foo;
\d foo
Table "public.foo"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------
a | integer | | not null |
b | integer | | |
c | integer | | |
Indexes:
"foo_pkey" PRIMARY KEY, btree (a)
"foo_b_idx" btree (b)
"foo_c_idx" btree (c)
drop table foo;

View File

@@ -1,11 +0,0 @@
# ----------
# src/test/regress/parallel_schedule
#
# By convention, we put no more than twenty tests in any one parallel group;
# this limits the number of connections needed to run the tests.
# ----------
test: zenith-cid
test: zenith-rel-truncate
test: zenith-clog
test: zenith-vacuum-full

View File

@@ -1,6 +0,0 @@
# src/test/regress/serial_schedule
# This should probably be in an order similar to parallel_schedule.
test: zenith-cid
test: zenith-rel-truncate
test: zenith-clog
test: zenith-vacuum-full

View File

@@ -1,8 +0,0 @@
/constraints.sql
/copy.sql
/create_function_1.sql
/create_function_2.sql
/largeobject.sql
/misc.sql
/security_label.sql
/tablespace.sql

View File

@@ -1,26 +0,0 @@
BEGIN;
SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;
CREATE TABLE cursor (a int);
INSERT INTO cursor VALUES (1);
DECLARE c1 NO SCROLL CURSOR FOR SELECT * FROM cursor FOR UPDATE;
UPDATE cursor SET a = 2;
FETCH ALL FROM c1;
COMMIT;
DROP TABLE cursor;
create table to_be_evicted(x bigint);
begin;
insert into to_be_evicted values (1);
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
insert into to_be_evicted select x*10 from to_be_evicted;
select sum(x) from to_be_evicted;
end;
drop table to_be_evicted;

View File

@@ -1,16 +0,0 @@
create or replace procedure do_commits() as $$
declare
xid xid8;
i integer;
begin
for i in 1..1000000 loop
xid = txid_current();
commit;
if (pg_xact_status(xid) <> 'committed') then
raise exception 'CLOG corruption';
end if;
end loop;
end;
$$ language plpgsql;
call do_commits();

View File

@@ -1,18 +0,0 @@
--
-- Test that when a relation is truncated by VACUUM, the next smgrnblocks()
-- query to get the relation's size returns the new size.
-- (This isn't related to the TRUNCATE command, which works differently,
-- by creating a new relation file)
--
CREATE TABLE truncatetest (i int);
INSERT INTO truncatetest SELECT g FROM generate_series(1, 10000) g;
-- Remove all the rows, and run VACUUM to remove the dead tuples and
-- truncate the physical relation to 0 blocks.
DELETE FROM truncatetest;
VACUUM truncatetest;
-- Check that a SeqScan sees correct relation size (which is now 0)
SELECT * FROM truncatetest;
DROP TABLE truncatetest;

View File

@@ -1,51 +0,0 @@
create table foo(a int primary key, b int, c int);
insert into foo values (generate_series(1,10000), generate_series(1,10000), generate_series(1,10000));
create index concurrently on foo(b);
create index concurrently on foo(c);
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
vacuum full foo;
\d foo
drop table foo;

2373
walkeeper/Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -20,20 +20,13 @@ slog = "2.7.0"
log = "0.4.14"
clap = "2.33.0"
daemonize = "0.4.1"
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
tokio = { version = "1.3.0", features = ["full"] }
tokio-stream = { version = "0.1.4" }
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
anyhow = "1.0"
crc32c = "0.6.0"
parse_duration = "2.1.1"
walkdir = "2"
serde = { version = "1.0", features = ["derive"] }
hex = "0.4.3"
# FIXME: 'pageserver' is needed for ZTimelineId. Refactor
pageserver = { path = "../pageserver" }
postgres_ffi = { path = "../postgres_ffi" }
workspace_hack = { path = "../workspace_hack" }
zenith_utils = { path = "../zenith_utils" }

View File

@@ -1,282 +0,0 @@
# Proxy-safekeeper communication consensus protocol.
## General requirements and architecture
There is single stateless master and several safekeepers. Number of safekeepers is determined by redundancy level.
To minimize number of changes in Postgres core, we are using standard streaming replication from master (through WAL sender).
This replication stream is initiated by `safekeeper_proxy` which receives data from the master and broadcasts it to safekeepers.
To provide durability we use synchronous replication at master (response to the commit statement is sent to the client
only when acknowledged by WAL receiver). `safekeeper_proxy` sends this acknowledgment only when LSN of commit record is confirmed by quorum of safekeepers.
`Safekeeper_proxy` tries to establish connections with safekeepers.
At any moment of time each safekeeper can serve exactly once proxy, but it can accept new connections.
Any of safekeepers can be used as WAL server, producing replication stream. So both `Pagers` and `Replicas`
(read-only computation nodes) can connect to safekeeper to receive WAL stream. Safekeepers is streaming WAL until
it reaches min(`commitLSN`,`flushLSN`). Then replication is suspended until new data arrives from master.
## Handshake
The goal of handshake is to collect quorum (to be able to perform recovery)
and avoid split-brains caused by simultaneous presence of old and new master.
Procedure of handshake consists of the following steps:
1. Broadcast information about server to all safekeepers (wal segment size, system_id,...)
2. Receive responses with information about safekeepers.
3. Once quorum of handshake responses are received, propose new `NodeId(max(term)+1, server.uuid)`
to all of them.
4. On receiving proposed nodeId, safekeeper compares it with locally stored nodeId and if it is greater or equals
then accepts proposed nodeId and persists this choice in the local control file.
5. If quorum of safekeepers approve proposed nodeId, then server assumes that handshake is successfully completed and switch to recovery stage.
## Recovery
Proxy computes max(`restartLSN`) and max(`flushLSN`) from quorum of attached safekeepers.
`RestartLSN` - is position in WAL which is known to be delivered to all safekeepers.
In other words: `restartLSN` can be also considered as cut-off horizon (all preceding WAL segments can be removed).
`FlushLSN` is position flushed by safekeeper to the local persistent storage.
If max(`restartLSN`) != max(`flushLSN`), then recovery has to be performed.
Proxy creates replication channel with most advanced safekeeper (safekeeper with the largest `flushLSN`).
Then it downloads all WAL messages between max(`restartLSN`)..max(`flushLSN`).
Messages are inserted in L1-list (ordered by LSN). Then we locate position of each safekeeper in this list according
to their `flushLSN`s. Safekeepers that are not yet connected (out of quorum) should start from the beginning of the list
(corresponding to `restartLSN`).
We need to choose max(`flushLSN`) because voting quorum may be different from quorum committed the last message.
So we do not know whether records with max(`flushLSN`) was committed by quorum or not. So we have to consider it committed
to avoid loose of committed data.
Calculated max(`flushLSN`) is called `VCL` (Volume Complete LSN). As far as it is chosen among quorum, there may be some other offline safekeeper with larger
`VCL`. Once it becomes online, we need to overwrite its WAL beyond `VCL`. To support it, each safekeeper maintains
`epoch` number. `Epoch` plays almost the same role as `term`, but algorithm of `epoch` bumping is different.
`VCL` and new epoch are received by safekeeper from proxy during voting.
But safekeeper doesn't switch to new epoch immediately after voting.
Instead of it, safekeepers waits record with LSN > Max(`flushLSN`,`VCL`) is received.
It means that we restore all records from old generation and switch to new generation.
When proxy calculates max(`FlushLSN`), it first compares `Epoch`. So actually we compare (`Epoch`,`FlushLSN`) pairs.
Let's looks at the examples. Consider that we have three safekeepers: S1, S2, S3. Si(N) means that i-th safekeeper has epoch=N.
Ri(x) - WAL record for resource X with LSN=i. Assume that we have the following state:
```
S1(1): R1(a)
S2(1): R1(a),R2(b)
S3(1): R1(a),R2(b),R3(c),R4(d) - offline
```
Proxy choose quorum (S1,S2). VCL for them is 2. We download S2 to proxy and schedule its write to S1.
After receiving record R5 the picture can be:
```
S1(2): R1(a),R2(b),R3(e)
S2(2): R1(a),R2(b),R3(e)
S3(1): R1(a),R2(b),R3(c),R4(d) - offline
```
Now if server is crashed or restarted, we perform new voting and
doesn't matter which quorum we choose: (S1,S2), (S2,S3)...
in any case VCL=3, because S3 has smaller epoch.
R3(c) will be overwritten with R3(e):
```
S1(3): R1(a),R2(b),R3(e)
S2(3): R1(a),R2(b),R3(e)
S3(1): R1(a),R2(b),R3(e),R4(d)
```
Epoch of S3 will be adjusted once it overwrites R4:
```
S1(3): R1(a),R2(b),R3(e),R4(f)
S2(3): R1(a),R2(b),R3(e),R4(f)
S3(3): R1(a),R2(b),R3(e),R4(f)
```
Crash can happen before epoch was bumped. Let's return back to the initial position:
```
S1(1): R1(a)
S2(1): R1(a),R2(b)
S3(1): R1(a),R2(b),R3(c),R4(d) - offline
```
Assume that we start recovery:
```
S1(1): R1(a),R2(b)
S2(1): R1(a),R2(b)
S3(1): R1(a),R2(b),R3(c),R4(d) - offline
```
and then crash happens. During voting we choose quorum (S3,S3).
Now them belong to the same epoch and S3 is most advanced among them.
So VCL is set to 4 and we recover S1 and S2 from S3:
```
S1(1): R1(a),R2(b),R3(c),R4(d)
S2(1): R1(a),R2(b),R3(c),R4(d)
S3(1): R1(a),R2(b),R3(c),R4(d)
```
## Main loop
Once recovery is completed, proxy switches to normal processing loop: it receives WAL stream from master and appends WAL
messages to the list. At the same time it tries to push messages to safekeepers. Each safekeeper is associated
with some element in message list and once it acknowledged receiving of the message, position is moved forward.
Each queue element contains acknowledgment mask, which bits corresponds to safekeepers.
Once all safekeepers acknowledged receiving of this message (by setting correspondent bit),
then element can be removed from queue and `restartLSN` is advanced forward.
Proxy maintains `restartLSN` and `commitLSN` based on the responses received by safekeepers.
`RestartLSN` equals to the LSN of head message in the list. `CommitLSN` is `flushLSN[nSafekeepers-Quorum]` element
in ordered array with `flushLSN`s of safekeepers. `CommitLSN` and `RestartLSN` are included in requests
sent from proxy to safekeepers and stored in safekeepers control file.
To avoid overhead of extra fsync, this control file is not fsynced on each request. Flushing this file is performed
periodically, which means that `restartLSN`/`commitLSN` stored by safekeeper may be slightly deteriorated.
It is not critical because may only cause redundant processing of some WAL record.
And `FlushLSN` is recalculated after node restart by scanning local WAL files.
## Fault tolerance
Once `safekeeper_proxy` looses connection to safekeeper it tries to reestablish this connection using the same nodeId.
If `safekeeper_proxy` looses connection with master, it is terminated. Right now safekeeper is standalone process,
which can be launched at any node, but it can be also spawned as master's background worker, so that it is automatically
restarted in case of Postgres instance restart.
Restart of `safekeeper_proxy` initiates new round of voting and switching new epoch.
## Limitations
Right now message queue is maintained in main memory and is not spilled to the disk.
It can cause memory overflow in case of presence of lagging safekeepers.
It is assumed that in case of loosing local data by some safekeepers, it should be recovered using some external mechanism.
## Glossary
* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
* `RestartLSN`: position in WAL confirmed by all safekeepers.
* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
* `NodeID`: pair (term,UUID)
* `Pager`: Zenith component restoring pages from WAL stream
* `Replica`: read-only computatio node
* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.
## Algorithm
```python
process SafekeeperProxy(safekeepers,server,curr_epoch,restart_lsn=0,message_queue={},feedbacks={})
function do_recovery(epoch,restart_lsn,VCL)
leader = i:safekeepers[i].state.epoch=epoch and safekeepers[i].state.flushLsn=VCL
wal_stream = safekeepers[leader].start_replication(restart_lsn,VCL)
do
message = wal_stream.read()
message_queue.append(message)
while message.startPos < VCL
for i in 1..safekeepers.size()
for message in message_queue
if message.endLsn < safekeepers[i].state.flushLsn
message.delivered += i
else
send_message(i, message)
break
end function
function send_message(i,msg)
msg.restartLsn = restart_lsn
msg.commitLsn = get_commit_lsn()
safekeepers[i].send(msg, response_handler)
end function
function do_broadcast(message)
for i in 1..safekeepers.size()
if not safekeepers[i].sending()
send_message(i, message)
end function
function get_commit_lsn()
sorted_feedbacks = feedbacks.sort()
return sorted_feedbacks[safekeepers.size() - quorum]
end function
function response_handler(i,message,response)
feedbacks[i] = if response.epoch=curr_epoch then response.flushLsn else VCL
server.write(get_commit_lsn())
message.delivered += i
next_message = message_queue.next(message)
if next_message
send_message(i, next_message)
while message_queue.head.delivered.size() = safekeepers.size()
if restart_lsn < message_queue.head.beginLsn
restart_lsn = message_queue.head.endLsn
message_queue.pop_head()
end function
server_info = server.read()
safekeepers.write(server_info)
safekeepers.state = safekeepers.read()
next_term = max(safekeepers.state.nodeId.term)+1
restart_lsn = max(safekeepers.state.restartLsn)
epoch,VCL = max(safekeepers.state.epoch,safekeepers.state.flushLsn)
curr_epoch = epoch + 1
proposal = Proposal(NodeId(next_term,server.id),curr_epoch,VCL)
safekeepers.send(proposal)
responses = safekeepers.read()
if any responses.is_rejected()
exit()
for i in 1..safekeepers.size()
feedbacks[i].flushLsn = if epoch=safekeepers[i].state.epoch then safekeepers[i].state.flushLsn else restart_lsn
if restart_lsn != VCL
do_recovery(epoch,restart_lsn,VCL)
wal_stream = server.start_replication(VCL)
for ever
message = wal_stream.read()
message_queue.append(message)
do_broadcast(message)
end process
process safekeeper(gateway,state)
function handshake()
proxy = gateway.accept()
server_info = proxy.read()
proxy.write(state)
proposal = proxy.read()
if proposal.nodeId < state.nodeId
proxy.write(rejected)
return null
else
state.nodeId = proposal.nodeId
state.proposed_epoch = proposal.epoch
state.VCL = proposal.VCL
write_control_file(state)
proxy.write(accepted)
return proxy
end function
state = read_control_file()
state.flushLsn = locate_end_of_wal()
for ever
proxy = handshake()
if not proxy
continue
for ever
req = proxy.read()
if req.nodeId != state.nodeId
break
save_wal_file(req.data)
state.restartLsn = req.restartLsn
if state.epoch < state.proposed_epoch and req.endPos > max(state.flushLsn,state.VCL)
state.epoch = state.proposed_epoch
if req.endPos > state.flushLsn
state.flushLsn = req.endPos
save_control_file(state)
resp = Response(state.epoch,req.endPos)
proxy.write(resp)
notify_wal_sender(Min(req.commitLsn,req.endPos))
end process
```

View File

@@ -1,19 +1,19 @@
//
// Main entry point for the wal_acceptor executable
//
use anyhow::{Context, Result};
use clap::{App, Arg};
use daemonize::Daemonize;
use log::*;
use parse_duration::parse;
use slog::Drain;
use std::io;
use std::path::{Path, PathBuf};
use std::path::Path;
use std::path::PathBuf;
use std::thread;
use std::time::Duration;
use std::{fs::File, fs::OpenOptions};
use walkeeper::s3_offload;
use anyhow::Result;
use clap::{App, Arg};
use slog::Drain;
use walkeeper::wal_service;
use walkeeper::WalAcceptorConf;
@@ -45,13 +45,8 @@ fn main() -> Result<()> {
Arg::with_name("pageserver")
.short("p")
.long("pageserver")
.takes_value(true),
)
.arg(
Arg::with_name("ttl")
.long("ttl")
.takes_value(true)
.help("interval for keeping WAL as walkeeper node, after which them will be uploaded to S3 and removed locally"),
.help("address ip:port of pageserver with which wal_acceptor should establish connection"),
)
.arg(
Arg::with_name("daemonize")
@@ -74,12 +69,11 @@ fn main() -> Result<()> {
let mut conf = WalAcceptorConf {
data_dir: PathBuf::from("./"),
systemid,
systemid: systemid,
daemonize: false,
no_sync: false,
pageserver_addr: None,
listen_addr: "127.0.0.1:5454".parse()?,
ttl: None,
};
if let Some(dir) = arg_matches.value_of("datadir") {
@@ -105,26 +99,12 @@ fn main() -> Result<()> {
conf.pageserver_addr = Some(addr.parse().unwrap());
}
if let Some(ttl) = arg_matches.value_of("ttl") {
conf.ttl = Some::<Duration>(parse(ttl)?);
}
start_wal_acceptor(conf)
}
fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
let log_filename = conf.data_dir.join("wal_acceptor.log");
// Don't open the same file for output multiple times;
// the different fds could overwrite each other's output.
let log_file = OpenOptions::new()
.create(true)
.append(true)
.open(&log_filename)
.with_context(|| format!("failed to open {:?}", &log_filename))?;
// Initialize logger
let logger_file = log_file.try_clone().unwrap();
let _scope_guard = init_logging(&conf, logger_file)?;
let _scope_guard = init_logging(&conf)?;
let _log_guard = slog_stdlog::init().unwrap();
// Note: this `info!(...)` macro comes from `log` crate
info!("standard logging redirected to slog");
@@ -134,8 +114,16 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
// that we will see any accidental manual fprintf's or backtraces.
let stdout = log_file.try_clone().unwrap();
let stderr = log_file;
let stdout = OpenOptions::new()
.create(true)
.append(true)
.open("wal_acceptor.log")
.unwrap();
let stderr = OpenOptions::new()
.create(true)
.append(true)
.open("wal_acceptor.log")
.unwrap();
let daemonize = Daemonize::new()
.pid_file("wal_acceptor.pid")
@@ -150,27 +138,11 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
}
let mut threads = Vec::new();
if conf.ttl.is_some() {
let s3_conf = conf.clone();
let s3_offload_thread = thread::Builder::new()
.name("S3 offload thread".into())
.spawn(|| {
// thread code
s3_offload::thread_main(s3_conf);
})
.unwrap();
threads.push(s3_offload_thread);
}
let wal_acceptor_thread = thread::Builder::new()
.name("WAL acceptor thread".into())
.spawn(|| {
// thread code
let thread_result = wal_service::thread_main(conf);
if let Err(e) = thread_result {
info!("wal_service thread terminated: {}", e);
}
wal_service::thread_main(conf);
})
.unwrap();
threads.push(wal_acceptor_thread);
@@ -181,11 +153,14 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
Ok(())
}
fn init_logging(
conf: &WalAcceptorConf,
log_file: File,
) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
fn init_logging(conf: &WalAcceptorConf) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
if conf.daemonize {
let log = conf.data_dir.join("wal_acceptor.log");
let log_file = File::create(&log).map_err(|err| {
// We failed to initialize logging, so we can't log this message with error!
eprintln!("Could not create log file {:?}: {}", log, err);
err
})?;
let decorator = slog_term::PlainSyncDecorator::new(log_file);
let drain = slog_term::CompactFormat::new(decorator).build();
let drain = std::sync::Mutex::new(drain).fuse();

View File

@@ -1,15 +1,10 @@
//
use std::net::SocketAddr;
use std::path::PathBuf;
use std::time::Duration;
pub mod pq_protocol;
pub mod receive_wal;
pub mod replication;
pub mod s3_offload;
pub mod send_wal;
pub mod timeline;
mod pq_protocol;
pub mod wal_service;
pub mod xlog_utils;
use crate::pq_protocol::SystemId;
@@ -21,5 +16,4 @@ pub struct WalAcceptorConf {
pub no_sync: bool,
pub listen_addr: SocketAddr,
pub pageserver_addr: Option<SocketAddr>,
pub ttl: Option<Duration>,
}

View File

@@ -1,15 +1,17 @@
use byteorder::{BigEndian, ReadBytesExt};
use bytes::{BufMut, Bytes, BytesMut};
use byteorder::{BigEndian, ByteOrder};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use pageserver::ZTimelineId;
use std::io::{self, Read};
use std::io;
use std::str;
use std::str::FromStr;
pub type Oid = u32;
pub type SystemId = u64;
pub type Result<T> = std::result::Result<T, io::Error>;
#[derive(Debug)]
pub enum FeMessage {
StartupMessage(FeStartupMessage),
Query(FeQueryMessage),
Terminate,
CopyData(FeCopyData),
@@ -38,7 +40,6 @@ pub struct FeStartupMessage {
pub version: u32,
pub kind: StartupRequestCode,
pub timelineid: ZTimelineId,
pub appname: Option<String>,
}
#[derive(Debug)]
@@ -50,22 +51,28 @@ pub enum StartupRequestCode {
}
impl FeStartupMessage {
pub fn read_from(reader: &mut impl Read) -> io::Result<Self> {
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
const CANCEL_REQUEST_CODE: u32 = (1234 << 16) | 5678;
const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679;
const NEGOTIATE_GSS_CODE: u32 = (1234 << 16) | 5680;
let len = reader.read_u32::<BigEndian>()? as usize;
if buf.len() < 4 {
return Ok(None);
}
let len = BigEndian::read_u32(&buf[0..4]) as usize;
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"FeStartupMessage: invalid message length",
"invalid message length",
));
}
if buf.len() < len {
return Ok(None);
}
let version = reader.read_u32::<BigEndian>()?;
let version = BigEndian::read_u32(&buf[4..8]);
let kind = match version {
CANCEL_REQUEST_CODE => StartupRequestCode::Cancel,
@@ -74,25 +81,20 @@ impl FeStartupMessage {
_ => StartupRequestCode::Normal,
};
let params_len = len - 8;
let mut params_bytes = vec![0u8; params_len];
reader.read_exact(params_bytes.as_mut())?;
let params_bytes = &buf[8..len];
let params_str = str::from_utf8(&params_bytes).unwrap();
let params = params_str.split('\0');
let mut options = false;
let mut timelineid: Option<ZTimelineId> = None;
let mut appname: Option<String> = None;
for p in params {
if p == "options" {
options = true;
} else if options {
for opt in p.split(' ') {
if let Some(ztimelineid_str) = opt.strip_prefix("ztimelineid=") {
if opt.starts_with("ztimelineid=") {
// FIXME: rethrow parsing error, don't unwrap
timelineid = Some(ZTimelineId::from_str(ztimelineid_str).unwrap());
} else if let Some(val) = opt.strip_prefix("application_name=") {
appname = Some(val.to_string());
timelineid = Some(ZTimelineId::from_str(&opt[12..]).unwrap());
break;
}
}
break;
@@ -105,12 +107,12 @@ impl FeStartupMessage {
));
}
Ok(FeStartupMessage {
buf.advance(len as usize);
Ok(Some(FeMessage::StartupMessage(FeStartupMessage {
version,
kind,
appname,
timelineid: timelineid.unwrap(),
})
})))
}
}
@@ -196,28 +198,44 @@ impl<'a> BeMessage<'a> {
}
impl FeMessage {
pub fn read_from(reader: &mut impl Read) -> io::Result<FeMessage> {
let tag = reader.read_u8()?;
let len = reader.read_u32::<BigEndian>()?;
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
if buf.len() < 5 {
let to_read = 5 - buf.len();
buf.reserve(to_read);
return Ok(None);
}
let tag = buf[0];
let len = BigEndian::read_u32(&buf[1..5]);
if len < 4 {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"FeMessage: invalid message length",
"invalid message length: parsing u32",
));
}
let body_len = (len - 4) as usize;
let mut body = vec![0u8; body_len];
reader.read_exact(&mut body)?;
let total_len = len as usize + 1;
if buf.len() < total_len {
let to_read = total_len - buf.len();
buf.reserve(to_read);
return Ok(None);
}
let mut body = buf.split_to(total_len);
body.advance(5);
match tag {
b'Q' => Ok(FeMessage::Query(FeQueryMessage { body: body.into() })),
b'd' => Ok(FeMessage::CopyData(FeCopyData { body: body.into() })),
b'X' => Ok(FeMessage::Terminate),
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage {
body: body.freeze(),
}))),
b'd' => Ok(Some(FeMessage::CopyData(FeCopyData {
body: body.freeze(),
}))),
b'X' => Ok(Some(FeMessage::Terminate)),
tag => Err(io::Error::new(
io::ErrorKind::InvalidInput,
format!("unknown message tag: {},'{:?}'", tag, body),
format!("unknown message tag: {},'{:?}'", tag, buf),
)),
}
}

Some files were not shown because too many files have changed in this diff Show More