mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-22 15:41:15 +00:00
Compare commits
192 Commits
embedded_w
...
drop_trunc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e95c2066b0 | ||
|
|
f2e57e1348 | ||
|
|
736d387f7b | ||
|
|
e6926d1794 | ||
|
|
a29fa4b8be | ||
|
|
1ccf82f932 | ||
|
|
b1a424dfa9 | ||
|
|
263acef1cc | ||
|
|
7c73afc1af | ||
|
|
e8f0a9bb80 | ||
|
|
6f9175ca2d | ||
|
|
69fa10ff86 | ||
|
|
d5fe515363 | ||
|
|
6a9c036ac1 | ||
|
|
6f9a582973 | ||
|
|
a0e23e6f3f | ||
|
|
84508d4f68 | ||
|
|
fb230dcf32 | ||
|
|
4aabc9a682 | ||
|
|
0fe81b2993 | ||
|
|
eb1f1a347d | ||
|
|
064aa44a06 | ||
|
|
d6ee61b5cf | ||
|
|
4b78a16b82 | ||
|
|
c093ee5e4b | ||
|
|
7685372cae | ||
|
|
ce54133ec4 | ||
|
|
610e14a7fc | ||
|
|
35a1c3d521 | ||
|
|
22b7e74c83 | ||
|
|
d95e1da742 | ||
|
|
40d047c146 | ||
|
|
42f3dd47d2 | ||
|
|
c2b2ab974c | ||
|
|
6ad6e5bd84 | ||
|
|
d534aeb9e1 | ||
|
|
d45839879c | ||
|
|
1f6ca23db6 | ||
|
|
2127a65e27 | ||
|
|
ecf2d181c4 | ||
|
|
c1bfa32771 | ||
|
|
8465738aa5 | ||
|
|
87d7ce816d | ||
|
|
f38c2e620e | ||
|
|
86056abd0e | ||
|
|
2bf2dd1d88 | ||
|
|
874d82fd4c | ||
|
|
3645133700 | ||
|
|
20b6279beb | ||
|
|
06f96f9600 | ||
|
|
b5f60f3874 | ||
|
|
0ec56cd21f | ||
|
|
600e1a0080 | ||
|
|
9c94a34ae7 | ||
|
|
9c0ac251df | ||
|
|
872ed24408 | ||
|
|
2f25d17e11 | ||
|
|
8faa6fa392 | ||
|
|
4d5a41301d | ||
|
|
4c35b22626 | ||
|
|
9fe3b73e13 | ||
|
|
e0146304e6 | ||
|
|
fbb04c592a | ||
|
|
8f43d7637c | ||
|
|
cf30303d8f | ||
|
|
1ec157653e | ||
|
|
858ca3a4ce | ||
|
|
d744ddee7c | ||
|
|
3296b7d770 | ||
|
|
2148ae78ab | ||
|
|
78dcf2207e | ||
|
|
74b78608d9 | ||
|
|
a11558b84f | ||
|
|
513696a485 | ||
|
|
cedc2eb5c2 | ||
|
|
e3e593f571 | ||
|
|
c12e393e74 | ||
|
|
d59cb2ca7a | ||
|
|
58f34a8d76 | ||
|
|
31462f4b71 | ||
|
|
538f903861 | ||
|
|
e6a7241c3a | ||
|
|
709b778904 | ||
|
|
aa8debf4e8 | ||
|
|
1912546e52 | ||
|
|
a6178c135f | ||
|
|
2ff16da6af | ||
|
|
21ea70c8f5 | ||
|
|
2b2d24433a | ||
|
|
66bced0f36 | ||
|
|
9ba7bc2695 | ||
|
|
8624bddc79 | ||
|
|
45b1495f37 | ||
|
|
23be5021f8 | ||
|
|
f954d5c501 | ||
|
|
ab2f0ad1a8 | ||
|
|
52fbcbde0a | ||
|
|
e602807476 | ||
|
|
398d522d88 | ||
|
|
746f667311 | ||
|
|
53ea6702bd | ||
|
|
952424b78c | ||
|
|
d737c40eec | ||
|
|
532918e13d | ||
|
|
b266c28345 | ||
|
|
04dc698d4b | ||
|
|
6b11b4250e | ||
|
|
15d1c1f8bf | ||
|
|
9ece1e863d | ||
|
|
2870150365 | ||
|
|
7b281900f9 | ||
|
|
97992226d3 | ||
|
|
270356ec38 | ||
|
|
c2db828481 | ||
|
|
71e93faed7 | ||
|
|
54d52e07db | ||
|
|
4dccdb33ab | ||
|
|
38c4b6f02f | ||
|
|
6ff3f1b9fd | ||
|
|
4c5e23d014 | ||
|
|
99d80aba52 | ||
|
|
2f2dff4c8d | ||
|
|
22e7fcbf2d | ||
|
|
372617a4f5 | ||
|
|
49d1921a28 | ||
|
|
d8e509d29e | ||
|
|
d5bfe84d9e | ||
|
|
8fff26ad49 | ||
|
|
5f4e32f505 | ||
|
|
fb71c85a79 | ||
|
|
ff76226a35 | ||
|
|
6e748147b6 | ||
|
|
e5df42feef | ||
|
|
73647e5715 | ||
|
|
95db33f3f9 | ||
|
|
bace19ffbe | ||
|
|
60d66267a9 | ||
|
|
294320e6a8 | ||
|
|
28b4d9abb3 | ||
|
|
8d8bc304c1 | ||
|
|
4788248e11 | ||
|
|
0cbb3798da | ||
|
|
36c12247b9 | ||
|
|
1767208563 | ||
|
|
d25656797c | ||
|
|
6c825dcbaa | ||
|
|
4b46693c81 | ||
|
|
8952066ecb | ||
|
|
d26b76fe7c | ||
|
|
df5a55c445 | ||
|
|
e5e5c3e067 | ||
|
|
b7575582b8 | ||
|
|
77fd24b950 | ||
|
|
61af9bb889 | ||
|
|
a68f60415b | ||
|
|
e7ca580922 | ||
|
|
33d126ecbe | ||
|
|
15db0d1d6f | ||
|
|
29f122009a | ||
|
|
bf0a0cb55d | ||
|
|
0fe5abadf5 | ||
|
|
1591f058c6 | ||
|
|
efa4ecaa7c | ||
|
|
8e57c2e413 | ||
|
|
4dd63821bd | ||
|
|
eeec1a3dcb | ||
|
|
b484b896b6 | ||
|
|
e5413be5fa | ||
|
|
b9c0d22045 | ||
|
|
2e0d45d092 | ||
|
|
86932c20eb | ||
|
|
f5b45a172c | ||
|
|
e6a0987182 | ||
|
|
aa64391265 | ||
|
|
aac913f9dc | ||
|
|
4e2e5bb4e6 | ||
|
|
3e15a5c325 | ||
|
|
ce646ea845 | ||
|
|
effcabb590 | ||
|
|
a08dfb1c2c | ||
|
|
a3818dee58 | ||
|
|
219cbe2d9c | ||
|
|
129f85f652 | ||
|
|
790f1b05c6 | ||
|
|
37cd662ab2 | ||
|
|
277a4d4582 | ||
|
|
1cdeba9db7 | ||
|
|
7d104e5660 | ||
|
|
49530145d8 | ||
|
|
da96965897 | ||
|
|
3762b53986 | ||
|
|
9ad99152b8 |
267
.circleci/config.yml
Normal file
267
.circleci/config.yml
Normal file
@@ -0,0 +1,267 @@
|
||||
version: 2.1
|
||||
|
||||
orbs:
|
||||
python: circleci/python@1.4.0
|
||||
|
||||
executors:
|
||||
zenith-build-executor:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
- image: cimg/rust:1.51.0
|
||||
|
||||
jobs:
|
||||
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: zenith-build-executor
|
||||
steps:
|
||||
# Checkout the git repo (circleci doesn't have a flag to enable submodules here)
|
||||
- checkout
|
||||
|
||||
# Grab the postgres git revision to build a cache key.
|
||||
# Note this works even though the submodule hasn't been checkout out yet.
|
||||
- run:
|
||||
name: Get postgres cache key
|
||||
command: |
|
||||
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
|
||||
|
||||
- restore_cache:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
# FIXME We could cache our own docker container, instead of installing packages every time.
|
||||
- run:
|
||||
name: apt install dependencies
|
||||
command: |
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libxml2-dev libcurl4-openssl-dev
|
||||
fi
|
||||
|
||||
# Build postgres if the restore_cache didn't find a build.
|
||||
# `make` can't figure out whether the cache is valid, since
|
||||
# it only compares file timestamps.
|
||||
- run:
|
||||
name: build postgres
|
||||
command: |
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
# "depth 1" saves some time by not cloning the whole repo
|
||||
git submodule update --init --depth 1
|
||||
make postgres
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save postgres cache
|
||||
key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
# A job to build zenith rust code
|
||||
build-zenith:
|
||||
executor: zenith-build-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
enum: ["debug", "release"]
|
||||
steps:
|
||||
- run:
|
||||
name: apt install dependencies
|
||||
command: |
|
||||
sudo apt update
|
||||
sudo apt install libssl-dev clang
|
||||
|
||||
# Checkout the git repo (without submodules)
|
||||
- checkout
|
||||
|
||||
# Grab the postgres git revision to build a cache key.
|
||||
# Note this works even though the submodule hasn't been checkout out yet.
|
||||
- run:
|
||||
name: Get postgres cache key
|
||||
command: |
|
||||
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
|
||||
|
||||
- restore_cache:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
keys:
|
||||
# Require an exact match. While an out of date cache might speed up the build,
|
||||
# there's no way to clean out old packages, so the cache grows every time something
|
||||
# changes.
|
||||
- v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
|
||||
# Build the rust code, including test binaries
|
||||
- run:
|
||||
name: Rust build << parameters.build_type >>
|
||||
command: |
|
||||
export CARGO_INCREMENTAL=0
|
||||
BUILD_TYPE="<< parameters.build_type >>"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
echo "Build in debug mode"
|
||||
cargo build --bins --tests
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
echo "Build in release mode"
|
||||
cargo build --release --bins --tests
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save rust cache
|
||||
key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
paths:
|
||||
- ~/.cargo/registry
|
||||
- ~/.cargo/git
|
||||
- target
|
||||
|
||||
# Run rust unit tests
|
||||
# FIXME: remove -p zenith_utils once integration tests are moved to python
|
||||
- run: cargo test -p zenith_utils
|
||||
|
||||
# Install the rust binaries, for use by test jobs
|
||||
# `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
|
||||
# FIXME: this is a really silly way to install; maybe we should just output
|
||||
# a tarball as an artifact? Or a .deb package?
|
||||
- run:
|
||||
name: cargo install
|
||||
command: |
|
||||
export CARGO_INCREMENTAL=0
|
||||
BUILD_TYPE="<< parameters.build_type >>"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
echo "Install debug mode"
|
||||
CARGO_FLAGS="--debug"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
echo "Install release mode"
|
||||
# The default is release mode; there is no --release flag.
|
||||
CARGO_FLAGS=""
|
||||
fi
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith
|
||||
|
||||
# Install the postgres binaries, for use by test jobs
|
||||
# FIXME: this is a silly way to do "install"; maybe just output a standard
|
||||
# postgres package, whatever the favored form is (tarball? .deb package?)
|
||||
# Note that pg_regress needs some build artifacts that probably aren't
|
||||
# in the usual package...?
|
||||
- run:
|
||||
name: postgres install
|
||||
command: |
|
||||
cp -a tmp_install /tmp/zenith/pg_install
|
||||
|
||||
# Save the rust output binaries for other jobs in this workflow.
|
||||
- persist_to_workspace:
|
||||
root: /tmp/zenith
|
||||
paths:
|
||||
- "*"
|
||||
|
||||
run-pytest:
|
||||
#description: "Run pytest"
|
||||
executor: python/default
|
||||
parameters:
|
||||
# pytest args to specify the tests to run.
|
||||
#
|
||||
# This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory,
|
||||
# or '-k foobar' to run tests containing string 'foobar'. See pytest man page
|
||||
# section SPECIFYING TESTS / SELECTING TESTS for details.
|
||||
#
|
||||
# Select the type of Rust build. Must be "release" or "debug".
|
||||
build_type:
|
||||
type: string
|
||||
default: "debug"
|
||||
# This parameter is required, to prevent the mistake of running all tests in one job.
|
||||
test_selection:
|
||||
type: string
|
||||
default: ""
|
||||
# Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
|
||||
extra_params:
|
||||
type: string
|
||||
default: ""
|
||||
needs_postgres_source:
|
||||
type: boolean
|
||||
default: false
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
- checkout
|
||||
- when:
|
||||
condition: << parameters.needs_postgres_source >>
|
||||
steps:
|
||||
- run: git submodule update --init --depth 1
|
||||
- run: pip install pytest psycopg2
|
||||
- run:
|
||||
name: Run pytest
|
||||
working_directory: test_runner
|
||||
environment:
|
||||
- ZENITH_BIN: /tmp/zenith/bin
|
||||
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
|
||||
- TEST_OUTPUT: /tmp/test_output
|
||||
command: |
|
||||
TEST_SELECTION="<< parameters.test_selection >>"
|
||||
EXTRA_PARAMS="<< parameters.extra_params >>"
|
||||
if [ -z "$TEST_SELECTION" ]; then
|
||||
echo "test_selection must be set"
|
||||
exit 1
|
||||
fi
|
||||
# Run the tests.
|
||||
#
|
||||
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||
# in its "Tests" tab in the results page.
|
||||
pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short $TEST_SELECTION $EXTRA_PARAMS
|
||||
- run:
|
||||
# CircleCI artifacts are preserved one file at a time, so skipping
|
||||
# this step isn't a good idea. If you want to extract the
|
||||
# pageserver state, perhaps a tarball would be a better idea.
|
||||
name: Delete pageserver data
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
for DIR in /tmp/test_output/*; do
|
||||
mv $DIR/repo/pageserver.log $DIR/ || true # ignore errors
|
||||
for PGDIR in $DIR/repo/pgdatadirs/pg?; do
|
||||
echo "PGDIR: $PGDIR"
|
||||
NEW_LOG="${PGDIR##*/}_log"
|
||||
mv $PGDIR/log "$DIR/$NEW_LOG" || true # ignore errors
|
||||
done
|
||||
echo "rm $DIR/repo"
|
||||
rm -rf $DIR/repo
|
||||
done
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
# The store_test_results step tells CircleCI where to find the junit.xml file.
|
||||
- store_test_results:
|
||||
path: /tmp/test_output
|
||||
|
||||
workflows:
|
||||
build_and_test:
|
||||
jobs:
|
||||
- build-postgres
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
requires:
|
||||
- build-postgres
|
||||
- run-pytest:
|
||||
name: pg_regress tests << matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
test_selection: batch_pg_regress
|
||||
needs_postgres_source: true
|
||||
requires:
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: other tests << matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
test_selection: batch_others
|
||||
requires:
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
5
.gitignore
vendored
5
.gitignore
vendored
@@ -2,5 +2,8 @@
|
||||
/tmp_check
|
||||
/tmp_install
|
||||
/tmp_check_cli
|
||||
__pycache__/
|
||||
test_output/
|
||||
.vscode
|
||||
.zenith
|
||||
/.zenith
|
||||
/integration_tests/.zenith
|
||||
|
||||
643
Cargo.lock
generated
643
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -7,4 +7,5 @@ members = [
|
||||
"control_plane",
|
||||
"postgres_ffi",
|
||||
"zenith_utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
6
Makefile
6
Makefile
@@ -32,10 +32,14 @@ postgres-headers: postgres-configure
|
||||
+@echo "Installing PostgreSQL headers"
|
||||
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
|
||||
|
||||
# Compile and install PostgreSQL
|
||||
|
||||
# Compile and install PostgreSQL and contrib/zenith
|
||||
postgres: postgres-configure
|
||||
+@echo "Compiling PostgreSQL"
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
|
||||
+@echo "Compiling contrib/zenith"
|
||||
(cd vendor/postgres/contrib/zenith && \
|
||||
$(MAKE) PG_CONFIG=$(abspath tmp_install)/bin/pg_config install USE_PGXS=1)
|
||||
|
||||
postgres-clean:
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
|
||||
|
||||
111
README.md
111
README.md
@@ -4,49 +4,96 @@ Zenith substitutes PostgreSQL storage layer and redistributes data across a clus
|
||||
|
||||
## Running local installation
|
||||
|
||||
1. Build zenith and patched postgres
|
||||
1. Install build dependencies and other useful packages
|
||||
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
```text
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison \
|
||||
libxml2-dev libcurl4-openssl-dev libssl-dev clang
|
||||
```
|
||||
|
||||
[Rust] 1.48 or later is also required.
|
||||
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
|
||||
|
||||
To run the integration tests (not required to use the code), install
|
||||
Python (3.6 or higher), and install python3 packages with `pip` (called `pip3` on some systems):
|
||||
```
|
||||
pip install pytest psycopg2
|
||||
```
|
||||
|
||||
2. Build zenith and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
cd zenith
|
||||
make
|
||||
make -j5
|
||||
```
|
||||
|
||||
2. Start pageserver and postggres on top of it (should be called from repo root):
|
||||
3. Start pageserver and postgres on top of it (should be called from repo root):
|
||||
```sh
|
||||
# Create ~/.zenith with proper paths to binaries and data
|
||||
# Create repository in .zenith with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
>./target/debug/zenith init
|
||||
> ./target/debug/zenith init
|
||||
<...>
|
||||
new zenith repository was created in .zenith
|
||||
|
||||
# start pageserver
|
||||
> ./target/debug/zenith pageserver start
|
||||
Starting pageserver at '127.0.0.1:64000'
|
||||
> ./target/debug/zenith start
|
||||
Starting pageserver at '127.0.0.1:64000' in .zenith
|
||||
Pageserver started
|
||||
|
||||
# create and configure postgres data dir
|
||||
> ./target/debug/zenith pg create
|
||||
Creating new postgres: path=/Users/user/code/zenith/tmp_check_cli/compute/pg1 port=55432
|
||||
Database initialized
|
||||
# start postgres on top on the pageserver
|
||||
> ./target/debug/zenith pg start main
|
||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
|
||||
waiting for server to start.... done
|
||||
|
||||
# start it
|
||||
> ./target/debug/zenith pg start pg1
|
||||
|
||||
# look up status and connection info
|
||||
> ./target/debug/zenith pg list
|
||||
NODE ADDRESS STATUS
|
||||
pg1 127.0.0.1:55432 running
|
||||
# check list of running postgres instances
|
||||
> ./target/debug/zenith pg list
|
||||
BRANCH ADDRESS LSN STATUS
|
||||
main 127.0.0.1:55432 0/1609610 running
|
||||
```
|
||||
|
||||
3. Now it is possible to connect to postgres and run some queries:
|
||||
```
|
||||
4. Now it is possible to connect to postgres and run some queries:
|
||||
```text
|
||||
> psql -p55432 -h 127.0.0.1 postgres
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
CREATE TABLE
|
||||
postgres=# insert into t values(1,1);
|
||||
INSERT 0 1
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
```
|
||||
|
||||
5. And create branches and run postgres on them:
|
||||
```sh
|
||||
# create branch named migration_check
|
||||
> ./target/debug/zenith branch migration_check main
|
||||
Created branch 'migration_check' at 0/1609610
|
||||
|
||||
# check branches tree
|
||||
> ./target/debug/zenith branch
|
||||
main
|
||||
┗━ @0/1609610: migration_check
|
||||
|
||||
# start postgres on that branch
|
||||
> ./target/debug/zenith pg start migration_check
|
||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
|
||||
waiting for server to start.... done
|
||||
|
||||
# this new postgres instance will have all the data from 'main' postgres,
|
||||
# but all modifications would not affect data in original postgres
|
||||
> psql -p55433 -h 127.0.0.1 postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
|
||||
postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
```
|
||||
|
||||
## Running tests
|
||||
@@ -54,38 +101,34 @@ postgres=# select * from t;
|
||||
```sh
|
||||
git clone --recursive https://github.com/libzenith/zenith.git
|
||||
make # builds also postgres and installs it to ./tmp_install
|
||||
cargo test -- --test-threads=1
|
||||
cd test_runner
|
||||
pytest
|
||||
```
|
||||
|
||||
## Source tree layout
|
||||
|
||||
/walkeeper:
|
||||
`/walkeeper`:
|
||||
|
||||
WAL safekeeper. Written in Rust.
|
||||
|
||||
/pageserver:
|
||||
`/pageserver`:
|
||||
|
||||
Page Server. Written in Rust.
|
||||
|
||||
Depends on the modified 'postgres' binary for WAL redo.
|
||||
|
||||
/integration_tests:
|
||||
|
||||
Tests with different combinations of a Postgres compute node, WAL safekeeper and Page Server.
|
||||
|
||||
/mgmt-console:
|
||||
|
||||
Web UI to launch (modified) Postgres servers, using S3 as the backing store. Written in Python.
|
||||
This is somewhat outdated, as it doesn't use the WAL safekeeper or Page Servers.
|
||||
|
||||
/vendor/postgres:
|
||||
`/vendor/postgres`:
|
||||
|
||||
PostgreSQL source tree, with the modifications needed for Zenith.
|
||||
|
||||
/vendor/postgres/src/bin/safekeeper:
|
||||
`/vendor/postgres/src/bin/safekeeper`:
|
||||
|
||||
Extension (safekeeper_proxy) that runs in the compute node, and connects to the WAL safekeepers
|
||||
and streams the WAL
|
||||
|
||||
`/test_runner`:
|
||||
|
||||
Integration tests, written in Python using the `pytest` framework.
|
||||
|
||||
|
||||
[Rust]: https://www.rust-lang.org/learn/get-started
|
||||
|
||||
@@ -9,19 +9,22 @@ edition = "2018"
|
||||
[dependencies]
|
||||
rand = "0.8.3"
|
||||
tar = "0.4.33"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
|
||||
serde = ""
|
||||
serde_derive = ""
|
||||
toml = ""
|
||||
lazy_static = ""
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
toml = "0.5"
|
||||
lazy_static = "1.4"
|
||||
regex = "1"
|
||||
anyhow = "1.0"
|
||||
hex = "0.4.3"
|
||||
# hex = "0.4.3"
|
||||
bytes = "1.0.1"
|
||||
fs_extra = "1.2.0"
|
||||
# fs_extra = "1.2.0"
|
||||
nix = "0.20"
|
||||
# thiserror = "1"
|
||||
url = "2.2.2"
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
@@ -1,23 +1,24 @@
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{Read, Write};
|
||||
use std::io::Write;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, ExitStatus};
|
||||
use std::process::Command;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{collections::BTreeMap, path::PathBuf};
|
||||
use std::{
|
||||
fs::{self, OpenOptions},
|
||||
io::Read,
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::storage::{PageServerNode, WalProposerNode};
|
||||
use pageserver::{zenith_repo_dir, ZTimelineId};
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
use crate::storage::PageServerNode;
|
||||
|
||||
//
|
||||
// ComputeControlPlane
|
||||
@@ -36,7 +37,7 @@ impl ComputeControlPlane {
|
||||
// it is running on default port. Change that when pageserver will have config.
|
||||
let pageserver = Arc::new(PageServerNode::from_env(&env));
|
||||
|
||||
let pgdatadirspath = env.repo_path.join("pgdatadirs");
|
||||
let pgdatadirspath = &env.pg_data_dirs_path();
|
||||
let nodes: Result<BTreeMap<_, _>> = fs::read_dir(&pgdatadirspath)
|
||||
.with_context(|| format!("failed to list {}", pgdatadirspath.display()))?
|
||||
.into_iter()
|
||||
@@ -79,11 +80,10 @@ impl ComputeControlPlane {
|
||||
&mut self,
|
||||
is_test: bool,
|
||||
timelineid: ZTimelineId,
|
||||
name: &str,
|
||||
) -> Result<Arc<PostgresNode>> {
|
||||
let node_id = self.nodes.len() as u32 + 1;
|
||||
|
||||
let node = Arc::new(PostgresNode {
|
||||
name: format!("pg{}", node_id),
|
||||
name: name.to_owned(),
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), self.get_port()),
|
||||
env: self.env.clone(),
|
||||
pageserver: Arc::clone(&self.pageserver),
|
||||
@@ -97,47 +97,66 @@ impl ComputeControlPlane {
|
||||
Ok(node)
|
||||
}
|
||||
|
||||
pub fn new_test_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
|
||||
let node = self.new_from_page_server(true, timelineid);
|
||||
assert!(node.is_ok());
|
||||
pub fn new_test_node(&mut self, branch_name: &str) -> Arc<PostgresNode> {
|
||||
let timeline_id = self
|
||||
.pageserver
|
||||
.branch_get_by_name(branch_name)
|
||||
.expect("failed to get timeline_id")
|
||||
.timeline_id;
|
||||
|
||||
let node = self.new_from_page_server(true, timeline_id, branch_name);
|
||||
let node = node.unwrap();
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
"shared_preload_libraries = zenith\n\
|
||||
zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
node.connstr()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_test_master_node(&mut self, timelineid: ZTimelineId) -> Arc<PostgresNode> {
|
||||
let node = self.new_from_page_server(true, timelineid).unwrap();
|
||||
pub fn new_test_master_node(&mut self, branch_name: &str) -> Arc<PostgresNode> {
|
||||
let timeline_id = self
|
||||
.pageserver
|
||||
.branch_get_by_name(branch_name)
|
||||
.expect("failed to get timeline_id")
|
||||
.timeline_id;
|
||||
|
||||
let node = self
|
||||
.new_from_page_server(true, timeline_id, branch_name)
|
||||
.unwrap();
|
||||
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
"synchronous_standby_names = 'safekeeper_proxy'\n",
|
||||
);
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
node
|
||||
}
|
||||
|
||||
pub fn new_node(&mut self, timelineid: ZTimelineId) -> Result<Arc<PostgresNode>> {
|
||||
let node = self.new_from_page_server(false, timelineid).unwrap();
|
||||
pub fn new_node(&mut self, branch_name: &str) -> Result<Arc<PostgresNode>> {
|
||||
let timeline_id = self.pageserver.branch_get_by_name(branch_name)?.timeline_id;
|
||||
|
||||
let node = self.new_from_page_server(false, timeline_id, branch_name)?;
|
||||
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
format!(
|
||||
"callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
"shared_preload_libraries = zenith\n\
|
||||
zenith.callmemaybe_connstring = '{}'\n", // FIXME escaping
|
||||
node.connstr()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
)?;
|
||||
|
||||
Ok(node)
|
||||
}
|
||||
@@ -151,7 +170,7 @@ pub struct PostgresNode {
|
||||
pub env: LocalEnv,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
is_test: bool,
|
||||
timelineid: ZTimelineId,
|
||||
pub timelineid: ZTimelineId,
|
||||
}
|
||||
|
||||
impl PostgresNode {
|
||||
@@ -169,6 +188,8 @@ impl PostgresNode {
|
||||
|
||||
lazy_static! {
|
||||
static ref CONF_PORT_RE: Regex = Regex::new(r"(?m)^\s*port\s*=\s*(\d+)\s*$").unwrap();
|
||||
static ref CONF_TIMELINE_RE: Regex =
|
||||
Regex::new(r"(?m)^\s*zenith.zenith_timeline\s*=\s*'(\w+)'\s*$").unwrap();
|
||||
}
|
||||
|
||||
// parse data directory name
|
||||
@@ -184,6 +205,7 @@ impl PostgresNode {
|
||||
)
|
||||
})?;
|
||||
|
||||
// parse port
|
||||
let err_msg = format!(
|
||||
"failed to find port definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
@@ -199,11 +221,21 @@ impl PostgresNode {
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
// FIXME: What timeline is this server on? Would have to parse the postgresql.conf
|
||||
// file for that, too. It's currently not needed for anything, but it would be
|
||||
// nice to list the timeline in "zenith pg list"
|
||||
let timelineid_buf = [0u8; 16];
|
||||
let timelineid = ZTimelineId::from(timelineid_buf);
|
||||
// parse timeline
|
||||
let err_msg = format!(
|
||||
"failed to find timeline definition in config file {}",
|
||||
cfg_path.to_str().unwrap()
|
||||
);
|
||||
let timelineid: ZTimelineId = CONF_TIMELINE_RE
|
||||
.captures(config.as_str())
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 1"))?
|
||||
.iter()
|
||||
.last()
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 2"))?
|
||||
.ok_or_else(|| anyhow::Error::msg(err_msg.clone() + " 3"))?
|
||||
.as_str()
|
||||
.parse()
|
||||
.with_context(|| err_msg)?;
|
||||
|
||||
// ok now
|
||||
Ok(PostgresNode {
|
||||
@@ -277,22 +309,22 @@ impl PostgresNode {
|
||||
max_replication_slots = 10\n\
|
||||
hot_standby = on\n\
|
||||
shared_buffers = 1MB\n\
|
||||
fsync = off\n\
|
||||
fsync = off\n\
|
||||
max_connections = 100\n\
|
||||
wal_sender_timeout = 0\n\
|
||||
wal_sender_timeout = 0\n\
|
||||
wal_level = replica\n\
|
||||
listen_addresses = '{address}'\n\
|
||||
port = {port}\n",
|
||||
address = self.address.ip(),
|
||||
port = self.address.port()
|
||||
),
|
||||
);
|
||||
)?;
|
||||
|
||||
// Never clean up old WAL. TODO: We should use a replication
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeepr or
|
||||
// page server yet. But this will do for now.
|
||||
self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n");
|
||||
self.append_conf("postgresql.conf", "wal_keep_size='10TB'\n")?;
|
||||
|
||||
// Connect it to the page server.
|
||||
|
||||
@@ -300,19 +332,20 @@ impl PostgresNode {
|
||||
self.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!(
|
||||
"page_server_connstring = 'host={} port={}'\n\
|
||||
zenith_timeline='{}'\n",
|
||||
"shared_preload_libraries = zenith \n\
|
||||
zenith.page_server_connstring = 'host={} port={}'\n\
|
||||
zenith.zenith_timeline='{}'\n",
|
||||
self.pageserver.address().ip(),
|
||||
self.pageserver.address().port(),
|
||||
self.timelineid
|
||||
),
|
||||
);
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pgdata(&self) -> PathBuf {
|
||||
self.env.repo_path.join("pgdatadirs").join(&self.name)
|
||||
pub fn pgdata(&self) -> PathBuf {
|
||||
self.env.pg_data_dir(&self.name)
|
||||
}
|
||||
|
||||
pub fn status(&self) -> &str {
|
||||
@@ -328,13 +361,12 @@ impl PostgresNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append_conf(&self, config: &str, opts: &str) {
|
||||
pub fn append_conf(&self, config: &str, opts: &str) -> Result<()> {
|
||||
OpenOptions::new()
|
||||
.append(true)
|
||||
.open(self.pgdata().join(config).to_str().unwrap())
|
||||
.unwrap()
|
||||
.write_all(opts.as_bytes())
|
||||
.unwrap();
|
||||
.open(self.pgdata().join(config).to_str().unwrap())?
|
||||
.write_all(opts.as_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str]) -> Result<()> {
|
||||
@@ -373,8 +405,16 @@ impl PostgresNode {
|
||||
self.pg_ctl(&["restart"])
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"])
|
||||
pub fn stop(&self, destroy: bool) -> Result<()> {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"])?;
|
||||
if destroy {
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
self.pgdata().to_str().unwrap()
|
||||
);
|
||||
fs::remove_dir_all(&self.pgdata())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn connstr(&self) -> String {
|
||||
@@ -398,152 +438,6 @@ impl PostgresNode {
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
fn dump_log_file(&self) {
|
||||
if let Ok(mut file) = File::open(self.env.repo_path.join("pageserver.log")) {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- pageserver.log:\n{}", buffer);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn safe_psql(&self, db: &str, sql: &str) -> Vec<tokio_postgres::Row> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
let mut client = Client::connect(connstring.as_str(), NoTls).unwrap();
|
||||
|
||||
println!("Running {}", sql);
|
||||
let result = client.query(sql, &[]);
|
||||
if result.is_err() {
|
||||
self.dump_log_file();
|
||||
}
|
||||
result.unwrap()
|
||||
}
|
||||
|
||||
pub fn open_psql(&self, db: &str) -> Client {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls).unwrap()
|
||||
}
|
||||
|
||||
pub fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode {
|
||||
let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy");
|
||||
match Command::new(proxy_path.as_path())
|
||||
.args(&["--ztimelineid", &self.timelineid.to_string()])
|
||||
.args(&["-s", wal_acceptors])
|
||||
.args(&["-h", &self.address.ip().to_string()])
|
||||
.args(&["-p", &self.address.port().to_string()])
|
||||
.arg("-v")
|
||||
.stderr(
|
||||
OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(self.pgdata().join("safekeeper_proxy.log"))
|
||||
.unwrap(),
|
||||
)
|
||||
.spawn()
|
||||
{
|
||||
Ok(child) => WalProposerNode { pid: child.id() },
|
||||
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pg_regress(&self) -> ExitStatus {
|
||||
self.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
let data_dir = zenith_repo_dir();
|
||||
let regress_run_path = data_dir.join("regress");
|
||||
fs::create_dir_all(®ress_run_path).unwrap();
|
||||
fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGPORT", self.address.port().to_string())
|
||||
.env("PGUSER", self.whoami())
|
||||
.env("PGHOST", self.address.ip().to_string())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
if !regress_check.success() {
|
||||
if let Ok(mut file) = File::open("regression.diffs") {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- regression.diffs:\n{}", buffer);
|
||||
}
|
||||
self.dump_log_file();
|
||||
if let Ok(mut file) = File::open(
|
||||
self.env
|
||||
.repo_path
|
||||
.join("pgdatadirs")
|
||||
.join("pg1")
|
||||
.join("log"),
|
||||
) {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- pgdatadirs/pg1/log:\n{}", buffer);
|
||||
}
|
||||
}
|
||||
regress_check
|
||||
}
|
||||
|
||||
pub fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus {
|
||||
let port = self.address.port().to_string();
|
||||
let clients = clients.to_string();
|
||||
let seconds = seconds.to_string();
|
||||
let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&["-i", "-p", port.as_str(), "postgres"])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench -i");
|
||||
let pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&[
|
||||
"-p",
|
||||
port.as_str(),
|
||||
"-T",
|
||||
seconds.as_str(),
|
||||
"-P",
|
||||
"1",
|
||||
"-c",
|
||||
clients.as_str(),
|
||||
"-M",
|
||||
"prepared",
|
||||
"postgres",
|
||||
])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench run");
|
||||
pg_bench_run
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PostgresNode {
|
||||
@@ -552,7 +446,7 @@ impl Drop for PostgresNode {
|
||||
// and checking it here. But let just clean datadirs on start.
|
||||
fn drop(&mut self) {
|
||||
if self.is_test {
|
||||
let _ = self.stop();
|
||||
let _ = self.stop(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,26 @@
|
||||
// Intended to be used in integration tests and in CLI tools for
|
||||
// local installations.
|
||||
//
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
pub mod compute;
|
||||
pub mod local_env;
|
||||
pub mod storage;
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// We expect a file that contains a single integer.
|
||||
/// We return an i32 for compatibility with libc and nix.
|
||||
pub fn read_pidfile(pidfile: &Path) -> Result<i32> {
|
||||
let pid_str = fs::read_to_string(pidfile)
|
||||
.with_context(|| format!("failed to read pidfile {:?}", pidfile))?;
|
||||
let pid: i32 = pid_str
|
||||
.parse()
|
||||
.map_err(|_| anyhow!("failed to parse pidfile {:?}", pidfile))?;
|
||||
if pid < 1 {
|
||||
bail!("pidfile {:?} contained bad value '{}'", pidfile, pid);
|
||||
}
|
||||
Ok(pid)
|
||||
}
|
||||
|
||||
@@ -4,34 +4,25 @@
|
||||
// Now it also provides init method which acts like a stub for proper installation
|
||||
// script which will use local paths.
|
||||
//
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use rand::Rng;
|
||||
use std::env;
|
||||
use anyhow::{anyhow, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use std::path::PathBuf;
|
||||
use std::{collections::BTreeMap, env};
|
||||
use url::Url;
|
||||
|
||||
use anyhow::Result;
|
||||
use serde_derive::{Deserialize, Serialize};
|
||||
|
||||
use pageserver::zenith_repo_dir;
|
||||
use pageserver::ZTimelineId;
|
||||
use postgres_ffi::xlog_utils;
|
||||
pub type Remotes = BTreeMap<String, String>;
|
||||
|
||||
//
|
||||
// This data structure represents deserialized zenith config, which should be
|
||||
// located in ~/.zenith
|
||||
//
|
||||
// TODO: should we also support ZENITH_CONF env var?
|
||||
// This data structures represent deserialized zenith CLI config
|
||||
//
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct LocalEnv {
|
||||
// Path to the Repository. Here page server and compute nodes will create and store their data.
|
||||
pub repo_path: PathBuf,
|
||||
// Pageserver connection strings
|
||||
pub pageserver_connstring: String,
|
||||
|
||||
// System identifier, from the PostgreSQL control file
|
||||
pub systemid: u64,
|
||||
// Base directory for both pageserver and compute nodes
|
||||
pub base_data_dir: PathBuf,
|
||||
|
||||
// Path to postgres distribution. It's expected that "bin", "include",
|
||||
// "lib", "share" from postgres distribution are there. If at some point
|
||||
@@ -39,191 +30,118 @@ pub struct LocalEnv {
|
||||
// to four separate paths and match OS-specific installation layout.
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
|
||||
// Path to pageserver binary.
|
||||
pub zenith_distrib_dir: PathBuf,
|
||||
// Path to pageserver binary. Empty for remote pageserver.
|
||||
pub zenith_distrib_dir: Option<PathBuf>,
|
||||
|
||||
pub remotes: Remotes,
|
||||
}
|
||||
|
||||
impl LocalEnv {
|
||||
// postgres installation
|
||||
// postgres installation paths
|
||||
pub fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
}
|
||||
pub fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> Result<PathBuf> {
|
||||
Ok(self
|
||||
.zenith_distrib_dir
|
||||
.as_ref()
|
||||
.ok_or_else(|| anyhow!("Can not manage remote pageserver"))?
|
||||
.join("pageserver"))
|
||||
}
|
||||
|
||||
pub fn pg_data_dirs_path(&self) -> PathBuf {
|
||||
self.base_data_dir.join("pgdatadirs")
|
||||
}
|
||||
|
||||
pub fn pg_data_dir(&self, name: &str) -> PathBuf {
|
||||
self.pg_data_dirs_path().join(name)
|
||||
}
|
||||
|
||||
// TODO: move pageserver files into ./pageserver
|
||||
pub fn pageserver_data_dir(&self) -> PathBuf {
|
||||
self.base_data_dir.clone()
|
||||
}
|
||||
}
|
||||
|
||||
fn base_path() -> PathBuf {
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Initialize a new Zenith repository
|
||||
//
|
||||
pub fn init() -> Result<()> {
|
||||
pub fn init(remote_pageserver: Option<&str>) -> Result<()> {
|
||||
// check if config already exists
|
||||
let repo_path = zenith_repo_dir();
|
||||
if repo_path.exists() {
|
||||
let base_path = base_path();
|
||||
if base_path.exists() {
|
||||
anyhow::bail!(
|
||||
"{} already exists. Perhaps already initialized?",
|
||||
repo_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// Now we can run init only from crate directory, so check that current dir is our crate.
|
||||
// Use 'pageserver/Cargo.toml' existence as evidendce.
|
||||
let cargo_path = env::current_dir()?;
|
||||
if !cargo_path.join("pageserver/Cargo.toml").exists() {
|
||||
anyhow::bail!(
|
||||
"Current directory does not look like a zenith repo. \
|
||||
Please, run 'init' from zenith repo root."
|
||||
base_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// ok, now check that expected binaries are present
|
||||
|
||||
// check postgres
|
||||
let pg_distrib_dir = cargo_path.join("tmp_install");
|
||||
let pg_path = pg_distrib_dir.join("bin/postgres");
|
||||
if !pg_path.exists() {
|
||||
anyhow::bail!(
|
||||
"Can't find postres binary at {}. \
|
||||
Perhaps 'make postgres' is needed to build it first.",
|
||||
pg_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// check pageserver
|
||||
let zenith_distrib_dir = cargo_path.join("target/debug/");
|
||||
let pageserver_path = zenith_distrib_dir.join("pageserver");
|
||||
if !pageserver_path.exists() {
|
||||
anyhow::bail!(
|
||||
"Can't find pageserver binary at {}. Please build it.",
|
||||
pageserver_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// ok, we are good to go
|
||||
let mut conf = LocalEnv {
|
||||
repo_path,
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir,
|
||||
systemid: 0,
|
||||
// Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
|
||||
let pg_distrib_dir: PathBuf = {
|
||||
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
|
||||
postgres_bin.into()
|
||||
} else {
|
||||
let cwd = env::current_dir()?;
|
||||
cwd.join("tmp_install")
|
||||
}
|
||||
};
|
||||
init_repo(&mut conf)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_repo(local_env: &mut LocalEnv) -> Result<()> {
|
||||
let repopath = &local_env.repo_path;
|
||||
fs::create_dir(&repopath)
|
||||
.with_context(|| format!("could not create directory {}", repopath.display()))?;
|
||||
fs::create_dir(repopath.join("pgdatadirs"))?;
|
||||
fs::create_dir(repopath.join("timelines"))?;
|
||||
fs::create_dir(repopath.join("refs"))?;
|
||||
fs::create_dir(repopath.join("refs").join("branches"))?;
|
||||
fs::create_dir(repopath.join("refs").join("tags"))?;
|
||||
println!("created directory structure in {}", repopath.display());
|
||||
|
||||
// Create initial timeline
|
||||
let tli = create_timeline(&local_env, None)?;
|
||||
let timelinedir = repopath.join("timelines").join(tli.to_string());
|
||||
println!("created initial timeline {}", timelinedir.display());
|
||||
|
||||
// Run initdb
|
||||
//
|
||||
// We create the cluster temporarily in a "tmp" directory inside the repository,
|
||||
// and move it to the right location from there.
|
||||
let tmppath = repopath.join("tmp");
|
||||
|
||||
let initdb_path = local_env.pg_bin_dir().join("initdb");
|
||||
let initdb = Command::new(initdb_path)
|
||||
.args(&["-D", tmppath.to_str().unwrap()])
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", local_env.pg_lib_dir().to_str().unwrap())
|
||||
.env(
|
||||
"DYLD_LIBRARY_PATH",
|
||||
local_env.pg_lib_dir().to_str().unwrap(),
|
||||
)
|
||||
.stdout(Stdio::null())
|
||||
.status()
|
||||
.with_context(|| "failed to execute initdb")?;
|
||||
if !initdb.success() {
|
||||
anyhow::bail!("initdb failed");
|
||||
if !pg_distrib_dir.join("bin/postgres").exists() {
|
||||
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
}
|
||||
println!("initdb succeeded");
|
||||
|
||||
// Read control file to extract the LSN and system id
|
||||
let controlfile_path = tmppath.join("global").join("pg_control");
|
||||
let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
|
||||
let systemid = controlfile.system_identifier;
|
||||
let lsn = controlfile.checkPoint;
|
||||
let lsnstr = format!("{:016X}", lsn);
|
||||
fs::create_dir(&base_path)?;
|
||||
fs::create_dir(base_path.join("pgdatadirs"))?;
|
||||
|
||||
// Move the initial WAL file
|
||||
fs::rename(
|
||||
tmppath.join("pg_wal").join("000000010000000000000001"),
|
||||
timelinedir
|
||||
.join("wal")
|
||||
.join("000000010000000000000001.partial"),
|
||||
)?;
|
||||
println!("moved initial WAL file");
|
||||
let conf = if let Some(addr) = remote_pageserver {
|
||||
// check that addr is parsable
|
||||
let _uri = Url::parse(addr).map_err(|e| anyhow!("{}: {}", addr, e))?;
|
||||
|
||||
// Remove pg_wal
|
||||
fs::remove_dir_all(tmppath.join("pg_wal"))?;
|
||||
LocalEnv {
|
||||
pageserver_connstring: format!("postgresql://{}/", addr),
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir: None,
|
||||
base_data_dir: base_path,
|
||||
remotes: BTreeMap::default(),
|
||||
}
|
||||
} else {
|
||||
// Find zenith binaries.
|
||||
let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
if !zenith_distrib_dir.join("pageserver").exists() {
|
||||
anyhow::bail!("Can't find pageserver binary.",);
|
||||
}
|
||||
|
||||
force_crash_recovery(&tmppath)?;
|
||||
println!("updated pg_control");
|
||||
LocalEnv {
|
||||
pageserver_connstring: "postgresql://127.0.0.1:6400".to_string(),
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir: Some(zenith_distrib_dir),
|
||||
base_data_dir: base_path,
|
||||
remotes: BTreeMap::default(),
|
||||
}
|
||||
};
|
||||
|
||||
let target = timelinedir.join("snapshots").join(&lsnstr);
|
||||
fs::rename(tmppath, &target)?;
|
||||
println!("moved 'tmp' to {}", target.display());
|
||||
|
||||
// Create 'main' branch to refer to the initial timeline
|
||||
let data = tli.to_string();
|
||||
fs::write(repopath.join("refs").join("branches").join("main"), data)?;
|
||||
println!("created main branch");
|
||||
|
||||
// Also update the system id in the LocalEnv
|
||||
local_env.systemid = systemid;
|
||||
|
||||
// write config
|
||||
let toml = toml::to_string(&local_env)?;
|
||||
fs::write(repopath.join("config"), toml)?;
|
||||
|
||||
println!(
|
||||
"new zenith repository was created in {}",
|
||||
repopath.display()
|
||||
);
|
||||
let toml = toml::to_string_pretty(&conf)?;
|
||||
fs::write(conf.base_data_dir.join("config"), toml)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// If control file says the cluster was shut down cleanly, modify it, to mark
|
||||
// it as crashed. That forces crash recovery when you start the cluster.
|
||||
//
|
||||
// FIXME:
|
||||
// We currently do this to the initial snapshot in "zenith init". It would
|
||||
// be more natural to do this when the snapshot is restored instead, but we
|
||||
// currently don't have any code to create new snapshots, so it doesn't matter
|
||||
// Or better yet, use a less hacky way of putting the cluster into recovery.
|
||||
// Perhaps create a backup label file in the data directory when it's restored.
|
||||
fn force_crash_recovery(datadir: &Path) -> Result<()> {
|
||||
// Read in the control file
|
||||
let controlfilepath = datadir.to_path_buf().join("global").join("pg_control");
|
||||
let mut controlfile =
|
||||
postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfilepath.as_path())?))?;
|
||||
// Locate and load config
|
||||
pub fn load_config() -> Result<LocalEnv> {
|
||||
let repopath = base_path();
|
||||
|
||||
controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION;
|
||||
|
||||
fs::write(
|
||||
controlfilepath.as_path(),
|
||||
postgres_ffi::encode_pg_control(controlfile),
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// check that config file is present
|
||||
pub fn load_config(repopath: &Path) -> Result<LocalEnv> {
|
||||
if !repopath.exists() {
|
||||
anyhow::bail!(
|
||||
"Zenith config is not found in {}. You need to run 'zenith init' first",
|
||||
@@ -231,159 +149,18 @@ pub fn load_config(repopath: &Path) -> Result<LocalEnv> {
|
||||
);
|
||||
}
|
||||
|
||||
// TODO: check that it looks like a zenith repository
|
||||
|
||||
// load and parse file
|
||||
let config = fs::read_to_string(repopath.join("config"))?;
|
||||
toml::from_str(config.as_str()).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
// local env for tests
|
||||
pub fn test_env(testname: &str) -> LocalEnv {
|
||||
fs::create_dir_all("../tmp_check").expect("could not create directory ../tmp_check");
|
||||
|
||||
let repo_path = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_check/")
|
||||
.join(testname);
|
||||
|
||||
// Remove remnants of old test repo
|
||||
let _ = fs::remove_dir_all(&repo_path);
|
||||
|
||||
let mut local_env = LocalEnv {
|
||||
repo_path,
|
||||
pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"),
|
||||
zenith_distrib_dir: cargo_bin_dir(),
|
||||
systemid: 0,
|
||||
};
|
||||
init_repo(&mut local_env).expect("could not initialize zenith repository");
|
||||
local_env
|
||||
}
|
||||
|
||||
// Find the directory where the binaries were put (i.e. target/debug/)
|
||||
pub fn cargo_bin_dir() -> PathBuf {
|
||||
let mut pathbuf = std::env::current_exe().unwrap();
|
||||
|
||||
pathbuf.pop();
|
||||
if pathbuf.ends_with("deps") {
|
||||
pathbuf.pop();
|
||||
}
|
||||
|
||||
pathbuf
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PointInTime {
|
||||
pub timelineid: ZTimelineId,
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
fn create_timeline(local_env: &LocalEnv, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
|
||||
let repopath = &local_env.repo_path;
|
||||
|
||||
// Create initial timeline
|
||||
let mut tli_buf = [0u8; 16];
|
||||
rand::thread_rng().fill(&mut tli_buf);
|
||||
let timelineid = ZTimelineId::from(tli_buf);
|
||||
|
||||
let timelinedir = repopath.join("timelines").join(timelineid.to_string());
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
fs::create_dir(&timelinedir.join("snapshots"))?;
|
||||
fs::create_dir(&timelinedir.join("wal"))?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!(
|
||||
"{}@{:X}/{:X}",
|
||||
ancestor.timelineid,
|
||||
ancestor.lsn >> 32,
|
||||
ancestor.lsn & 0xffffffff
|
||||
);
|
||||
fs::write(timelinedir.join("ancestor"), data)?;
|
||||
}
|
||||
|
||||
Ok(timelineid)
|
||||
}
|
||||
|
||||
// Parse an LSN in the format used in filenames
|
||||
//
|
||||
// For example: 00000000015D3DD8
|
||||
//
|
||||
fn parse_lsn(s: &str) -> std::result::Result<u64, std::num::ParseIntError> {
|
||||
u64::from_str_radix(s, 16)
|
||||
}
|
||||
|
||||
// Create a new branch in the repository (for the "zenith branch" subcommand)
|
||||
pub fn create_branch(
|
||||
local_env: &LocalEnv,
|
||||
branchname: &str,
|
||||
startpoint: PointInTime,
|
||||
) -> Result<()> {
|
||||
let repopath = &local_env.repo_path;
|
||||
|
||||
// create a new timeline for it
|
||||
let newtli = create_timeline(local_env, Some(startpoint))?;
|
||||
let newtimelinedir = repopath.join("timelines").join(newtli.to_string());
|
||||
|
||||
let data = newtli.to_string();
|
||||
fs::write(
|
||||
repopath.join("refs").join("branches").join(branchname),
|
||||
data,
|
||||
)?;
|
||||
|
||||
// Copy the latest snapshot (TODO: before the startpoint) and all WAL
|
||||
// TODO: be smarter and avoid the copying...
|
||||
let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(local_env, startpoint.timelineid)?;
|
||||
let copy_opts = fs_extra::dir::CopyOptions::new();
|
||||
fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), ©_opts)?;
|
||||
|
||||
let oldtimelinedir = repopath
|
||||
.join("timelines")
|
||||
.join(startpoint.timelineid.to_string());
|
||||
let mut copy_opts = fs_extra::dir::CopyOptions::new();
|
||||
copy_opts.content_only = true;
|
||||
fs_extra::dir::copy(
|
||||
oldtimelinedir.join("wal"),
|
||||
newtimelinedir.join("wal"),
|
||||
©_opts,
|
||||
)?;
|
||||
// Save config. We use that to change set of remotes from CLI itself.
|
||||
pub fn save_config(conf: &LocalEnv) -> Result<()> {
|
||||
let config_path = base_path().join("config");
|
||||
let conf_str = toml::to_string_pretty(conf)?;
|
||||
|
||||
fs::write(config_path, conf_str)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Find the end of valid WAL in a wal directory
|
||||
pub fn find_end_of_wal(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<u64> {
|
||||
let repopath = &local_env.repo_path;
|
||||
let waldir = repopath
|
||||
.join("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("wal");
|
||||
|
||||
let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, 16 * 1024 * 1024, true);
|
||||
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
// Find the latest snapshot for a timeline
|
||||
fn find_latest_snapshot(local_env: &LocalEnv, timeline: ZTimelineId) -> Result<(u64, PathBuf)> {
|
||||
let repopath = &local_env.repo_path;
|
||||
|
||||
let snapshotsdir = repopath
|
||||
.join("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("snapshots");
|
||||
let paths = fs::read_dir(&snapshotsdir)?;
|
||||
let mut maxsnapshot: u64 = 0;
|
||||
let mut snapshotdir: Option<PathBuf> = None;
|
||||
for path in paths {
|
||||
let path = path?;
|
||||
let filename = path.file_name().to_str().unwrap().to_owned();
|
||||
if let Ok(lsn) = parse_lsn(&filename) {
|
||||
maxsnapshot = std::cmp::max(lsn, maxsnapshot);
|
||||
snapshotdir = Some(path.path());
|
||||
}
|
||||
}
|
||||
if maxsnapshot == 0 {
|
||||
// TODO: check ancestor timeline
|
||||
anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
|
||||
}
|
||||
|
||||
Ok((maxsnapshot, snapshotdir.unwrap()))
|
||||
}
|
||||
|
||||
@@ -1,122 +1,18 @@
|
||||
use anyhow::Result;
|
||||
use std::fs;
|
||||
use std::io;
|
||||
use std::net::SocketAddr;
|
||||
use std::net::TcpStream;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::collections::HashMap;
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
//
|
||||
// Collection of several example deployments useful for tests.
|
||||
//
|
||||
// I'm intendedly modelling storage and compute control planes as a separate entities
|
||||
// as it is closer to the actual setup.
|
||||
//
|
||||
pub struct TestStorageControlPlane {
|
||||
pub wal_acceptors: Vec<WalAcceptorNode>,
|
||||
pub pageserver: Arc<PageServerNode>,
|
||||
pub test_done: AtomicBool,
|
||||
pub repopath: PathBuf,
|
||||
}
|
||||
|
||||
impl TestStorageControlPlane {
|
||||
// Peek into the repository, to grab the timeline ID of given branch
|
||||
pub fn get_branch_timeline(&self, branchname: &str) -> ZTimelineId {
|
||||
let branchpath = self.repopath.join("refs/branches/".to_owned() + branchname);
|
||||
|
||||
ZTimelineId::from_str(&(fs::read_to_string(&branchpath).unwrap())).unwrap()
|
||||
}
|
||||
|
||||
// postgres <-> page_server
|
||||
//
|
||||
// Initialize a new repository and configure a page server to run in it
|
||||
//
|
||||
pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane {
|
||||
let repopath = local_env.repo_path.clone();
|
||||
|
||||
let pserver = Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
});
|
||||
pserver.start().unwrap();
|
||||
|
||||
TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: pserver,
|
||||
test_done: AtomicBool::new(false),
|
||||
repopath,
|
||||
}
|
||||
}
|
||||
|
||||
// postgres <-> {wal_acceptor1, wal_acceptor2, ...}
|
||||
pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane {
|
||||
let repopath = local_env.repo_path.clone();
|
||||
|
||||
let mut cplane = TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
}),
|
||||
test_done: AtomicBool::new(false),
|
||||
repopath,
|
||||
};
|
||||
cplane.pageserver.start().unwrap();
|
||||
|
||||
const WAL_ACCEPTOR_PORT: usize = 54321;
|
||||
|
||||
for i in 0..redundancy {
|
||||
let wal_acceptor = WalAcceptorNode {
|
||||
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
data_dir: local_env.repo_path.join(format!("wal_acceptor_{}", i)),
|
||||
env: local_env.clone(),
|
||||
};
|
||||
wal_acceptor.init();
|
||||
wal_acceptor.start();
|
||||
cplane.wal_acceptors.push(wal_acceptor);
|
||||
}
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
for wa in self.wal_acceptors.iter() {
|
||||
let _ = wa.stop();
|
||||
}
|
||||
self.test_done.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn get_wal_acceptor_conn_info(&self) -> String {
|
||||
self.wal_acceptors
|
||||
.iter()
|
||||
.map(|wa| wa.listen.to_string())
|
||||
.collect::<Vec<String>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub fn is_running(&self) -> bool {
|
||||
self.test_done.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestStorageControlPlane {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
use crate::read_pidfile;
|
||||
use pageserver::branches::BranchInfo;
|
||||
|
||||
//
|
||||
// Control routines for pageserver.
|
||||
@@ -124,8 +20,8 @@ impl Drop for TestStorageControlPlane {
|
||||
// Used in CLI and tests.
|
||||
//
|
||||
pub struct PageServerNode {
|
||||
kill_on_exit: bool,
|
||||
listen_address: Option<SocketAddr>,
|
||||
pub kill_on_exit: bool,
|
||||
pub listen_address: Option<SocketAddr>,
|
||||
pub env: LocalEnv,
|
||||
}
|
||||
|
||||
@@ -145,12 +41,36 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn init(&self) -> Result<()> {
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let status = cmd
|
||||
.args(&["--init", "-D", self.env.base_data_dir.to_str().unwrap()])
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.env(
|
||||
"POSTGRES_DISTRIB_DIR",
|
||||
self.env.pg_distrib_dir.to_str().unwrap(),
|
||||
)
|
||||
.env("ZENITH_REPO_DIR", self.repo_path())
|
||||
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pageserver init failed");
|
||||
|
||||
if status.success() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow!("pageserver init failed"))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn repo_path(&self) -> PathBuf {
|
||||
self.env.repo_path.clone()
|
||||
self.env.pageserver_data_dir()
|
||||
}
|
||||
|
||||
pub fn pid_file(&self) -> PathBuf {
|
||||
self.env.repo_path.join("pageserver.pid")
|
||||
self.repo_path().join("pageserver.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Result<()> {
|
||||
@@ -160,18 +80,27 @@ impl PageServerNode {
|
||||
self.repo_path().display()
|
||||
);
|
||||
|
||||
let mut cmd = Command::new(self.env.zenith_distrib_dir.join("pageserver"));
|
||||
cmd.args(&["-l", self.address().to_string().as_str()])
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.env("ZENITH_REPO_DIR", self.repo_path())
|
||||
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
cmd.args(&[
|
||||
"-l",
|
||||
self.address().to_string().as_str(),
|
||||
"-D",
|
||||
self.repo_path().to_str().unwrap(),
|
||||
])
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.env(
|
||||
"POSTGRES_DISTRIB_DIR",
|
||||
self.env.pg_distrib_dir.to_str().unwrap(),
|
||||
)
|
||||
.env("ZENITH_REPO_DIR", self.repo_path())
|
||||
.env("PATH", self.env.pg_bin_dir().to_str().unwrap()) // needs postres-wal-redo binary
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
anyhow::bail!(
|
||||
bail!(
|
||||
"Pageserver failed to start. See '{}' for details.",
|
||||
self.repo_path().join("pageserver.log").display()
|
||||
);
|
||||
@@ -184,43 +113,35 @@ impl PageServerNode {
|
||||
if client.is_ok() {
|
||||
break;
|
||||
} else {
|
||||
println!("page server not responding yet, retrying ({})...", retries);
|
||||
println!("Pageserver not responding yet, retrying ({})...", retries);
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
|
||||
println!("Pageserver started");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
let pidfile = self.pid_file();
|
||||
let pid = read_pidfile(&pidfile)?;
|
||||
|
||||
let status = Command::new("kill")
|
||||
.arg(&pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
anyhow::bail!("Failed to kill pageserver with pid {}", pid);
|
||||
let pid = read_pidfile(&self.pid_file())?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to kill pageserver with pid {}", pid);
|
||||
}
|
||||
|
||||
// await for pageserver stop
|
||||
// wait for pageserver stop
|
||||
for _ in 0..5 {
|
||||
let stream = TcpStream::connect(self.address());
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
if let Err(_e) = stream {
|
||||
println!("Pageserver stopped");
|
||||
return Ok(());
|
||||
}
|
||||
println!("Stopping pageserver on {}", self.address());
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
|
||||
// ok, we failed to stop pageserver, let's panic
|
||||
if !status.success() {
|
||||
anyhow::bail!("Failed to stop pageserver with pid {}", pid);
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
}
|
||||
|
||||
pub fn page_server_psql(&self, sql: &str) -> Vec<postgres::SimpleQueryMessage> {
|
||||
@@ -237,9 +158,7 @@ impl PageServerNode {
|
||||
client.simple_query(sql).unwrap()
|
||||
}
|
||||
|
||||
pub fn page_server_psql_client(
|
||||
&self,
|
||||
) -> std::result::Result<postgres::Client, postgres::Error> {
|
||||
pub fn page_server_psql_client(&self) -> Result<postgres::Client, postgres::Error> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address().ip(),
|
||||
@@ -249,6 +168,79 @@ impl PageServerNode {
|
||||
);
|
||||
Client::connect(connstring.as_str(), NoTls)
|
||||
}
|
||||
|
||||
pub fn branches_list(&self) -> Result<Vec<BranchInfo>> {
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
let query_result = client.simple_query("branch_list")?;
|
||||
let branches_json = query_result
|
||||
.first()
|
||||
.map(|msg| match msg {
|
||||
postgres::SimpleQueryMessage::Row(row) => row.get(0),
|
||||
_ => None,
|
||||
})
|
||||
.flatten()
|
||||
.ok_or_else(|| anyhow!("missing branches"))?;
|
||||
|
||||
let res: Vec<BranchInfo> = serde_json::from_str(branches_json)?;
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub fn branch_create(&self, name: &str, startpoint: &str) -> Result<BranchInfo> {
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
let query_result =
|
||||
client.simple_query(format!("branch_create {} {}", name, startpoint).as_str())?;
|
||||
|
||||
let branch_json = query_result
|
||||
.first()
|
||||
.map(|msg| match msg {
|
||||
postgres::SimpleQueryMessage::Row(row) => row.get(0),
|
||||
_ => None,
|
||||
})
|
||||
.flatten()
|
||||
.ok_or_else(|| anyhow!("missing branch"))?;
|
||||
|
||||
let res: BranchInfo = serde_json::from_str(branch_json).map_err(|e| {
|
||||
anyhow!(
|
||||
"failed to parse branch_create response: {}: {}",
|
||||
branch_json,
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
// TODO: make this a separate request type and avoid loading all the branches
|
||||
pub fn branch_get_by_name(&self, name: &str) -> Result<BranchInfo> {
|
||||
let branch_infos = self.branches_list()?;
|
||||
let branche_by_name: Result<HashMap<String, BranchInfo>> = branch_infos
|
||||
.into_iter()
|
||||
.map(|branch_info| Ok((branch_info.name.clone(), branch_info)))
|
||||
.collect();
|
||||
let branche_by_name = branche_by_name?;
|
||||
|
||||
let branch = branche_by_name
|
||||
.get(name)
|
||||
.ok_or_else(|| anyhow!("Branch {} not found", name))?;
|
||||
|
||||
Ok(branch.clone())
|
||||
}
|
||||
|
||||
pub fn system_id_get(&self) -> Result<u64> {
|
||||
let mut client = self.page_server_psql_client()?;
|
||||
let query_result = client
|
||||
.simple_query("identify_system")?
|
||||
.first()
|
||||
.map(|msg| match msg {
|
||||
postgres::SimpleQueryMessage::Row(row) => row.get(0),
|
||||
_ => None,
|
||||
})
|
||||
.flatten()
|
||||
.ok_or_else(|| anyhow!("failed to get system_id"))?
|
||||
.parse::<u64>()?;
|
||||
|
||||
Ok(query_result)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
@@ -258,106 +250,3 @@ impl Drop for PageServerNode {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Control routines for WalAcceptor.
|
||||
//
|
||||
// Now used only in test setups.
|
||||
//
|
||||
pub struct WalAcceptorNode {
|
||||
listen: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
env: LocalEnv,
|
||||
}
|
||||
|
||||
impl WalAcceptorNode {
|
||||
pub fn init(&self) {
|
||||
if self.data_dir.exists() {
|
||||
fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!(
|
||||
"Starting wal_acceptor in {} listening '{}'",
|
||||
self.data_dir.to_str().unwrap(),
|
||||
self.listen
|
||||
);
|
||||
|
||||
let status = Command::new(self.env.zenith_distrib_dir.join("wal_acceptor"))
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.listen.to_string().as_str()])
|
||||
.args(&["--systemid", &self.env.systemid.to_string()])
|
||||
// Tell page server it can receive WAL from this WAL safekeeper
|
||||
// FIXME: If there are multiple safekeepers, they will all inform
|
||||
// the page server. Only the last "notification" will stay in effect.
|
||||
// So it's pretty random which safekeeper the page server will connect to
|
||||
.args(&["--pageserver", "127.0.0.1:64000"])
|
||||
.arg("-d")
|
||||
.arg("-n")
|
||||
.status()
|
||||
.expect("failed to start wal_acceptor");
|
||||
|
||||
if !status.success() {
|
||||
panic!("wal_acceptor start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> std::result::Result<(), io::Error> {
|
||||
println!("Stopping wal acceptor on {}", self.listen);
|
||||
let pidfile = self.data_dir.join("wal_acceptor.pid");
|
||||
let pid = read_pidfile(&pidfile)?;
|
||||
// Ignores any failures when running this command
|
||||
let _status = Command::new("kill")
|
||||
.arg(pid)
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalAcceptorNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct WalProposerNode {
|
||||
pub pid: u32,
|
||||
}
|
||||
|
||||
impl WalProposerNode {
|
||||
pub fn stop(&self) {
|
||||
let status = Command::new("kill")
|
||||
.arg(self.pid.to_string())
|
||||
.env_clear()
|
||||
.status()
|
||||
.expect("failed to execute kill");
|
||||
|
||||
if !status.success() {
|
||||
panic!("kill start failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalProposerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
/// Read a PID file
|
||||
///
|
||||
/// This should contain an unsigned integer, but we return it as a String
|
||||
/// because our callers only want to pass it back into a subcommand.
|
||||
fn read_pidfile(pidfile: &Path) -> std::result::Result<String, io::Error> {
|
||||
fs::read_to_string(pidfile).map_err(|err| {
|
||||
eprintln!("failed to read pidfile {:?}: {:?}", pidfile, err);
|
||||
err
|
||||
})
|
||||
}
|
||||
|
||||
@@ -9,8 +9,9 @@ edition = "2018"
|
||||
[dependencies]
|
||||
lazy_static = "1.4.0"
|
||||
rand = "0.8.3"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
anyhow = "1.0"
|
||||
nix = "0.20"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
|
||||
414
integration_tests/src/lib.rs
Normal file
414
integration_tests/src/lib.rs
Normal file
@@ -0,0 +1,414 @@
|
||||
use anyhow::{bail, Result};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use std::collections::BTreeMap;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::Read;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, ExitStatus};
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use control_plane::compute::PostgresNode;
|
||||
use control_plane::read_pidfile;
|
||||
use control_plane::{local_env::LocalEnv, storage::PageServerNode};
|
||||
|
||||
// Find the directory where the binaries were put (i.e. target/debug/)
|
||||
fn cargo_bin_dir() -> PathBuf {
|
||||
let mut pathbuf = std::env::current_exe().unwrap();
|
||||
|
||||
pathbuf.pop();
|
||||
if pathbuf.ends_with("deps") {
|
||||
pathbuf.pop();
|
||||
}
|
||||
|
||||
pathbuf
|
||||
}
|
||||
|
||||
// local compute env for tests
|
||||
pub fn create_test_env(testname: &str) -> LocalEnv {
|
||||
let base_path = Path::new(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("../tmp_check/")
|
||||
.join(testname);
|
||||
|
||||
let base_path_str = base_path.to_str().unwrap();
|
||||
|
||||
// Remove remnants of old test repo
|
||||
let _ = fs::remove_dir_all(&base_path);
|
||||
|
||||
fs::create_dir_all(&base_path)
|
||||
.unwrap_or_else(|_| panic!("could not create directory for {}", base_path_str));
|
||||
|
||||
let pgdatadirs_path = base_path.join("pgdatadirs");
|
||||
fs::create_dir(&pgdatadirs_path)
|
||||
.unwrap_or_else(|_| panic!("could not create directory {:?}", pgdatadirs_path));
|
||||
|
||||
LocalEnv {
|
||||
pageserver_connstring: "postgresql://127.0.0.1:64000".to_string(),
|
||||
pg_distrib_dir: Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install"),
|
||||
zenith_distrib_dir: Some(cargo_bin_dir()),
|
||||
base_data_dir: base_path,
|
||||
remotes: BTreeMap::default(),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Collection of several example deployments useful for tests.
|
||||
//
|
||||
// I'm intendedly modelling storage and compute control planes as a separate entities
|
||||
// as it is closer to the actual setup.
|
||||
//
|
||||
pub struct TestStorageControlPlane {
|
||||
pub wal_acceptors: Vec<WalAcceptorNode>,
|
||||
pub pageserver: Arc<PageServerNode>,
|
||||
pub test_done: AtomicBool,
|
||||
}
|
||||
|
||||
impl TestStorageControlPlane {
|
||||
// postgres <-> page_server
|
||||
//
|
||||
// Initialize a new repository and configure a page server to run in it
|
||||
//
|
||||
pub fn one_page_server(local_env: &LocalEnv) -> TestStorageControlPlane {
|
||||
let pserver = Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
});
|
||||
pserver.init().unwrap();
|
||||
pserver.start().unwrap();
|
||||
|
||||
TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: pserver,
|
||||
test_done: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
// postgres <-> {wal_acceptor1, wal_acceptor2, ...}
|
||||
pub fn fault_tolerant(local_env: &LocalEnv, redundancy: usize) -> TestStorageControlPlane {
|
||||
let mut cplane = TestStorageControlPlane {
|
||||
wal_acceptors: Vec::new(),
|
||||
pageserver: Arc::new(PageServerNode {
|
||||
env: local_env.clone(),
|
||||
kill_on_exit: true,
|
||||
listen_address: None,
|
||||
}),
|
||||
test_done: AtomicBool::new(false),
|
||||
// repopath,
|
||||
};
|
||||
cplane.pageserver.init().unwrap();
|
||||
cplane.pageserver.start().unwrap();
|
||||
|
||||
let systemid = cplane.pageserver.system_id_get().unwrap();
|
||||
|
||||
const WAL_ACCEPTOR_PORT: usize = 54321;
|
||||
|
||||
let datadir_base = local_env.base_data_dir.join("safekeepers");
|
||||
fs::create_dir_all(&datadir_base).unwrap();
|
||||
|
||||
for i in 0..redundancy {
|
||||
let wal_acceptor = WalAcceptorNode {
|
||||
listen: format!("127.0.0.1:{}", WAL_ACCEPTOR_PORT + i)
|
||||
.parse()
|
||||
.unwrap(),
|
||||
data_dir: datadir_base.join(format!("wal_acceptor_{}", i)),
|
||||
systemid,
|
||||
env: local_env.clone(),
|
||||
pass_to_pageserver: true,
|
||||
};
|
||||
wal_acceptor.init();
|
||||
wal_acceptor.start();
|
||||
cplane.wal_acceptors.push(wal_acceptor);
|
||||
}
|
||||
cplane
|
||||
}
|
||||
|
||||
pub fn stop(&self) {
|
||||
for wa in self.wal_acceptors.iter() {
|
||||
let _ = wa.stop();
|
||||
}
|
||||
self.test_done.store(true, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn get_wal_acceptor_conn_info(&self) -> String {
|
||||
self.wal_acceptors
|
||||
.iter()
|
||||
.map(|wa| wa.listen.to_string())
|
||||
.collect::<Vec<String>>()
|
||||
.join(",")
|
||||
}
|
||||
|
||||
pub fn is_running(&self) -> bool {
|
||||
self.test_done.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for TestStorageControlPlane {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// PostgresNodeExt
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
///
|
||||
/// Testing utilities for PostgresNode type
|
||||
///
|
||||
pub trait PostgresNodeExt {
|
||||
fn pg_regress(&self) -> ExitStatus;
|
||||
fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus;
|
||||
fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode;
|
||||
fn open_psql(&self, db: &str) -> postgres::Client;
|
||||
fn dump_log_file(&self);
|
||||
fn safe_psql(&self, db: &str, sql: &str) -> Vec<postgres::Row>;
|
||||
}
|
||||
|
||||
impl PostgresNodeExt for PostgresNode {
|
||||
fn pg_regress(&self) -> ExitStatus {
|
||||
self.safe_psql("postgres", "CREATE DATABASE regression");
|
||||
|
||||
let regress_run_path = self.env.base_data_dir.join("regress");
|
||||
fs::create_dir_all(®ress_run_path).unwrap();
|
||||
fs::create_dir_all(regress_run_path.join("testtablespace")).unwrap();
|
||||
std::env::set_current_dir(regress_run_path).unwrap();
|
||||
|
||||
let regress_build_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../tmp_install/build/src/test/regress");
|
||||
let regress_src_path =
|
||||
Path::new(env!("CARGO_MANIFEST_DIR")).join("../vendor/postgres/src/test/regress");
|
||||
|
||||
let regress_check = Command::new(regress_build_path.join("pg_regress"))
|
||||
.args(&[
|
||||
"--bindir=''",
|
||||
"--use-existing",
|
||||
format!("--bindir={}", self.env.pg_bin_dir().to_str().unwrap()).as_str(),
|
||||
format!("--dlpath={}", regress_build_path.to_str().unwrap()).as_str(),
|
||||
format!(
|
||||
"--schedule={}",
|
||||
regress_src_path.join("parallel_schedule").to_str().unwrap()
|
||||
)
|
||||
.as_str(),
|
||||
format!("--inputdir={}", regress_src_path.to_str().unwrap()).as_str(),
|
||||
])
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("PGPORT", self.address.port().to_string())
|
||||
.env("PGUSER", self.whoami())
|
||||
.env("PGHOST", self.address.ip().to_string())
|
||||
.status()
|
||||
.expect("pg_regress failed");
|
||||
if !regress_check.success() {
|
||||
if let Ok(mut file) = File::open("regression.diffs") {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- regression.diffs:\n{}", buffer);
|
||||
}
|
||||
self.dump_log_file();
|
||||
}
|
||||
regress_check
|
||||
}
|
||||
|
||||
fn pg_bench(&self, clients: u32, seconds: u32) -> ExitStatus {
|
||||
let port = self.address.port().to_string();
|
||||
let clients = clients.to_string();
|
||||
let seconds = seconds.to_string();
|
||||
let _pg_bench_init = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&["-i", "-p", port.as_str(), "postgres"])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench -i");
|
||||
let pg_bench_run = Command::new(self.env.pg_bin_dir().join("pgbench"))
|
||||
.args(&[
|
||||
"-p",
|
||||
port.as_str(),
|
||||
"-T",
|
||||
seconds.as_str(),
|
||||
"-P",
|
||||
"1",
|
||||
"-c",
|
||||
clients.as_str(),
|
||||
"-M",
|
||||
"prepared",
|
||||
"postgres",
|
||||
])
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.status()
|
||||
.expect("pgbench run");
|
||||
pg_bench_run
|
||||
}
|
||||
|
||||
fn start_proxy(&self, wal_acceptors: &str) -> WalProposerNode {
|
||||
let proxy_path = self.env.pg_bin_dir().join("safekeeper_proxy");
|
||||
match Command::new(proxy_path.as_path())
|
||||
.args(&["--ztimelineid", &self.timelineid.to_string()])
|
||||
.args(&["-s", wal_acceptors])
|
||||
.args(&["-h", &self.address.ip().to_string()])
|
||||
.args(&["-p", &self.address.port().to_string()])
|
||||
.arg("-v")
|
||||
.stderr(
|
||||
OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(self.pgdata().join("safekeeper_proxy.log"))
|
||||
.unwrap(),
|
||||
)
|
||||
.spawn()
|
||||
{
|
||||
Ok(child) => WalProposerNode { pid: child.id() },
|
||||
Err(e) => panic!("Failed to launch {:?}: {}", proxy_path, e),
|
||||
}
|
||||
}
|
||||
|
||||
fn dump_log_file(&self) {
|
||||
if let Ok(mut file) = File::open(self.env.pageserver_data_dir().join("pageserver.log")) {
|
||||
let mut buffer = String::new();
|
||||
file.read_to_string(&mut buffer).unwrap();
|
||||
println!("--------------- pageserver.log:\n{}", buffer);
|
||||
}
|
||||
}
|
||||
|
||||
fn safe_psql(&self, db: &str, sql: &str) -> Vec<postgres::Row> {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
let mut client = postgres::Client::connect(connstring.as_str(), postgres::NoTls).unwrap();
|
||||
|
||||
println!("Running {}", sql);
|
||||
let result = client.query(sql, &[]);
|
||||
if result.is_err() {
|
||||
self.dump_log_file();
|
||||
}
|
||||
result.unwrap()
|
||||
}
|
||||
|
||||
fn open_psql(&self, db: &str) -> postgres::Client {
|
||||
let connstring = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
self.address.ip(),
|
||||
self.address.port(),
|
||||
db,
|
||||
self.whoami()
|
||||
);
|
||||
postgres::Client::connect(connstring.as_str(), postgres::NoTls).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// WalAcceptorNode
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Control routines for WalAcceptor.
|
||||
//
|
||||
// Now used only in test setups.
|
||||
//
|
||||
pub struct WalAcceptorNode {
|
||||
listen: SocketAddr,
|
||||
data_dir: PathBuf,
|
||||
systemid: u64,
|
||||
env: LocalEnv,
|
||||
pass_to_pageserver: bool,
|
||||
}
|
||||
|
||||
impl WalAcceptorNode {
|
||||
pub fn init(&self) {
|
||||
if self.data_dir.exists() {
|
||||
fs::remove_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
fs::create_dir_all(self.data_dir.clone()).unwrap();
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
println!(
|
||||
"Starting wal_acceptor in {} listening '{}'",
|
||||
self.data_dir.to_str().unwrap(),
|
||||
self.listen
|
||||
);
|
||||
|
||||
let ps_arg = if self.pass_to_pageserver {
|
||||
// Tell page server it can receive WAL from this WAL safekeeper
|
||||
["--pageserver", "127.0.0.1:64000"].to_vec()
|
||||
} else {
|
||||
[].to_vec()
|
||||
};
|
||||
|
||||
let status = Command::new(
|
||||
self.env
|
||||
.zenith_distrib_dir
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.join("wal_acceptor"),
|
||||
)
|
||||
.args(&["-D", self.data_dir.to_str().unwrap()])
|
||||
.args(&["-l", self.listen.to_string().as_str()])
|
||||
.args(&["--systemid", self.systemid.to_string().as_str()])
|
||||
.args(&ps_arg)
|
||||
.arg("-d")
|
||||
.arg("-n")
|
||||
.status()
|
||||
.expect("failed to start wal_acceptor");
|
||||
|
||||
if !status.success() {
|
||||
panic!("wal_acceptor start failed");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self) -> Result<()> {
|
||||
println!("Stopping wal acceptor on {}", self.listen);
|
||||
let pidfile = self.data_dir.join("wal_acceptor.pid");
|
||||
let pid = read_pidfile(&pidfile)?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to kill wal_acceptor with pid {}", pid);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalAcceptorNode {
|
||||
fn drop(&mut self) {
|
||||
// Ignore errors.
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// WalProposerNode
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
pub struct WalProposerNode {
|
||||
pub pid: u32,
|
||||
}
|
||||
|
||||
impl WalProposerNode {
|
||||
pub fn stop(&self) {
|
||||
// std::process::Child::id() returns u32, we need i32.
|
||||
let pid: i32 = self.pid.try_into().unwrap();
|
||||
let pid = Pid::from_raw(pid);
|
||||
kill(pid, Signal::SIGTERM).expect("failed to execute kill");
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalProposerNode {
|
||||
fn drop(&mut self) {
|
||||
self.stop();
|
||||
}
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
// test node resettlement to an empty datadir
|
||||
|
||||
// TODO
|
||||
/*
|
||||
#[test]
|
||||
fn test_resettlement() {}
|
||||
|
||||
// test seq scan of everythin after restart
|
||||
#[test]
|
||||
fn test_cold_seqscan() {}
|
||||
*/
|
||||
@@ -1,8 +0,0 @@
|
||||
// TODO
|
||||
/*
|
||||
#[test]
|
||||
fn test_actions() {}
|
||||
|
||||
#[test]
|
||||
fn test_regress() {}
|
||||
*/
|
||||
@@ -1,150 +0,0 @@
|
||||
// mod control_plane;
|
||||
use control_plane::compute::ComputeControlPlane;
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::PointInTime;
|
||||
use control_plane::storage::TestStorageControlPlane;
|
||||
|
||||
// XXX: force all redo at the end
|
||||
// -- restart + seqscan won't read deleted stuff
|
||||
// -- pageserver api endpoint to check all rels
|
||||
#[test]
|
||||
fn test_redo_cases() {
|
||||
let local_env = local_env::test_env("test_redo_cases");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_node(maintli);
|
||||
node.start().unwrap();
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
|
||||
// check 'create table as'
|
||||
node.safe_psql("postgres", "CREATE TABLE t2 AS SELECT * FROM t");
|
||||
let count: i64 = node
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
}
|
||||
|
||||
// Runs pg_regress on a compute node
|
||||
#[test]
|
||||
fn test_regress() {
|
||||
let local_env = local_env::test_env("test_regress");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_node(maintli);
|
||||
node.start().unwrap();
|
||||
|
||||
let status = node.pg_regress();
|
||||
assert!(status.success());
|
||||
}
|
||||
|
||||
// Runs pg_bench on a compute node
|
||||
#[test]
|
||||
fn pgbench() {
|
||||
let local_env = local_env::test_env("pgbench");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_node(maintli);
|
||||
node.start().unwrap();
|
||||
|
||||
let status = node.pg_bench(10, 100);
|
||||
assert!(status.success());
|
||||
}
|
||||
|
||||
// Run two postgres instances on one pageserver, on different timelines
|
||||
#[test]
|
||||
fn test_pageserver_two_timelines() {
|
||||
let local_env = local_env::test_env("test_pageserver_two_timelines");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
let storage_cplane = TestStorageControlPlane::one_page_server(&local_env);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
|
||||
// Create new branch at the end of 'main'
|
||||
let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
|
||||
local_env::create_branch(
|
||||
&local_env,
|
||||
"experimental",
|
||||
PointInTime {
|
||||
timelineid: maintli,
|
||||
lsn: startpoint,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
let experimentaltli = storage_cplane.get_branch_timeline("experimental");
|
||||
|
||||
// Launch postgres instances on both branches
|
||||
let node1 = compute_cplane.new_test_node(maintli);
|
||||
let node2 = compute_cplane.new_test_node(experimentaltli);
|
||||
node1.start().unwrap();
|
||||
node2.start().unwrap();
|
||||
|
||||
// check node1
|
||||
node1.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node1.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
);
|
||||
let count: i64 = node1
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 5000050000);
|
||||
|
||||
// check node2
|
||||
node2.safe_psql(
|
||||
"postgres",
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
);
|
||||
node2.safe_psql(
|
||||
"postgres",
|
||||
"INSERT INTO t SELECT generate_series(100000,200000), 'payload'",
|
||||
);
|
||||
let count: i64 = node2
|
||||
.safe_psql("postgres", "SELECT sum(key) FROM t")
|
||||
.first()
|
||||
.unwrap()
|
||||
.get(0);
|
||||
println!("sum = {}", count);
|
||||
assert_eq!(count, 15000150000);
|
||||
}
|
||||
@@ -1,21 +1,32 @@
|
||||
// Restart acceptors one by one while compute is under the load.
|
||||
use control_plane::compute::ComputeControlPlane;
|
||||
use control_plane::local_env;
|
||||
use control_plane::local_env::PointInTime;
|
||||
use control_plane::storage::TestStorageControlPlane;
|
||||
use pageserver::ZTimelineId;
|
||||
|
||||
use rand::Rng;
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use std::{thread, time};
|
||||
|
||||
use control_plane::compute::{ComputeControlPlane, PostgresNode};
|
||||
|
||||
use integration_tests::PostgresNodeExt;
|
||||
use integration_tests::TestStorageControlPlane;
|
||||
|
||||
const DOWNTIME: u64 = 2;
|
||||
|
||||
fn start_node_with_wal_proposer(
|
||||
timeline: &str,
|
||||
compute_cplane: &mut ComputeControlPlane,
|
||||
wal_acceptors: &str,
|
||||
) -> Arc<PostgresNode> {
|
||||
let node = compute_cplane.new_test_master_node(timeline);
|
||||
let _node = node.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!("wal_acceptors='{}'\n", wal_acceptors),
|
||||
);
|
||||
node.start().unwrap();
|
||||
node
|
||||
}
|
||||
|
||||
#[test]
|
||||
//#[ignore]
|
||||
fn test_embedded_wal_proposer() {
|
||||
let local_env = local_env::test_env("test_embedded_wal_proposer");
|
||||
let local_env = integration_tests::create_test_env("test_embedded_wal_proposer");
|
||||
|
||||
const REDUNDANCY: usize = 3;
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
@@ -23,13 +34,7 @@ fn test_embedded_wal_proposer() {
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.append_conf(
|
||||
"postgresql.conf",
|
||||
&format!("wal_acceptors='{}'\n", wal_acceptors),
|
||||
);
|
||||
node.start().unwrap();
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
@@ -52,7 +57,7 @@ fn test_embedded_wal_proposer() {
|
||||
|
||||
#[test]
|
||||
fn test_acceptors_normal_work() {
|
||||
let local_env = local_env::test_env("test_acceptors_normal_work");
|
||||
let local_env = integration_tests::create_test_env("test_acceptors_normal_work");
|
||||
|
||||
const REDUNDANCY: usize = 3;
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
@@ -60,12 +65,7 @@ fn test_acceptors_normal_work() {
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
@@ -93,39 +93,28 @@ fn test_many_timelines() {
|
||||
// Initialize a new repository, and set up WAL safekeepers and page server.
|
||||
const REDUNDANCY: usize = 3;
|
||||
const N_TIMELINES: usize = 5;
|
||||
let local_env = local_env::test_env("test_many_timelines");
|
||||
let local_env = integration_tests::create_test_env("test_many_timelines");
|
||||
let storage_cplane = TestStorageControlPlane::fault_tolerant(&local_env, REDUNDANCY);
|
||||
let mut compute_cplane = ComputeControlPlane::local(&local_env, &storage_cplane.pageserver);
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// Create branches
|
||||
let mut timelines: Vec<ZTimelineId> = Vec::new();
|
||||
let maintli = storage_cplane.get_branch_timeline("main"); // main branch
|
||||
timelines.push(maintli);
|
||||
let startpoint = local_env::find_end_of_wal(&local_env, maintli).unwrap();
|
||||
let mut timelines: Vec<String> = vec!["main".to_string()];
|
||||
|
||||
for i in 1..N_TIMELINES {
|
||||
// additional branches
|
||||
let branchname = format!("experimental{}", i);
|
||||
local_env::create_branch(
|
||||
&local_env,
|
||||
&branchname,
|
||||
PointInTime {
|
||||
timelineid: maintli,
|
||||
lsn: startpoint,
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
let tli = storage_cplane.get_branch_timeline(&branchname);
|
||||
timelines.push(tli);
|
||||
storage_cplane
|
||||
.pageserver
|
||||
.branch_create(&branchname, "main")
|
||||
.unwrap();
|
||||
timelines.push(branchname);
|
||||
}
|
||||
|
||||
// start postgres on each timeline
|
||||
let mut nodes = Vec::new();
|
||||
for tli in timelines {
|
||||
let node = compute_cplane.new_test_node(tli);
|
||||
for tli_name in timelines {
|
||||
let node = start_node_with_wal_proposer(&tli_name, &mut compute_cplane, &wal_acceptors);
|
||||
nodes.push(node.clone());
|
||||
node.start().unwrap();
|
||||
node.start_proxy(&wal_acceptors);
|
||||
}
|
||||
|
||||
// create schema
|
||||
@@ -159,7 +148,7 @@ fn test_many_timelines() {
|
||||
// Majority is always alive
|
||||
#[test]
|
||||
fn test_acceptors_restarts() {
|
||||
let local_env = local_env::test_env("test_acceptors_restarts");
|
||||
let local_env = integration_tests::create_test_env("test_acceptors_restarts");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
@@ -171,12 +160,8 @@ fn test_acceptors_restarts() {
|
||||
let mut rng = rand::thread_rng();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let mut failed_node: Option<usize> = None;
|
||||
|
||||
// check basic work with table
|
||||
@@ -222,7 +207,7 @@ fn start_acceptor(cplane: &Arc<TestStorageControlPlane>, no: usize) {
|
||||
// N_CRASHES env var
|
||||
#[test]
|
||||
fn test_acceptors_unavailability() {
|
||||
let local_env = local_env::test_env("test_acceptors_unavailability");
|
||||
let local_env = integration_tests::create_test_env("test_acceptors_unavailability");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 2;
|
||||
@@ -232,12 +217,7 @@ fn test_acceptors_unavailability() {
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
@@ -307,7 +287,7 @@ fn simulate_failures(cplane: Arc<TestStorageControlPlane>) {
|
||||
// Race condition test
|
||||
#[test]
|
||||
fn test_race_conditions() {
|
||||
let local_env = local_env::test_env("test_race_conditions");
|
||||
let local_env = integration_tests::create_test_env("test_race_conditions");
|
||||
|
||||
// Start pageserver that reads WAL directly from that postgres
|
||||
const REDUNDANCY: usize = 3;
|
||||
@@ -319,12 +299,7 @@ fn test_race_conditions() {
|
||||
let wal_acceptors = storage_cplane.get_wal_acceptor_conn_info();
|
||||
|
||||
// start postgres
|
||||
let maintli = storage_cplane.get_branch_timeline("main");
|
||||
let node = compute_cplane.new_test_master_node(maintli);
|
||||
node.start().unwrap();
|
||||
|
||||
// start proxy
|
||||
let _proxy = node.start_proxy(&wal_acceptors);
|
||||
let node = start_node_with_wal_proposer("main", &mut compute_cplane, &wal_acceptors);
|
||||
|
||||
// check basic work with table
|
||||
node.safe_psql(
|
||||
|
||||
23
mgmt-console/.gitignore
vendored
23
mgmt-console/.gitignore
vendored
@@ -1,23 +0,0 @@
|
||||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||
|
||||
# dependencies
|
||||
/node_modules
|
||||
/.pnp
|
||||
.pnp.js
|
||||
|
||||
# testing
|
||||
/coverage
|
||||
|
||||
# production
|
||||
/build
|
||||
|
||||
# misc
|
||||
.DS_Store
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
@@ -1,55 +0,0 @@
|
||||
Mock implementation of a management console.
|
||||
|
||||
See demo-howto.txt for usage.
|
||||
|
||||
Building and Installation
|
||||
-------------------------
|
||||
|
||||
To compile Postgres:
|
||||
sudo apt build-dep postgresql
|
||||
sudo apt install bison flex libz-dev libssl-dev
|
||||
sudo apt install ccache
|
||||
sudo apt install libcurl4-openssl-dev libxml2-dev
|
||||
|
||||
For the webapp:
|
||||
# NOTE: This requires at least version 1.1.0 of python3-flask. That's not
|
||||
# available in Debian Buster, need at least Bullseye.
|
||||
|
||||
sudo apt install python3 python3-flask python3-pip npm webpack
|
||||
pip3 install Flask-BasicAuth
|
||||
pip3 install boto3
|
||||
|
||||
git clone and compile and install patched version of Postgres:
|
||||
|
||||
git clone https://github.com/libzenith/postgres.git
|
||||
cd postgres
|
||||
git checkout zenith-experiments
|
||||
./configure --enable-debug --enable-cassert --with-openssl --prefix=/home/heikki/pgsql-install --with-libxml CC="ccache gcc" CFLAGS="-O0"
|
||||
make -j4 -s install
|
||||
|
||||
Get the webapp:
|
||||
cd ~
|
||||
git clone https://github.com/libzenith/zenith-mgmt-console.git
|
||||
cd zenith-mgmt-console
|
||||
mkdir pgdatadirs
|
||||
|
||||
|
||||
openssl req -new -x509 -days 365 -nodes -text -out server.crt \
|
||||
-keyout server.key -subj "/CN=zenith-demo"
|
||||
|
||||
For Mock S3 server (unless you want to test against a real cloud service):
|
||||
sudo apt install python3-tornado
|
||||
|
||||
cd ~/zenith-mgmt-console
|
||||
git clone https://github.com/hlinnaka/ms3.git
|
||||
|
||||
Compile & run it:
|
||||
npm install
|
||||
webpack # compile React app
|
||||
|
||||
BASIC_AUTH_PASSWORD=<password> ./launch-local.sh
|
||||
|
||||
|
||||
You can view the contents of the S3 bucket with browser:
|
||||
|
||||
http://<server>/list_bucket
|
||||
@@ -1,340 +0,0 @@
|
||||
from flask import request
|
||||
from flask_basicauth import BasicAuth
|
||||
from flask import render_template
|
||||
from subprocess import PIPE, STDOUT, run, Popen
|
||||
import html
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import logging
|
||||
import time
|
||||
|
||||
import boto3
|
||||
from boto3.session import Session
|
||||
from botocore.client import Config
|
||||
from botocore.handlers import set_list_objects_encoding_type_url
|
||||
|
||||
from flask import Flask
|
||||
|
||||
import waldump
|
||||
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
app.config['BASIC_AUTH_USERNAME'] = 'zenith'
|
||||
app.config['BASIC_AUTH_PASSWORD'] = os.getenv('BASIC_AUTH_PASSWORD')
|
||||
app.config['BASIC_AUTH_FORCE'] = True
|
||||
|
||||
basic_auth = BasicAuth(app)
|
||||
|
||||
# S3 configuration:
|
||||
|
||||
ENDPOINT = os.getenv('S3_ENDPOINT', 'https://localhost:9000')
|
||||
ACCESS_KEY = os.getenv('S3_ACCESSKEY', 'minioadmin')
|
||||
SECRET = os.getenv('S3_SECRET', '')
|
||||
BUCKET = os.getenv('S3_BUCKET', 'foobucket')
|
||||
|
||||
print("Using bucket at " + ENDPOINT);
|
||||
|
||||
#boto3.set_stream_logger('botocore', logging.DEBUG)
|
||||
|
||||
session = Session(aws_access_key_id=ACCESS_KEY,
|
||||
aws_secret_access_key=SECRET,
|
||||
region_name=os.getenv('S3_REGION', 'auto'))
|
||||
|
||||
# needed for google cloud?
|
||||
session.events.unregister('before-parameter-build.s3.ListObjects',
|
||||
set_list_objects_encoding_type_url)
|
||||
|
||||
s3resource = session.resource('s3',
|
||||
endpoint_url=ENDPOINT,
|
||||
verify=False,
|
||||
config=Config(signature_version='s3v4'))
|
||||
s3bucket = s3resource.Bucket(BUCKET)
|
||||
|
||||
s3_client = boto3.client('s3',
|
||||
endpoint_url=ENDPOINT,
|
||||
verify=False,
|
||||
config=Config(signature_version='s3v4'),
|
||||
aws_access_key_id=ACCESS_KEY,
|
||||
aws_secret_access_key=SECRET)
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
return render_template("index.html")
|
||||
|
||||
|
||||
@app.route("/api/waldump")
|
||||
def render_waldump():
|
||||
return render_template("waldump.html")
|
||||
|
||||
@app.route('/api/fetch_wal')
|
||||
def fetch_wal():
|
||||
return waldump.fetch_wal(request, s3bucket);
|
||||
|
||||
@app.route("/api/server_status")
|
||||
def server_status():
|
||||
dirs = os.listdir("pgdatadirs")
|
||||
dirs.sort()
|
||||
|
||||
primary = None
|
||||
standbys = []
|
||||
|
||||
for dirname in dirs:
|
||||
|
||||
result = run("pg_ctl status -D pgdatadirs/" + dirname, stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
srv = {
|
||||
'datadir': dirname,
|
||||
'status': result.stdout,
|
||||
'port': None
|
||||
}
|
||||
|
||||
if dirname == 'primary':
|
||||
primary = srv;
|
||||
primary['port'] = 5432;
|
||||
else:
|
||||
standby_match = re.search('standby_([0-9]+)', dirname)
|
||||
if standby_match:
|
||||
srv['port'] = int(standby_match.group(1))
|
||||
|
||||
standbys.append(srv);
|
||||
|
||||
return {'primary': primary, 'standbys': standbys}
|
||||
|
||||
@app.route('/api/list_bucket')
|
||||
def list_bucket():
|
||||
|
||||
response = 'cloud bucket contents:<br>\n'
|
||||
|
||||
for file in s3bucket.objects.all():
|
||||
response = response + html.escape(file.key) + '<br>\n'
|
||||
|
||||
return response
|
||||
|
||||
def walpos_str(walpos):
|
||||
return '{:X}/{:X}'.format(walpos >> 32, walpos & 0xFFFFFFFF)
|
||||
|
||||
@app.route('/api/bucket_summary')
|
||||
def bucket_summary():
|
||||
|
||||
nonrelimages = []
|
||||
minwal = int(0)
|
||||
maxwal = int(0)
|
||||
minseqwal = int(0)
|
||||
maxseqwal = int(0)
|
||||
|
||||
for file in s3bucket.objects.all():
|
||||
path = file.key
|
||||
match = re.search('nonreldata/nonrel_([0-9A-F]+).tar', path)
|
||||
if match:
|
||||
walpos = int(match.group(1), 16)
|
||||
nonrelimages.append(walpos_str(walpos))
|
||||
|
||||
match = re.search('nonreldata/nonrel_([0-9A-F]+)-([0-9A-F]+)', path)
|
||||
if match:
|
||||
endwal = int(match.group(2), 16)
|
||||
if endwal > maxwal:
|
||||
maxwal = endwal
|
||||
|
||||
match = re.search('walarchive/([0-9A-F]{8})([0-9A-F]{8})([0-9A-F]{8})', path)
|
||||
if match:
|
||||
tli = int(match.group(1), 16)
|
||||
logno = int(match.group(2), 16)
|
||||
segno = int(match.group(3), 16)
|
||||
# FIXME: this assumes default 16 MB wal segment size
|
||||
logsegno = logno * (0x100000000 / (16*1024*1024)) + segno
|
||||
|
||||
seqwal = int((logsegno + 1) * (16*1024*1024))
|
||||
|
||||
if seqwal > maxseqwal:
|
||||
maxseqwal = seqwal;
|
||||
if minseqwal == 0 or seqwal < minseqwal:
|
||||
minseqwal = seqwal;
|
||||
|
||||
return {
|
||||
'nonrelimages': nonrelimages,
|
||||
'minwal': walpos_str(minwal),
|
||||
'maxwal': walpos_str(maxwal),
|
||||
'minseqwal': walpos_str(minseqwal),
|
||||
'maxseqwal': walpos_str(maxseqwal)
|
||||
}
|
||||
|
||||
def print_cmd_result(cmd_result):
|
||||
return print_cmd_result_ex(cmd_result.args, cmd_result.returncode, cmd_result.stdout)
|
||||
|
||||
def print_cmd_result_ex(cmd, returncode, stdout):
|
||||
res = ''
|
||||
res += 'ran command:\n' + str(cmd) + '\n'
|
||||
res += 'It returned code ' + str(returncode) + '\n'
|
||||
res += '\n'
|
||||
res += 'stdout/stderr:\n'
|
||||
res += stdout
|
||||
|
||||
return res
|
||||
|
||||
@app.route('/api/init_primary', methods=['GET', 'POST'])
|
||||
def init_primary():
|
||||
|
||||
initdb_result = run("initdb -D pgdatadirs/primary --username=zenith --pwfile=pg-password.txt", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
if initdb_result.returncode != 0:
|
||||
return print_cmd_result(initdb_result)
|
||||
|
||||
# Append archive_mode and archive_command and port to postgresql.conf
|
||||
f=open("pgdatadirs/primary/postgresql.conf", "a+")
|
||||
f.write("listen_addresses='*'\n")
|
||||
f.write("archive_mode=on\n")
|
||||
f.write("archive_command='zenith_push --archive-wal-path=%p --archive-wal-fname=%f'\n")
|
||||
f.write("ssl=on\n")
|
||||
f.close()
|
||||
|
||||
f=open("pgdatadirs/primary/pg_hba.conf", "a+")
|
||||
f.write("# allow SSL connections with password from anywhere\n")
|
||||
f.write("hostssl all all 0.0.0.0/0 md5\n")
|
||||
f.write("hostssl all all ::0/0 md5\n")
|
||||
f.close()
|
||||
|
||||
shutil.copyfile("server.crt", "pgdatadirs/primary/server.crt")
|
||||
shutil.copyfile("server.key", "pgdatadirs/primary/server.key")
|
||||
os.chmod("pgdatadirs/primary/server.key", 0o0600)
|
||||
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
|
||||
responsestr = print_cmd_result(initdb_result) + '\n'
|
||||
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/zenith_push', methods=['GET', 'POST'])
|
||||
def zenith_push():
|
||||
# Stop the primary if it's running
|
||||
stop_result = run(args=["pg_ctl", "stop", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
|
||||
# Call zenith_push
|
||||
push_result = run("zenith_push -D pgdatadirs/primary", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
# Restart the primary
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
|
||||
responsestr = print_cmd_result(stop_result) + '\n'
|
||||
responsestr += print_cmd_result(push_result) + '\n'
|
||||
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout) + '\n'
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/create_standby', methods=['GET', 'POST'])
|
||||
def create_standby():
|
||||
|
||||
walpos = request.form.get('walpos')
|
||||
if not walpos:
|
||||
return 'no walpos'
|
||||
|
||||
dirs = os.listdir("pgdatadirs")
|
||||
|
||||
last_port = 5432
|
||||
|
||||
for dirname in dirs:
|
||||
|
||||
standby_match = re.search('standby_([0-9]+)', dirname)
|
||||
if standby_match:
|
||||
port = int(standby_match.group(1))
|
||||
if port > last_port:
|
||||
last_port = port
|
||||
|
||||
standby_port = last_port + 1
|
||||
|
||||
standby_dir = "pgdatadirs/standby_" + str(standby_port)
|
||||
|
||||
# Call zenith_restore
|
||||
restore_result = run(["zenith_restore", "--end=" + walpos, "-D", standby_dir], stdout=PIPE, stderr=STDOUT, encoding='latin1')
|
||||
responsestr = print_cmd_result(restore_result)
|
||||
|
||||
if restore_result.returncode == 0:
|
||||
# Append hot_standby and port to postgresql.conf
|
||||
f=open(standby_dir + "/postgresql.conf", "a+")
|
||||
f.write("hot_standby=on\n")
|
||||
f.write("port=" + str(standby_port) + "\n")
|
||||
f.close()
|
||||
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", standby_dir, "-l", standby_dir + "/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
responsestr += '\n\n' + print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/destroy_server', methods=['GET', 'POST'])
|
||||
def destroy_primary():
|
||||
|
||||
datadir = request.form.get('datadir')
|
||||
|
||||
# Check that the datadir parameter doesn't contain anything funny.
|
||||
if not re.match("^[A-Za-z0-9_-]+$", datadir):
|
||||
raise Exception('invalid datadir: ' + datadir)
|
||||
|
||||
# Stop the server if it's running
|
||||
stop_result = run(args=["pg_ctl", "stop", "-m", "immediate", "-D", "pgdatadirs/" + datadir], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
|
||||
shutil.rmtree('pgdatadirs/' + datadir, ignore_errors=True)
|
||||
|
||||
responsestr = print_cmd_result(stop_result) + '\n'
|
||||
responsestr += 'Deleted datadir ' + datadir + '.\n'
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/restore_primary', methods=['GET', 'POST'])
|
||||
def restore_primary():
|
||||
|
||||
# Call zenith_restore
|
||||
restore_result = run(["zenith_restore", "-D", "pgdatadirs/primary"], stdout=PIPE, stderr=STDOUT, encoding='latin1')
|
||||
responsestr = print_cmd_result(restore_result)
|
||||
|
||||
# Append restore_command to postgresql.conf, so that it can find the last raw WAL segments
|
||||
f=open("pgdatadirs/primary/postgresql.conf", "a+")
|
||||
f.write("listen_addresses='*'\n")
|
||||
f.write("restore_command='zenith_restore --archive-wal-path=%p --archive-wal-fname=%f'\n")
|
||||
f.write("ssl=on\n")
|
||||
f.close()
|
||||
|
||||
if restore_result.returncode == 0:
|
||||
start_proc = Popen(args=["pg_ctl", "start", "-D", "pgdatadirs/primary", "-l", "pgdatadirs/primary/log"], stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=False, start_new_session=True, close_fds=True)
|
||||
start_rc = start_proc.wait()
|
||||
start_stdout, start_stderr = start_proc.communicate()
|
||||
responsestr += print_cmd_result_ex(start_proc.args, start_rc, start_stdout)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/slicedice', methods=['GET', 'POST'])
|
||||
def run_slicedice():
|
||||
result = run("zenith_slicedice", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
responsestr = print_cmd_result(result)
|
||||
|
||||
return responsestr
|
||||
|
||||
@app.route('/api/reset_demo', methods=['POST'])
|
||||
def reset_all():
|
||||
result = run("pkill -9 postgres", stdout=PIPE, stderr=STDOUT, universal_newlines=True, shell=True)
|
||||
|
||||
dirs = os.listdir("pgdatadirs")
|
||||
for dirname in dirs:
|
||||
shutil.rmtree('pgdatadirs/' + dirname)
|
||||
|
||||
for file in s3bucket.objects.all():
|
||||
s3_client.delete_object(Bucket = BUCKET, Key = file.key)
|
||||
|
||||
responsestr = print_cmd_result(result) + '\n'
|
||||
responsestr += '''
|
||||
Deleted all Postgres datadirs.
|
||||
Deleted all files in object storage bucket.
|
||||
'''
|
||||
|
||||
return responsestr
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run()
|
||||
@@ -1,3 +0,0 @@
|
||||
module.exports = {
|
||||
presets: ["@babel/preset-env", "@babel/preset-react"],
|
||||
};
|
||||
@@ -1,67 +0,0 @@
|
||||
Mock implementation of a management console.
|
||||
|
||||
This isn't very different from a "normal" PostgreSQL installation with
|
||||
a base backup and WAL archive. The main user-visible difference is
|
||||
that when you create a standby server, we don't restore the whole data
|
||||
directory, but only the "non-relation" files. Relation files are
|
||||
restored on demand, when they're accessed the first time. That makes
|
||||
the "create standby" operation is very fast, but with some delay when
|
||||
you connect and start running queries instead. Most visible if you
|
||||
have a large database. (However, see note below about large databases)
|
||||
|
||||
Note: lots of things are broken/unsafe. Things will fail if a table is
|
||||
larger than 1 GB. Or if there are more than 1000 files in the cloud
|
||||
bucket.
|
||||
|
||||
How to use this demo:
|
||||
|
||||
1. If there are any leftovers from previous runs, reset by clicking
|
||||
the RESET DEMO button. This kills and deletes all Postgres servers,
|
||||
and empties the cloud storage bucket
|
||||
|
||||
2. Create primary server by clicking on the "Init primary" button
|
||||
|
||||
3. Push a base image of the primary to cloud storage, by clicking the
|
||||
"push base image" button. (This takes about 30 seconds, be
|
||||
patient)
|
||||
|
||||
4. Connect to primary with psql, and create a test table with a little data.
|
||||
|
||||
psql postgres -p5432 -U zenith -h<host>
|
||||
|
||||
create table mytable (i int4);
|
||||
|
||||
insert into mytable values (1);
|
||||
select pg_switch_wal();
|
||||
|
||||
The Postgres password is the same as for the management console.
|
||||
|
||||
3. Now that there's a new WAL segment in the arhive, we can "slice &
|
||||
dice" it. Click on the "Slice & dice button".
|
||||
|
||||
4. Perform more updates on the primary, to generate more WAL.
|
||||
|
||||
insert into mytable values (2); select pg_switch_wal();
|
||||
insert into mytable values (3); select pg_switch_wal();
|
||||
insert into mytable values (4); select pg_switch_wal();
|
||||
insert into mytable values (5); select pg_switch_wal();
|
||||
|
||||
5. Slice & Dice the WAL again
|
||||
|
||||
6. Now you can create read-only standby servers at any point in the
|
||||
WAL. Type a WAL position in the text box (or use the slider), and
|
||||
click "Create new standby". The first standby is created at port 5433,
|
||||
the second at port 5434, and so forth.
|
||||
|
||||
7. Connect to the standby with "psql -p 5433". Note that it takes a
|
||||
few seconds until the connection is established. That's because the
|
||||
standby has to restore the basic system catalogs, like pg_database and
|
||||
pg_authid from the backup. After connecting, you can do "\d" to list
|
||||
tables, this will also take a few seconds, as more catalog tables are
|
||||
restored from backup. Subsequent commands will be faster.
|
||||
|
||||
Run queries in the standby:
|
||||
|
||||
select * from mytable;
|
||||
|
||||
the result depends on the LSN that you picked when you created the server.
|
||||
@@ -1,463 +0,0 @@
|
||||
import React, { useState, useEffect } from 'react';
|
||||
import ReactDOM from 'react-dom';
|
||||
import Loader from "react-loader-spinner";
|
||||
import { Router, Route, Link, IndexRoute, hashHistory, browserHistory } from 'react-router';
|
||||
|
||||
function ServerStatus(props) {
|
||||
const datadir = props.server.datadir;
|
||||
const status = props.server.status;
|
||||
const port = props.server.port;
|
||||
|
||||
return (
|
||||
<div>
|
||||
<h2>{ datadir == 'primary' ? 'Primary' : datadir }</h2>
|
||||
status: <div className='status'>{status}</div><br/>
|
||||
to connect: <span className='shellcommand'>psql -h { window.location.hostname } -p { port } -U zenith postgres</span><br/>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function StandbyList(props) {
|
||||
const bucketSummary = props.bucketSummary;
|
||||
const standbys = props.standbys;
|
||||
const maxwalpos = bucketSummary.maxwal ? walpos_to_int(bucketSummary.maxwal) : 0;
|
||||
|
||||
const [walposInput, setWalposInput] = useState({ src: 'text', value: '0/0'});
|
||||
|
||||
// find earliest base image
|
||||
const minwalpos = bucketSummary.nonrelimages ? bucketSummary.nonrelimages.reduce((minpos, imgpos_str, index, array) => {
|
||||
const imgpos = walpos_to_int(imgpos_str);
|
||||
return (minpos == 0 || imgpos < minpos) ? imgpos : minpos;
|
||||
}, 0) : 0;
|
||||
|
||||
const can_create_standby = minwalpos > 0 && maxwalpos > 0 && maxwalpos >= minwalpos;
|
||||
var walpos_valid = true;
|
||||
|
||||
function create_standby() {
|
||||
const formdata = new FormData();
|
||||
formdata.append("walpos", walposStr);
|
||||
|
||||
props.startOperation('Creating new standby at ' + walposStr + '...',
|
||||
fetch("/api/create_standby", { method: 'POST', body: formdata }));
|
||||
}
|
||||
|
||||
function destroy_standby(datadir) {
|
||||
const formdata = new FormData();
|
||||
formdata.append("datadir", datadir);
|
||||
props.startOperation('Destroying ' + datadir + '...',
|
||||
fetch("/api/destroy_server", { method: 'POST', body: formdata }));
|
||||
}
|
||||
|
||||
const handleSliderChange = (event) => {
|
||||
setWalposInput({ src: 'slider', value: event.target.value });
|
||||
}
|
||||
|
||||
const handleWalposChange = (event) => {
|
||||
setWalposInput({ src: 'text', value: event.target.value });
|
||||
}
|
||||
|
||||
var sliderValue;
|
||||
var walposStr;
|
||||
if (walposInput.src == 'text')
|
||||
{
|
||||
const walpos = walpos_to_int(walposInput.value);
|
||||
|
||||
if (walpos >= minwalpos && walpos <= maxwalpos)
|
||||
walpos_valid = true;
|
||||
else
|
||||
walpos_valid = false;
|
||||
|
||||
sliderValue = Math.round((walpos - minwalpos) / (maxwalpos - minwalpos) * 100);
|
||||
walposStr = walposInput.value;
|
||||
}
|
||||
else
|
||||
{
|
||||
const slider = walposInput.value;
|
||||
const new_walpos = minwalpos + slider / 100 * (maxwalpos - minwalpos);
|
||||
|
||||
console.log('minwalpos: '+ minwalpos);
|
||||
console.log('maxwalpos: '+ maxwalpos);
|
||||
|
||||
walposStr = int_to_walpos(Math.round(new_walpos));
|
||||
walpos_valid = true;
|
||||
console.log(walposStr);
|
||||
}
|
||||
|
||||
var standbystatus = ''
|
||||
if (standbys)
|
||||
{
|
||||
standbystatus =
|
||||
<div>
|
||||
{
|
||||
standbys.length > 0 ?
|
||||
standbys.map((server) =>
|
||||
<>
|
||||
<ServerStatus key={ 'status_' + server.datadir} server={server}/>
|
||||
<button key={ 'destroy_' + server.datadir} onClick={e => destroy_standby(server.datadir)}>Destroy standby</button>
|
||||
</>
|
||||
) : "no standby servers"
|
||||
}
|
||||
</div>
|
||||
}
|
||||
|
||||
return (
|
||||
<div>
|
||||
<h2>Standbys</h2>
|
||||
<button onClick={create_standby} disabled={!can_create_standby || !walpos_valid}>Create new Standby</button> at LSN
|
||||
<input type="text" id="walpos_input" value={ walposStr } onChange={handleWalposChange} disabled={!can_create_standby}/>
|
||||
<input type="range" id="walpos_slider" min="0" max="100" steps="1" value={sliderValue} onChange={handleSliderChange} disabled={!can_create_standby}/>
|
||||
<br/>
|
||||
{ standbystatus }
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ServerList(props) {
|
||||
const primary = props.serverStatus ? props.serverStatus.primary : null;
|
||||
const standbys = props.serverStatus ? props.serverStatus.standbys : [];
|
||||
const bucketSummary = props.bucketSummary;
|
||||
|
||||
var primarystatus = '';
|
||||
|
||||
function destroy_primary() {
|
||||
const formdata = new FormData();
|
||||
formdata.append("datadir", 'primary');
|
||||
props.startOperation('Destroying primary...',
|
||||
fetch("/api/destroy_server", { method: 'POST', body: formdata }));
|
||||
}
|
||||
|
||||
function restore_primary() {
|
||||
props.startOperation('Restoring primary...',
|
||||
fetch("/api/restore_primary", { method: 'POST' }));
|
||||
}
|
||||
|
||||
if (primary)
|
||||
{
|
||||
primarystatus =
|
||||
<div>
|
||||
<ServerStatus server={primary}/>
|
||||
<button onClick={destroy_primary}>Destroy primary</button>
|
||||
</div>
|
||||
}
|
||||
else
|
||||
{
|
||||
primarystatus =
|
||||
<div>
|
||||
no primary server<br/>
|
||||
<button onClick={restore_primary}>Restore primary</button>
|
||||
</div>
|
||||
}
|
||||
|
||||
return (
|
||||
<>
|
||||
{ primarystatus }
|
||||
<StandbyList standbys={standbys} startOperation={props.startOperation} bucketSummary={props.bucketSummary}/>
|
||||
<p className="todo">
|
||||
Should we list the WAL safekeeper nodes here? Or are they part of the Storage? Or not visible to users at all?
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
function BucketSummary(props) {
|
||||
const bucketSummary = props.bucketSummary;
|
||||
const startOperation = props.startOperation;
|
||||
|
||||
function slicedice() {
|
||||
startOperation('Slicing sequential WAL to per-relation WAL...',
|
||||
fetch("/api/slicedice", { method: 'POST' }));
|
||||
}
|
||||
|
||||
if (!bucketSummary.nonrelimages)
|
||||
{
|
||||
return <>loading...</>
|
||||
}
|
||||
|
||||
return (
|
||||
<div>
|
||||
<div>Base images at following WAL positions:
|
||||
<ul>
|
||||
{bucketSummary.nonrelimages.map((img) => (
|
||||
<li key={img}>{img}</li>
|
||||
))}
|
||||
</ul>
|
||||
</div>
|
||||
Sliced WAL is available up to { bucketSummary.maxwal }<br/>
|
||||
Raw WAL is available up to { bucketSummary.maxseqwal }<br/>
|
||||
|
||||
<br/>
|
||||
<button onClick={slicedice}>Slice & Dice WAL</button>
|
||||
<p className="todo">
|
||||
Currently, the slicing or "sharding" of the WAL needs to be triggered manually, by clicking the above button.
|
||||
<br/>
|
||||
TODO: make it a continuous process that runs in the WAL safekeepers, or in the Page Servers, or as a standalone service.
|
||||
</p>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function ProgressIndicator()
|
||||
{
|
||||
return (
|
||||
<div>
|
||||
<Loader
|
||||
type="Puff"
|
||||
color="#00BFFF"
|
||||
height={100}
|
||||
width={100}
|
||||
/>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
function walpos_to_int(walpos)
|
||||
{
|
||||
const [hi, lo] = walpos.split('/');
|
||||
|
||||
return parseInt(hi, 16) + parseInt(lo, 16);
|
||||
}
|
||||
|
||||
function int_to_walpos(x)
|
||||
{
|
||||
console.log('converting ' + x);
|
||||
return (Math.floor((x / 0x100000000)).toString(16) + '/' + (x % 0x100000000).toString(16)).toUpperCase();
|
||||
}
|
||||
|
||||
function OperationStatus(props) {
|
||||
const lastOperation = props.lastOperation;
|
||||
const inProgress = props.inProgress;
|
||||
const operationResult = props.operationResult;
|
||||
|
||||
if (lastOperation)
|
||||
{
|
||||
return (
|
||||
<div><h2>Last operation:</h2>
|
||||
<div>{lastOperation} { (!inProgress && lastOperation) ? 'done!' : '' }</div>
|
||||
<div className='result'>
|
||||
{inProgress ? <ProgressIndicator/> : <pre>{operationResult}</pre>}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
else
|
||||
return '';
|
||||
}
|
||||
|
||||
function ActionButtons(props) {
|
||||
|
||||
const startOperation = props.startOperation;
|
||||
const bucketSummary = props.bucketSummary;
|
||||
|
||||
function reset_demo() {
|
||||
startOperation('resetting everything...',
|
||||
fetch("/api/reset_demo", { method: 'POST' }));
|
||||
}
|
||||
|
||||
function init_primary() {
|
||||
startOperation('Initializing new primary...',
|
||||
fetch("/api/init_primary", { method: 'POST' }));
|
||||
}
|
||||
|
||||
function zenith_push() {
|
||||
startOperation('Pushing new base image...',
|
||||
fetch("/api/zenith_push", { method: 'POST' }));
|
||||
}
|
||||
|
||||
return (
|
||||
<div>
|
||||
<p className="todo">
|
||||
RESET DEMO deletes everything in the storage bucket, and stops and destroys all servers. This resets the whole demo environment to the initial state.
|
||||
</p>
|
||||
<button onClick={reset_demo}>RESET DEMO</button>
|
||||
<p className="todo">
|
||||
Init Primary runs initdb to create a new primary server. Click this after Resetting the demo.
|
||||
</p>
|
||||
|
||||
<button onClick={init_primary}>Init primary</button>
|
||||
|
||||
<p className="todo">
|
||||
Push Base Image stops the primary, copies the current state of the primary to the storage bucket as a new base backup, and restarts the primary.
|
||||
<br/>
|
||||
TODO: This should be handled by a continuous background process, probably running in the storage nodes. And without having to shut down the cluster, of course.
|
||||
</p>
|
||||
|
||||
<button onClick={zenith_push}>Push base image</button>
|
||||
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function Sidenav(props)
|
||||
{
|
||||
const toPage = (page) => (event) => {
|
||||
//event.preventDefault()
|
||||
props.switchPage(page);
|
||||
};
|
||||
return (
|
||||
<div>
|
||||
<h3 className="sidenav-item">Menu</h3>
|
||||
<a href="#servers" onClick={toPage('servers')} className="sidenav-item">Servers</a>
|
||||
<a href="#storage" onClick={toPage('storage')} className="sidenav-item">Storage</a>
|
||||
<a href="#snapshots" onClick={toPage('snapshots')} className="sidenav-item">Snapshots</a>
|
||||
<a href="#demo" onClick={toPage('demo')} className="sidenav-item">Demo</a>
|
||||
<a href="#import" onClick={toPage('import')} className="sidenav-item">Import / Export</a>
|
||||
<a href="#jobs" onClick={toPage('jobs')} className="sidenav-item">Jobs</a>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function App()
|
||||
{
|
||||
const [page, setPage] = useState('servers');
|
||||
const [serverStatus, setServerStatus] = useState({});
|
||||
const [bucketSummary, setBucketSummary] = useState({});
|
||||
const [lastOperation, setLastOperation] = useState('');
|
||||
const [inProgress, setInProgress] = useState('');
|
||||
const [operationResult, setOperationResult] = useState('');
|
||||
|
||||
useEffect(() => {
|
||||
reloadStatus();
|
||||
}, []);
|
||||
|
||||
function startOperation(operation, promise)
|
||||
{
|
||||
promise.then(result => result.text()).then(resultText => {
|
||||
operationFinished(resultText);
|
||||
});
|
||||
|
||||
setLastOperation(operation);
|
||||
setInProgress(true);
|
||||
setOperationResult('');
|
||||
}
|
||||
|
||||
function operationFinished(result)
|
||||
{
|
||||
setInProgress(false);
|
||||
setOperationResult(result);
|
||||
reloadStatus();
|
||||
}
|
||||
|
||||
function clearOperation()
|
||||
{
|
||||
setLastOperation('')
|
||||
setInProgress('');
|
||||
setOperationResult('');
|
||||
console.log("cleared");
|
||||
}
|
||||
|
||||
function reloadStatus()
|
||||
{
|
||||
fetch('/api/server_status').then(res => res.json()).then(data => {
|
||||
setServerStatus(data);
|
||||
});
|
||||
|
||||
fetch('/api/bucket_summary').then(res => res.json()).then(data => {
|
||||
setBucketSummary(data);
|
||||
});
|
||||
}
|
||||
|
||||
const content = () => {
|
||||
console.log(page);
|
||||
if (page === 'servers') {
|
||||
return (
|
||||
<>
|
||||
<h1>Server status</h1>
|
||||
<ServerList startOperation={ startOperation }
|
||||
serverStatus={ serverStatus }
|
||||
bucketSummary={ bucketSummary }/>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'storage') {
|
||||
return (
|
||||
<>
|
||||
<h1>Storage bucket status</h1>
|
||||
<BucketSummary startOperation={ startOperation }
|
||||
bucketSummary={ bucketSummary }/>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'snapshots') {
|
||||
return (
|
||||
<>
|
||||
<h1>Snapshots</h1>
|
||||
<p className="todo">
|
||||
In Zenith, snapshots are just specific points (LSNs) in the WAL history, with a label. A snapshot prevents garbage collecting old data that's still needed to reconstruct the database at that LSN.
|
||||
</p>
|
||||
<p className="todo">
|
||||
TODO:
|
||||
<ul>
|
||||
<li>List existing snapshots</li>
|
||||
<li>Create new snapshot manually, from current state or from a given LSN</li>
|
||||
<li>Drill into the WAL stream to see what have happened. Provide tools for e.g. finding point where a table was dropped</li>
|
||||
<li>Create snapshots automatically based on events in the WAL, like if you call pg_create_restore_point(() in the primary</li>
|
||||
<li>Launch new reader instance at a snapshot</li>
|
||||
<li>Export snapshot</li>
|
||||
<li>Rollback cluster to a snapshot</li>
|
||||
</ul>
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'demo') {
|
||||
return (
|
||||
<>
|
||||
<h1>Misc actions</h1>
|
||||
<ActionButtons startOperation={ startOperation }
|
||||
bucketSummary={ bucketSummary }/>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'import') {
|
||||
return (
|
||||
<>
|
||||
<h1>Import & Export tools</h1>
|
||||
<p className="TODO">TODO:
|
||||
<ul>
|
||||
<li>Initialize database from existing backup (pg_basebackup, WAL-G, pgbackrest)</li>
|
||||
<li>Initialize from a pg_dump or other SQL script</li>
|
||||
<li>Launch batch job to import data files from S3</li>
|
||||
<li>Launch batch job to export database with pg_dump to S3</li>
|
||||
</ul>
|
||||
These jobs can be run in against reader processing nodes. We can even
|
||||
spawn a new reader node dedicated to a job, and destry it when the job is done.
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
} else if (page === 'jobs') {
|
||||
return (
|
||||
<>
|
||||
<h1>Batch jobs</h1>
|
||||
<p className="TODO">TODO:
|
||||
<ul>
|
||||
<li>List running jobs launched from Import & Export tools</li>
|
||||
<li>List other batch jobs launched by the user</li>
|
||||
<li>Launch new batch jobs</li>
|
||||
</ul>
|
||||
</p>
|
||||
</>
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function switchPage(page)
|
||||
{
|
||||
console.log("topage " + page);
|
||||
setPage(page)
|
||||
clearOperation();
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="row">
|
||||
<div className="sidenav">
|
||||
<Sidenav switchPage={switchPage} className="column"/>
|
||||
</div>
|
||||
<div className="column">
|
||||
<div>
|
||||
{ content() }
|
||||
</div>
|
||||
<OperationStatus lastOperation={ lastOperation }
|
||||
inProgress = { inProgress }
|
||||
operationResult = { operationResult }/>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
ReactDOM.render(<App/>, document.getElementById('reactApp'));
|
||||
@@ -1,105 +0,0 @@
|
||||
import React, { useState, useEffect } from 'react';
|
||||
import ReactDOM from 'react-dom';
|
||||
import Loader from "react-loader-spinner";
|
||||
|
||||
function walpos_to_int(walpos)
|
||||
{
|
||||
const [hi, lo] = walpos.split('/');
|
||||
|
||||
return parseInt(hi, 16) + parseInt(lo, 16);
|
||||
}
|
||||
|
||||
const palette = [
|
||||
"#003f5c",
|
||||
"#2f4b7c",
|
||||
"#665191",
|
||||
"#a05195",
|
||||
"#d45087",
|
||||
"#f95d6a",
|
||||
"#ff7c43",
|
||||
"#ffa600"];
|
||||
|
||||
function WalRecord(props)
|
||||
{
|
||||
const firstwalpos = props.firstwalpos;
|
||||
const endwalpos = props.endwalpos;
|
||||
const record = props.record;
|
||||
const index = props.index;
|
||||
const xidmap = props.xidmap;
|
||||
|
||||
const startpos = walpos_to_int(record.start)
|
||||
const endpos = walpos_to_int(record.end)
|
||||
|
||||
const scale = 1000 / (16*1024*1024)
|
||||
const startx = (startpos - firstwalpos) * scale;
|
||||
const endx = (endpos - firstwalpos) * scale;
|
||||
|
||||
const xidindex = xidmap[record.xid];
|
||||
const color = palette[index % palette.length];
|
||||
|
||||
const y = 5 + (xidindex) * 20 + (index % 2) * 2;
|
||||
|
||||
return (
|
||||
<line x1={ startx } y1={y} x2={endx} y2={y} stroke={ color } strokeWidth="5">
|
||||
<title>
|
||||
start: { record.start } end: { record.end }
|
||||
</title>
|
||||
</line>
|
||||
)
|
||||
}
|
||||
|
||||
function WalFile(props)
|
||||
{
|
||||
const walContent = props.walContent;
|
||||
const firstwalpos = props.firstwalpos;
|
||||
const xidmap = props.xidmap;
|
||||
|
||||
return <svg width="1000" height="200">
|
||||
{
|
||||
walContent.records ?
|
||||
walContent.records.map((record, index) =>
|
||||
<WalRecord key={record.start} firstwalpos={firstwalpos} record={record} index={index} xidmap={xidmap}/>
|
||||
) : "no records"
|
||||
}
|
||||
</svg>
|
||||
}
|
||||
|
||||
function WalDumpApp()
|
||||
{
|
||||
const [walContent, setWalContent] = useState({});
|
||||
|
||||
const filename = '00000001000000000000000C';
|
||||
|
||||
useEffect(() => {
|
||||
fetch('/fetch_wal?filename='+filename).then(res => res.json()).then(data => {
|
||||
setWalContent(data);
|
||||
});
|
||||
}, []);
|
||||
|
||||
var firstwalpos = 0;
|
||||
var endwalpos = 0;
|
||||
var numxids = 0;
|
||||
var xidmap = {};
|
||||
if (walContent.records && walContent.records.length > 0)
|
||||
{
|
||||
firstwalpos = walpos_to_int(walContent.records[0].start);
|
||||
endwalpos = firstwalpos + 16*1024*1024;
|
||||
|
||||
walContent.records.forEach(rec => {
|
||||
if (!xidmap[rec.xid])
|
||||
{
|
||||
xidmap[rec.xid] = ++numxids;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return (
|
||||
<>
|
||||
<h2>{filename}</h2>
|
||||
<WalFile walContent={walContent} firstwalpos={firstwalpos} endwalpos={endwalpos} xidmap={xidmap}/>
|
||||
</>
|
||||
);
|
||||
}
|
||||
|
||||
console.log('hey there');
|
||||
ReactDOM.render(<WalDumpApp/>, document.getElementById('waldump'));
|
||||
@@ -1,9 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# NOTE: You must set the following environment variables before running this:
|
||||
# BASIC_AUTH_PASSWORD - basic http auth password
|
||||
# S3_ACCESSKEY
|
||||
# S3_SECRET
|
||||
|
||||
|
||||
S3_ENDPOINT=https://storage.googleapis.com S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql-install/bin:$PATH flask run --host=0.0.0.0
|
||||
@@ -1,8 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# NOTE: You should set the BASIC_AUTH_PASSWORD environment variable before calling
|
||||
|
||||
# Launch S3 server
|
||||
(cd ms3 && python3 -m ms3.app --listen-address=localhost) &
|
||||
|
||||
FLASK_ENV=development S3_REGION=auto S3_ENDPOINT=http://localhost:9009 S3_BUCKET=zenith-testbucket PATH=/home/heikki/pgsql.fsmfork/bin:$PATH flask run --host=0.0.0.0
|
||||
6144
mgmt-console/package-lock.json
generated
6144
mgmt-console/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -1,27 +0,0 @@
|
||||
{
|
||||
"name": "starter-kit",
|
||||
"version": "1.1.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1",
|
||||
"build": "webpack",
|
||||
"start": "python app.py"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"react": "^17.0.1",
|
||||
"react-dom": "^17.0.1",
|
||||
"react-loader-spinner": "^4.0.0",
|
||||
"react-router": "^5.2.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@babel/core": "^7.13.1",
|
||||
"@babel/preset-env": "^7.13.5",
|
||||
"@babel/preset-react": "^7.12.13",
|
||||
"babel-loader": "^8.2.2",
|
||||
"webpack": "^5.24.2",
|
||||
"webpack-cli": "^4.5.0"
|
||||
}
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
<head>
|
||||
|
||||
<style>
|
||||
.status {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.shellcommand {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.result {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
padding: 10px;
|
||||
}
|
||||
|
||||
|
||||
.todo {font-style: italic;}
|
||||
|
||||
|
||||
h1 {color: blue;}
|
||||
|
||||
.column {
|
||||
float: left;
|
||||
width: 50%;
|
||||
padding: 10px;
|
||||
}
|
||||
/* Clear floats after the columns */
|
||||
.row:after {
|
||||
content: "";
|
||||
display: table;
|
||||
clear: both;
|
||||
}
|
||||
|
||||
.sidenav {
|
||||
float: left;
|
||||
width: 150px;
|
||||
padding: 10px;
|
||||
background-color: pink;
|
||||
}
|
||||
|
||||
.sidenav-item {
|
||||
padding:10px 0px;
|
||||
border:none;
|
||||
display:block;
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="reactApp"></div>
|
||||
|
||||
<!-- Attach React components -->
|
||||
<script type="text/javascript" src="{{ url_for('static', filename='app_bundle.js') }}"></script>
|
||||
</body>
|
||||
@@ -1,46 +0,0 @@
|
||||
<head>
|
||||
|
||||
<style>
|
||||
.status {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.shellcommand {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
}
|
||||
.result {
|
||||
font-family: monospace;
|
||||
background-color: lightgrey;
|
||||
padding: 10px;
|
||||
}
|
||||
h1 {color: blue;}
|
||||
p {color: red;}
|
||||
|
||||
* {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.row {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
/* Create two equal columns that sits next to each other */
|
||||
.column1 {
|
||||
flex: 30%;
|
||||
padding: 10px;
|
||||
}
|
||||
.column2 {
|
||||
flex: 70%;
|
||||
padding: 10px;
|
||||
}
|
||||
</style>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="waldump"></div>
|
||||
|
||||
<!-- Attach React components -->
|
||||
<script type="text/javascript" src="{{ url_for('static', filename='waldump_bundle.js') }}"></script>
|
||||
</body>
|
||||
@@ -1,25 +0,0 @@
|
||||
#
|
||||
# This file contains work-in-progress code to visualize WAL contents.
|
||||
#
|
||||
# This is the API endpoint that calls a 'zenith_wal_to_json' executable,
|
||||
# which is a hacked version of pg_waldump that prints information about the
|
||||
# records in JSON format. The code in js/waldump.js displays it.
|
||||
#
|
||||
|
||||
import os
|
||||
import re
|
||||
from subprocess import PIPE, STDOUT, run, Popen
|
||||
|
||||
def fetch_wal(request, s3bucket):
|
||||
filename = request.args.get('filename')
|
||||
if not re.match("^[A-Za-z0-9_]+$", filename):
|
||||
raise Exception('invalid WAL filename: ' + filename)
|
||||
|
||||
# FIXME: this downloads the WAL file to current dir. Use a temp dir? Pipe?
|
||||
s3bucket.download_file('walarchive/' + filename, filename)
|
||||
|
||||
result = run("zenith_wal_to_json " + filename, stdout=PIPE, universal_newlines=True, shell=True)
|
||||
|
||||
os.unlink(filename);
|
||||
|
||||
return result.stdout
|
||||
@@ -1,27 +0,0 @@
|
||||
var webpack = require('webpack');
|
||||
module.exports = {
|
||||
entry: {
|
||||
app: './js/app.js',
|
||||
waldump: './js/waldump.js'
|
||||
},
|
||||
output: {
|
||||
filename: "[name]_bundle.js",
|
||||
path: __dirname + '/static'
|
||||
},
|
||||
module: {
|
||||
rules: [
|
||||
{
|
||||
test: /\.js?$/,
|
||||
exclude: /node_modules/,
|
||||
use: {
|
||||
loader: 'babel-loader',
|
||||
options: {
|
||||
presets: ['@babel/preset-env']
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
plugins: [
|
||||
]
|
||||
};
|
||||
@@ -1,179 +0,0 @@
|
||||
#zenith.py
|
||||
import click
|
||||
import testgres
|
||||
import os
|
||||
|
||||
from testgres import PostgresNode
|
||||
from tabulate import tabulate
|
||||
|
||||
zenith_base_dir = '/home/anastasia/zenith/basedir'
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Run the Zenith CLI."""
|
||||
|
||||
@click.group()
|
||||
def pg():
|
||||
"""Db operations
|
||||
|
||||
NOTE: 'database' here means one postgresql node
|
||||
"""
|
||||
|
||||
@click.command(name='create')
|
||||
@click.option('--name', required=True)
|
||||
@click.option('-s', '--storage-name', help='Name of the storage',
|
||||
default='zenith-local',
|
||||
show_default=True)
|
||||
@click.option('--snapshot', help='init from the snapshot. Snap is a name or URL')
|
||||
@click.option('--no-start', is_flag=True, help='Do not start created node',
|
||||
default=False, show_default=True)
|
||||
def pg_create(name, storage_name, snapshot, no_start):
|
||||
"""Initialize the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
# TODO skip init, instead of that link node with storage or upload it from snapshot
|
||||
node.init()
|
||||
if(no_start==False):
|
||||
node.start()
|
||||
|
||||
@click.command(name='start')
|
||||
@click.option('--name', required=True)
|
||||
@click.option('--snapshot')
|
||||
@click.option('--read-only', is_flag=True, help='Start read-only node', show_default=True)
|
||||
def pg_start(name, snapshot, read_only):
|
||||
"""Start the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
# TODO pass snapshot as a parameter
|
||||
node.start()
|
||||
|
||||
@click.command(name='stop')
|
||||
@click.option('--name', required=True)
|
||||
def pg_stop(name):
|
||||
"""Stop the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
node.stop()
|
||||
|
||||
@click.command(name='destroy')
|
||||
@click.option('--name', required=True)
|
||||
def pg_destroy(name):
|
||||
"""Drop the database"""
|
||||
node = PostgresNode()
|
||||
base_dir = os.path.join(zenith_base_dir, 'pg', name)
|
||||
node = testgres.get_new_node(name, base_dir=base_dir)
|
||||
node.cleanup()
|
||||
|
||||
@click.command(name='list')
|
||||
def pg_list():
|
||||
"""List existing databases"""
|
||||
dirs = os.listdir(os.path.join(zenith_base_dir, 'pg'))
|
||||
path={}
|
||||
status={}
|
||||
data=[]
|
||||
|
||||
for dirname in dirs:
|
||||
path[dirname] = os.path.join(zenith_base_dir, 'pg', dirname)
|
||||
fname = os.path.join( path[dirname], 'data/postmaster.pid')
|
||||
try:
|
||||
f = open(fname,'r')
|
||||
status[dirname] = f.readlines()[-1]
|
||||
except OSError as err:
|
||||
status[dirname]='inactive'
|
||||
data.append([dirname , status[dirname], path[dirname]])
|
||||
|
||||
print(tabulate(data, headers=['Name', 'Status', 'Path']))
|
||||
|
||||
pg.add_command(pg_create)
|
||||
pg.add_command(pg_destroy)
|
||||
pg.add_command(pg_start)
|
||||
pg.add_command(pg_stop)
|
||||
pg.add_command(pg_list)
|
||||
|
||||
|
||||
|
||||
@click.group()
|
||||
def storage():
|
||||
"""Storage operations"""
|
||||
|
||||
@click.command(name='attach')
|
||||
@click.option('--name')
|
||||
def storage_attach(name):
|
||||
"""Attach the storage"""
|
||||
|
||||
@click.command(name='detach')
|
||||
@click.option('--name')
|
||||
@click.option('--force', is_flag=True, show_default=True)
|
||||
def storage_detach(name):
|
||||
"""Detach the storage"""
|
||||
|
||||
@click.command(name='list')
|
||||
def storage_list():
|
||||
"""List existing storages"""
|
||||
|
||||
storage.add_command(storage_attach)
|
||||
storage.add_command(storage_detach)
|
||||
storage.add_command(storage_list)
|
||||
|
||||
@click.group()
|
||||
def snapshot():
|
||||
"""Snapshot operations"""
|
||||
|
||||
@click.command(name='create')
|
||||
def snapshot_create():
|
||||
"""Create new snapshot"""
|
||||
|
||||
@click.command(name='destroy')
|
||||
def snapshot_destroy():
|
||||
"""Destroy the snapshot"""
|
||||
|
||||
@click.command(name='pull')
|
||||
def snapshot_pull():
|
||||
"""Pull remote snapshot"""
|
||||
|
||||
@click.command(name='push')
|
||||
def snapshot_push():
|
||||
"""Push snapshot to remote"""
|
||||
|
||||
@click.command(name='import')
|
||||
def snapshot_import():
|
||||
"""Convert given format to zenith snapshot"""
|
||||
|
||||
@click.command(name='export')
|
||||
def snapshot_export():
|
||||
"""Convert zenith snapshot to PostgreSQL compatible format"""
|
||||
|
||||
snapshot.add_command(snapshot_create)
|
||||
snapshot.add_command(snapshot_destroy)
|
||||
snapshot.add_command(snapshot_pull)
|
||||
snapshot.add_command(snapshot_push)
|
||||
snapshot.add_command(snapshot_import)
|
||||
snapshot.add_command(snapshot_export)
|
||||
|
||||
@click.group()
|
||||
def wal():
|
||||
"""WAL operations"""
|
||||
|
||||
@click.command()
|
||||
def wallist(name="list"):
|
||||
"""List WAL files"""
|
||||
|
||||
wal.add_command(wallist)
|
||||
|
||||
|
||||
@click.command()
|
||||
def console():
|
||||
"""Open web console"""
|
||||
|
||||
main.add_command(pg)
|
||||
main.add_command(storage)
|
||||
main.add_command(snapshot)
|
||||
main.add_command(wal)
|
||||
main.add_command(console)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -24,13 +24,12 @@ clap = "2.33.0"
|
||||
termion = "1.5.6"
|
||||
tui = "0.14.0"
|
||||
daemonize = "0.4.1"
|
||||
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", rev="7f15a24ec7daa0a5d9516da706212745f9042818", features = ["no-verify-ssl"] }
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
tokio = { version = "1.3.0", features = ["full"] }
|
||||
tokio-stream = { version = "0.1.4" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
rocksdb = "0.16.0"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
@@ -38,7 +37,11 @@ walkdir = "2"
|
||||
thiserror = "1.0"
|
||||
hex = "0.4.3"
|
||||
tar = "0.4.33"
|
||||
parse_duration = "*"
|
||||
parse_duration = "2.1.1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
fs_extra = "1.2.0"
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
@@ -1,20 +1,105 @@
|
||||
use crate::ZTimelineId;
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
use tar::Builder;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::ZTimelineId;
|
||||
use crate::repository::Timeline;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Generate tarball with non-relational files from repository
|
||||
///
|
||||
pub fn send_tarball_at_lsn(
|
||||
write: &mut dyn Write,
|
||||
timelineid: ZTimelineId,
|
||||
_timeline: &Arc<dyn Timeline>,
|
||||
_lsn: Lsn,
|
||||
snapshot_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut ar = Builder::new(write);
|
||||
|
||||
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshot_lsn.0);
|
||||
let walpath = format!("timelines/{}/wal", timelineid);
|
||||
|
||||
debug!("sending tarball of snapshot in {}", snappath);
|
||||
for entry in WalkDir::new(&snappath) {
|
||||
let entry = entry?;
|
||||
let fullpath = entry.path();
|
||||
let relpath = entry.path().strip_prefix(&snappath).unwrap();
|
||||
|
||||
if relpath.to_str().unwrap() == "" {
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.file_type().is_dir() {
|
||||
trace!(
|
||||
"sending dir {} as {}",
|
||||
fullpath.display(),
|
||||
relpath.display()
|
||||
);
|
||||
ar.append_dir(relpath, fullpath)?;
|
||||
} else if entry.file_type().is_symlink() {
|
||||
error!("ignoring symlink in snapshot dir");
|
||||
} else if entry.file_type().is_file() {
|
||||
// Shared catalogs are exempt
|
||||
if relpath.starts_with("global/") {
|
||||
trace!("sending shared catalog {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else if !is_rel_file_path(relpath.to_str().unwrap()) {
|
||||
trace!("sending {}", relpath.display());
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else {
|
||||
trace!("not sending {}", relpath.display());
|
||||
}
|
||||
} else {
|
||||
error!("unknown file type: {}", fullpath.display());
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: Also send all the WAL. The compute node would only need
|
||||
// the WAL that applies to non-relation files, because the page
|
||||
// server handles all the relation files. But we don't have a
|
||||
// mechanism for separating relation and non-relation WAL at the
|
||||
// moment.
|
||||
for entry in std::fs::read_dir(&walpath)? {
|
||||
let entry = entry?;
|
||||
let fullpath = &entry.path();
|
||||
let relpath = fullpath.strip_prefix(&walpath).unwrap();
|
||||
|
||||
if !entry.path().is_file() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let archive_fname = relpath.to_str().unwrap();
|
||||
let archive_fname = archive_fname
|
||||
.strip_suffix(".partial")
|
||||
.unwrap_or(&archive_fname);
|
||||
let archive_path = "pg_wal/".to_owned() + archive_fname;
|
||||
ar.append_path_with_name(fullpath, archive_path)?;
|
||||
}
|
||||
ar.finish()?;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Send a tarball containing a snapshot of all non-relation files in the
|
||||
/// PostgreSQL data directory, at given LSN
|
||||
///
|
||||
/// There must be a snapshot at the given LSN in the snapshots directory, we cannot
|
||||
/// reconstruct the state at an arbitrary LSN at the moment.
|
||||
///
|
||||
pub fn send_snapshot_tarball(
|
||||
write: &mut dyn Write,
|
||||
timelineid: ZTimelineId,
|
||||
snapshotlsn: u64,
|
||||
snapshotlsn: Lsn,
|
||||
) -> Result<(), std::io::Error> {
|
||||
let mut ar = Builder::new(write);
|
||||
|
||||
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn);
|
||||
let snappath = format!("timelines/{}/snapshots/{:016X}", timelineid, snapshotlsn.0);
|
||||
let walpath = format!("timelines/{}/wal", timelineid);
|
||||
|
||||
debug!("sending tarball of snapshot in {}", snappath);
|
||||
@@ -48,7 +133,14 @@ pub fn send_snapshot_tarball(
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
} else {
|
||||
trace!("not sending {}", relpath.display());
|
||||
// FIXME: send all files for now
|
||||
|
||||
// FIXME: For now, also send all the relation files.
|
||||
// This really shouldn't be necessary, and kind of
|
||||
// defeats the point of having a page server in the
|
||||
// first place. But it is useful at least when
|
||||
// debugging with the DEBUG_COMPARE_LOCAL option (see
|
||||
// vendor/postgres/src/backend/storage/smgr/pagestore_smgr.c)
|
||||
|
||||
ar.append_path_with_name(fullpath, relpath)?;
|
||||
}
|
||||
} else {
|
||||
@@ -56,7 +148,11 @@ pub fn send_snapshot_tarball(
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: also send all the WAL
|
||||
// FIXME: Also send all the WAL. The compute node would only need
|
||||
// the WAL that applies to non-relation files, because the page
|
||||
// server handles all the relation files. But we don't have a
|
||||
// mechanism for separating relation and non-relation WAL at the
|
||||
// moment.
|
||||
for entry in std::fs::read_dir(&walpath)? {
|
||||
let entry = entry?;
|
||||
let fullpath = &entry.path();
|
||||
@@ -79,73 +175,10 @@ pub fn send_snapshot_tarball(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(fname)
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
};
|
||||
|
||||
Ok((relnode, forknum, segno))
|
||||
}
|
||||
|
||||
///
|
||||
/// Parse a path, relative to the root of PostgreSQL data directory, as
|
||||
/// a PostgreSQL relation data file.
|
||||
///
|
||||
fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
|
||||
/*
|
||||
* Relation data files can be in one of the following directories:
|
||||
@@ -165,30 +198,27 @@ fn parse_rel_file_path(path: &str) -> Result<(), FilePathError> {
|
||||
* <oid>.<segment number>
|
||||
*/
|
||||
if let Some(fname) = path.strip_prefix("global/") {
|
||||
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
|
||||
let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;
|
||||
|
||||
Ok(())
|
||||
} else if let Some(dbpath) = path.strip_prefix("base/") {
|
||||
let mut s = dbpath.split('/');
|
||||
let dbnode_str = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
let _dbnode = u32::from_str_radix(dbnode_str, 10)?;
|
||||
let fname = s
|
||||
.next()
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
let dbnode_str = s.next().ok_or(FilePathError::InvalidFileName)?;
|
||||
let _dbnode = dbnode_str.parse::<u32>()?;
|
||||
let fname = s.next().ok_or(FilePathError::InvalidFileName)?;
|
||||
if s.next().is_some() {
|
||||
return Err(FilePathError::new("invalid relation data file name"));
|
||||
return Err(FilePathError::InvalidFileName);
|
||||
};
|
||||
|
||||
let (_relnode, _forknum, _segno) = parse_filename(fname)?;
|
||||
let (_relnode, _forknum, _segno) = parse_relfilename(fname)?;
|
||||
|
||||
Ok(())
|
||||
} else if let Some(_) = path.strip_prefix("pg_tblspc/") {
|
||||
} else if path.strip_prefix("pg_tblspc/").is_some() {
|
||||
// TODO
|
||||
Err(FilePathError::new("tablespaces not supported"))
|
||||
error!("tablespaces not implemented yet");
|
||||
Err(FilePathError::InvalidFileName)
|
||||
} else {
|
||||
Err(FilePathError::new("invalid relation data file name"))
|
||||
Err(FilePathError::InvalidFileName)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -4,19 +4,23 @@
|
||||
|
||||
use log::*;
|
||||
use parse_duration::parse;
|
||||
use std::fs::{self, OpenOptions};
|
||||
use std::io;
|
||||
use std::process::exit;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::{env, path::PathBuf};
|
||||
use std::{
|
||||
fs::{File, OpenOptions},
|
||||
net::TcpListener,
|
||||
};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{App, Arg};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use slog::Drain;
|
||||
use slog::{Drain, FnValue};
|
||||
|
||||
use pageserver::{page_service, tui, zenith_repo_dir, PageServerConf};
|
||||
use pageserver::{branches, page_cache, page_service, tui, PageServerConf};
|
||||
|
||||
const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
const DEFAULT_GC_PERIOD_SEC: u64 = 10;
|
||||
@@ -47,6 +51,12 @@ fn main() -> Result<()> {
|
||||
.takes_value(false)
|
||||
.help("Run in the background"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("init")
|
||||
.long("init")
|
||||
.takes_value(false)
|
||||
.help("Initialize pageserver repo"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("gc_horizon")
|
||||
.long("gc_horizon")
|
||||
@@ -59,14 +69,46 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Interval between garbage collector iterations"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("workdir")
|
||||
.short("D")
|
||||
.long("workdir")
|
||||
.takes_value(true)
|
||||
.help("Working directory for the pageserver"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let workdir = if let Some(workdir_arg) = arg_matches.value_of("workdir") {
|
||||
PathBuf::from(workdir_arg)
|
||||
} else if let Some(workdir_arg) = std::env::var_os("ZENITH_REPO_DIR") {
|
||||
PathBuf::from(workdir_arg.to_str().unwrap())
|
||||
} else {
|
||||
PathBuf::from(".zenith")
|
||||
};
|
||||
|
||||
let pg_distrib_dir: PathBuf = {
|
||||
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
|
||||
postgres_bin.into()
|
||||
} else {
|
||||
let cwd = env::current_dir()?;
|
||||
cwd.join("tmp_install")
|
||||
}
|
||||
};
|
||||
|
||||
if !pg_distrib_dir.join("bin/postgres").exists() {
|
||||
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
}
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(DEFAULT_GC_PERIOD_SEC),
|
||||
listen_addr: "127.0.0.1:5430".parse().unwrap(),
|
||||
listen_addr: "127.0.0.1:64000".parse().unwrap(),
|
||||
// we will change the current working directory to the repository below,
|
||||
// so always set 'workdir' to '.'
|
||||
workdir: PathBuf::from("."),
|
||||
pg_distrib_dir,
|
||||
};
|
||||
|
||||
if arg_matches.is_present("daemonize") {
|
||||
@@ -94,55 +136,68 @@ fn main() -> Result<()> {
|
||||
conf.gc_period = parse(period)?;
|
||||
}
|
||||
|
||||
start_pageserver(&conf)
|
||||
// The configuration is all set up now. Turn it into a 'static
|
||||
// that can be freely stored in structs and passed across threads
|
||||
// as a ref.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// Create repo and exit if init was requested
|
||||
if arg_matches.is_present("init") {
|
||||
branches::init_repo(conf, &workdir)?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Set CWD to workdir for non-daemon modes
|
||||
env::set_current_dir(&workdir)?;
|
||||
|
||||
start_pageserver(conf)
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
let log_filename = "pageserver.log";
|
||||
// Don't open the same file for output multiple times;
|
||||
// the different fds could overwrite each other's output.
|
||||
let log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
|
||||
// Initialize logger
|
||||
let _scope_guard = init_logging(&conf)?;
|
||||
let logger_file = log_file.try_clone().unwrap();
|
||||
let _scope_guard = init_logging(&conf, logger_file)?;
|
||||
let _log_guard = slog_stdlog::init()?;
|
||||
|
||||
// Note: this `info!(...)` macro comes from `log` crate
|
||||
info!("standard logging redirected to slog");
|
||||
|
||||
let tui_thread: Option<thread::JoinHandle<()>>;
|
||||
if conf.interactive {
|
||||
let tui_thread = if conf.interactive {
|
||||
// Initialize the UI
|
||||
tui_thread = Some(
|
||||
Some(
|
||||
thread::Builder::new()
|
||||
.name("UI thread".into())
|
||||
.spawn(|| {
|
||||
let _ = tui::ui_main();
|
||||
})
|
||||
.unwrap(),
|
||||
);
|
||||
//threads.push(tui_thread);
|
||||
)
|
||||
} else {
|
||||
tui_thread = None;
|
||||
}
|
||||
None
|
||||
};
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
let repodir = zenith_repo_dir();
|
||||
|
||||
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
let log_filename = repodir.join("pageserver.log");
|
||||
let stdout = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
let stderr = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
let stdout = log_file.try_clone().unwrap();
|
||||
let stderr = log_file;
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file(repodir.join("pageserver.pid"))
|
||||
.working_directory(repodir)
|
||||
.pid_file("pageserver.pid")
|
||||
.working_directory(".")
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
@@ -150,69 +205,42 @@ fn start_pageserver(conf: &PageServerConf) -> Result<()> {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(e) => error!("Error, {}", e),
|
||||
}
|
||||
} else {
|
||||
// change into the repository directory. In daemon mode, Daemonize
|
||||
// does this for us.
|
||||
let repodir = zenith_repo_dir();
|
||||
std::env::set_current_dir(&repodir)?;
|
||||
info!("Changed current directory to repository in {:?}", &repodir);
|
||||
}
|
||||
|
||||
let mut threads = Vec::new();
|
||||
// Check that we can bind to address before further initialization
|
||||
info!("Starting pageserver on {}", conf.listen_addr);
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_addr)?;
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
// Initialize page cache, this will spawn walredo_thread
|
||||
page_cache::init(conf);
|
||||
|
||||
// Create directory for wal-redo datadirs
|
||||
match fs::create_dir("wal-redo") {
|
||||
Ok(_) => {}
|
||||
Err(e) => match e.kind() {
|
||||
io::ErrorKind::AlreadyExists => {}
|
||||
_ => {
|
||||
anyhow::bail!("Failed to create wal-redo data directory: {}", e);
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// GetPage@LSN requests are served by another thread. (It uses async I/O,
|
||||
// but the code in page_service sets up it own thread pool for that)
|
||||
let conf_copy = conf.clone();
|
||||
let page_server_thread = thread::Builder::new()
|
||||
// Spawn a thread to listen for connections. It will spawn further threads
|
||||
// for each connection.
|
||||
let page_service_thread = thread::Builder::new()
|
||||
.name("Page Service thread".into())
|
||||
.spawn(move || {
|
||||
// thread code
|
||||
page_service::thread_main(&conf_copy);
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(page_server_thread);
|
||||
.spawn(move || page_service::thread_main(conf, pageserver_listener))?;
|
||||
|
||||
if let Some(tui_thread) = tui_thread {
|
||||
// The TUI thread exits when the user asks to Quit.
|
||||
tui_thread.join().unwrap();
|
||||
} else {
|
||||
// In non-interactive mode, wait forever.
|
||||
for t in threads {
|
||||
t.join().unwrap()
|
||||
}
|
||||
page_service_thread
|
||||
.join()
|
||||
.expect("Page service thread has panicked")?
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
|
||||
fn init_logging(
|
||||
conf: &PageServerConf,
|
||||
log_file: File,
|
||||
) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
|
||||
if conf.interactive {
|
||||
Ok(tui::init_logging())
|
||||
} else if conf.daemonize {
|
||||
let log = zenith_repo_dir().join("pageserver.log");
|
||||
let log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log)
|
||||
.map_err(|err| {
|
||||
// We failed to initialize logging, so we can't log this message with error!
|
||||
eprintln!("Could not create log file {:?}: {}", log, err);
|
||||
err
|
||||
})?;
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file);
|
||||
let drain = slog_term::CompactFormat::new(decorator).build();
|
||||
let drain = slog_term::FullFormat::new(decorator).build();
|
||||
let drain = slog::Filter::new(drain, |record: &slog::Record| {
|
||||
if record.level().is_at_least(slog::Level::Info) {
|
||||
return true;
|
||||
@@ -220,7 +248,20 @@ fn init_logging(conf: &PageServerConf) -> Result<slog_scope::GlobalLoggerGuard,
|
||||
false
|
||||
});
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
let logger = slog::Logger::root(drain, slog::o!());
|
||||
let logger = slog::Logger::root(
|
||||
drain,
|
||||
slog::o!(
|
||||
"location" =>
|
||||
FnValue(move |record| {
|
||||
format!("{}, {}:{}",
|
||||
record.module(),
|
||||
record.file(),
|
||||
record.line()
|
||||
)
|
||||
}
|
||||
)
|
||||
),
|
||||
);
|
||||
Ok(slog_scope::set_global_logger(logger))
|
||||
} else {
|
||||
let decorator = slog_term::TermDecorator::new().build();
|
||||
|
||||
460
pageserver/src/branches.rs
Normal file
460
pageserver/src/branches.rs
Normal file
@@ -0,0 +1,460 @@
|
||||
//
|
||||
// Branch management code
|
||||
//
|
||||
// TODO: move all paths construction to conf impl
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use fs::File;
|
||||
use postgres_ffi::{pg_constants, xlog_utils};
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::env;
|
||||
use std::io::{Read, Write};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fs, io,
|
||||
path::{Path, PathBuf},
|
||||
process::{Command, Stdio},
|
||||
str::FromStr,
|
||||
};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::restore_local_repo;
|
||||
use crate::{repository::Repository, PageServerConf, ZTimelineId};
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct BranchInfo {
|
||||
pub name: String,
|
||||
pub timeline_id: ZTimelineId,
|
||||
pub latest_valid_lsn: Option<Lsn>,
|
||||
pub ancestor_id: Option<String>,
|
||||
pub ancestor_lsn: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PointInTime {
|
||||
pub timelineid: ZTimelineId,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
pub fn init_repo(conf: &'static PageServerConf, repo_dir: &Path) -> Result<()> {
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
fs::create_dir_all(repo_dir)
|
||||
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
|
||||
|
||||
env::set_current_dir(repo_dir)?;
|
||||
|
||||
fs::create_dir(std::path::Path::new("timelines"))?;
|
||||
fs::create_dir(std::path::Path::new("refs"))?;
|
||||
fs::create_dir(std::path::Path::new("refs").join("branches"))?;
|
||||
fs::create_dir(std::path::Path::new("refs").join("tags"))?;
|
||||
|
||||
println!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
// Run initdb
|
||||
//
|
||||
// We create the cluster temporarily in a "tmp" directory inside the repository,
|
||||
// and move it to the right location from there.
|
||||
let tmppath = std::path::Path::new("tmp");
|
||||
|
||||
print!("running initdb... ");
|
||||
io::stdout().flush()?;
|
||||
|
||||
let initdb_path = conf.pg_bin_dir().join("initdb");
|
||||
let initdb_otput = Command::new(initdb_path)
|
||||
.args(&["-D", tmppath.to_str().unwrap()])
|
||||
.arg("--no-instructions")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.with_context(|| "failed to execute initdb")?;
|
||||
if !initdb_otput.status.success() {
|
||||
anyhow::bail!("initdb failed");
|
||||
}
|
||||
println!("initdb succeeded");
|
||||
|
||||
// Read control file to extract the LSN and system id
|
||||
let controlfile_path = tmppath.join("global").join("pg_control");
|
||||
let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
|
||||
// let systemid = controlfile.system_identifier;
|
||||
let lsn = controlfile.checkPoint;
|
||||
let lsnstr = format!("{:016X}", lsn);
|
||||
|
||||
// Bootstrap the repository by loading the newly-initdb'd cluster into 'main' branch.
|
||||
let tli = create_timeline(conf, None)?;
|
||||
let timelinedir = conf.timeline_path(tli);
|
||||
|
||||
// We don't use page_cache here, because we don't want to spawn the WAL redo thread during
|
||||
// repository initialization.
|
||||
//
|
||||
// FIXME: That caused trouble, because the WAL redo thread launched initdb in the background,
|
||||
// and it kept running even after the "zenith init" had exited. In tests, we started the
|
||||
// page server immediately after that, so that initdb was still running in the background,
|
||||
// and we failed to run initdb again in the same directory. This has been solved for the
|
||||
// rapid init+start case now, but the general race condition remains if you restart the the
|
||||
// server quickly.
|
||||
let repo = crate::repository::rocksdb::RocksRepository::new(
|
||||
conf,
|
||||
std::sync::Arc::new(crate::walredo::DummyRedoManager {}),
|
||||
);
|
||||
let timeline = repo.create_empty_timeline(tli, Lsn(lsn))?;
|
||||
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(&tmppath, &*timeline, Lsn(lsn))?;
|
||||
|
||||
// Move the initial WAL file
|
||||
fs::rename(
|
||||
tmppath.join("pg_wal").join("000000010000000000000001"),
|
||||
timelinedir
|
||||
.join("wal")
|
||||
.join("000000010000000000000001.partial"),
|
||||
)?;
|
||||
println!("created initial timeline {}", tli);
|
||||
|
||||
let data = tli.to_string();
|
||||
fs::write(conf.branch_path("main"), data)?;
|
||||
println!("created main branch");
|
||||
|
||||
// Remove pg_wal
|
||||
fs::remove_dir_all(tmppath.join("pg_wal"))?;
|
||||
|
||||
force_crash_recovery(&tmppath)?;
|
||||
println!("updated pg_control");
|
||||
|
||||
// Move the data directory as an initial base backup.
|
||||
// FIXME: It would be enough to only copy the non-relation files here, the relation
|
||||
// data was already loaded into the repository.
|
||||
let target = timelinedir.join("snapshots").join(&lsnstr);
|
||||
fs::rename(tmppath, &target)?;
|
||||
|
||||
println!(
|
||||
"new zenith repository was created in {}",
|
||||
repo_dir.display()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_branches(
|
||||
conf: &PageServerConf,
|
||||
repository: &dyn Repository,
|
||||
) -> Result<Vec<BranchInfo>> {
|
||||
// Each branch has a corresponding record (text file) in the refs/branches
|
||||
// with timeline_id.
|
||||
let branches_dir = std::path::Path::new("refs").join("branches");
|
||||
|
||||
std::fs::read_dir(&branches_dir)?
|
||||
.map(|dir_entry_res| {
|
||||
let dir_entry = dir_entry_res?;
|
||||
let name = dir_entry.file_name().to_str().unwrap().to_string();
|
||||
let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
|
||||
|
||||
let latest_valid_lsn = repository
|
||||
.get_timeline(timeline_id)
|
||||
.map(|timeline| timeline.get_last_valid_lsn())
|
||||
.ok();
|
||||
|
||||
let ancestor_path = conf.ancestor_path(timeline_id);
|
||||
let mut ancestor_id: Option<String> = None;
|
||||
let mut ancestor_lsn: Option<String> = None;
|
||||
|
||||
if ancestor_path.exists() {
|
||||
let ancestor = std::fs::read_to_string(ancestor_path)?;
|
||||
let mut strings = ancestor.split('@');
|
||||
|
||||
ancestor_id = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
ancestor_lsn = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(BranchInfo {
|
||||
name,
|
||||
timeline_id,
|
||||
latest_valid_lsn,
|
||||
ancestor_id,
|
||||
ancestor_lsn,
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub(crate) fn get_system_id(conf: &PageServerConf) -> Result<u64> {
|
||||
// let branches = get_branches();
|
||||
|
||||
let branches_dir = std::path::Path::new("refs").join("branches");
|
||||
let branches = std::fs::read_dir(&branches_dir)?
|
||||
.map(|dir_entry_res| {
|
||||
let dir_entry = dir_entry_res?;
|
||||
let name = dir_entry.file_name().to_str().unwrap().to_string();
|
||||
let timeline_id = std::fs::read_to_string(dir_entry.path())?.parse::<ZTimelineId>()?;
|
||||
Ok((name, timeline_id))
|
||||
})
|
||||
.collect::<Result<HashMap<String, ZTimelineId>>>()?;
|
||||
|
||||
let main_tli = branches
|
||||
.get("main")
|
||||
.ok_or_else(|| anyhow!("Branch main not found"))?;
|
||||
|
||||
let (_, main_snap_dir) = find_latest_snapshot(conf, *main_tli)?;
|
||||
let controlfile_path = main_snap_dir.join("global").join("pg_control");
|
||||
let controlfile = postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfile_path)?))?;
|
||||
Ok(controlfile.system_identifier)
|
||||
}
|
||||
|
||||
pub(crate) fn create_branch(
|
||||
conf: &PageServerConf,
|
||||
branchname: &str,
|
||||
startpoint_str: &str,
|
||||
) -> Result<BranchInfo> {
|
||||
if conf.branch_path(&branchname).exists() {
|
||||
anyhow::bail!("branch {} already exists", branchname);
|
||||
}
|
||||
|
||||
let mut startpoint = parse_point_in_time(conf, startpoint_str)?;
|
||||
|
||||
if startpoint.lsn == Lsn(0) {
|
||||
// Find end of WAL on the old timeline
|
||||
let end_of_wal = find_end_of_wal(conf, startpoint.timelineid)?;
|
||||
println!("branching at end of WAL: {}", end_of_wal);
|
||||
startpoint.lsn = end_of_wal;
|
||||
}
|
||||
|
||||
// create a new timeline directory for it
|
||||
let newtli = create_timeline(conf, Some(startpoint))?;
|
||||
let newtimelinedir = conf.timeline_path(newtli);
|
||||
|
||||
// Let the Repository backend do its initialization
|
||||
let repo = page_cache::get_repository();
|
||||
repo.branch_timeline(startpoint.timelineid, newtli, startpoint.lsn)?;
|
||||
|
||||
// Copy the latest snapshot (TODO: before the startpoint) and all WAL
|
||||
// TODO: be smarter and avoid the copying...
|
||||
let (_maxsnapshot, oldsnapshotdir) = find_latest_snapshot(conf, startpoint.timelineid)?;
|
||||
let copy_opts = fs_extra::dir::CopyOptions::new();
|
||||
fs_extra::dir::copy(oldsnapshotdir, newtimelinedir.join("snapshots"), ©_opts)?;
|
||||
|
||||
let oldtimelinedir = conf.timeline_path(startpoint.timelineid);
|
||||
copy_wal(
|
||||
&oldtimelinedir.join("wal"),
|
||||
&newtimelinedir.join("wal"),
|
||||
startpoint.lsn,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
)?;
|
||||
|
||||
// Remember the human-readable branch name for the new timeline.
|
||||
// FIXME: there's a race condition, if you create a branch with the same
|
||||
// name concurrently.
|
||||
let data = newtli.to_string();
|
||||
fs::write(conf.branch_path(&branchname), data)?;
|
||||
|
||||
Ok(BranchInfo {
|
||||
name: branchname.to_string(),
|
||||
timeline_id: newtli,
|
||||
latest_valid_lsn: Some(startpoint.lsn),
|
||||
ancestor_id: None,
|
||||
ancestor_lsn: None,
|
||||
})
|
||||
}
|
||||
|
||||
//
|
||||
// Parse user-given string that represents a point-in-time.
|
||||
//
|
||||
// We support multiple variants:
|
||||
//
|
||||
// Raw timeline id in hex, meaning the end of that timeline:
|
||||
// bc62e7d612d0e6fe8f99a6dd2f281f9d
|
||||
//
|
||||
// A specific LSN on a timeline:
|
||||
// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8
|
||||
//
|
||||
// Same, with a human-friendly branch name:
|
||||
// main
|
||||
// main@2/15D3DD8
|
||||
//
|
||||
// Human-friendly tag name:
|
||||
// mytag
|
||||
//
|
||||
//
|
||||
fn parse_point_in_time(conf: &PageServerConf, s: &str) -> Result<PointInTime> {
|
||||
let mut strings = s.split('@');
|
||||
let name = strings.next().unwrap();
|
||||
|
||||
let lsn: Option<Lsn>;
|
||||
if let Some(lsnstr) = strings.next() {
|
||||
lsn = Some(
|
||||
Lsn::from_str(lsnstr).with_context(|| "invalid LSN in point-in-time specification")?,
|
||||
);
|
||||
} else {
|
||||
lsn = None
|
||||
}
|
||||
|
||||
// Check if it's a tag
|
||||
if lsn.is_none() {
|
||||
let tagpath = conf.tag_path(name);
|
||||
if tagpath.exists() {
|
||||
let pointstr = fs::read_to_string(tagpath)?;
|
||||
|
||||
return parse_point_in_time(conf, &pointstr);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if it's a branch
|
||||
// Check if it's branch @ LSN
|
||||
let branchpath = conf.branch_path(name);
|
||||
if branchpath.exists() {
|
||||
let pointstr = fs::read_to_string(branchpath)?;
|
||||
|
||||
let mut result = parse_point_in_time(conf, &pointstr)?;
|
||||
|
||||
result.lsn = lsn.unwrap_or(Lsn(0));
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
// Check if it's a timelineid
|
||||
// Check if it's timelineid @ LSN
|
||||
if let Ok(timelineid) = ZTimelineId::from_str(name) {
|
||||
let tlipath = conf.timeline_path(timelineid);
|
||||
if tlipath.exists() {
|
||||
return Ok(PointInTime {
|
||||
timelineid,
|
||||
lsn: lsn.unwrap_or(Lsn(0)),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
bail!("could not parse point-in-time {}", s);
|
||||
}
|
||||
|
||||
// If control file says the cluster was shut down cleanly, modify it, to mark
|
||||
// it as crashed. That forces crash recovery when you start the cluster.
|
||||
//
|
||||
// FIXME:
|
||||
// We currently do this to the initial snapshot in "zenith init". It would
|
||||
// be more natural to do this when the snapshot is restored instead, but we
|
||||
// currently don't have any code to create new snapshots, so it doesn't matter
|
||||
// Or better yet, use a less hacky way of putting the cluster into recovery.
|
||||
// Perhaps create a backup label file in the data directory when it's restored.
|
||||
fn force_crash_recovery(datadir: &Path) -> Result<()> {
|
||||
// Read in the control file
|
||||
let controlfilepath = datadir.to_path_buf().join("global").join("pg_control");
|
||||
let mut controlfile =
|
||||
postgres_ffi::decode_pg_control(Bytes::from(fs::read(controlfilepath.as_path())?))?;
|
||||
|
||||
controlfile.state = postgres_ffi::DBState_DB_IN_PRODUCTION;
|
||||
|
||||
fs::write(
|
||||
controlfilepath.as_path(),
|
||||
postgres_ffi::encode_pg_control(controlfile),
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_timeline(conf: &PageServerConf, ancestor: Option<PointInTime>) -> Result<ZTimelineId> {
|
||||
// Create initial timeline
|
||||
let mut tli_buf = [0u8; 16];
|
||||
rand::thread_rng().fill(&mut tli_buf);
|
||||
let timelineid = ZTimelineId::from(tli_buf);
|
||||
|
||||
let timelinedir = conf.timeline_path(timelineid);
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
fs::create_dir(&timelinedir.join("snapshots"))?;
|
||||
fs::create_dir(&timelinedir.join("wal"))?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
|
||||
fs::write(timelinedir.join("ancestor"), data)?;
|
||||
}
|
||||
|
||||
Ok(timelineid)
|
||||
}
|
||||
|
||||
///
|
||||
/// Copy all WAL segments from one directory to another, up to given LSN.
|
||||
///
|
||||
/// If the given LSN is in the middle of a segment, the last segment containing it
|
||||
/// is written out as .partial, and padded with zeros.
|
||||
///
|
||||
fn copy_wal(src_dir: &Path, dst_dir: &Path, upto: Lsn, wal_seg_size: usize) -> Result<()> {
|
||||
let last_segno = upto.segment_number(wal_seg_size);
|
||||
let last_segoff = upto.segment_offset(wal_seg_size);
|
||||
|
||||
for entry in fs::read_dir(src_dir).unwrap().flatten() {
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
|
||||
// Check if the filename looks like an xlog file, or a .partial file.
|
||||
if !xlog_utils::IsXLogFileName(fname) && !xlog_utils::IsPartialXLogFileName(fname) {
|
||||
continue;
|
||||
}
|
||||
let (segno, _tli) = xlog_utils::XLogFromFileName(fname, wal_seg_size as usize);
|
||||
|
||||
let copylen;
|
||||
let mut dst_fname = PathBuf::from(fname);
|
||||
if segno > last_segno {
|
||||
// future segment, skip
|
||||
continue;
|
||||
} else if segno < last_segno {
|
||||
copylen = wal_seg_size;
|
||||
dst_fname.set_extension("");
|
||||
} else {
|
||||
copylen = last_segoff;
|
||||
dst_fname.set_extension("partial");
|
||||
}
|
||||
|
||||
let src_file = File::open(entry.path())?;
|
||||
let mut dst_file = File::create(dst_dir.join(&dst_fname))?;
|
||||
std::io::copy(&mut src_file.take(copylen as u64), &mut dst_file)?;
|
||||
|
||||
if copylen < wal_seg_size {
|
||||
std::io::copy(
|
||||
&mut std::io::repeat(0).take((wal_seg_size - copylen) as u64),
|
||||
&mut dst_file,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Find the end of valid WAL in a wal directory
|
||||
pub fn find_end_of_wal(conf: &PageServerConf, timeline: ZTimelineId) -> Result<Lsn> {
|
||||
let waldir = conf.timeline_path(timeline).join("wal");
|
||||
let (lsn, _tli) = xlog_utils::find_end_of_wal(&waldir, pg_constants::WAL_SEGMENT_SIZE, true);
|
||||
Ok(Lsn(lsn))
|
||||
}
|
||||
|
||||
// Find the latest snapshot for a timeline
|
||||
fn find_latest_snapshot(conf: &PageServerConf, timeline: ZTimelineId) -> Result<(Lsn, PathBuf)> {
|
||||
let snapshotsdir = conf.snapshots_path(timeline);
|
||||
let paths = fs::read_dir(&snapshotsdir)?;
|
||||
let mut maxsnapshot = Lsn(0);
|
||||
let mut snapshotdir: Option<PathBuf> = None;
|
||||
for path in paths {
|
||||
let path = path?;
|
||||
let filename = path.file_name().to_str().unwrap().to_owned();
|
||||
if let Ok(lsn) = Lsn::from_hex(&filename) {
|
||||
maxsnapshot = std::cmp::max(lsn, maxsnapshot);
|
||||
snapshotdir = Some(path.path());
|
||||
}
|
||||
}
|
||||
if maxsnapshot == Lsn(0) {
|
||||
// TODO: check ancestor timeline
|
||||
anyhow::bail!("no snapshot found in {}", snapshotsdir.display());
|
||||
}
|
||||
|
||||
Ok((maxsnapshot, snapshotdir.unwrap()))
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use std::fmt;
|
||||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
@@ -5,8 +7,10 @@ use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
pub mod basebackup;
|
||||
pub mod branches;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod tui;
|
||||
pub mod tui_event;
|
||||
@@ -22,6 +26,54 @@ pub struct PageServerConf {
|
||||
pub listen_addr: SocketAddr,
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
|
||||
// Repository directory, relative to current working directory.
|
||||
// Normally, the page server changes the current working directory
|
||||
// to the repository, and 'workdir' is always '.'. But we don't do
|
||||
// that during unit testing, because the current directory is global
|
||||
// to the process but different unit tests work on different
|
||||
// repositories.
|
||||
pub workdir: PathBuf,
|
||||
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl PageServerConf {
|
||||
//
|
||||
// Repository paths, relative to workdir.
|
||||
//
|
||||
|
||||
fn tag_path(&self, name: &str) -> PathBuf {
|
||||
self.workdir.join("refs").join("tags").join(name)
|
||||
}
|
||||
|
||||
fn branch_path(&self, name: &str) -> PathBuf {
|
||||
self.workdir.join("refs").join("branches").join(name)
|
||||
}
|
||||
|
||||
fn timeline_path(&self, timelineid: ZTimelineId) -> PathBuf {
|
||||
self.workdir.join("timelines").join(timelineid.to_string())
|
||||
}
|
||||
|
||||
fn snapshots_path(&self, timelineid: ZTimelineId) -> PathBuf {
|
||||
self.timeline_path(timelineid).join("snapshots")
|
||||
}
|
||||
|
||||
fn ancestor_path(&self, timelineid: ZTimelineId) -> PathBuf {
|
||||
self.timeline_path(timelineid).join("ancestor")
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
|
||||
pub fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
}
|
||||
|
||||
pub fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
}
|
||||
}
|
||||
|
||||
/// Zenith Timeline ID is a 128-bit random ID.
|
||||
@@ -48,7 +100,7 @@ pub struct PageServerConf {
|
||||
/// is separate from PostgreSQL timelines, and doesn't have those
|
||||
/// limitations. A zenith timeline is identified by a 128-bit ID, which
|
||||
/// is usually printed out as a hex string.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub struct ZTimelineId([u8; 16]);
|
||||
|
||||
impl FromStr for ZTimelineId {
|
||||
@@ -84,11 +136,3 @@ impl fmt::Display for ZTimelineId {
|
||||
f.write_str(&hex::encode(self.0))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn zenith_repo_dir() -> PathBuf {
|
||||
// Find repository path
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,917 +1,32 @@
|
||||
//
|
||||
// Page Cache holds all the different page versions and WAL records
|
||||
//
|
||||
// Currently, the page cache uses RocksDB to store WAL wal records and
|
||||
// full page images, keyed by the RelFileNode, blocknumber, and the
|
||||
// LSN.
|
||||
//! This module acts as a switchboard to access different repositories managed by this
|
||||
//! page server. Currently, a Page Server can only manage one repository, so there
|
||||
//! isn't much here. If we implement multi-tenancy, this will probably be changed into
|
||||
//! a hash map, keyed by the tenant ID.
|
||||
|
||||
use crate::restore_local_repo::restore_timeline;
|
||||
use crate::waldecoder::Oid;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::ZTimelineId;
|
||||
use crate::{zenith_repo_dir, PageServerConf};
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use crate::repository::rocksdb::RocksRepository;
|
||||
use crate::repository::Repository;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::thread;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{convert::TryInto, ops::AddAssign};
|
||||
use zenith_utils::lsn::{AtomicLsn, Lsn};
|
||||
use zenith_utils::seqwait::SeqWait;
|
||||
|
||||
// Timeout when waiting or WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
static TIMEOUT: Duration = Duration::from_secs(60);
|
||||
|
||||
pub struct PageCache {
|
||||
// RocksDB handle
|
||||
db: rocksdb::DB,
|
||||
|
||||
// WAL redo manager
|
||||
walredo_mgr: WalRedoManager,
|
||||
|
||||
// What page versions do we hold in the cache? If we get GetPage with
|
||||
// LSN < first_valid_lsn, that's an error because we (no longer) hold that
|
||||
// page version. If we get a request > last_valid_lsn, we need to wait until
|
||||
// we receive all the WAL up to the request. The SeqWait provides functions
|
||||
// for that.
|
||||
//
|
||||
// last_record_lsn points to the end of last processed WAL record.
|
||||
// It can lag behind last_valid_lsn, if the WAL receiver has received some WAL
|
||||
// after the end of last record, but not the whole next record yet. In the
|
||||
// page cache, we care about last_valid_lsn, but if the WAL receiver needs to
|
||||
// restart the streaming, it needs to restart at the end of last record, so
|
||||
// we track them separately. last_record_lsn should perhaps be in
|
||||
// walreceiver.rs instead of here, but it seems convenient to keep all three
|
||||
// values together.
|
||||
//
|
||||
first_valid_lsn: AtomicLsn,
|
||||
last_valid_lsn: SeqWait<Lsn>,
|
||||
last_record_lsn: AtomicLsn,
|
||||
|
||||
// Counters, for metrics collection.
|
||||
pub num_entries: AtomicU64,
|
||||
pub num_page_images: AtomicU64,
|
||||
pub num_wal_records: AtomicU64,
|
||||
pub num_getpage_requests: AtomicU64,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PageCacheStats {
|
||||
pub num_entries: u64,
|
||||
pub num_page_images: u64,
|
||||
pub num_wal_records: u64,
|
||||
pub num_getpage_requests: u64,
|
||||
}
|
||||
|
||||
impl AddAssign for PageCacheStats {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.num_entries += other.num_entries;
|
||||
self.num_page_images += other.num_page_images;
|
||||
self.num_wal_records += other.num_wal_records;
|
||||
self.num_getpage_requests += other.num_getpage_requests;
|
||||
}
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
pub static ref PAGECACHES: Mutex<HashMap<ZTimelineId, Arc<PageCache>>> =
|
||||
Mutex::new(HashMap::new());
|
||||
pub static ref REPOSITORY: Mutex<Option<Arc<dyn Repository + Send + Sync>>> = Mutex::new(None);
|
||||
}
|
||||
|
||||
// Get Page Cache for given timeline. It is assumed to already exist.
|
||||
pub fn get_pagecache(_conf: &PageServerConf, timelineid: ZTimelineId) -> Option<Arc<PageCache>> {
|
||||
let pcaches = PAGECACHES.lock().unwrap();
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
match pcaches.get(&timelineid) {
|
||||
Some(pcache) => Some(pcache.clone()),
|
||||
None => None,
|
||||
}
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf);
|
||||
|
||||
// we have already changed current dir to the repository.
|
||||
let repo = RocksRepository::new(conf, Arc::new(walredo_mgr));
|
||||
|
||||
*m = Some(Arc::new(repo));
|
||||
}
|
||||
|
||||
pub fn get_or_restore_pagecache(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
) -> anyhow::Result<Arc<PageCache>> {
|
||||
let mut pcaches = PAGECACHES.lock().unwrap();
|
||||
match pcaches.get(&timelineid) {
|
||||
Some(pcache) => Ok(pcache.clone()),
|
||||
None => {
|
||||
let pcache = init_page_cache(conf, timelineid);
|
||||
|
||||
restore_timeline(conf, &pcache, timelineid)?;
|
||||
|
||||
let result = Arc::new(pcache);
|
||||
|
||||
pcaches.insert(timelineid, result.clone());
|
||||
|
||||
if conf.gc_horizon != 0 {
|
||||
let conf_copy = conf.clone();
|
||||
let _gc_thread = thread::Builder::new()
|
||||
.name("Garbage collection thread".into())
|
||||
.spawn(move || {
|
||||
gc_thread_main(&conf_copy, timelineid);
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn gc_thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
info!("Garbage collection thread started {}", timelineid);
|
||||
let pcache = get_pagecache(conf, timelineid).unwrap();
|
||||
|
||||
pcache.do_gc(conf).unwrap();
|
||||
}
|
||||
|
||||
fn open_rocksdb(_conf: &PageServerConf, timelineid: ZTimelineId) -> rocksdb::DB {
|
||||
let path = zenith_repo_dir().join(timelineid.to_string());
|
||||
let mut opts = rocksdb::Options::default();
|
||||
opts.create_if_missing(true);
|
||||
opts.set_use_fsync(true);
|
||||
opts.set_compression_type(rocksdb::DBCompressionType::Lz4);
|
||||
opts.set_compaction_filter("ttl", move |_level: u32, _key: &[u8], val: &[u8]| {
|
||||
if (val[0] & UNUSED_VERSION_FLAG) != 0 {
|
||||
rocksdb::compaction_filter::Decision::Remove
|
||||
} else {
|
||||
rocksdb::compaction_filter::Decision::Keep
|
||||
}
|
||||
});
|
||||
rocksdb::DB::open(&opts, &path).unwrap()
|
||||
}
|
||||
|
||||
fn init_page_cache(conf: &PageServerConf, timelineid: ZTimelineId) -> PageCache {
|
||||
PageCache {
|
||||
db: open_rocksdb(&conf, timelineid),
|
||||
|
||||
walredo_mgr: WalRedoManager::new(conf, timelineid),
|
||||
|
||||
first_valid_lsn: AtomicLsn::new(0),
|
||||
last_valid_lsn: SeqWait::new(Lsn(0)),
|
||||
last_record_lsn: AtomicLsn::new(0),
|
||||
|
||||
num_entries: AtomicU64::new(0),
|
||||
num_page_images: AtomicU64::new(0),
|
||||
num_wal_records: AtomicU64::new(0),
|
||||
num_getpage_requests: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// We store two kinds of entries in the page cache:
|
||||
//
|
||||
// 1. Ready-made images of the block
|
||||
// 2. WAL records, to be applied on top of the "previous" entry
|
||||
//
|
||||
// Some WAL records will initialize the page from scratch. For such records,
|
||||
// the 'will_init' flag is set. They don't need the previous page image before
|
||||
// applying. The 'will_init' flag is set for records containing a full-page image,
|
||||
// and for records with the BKPBLOCK_WILL_INIT flag. These differ from PageImages
|
||||
// stored directly in the cache entry in that you still need to run the WAL redo
|
||||
// routine to generate the page image.
|
||||
//
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
pub struct CacheKey {
|
||||
pub tag: BufferTag,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
impl CacheKey {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
self.tag.pack(buf);
|
||||
buf.put_u64(self.lsn.0);
|
||||
}
|
||||
pub fn unpack(buf: &mut BytesMut) -> CacheKey {
|
||||
CacheKey {
|
||||
tag: BufferTag::unpack(buf),
|
||||
lsn: Lsn::from(buf.get_u64()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CacheEntryContent {
|
||||
pub page_image: Option<Bytes>,
|
||||
pub wal_record: Option<WALRecord>,
|
||||
}
|
||||
|
||||
const PAGE_IMAGE_FLAG: u8 = 1u8;
|
||||
const UNUSED_VERSION_FLAG: u8 = 2u8;
|
||||
|
||||
impl CacheEntryContent {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
if let Some(image) = &self.page_image {
|
||||
buf.put_u8(PAGE_IMAGE_FLAG);
|
||||
buf.put_u16(image.len() as u16);
|
||||
buf.put_slice(&image[..]);
|
||||
} else if let Some(rec) = &self.wal_record {
|
||||
buf.put_u8(0);
|
||||
rec.pack(buf);
|
||||
}
|
||||
}
|
||||
pub fn unpack(buf: &mut BytesMut) -> CacheEntryContent {
|
||||
if (buf.get_u8() & PAGE_IMAGE_FLAG) != 0 {
|
||||
let mut dst = vec![0u8; buf.get_u16() as usize];
|
||||
buf.copy_to_slice(&mut dst);
|
||||
CacheEntryContent {
|
||||
page_image: Some(Bytes::from(dst)),
|
||||
wal_record: None,
|
||||
}
|
||||
} else {
|
||||
CacheEntryContent {
|
||||
page_image: None,
|
||||
wal_record: Some(WALRecord::unpack(buf)),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy)]
|
||||
pub struct RelTag {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u8,
|
||||
}
|
||||
|
||||
impl RelTag {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u32(self.spcnode);
|
||||
buf.put_u32(self.dbnode);
|
||||
buf.put_u32(self.relnode);
|
||||
buf.put_u32(self.forknum as u32); // encode forknum as u32 to provide compatibility with wal_redo_postgres
|
||||
}
|
||||
pub fn unpack(buf: &mut BytesMut) -> RelTag {
|
||||
RelTag {
|
||||
spcnode: buf.get_u32(),
|
||||
dbnode: buf.get_u32(),
|
||||
relnode: buf.get_u32(),
|
||||
forknum: buf.get_u32() as u8,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
impl BufferTag {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
self.rel.pack(buf);
|
||||
buf.put_u32(self.blknum);
|
||||
}
|
||||
pub fn unpack(buf: &mut BytesMut) -> BufferTag {
|
||||
BufferTag {
|
||||
rel: RelTag::unpack(buf),
|
||||
blknum: buf.get_u32(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: Lsn, // LSN at the *end* of the record
|
||||
pub will_init: bool,
|
||||
pub truncate: bool,
|
||||
pub rec: Bytes,
|
||||
// Remember the offset of main_data in rec,
|
||||
// so that we don't have to parse the record again.
|
||||
// If record has no main_data, this offset equals rec.len().
|
||||
pub main_data_offset: u32,
|
||||
}
|
||||
|
||||
impl WALRecord {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u64(self.lsn.0);
|
||||
buf.put_u8(self.will_init as u8);
|
||||
buf.put_u8(self.truncate as u8);
|
||||
buf.put_u32(self.main_data_offset);
|
||||
buf.put_u32(self.rec.len() as u32);
|
||||
buf.put_slice(&self.rec[..]);
|
||||
}
|
||||
pub fn unpack(buf: &mut BytesMut) -> WALRecord {
|
||||
let lsn = Lsn::from(buf.get_u64());
|
||||
let will_init = buf.get_u8() != 0;
|
||||
let truncate = buf.get_u8() != 0;
|
||||
let main_data_offset = buf.get_u32();
|
||||
let mut dst = vec![0u8; buf.get_u32() as usize];
|
||||
buf.copy_to_slice(&mut dst);
|
||||
WALRecord {
|
||||
lsn,
|
||||
will_init,
|
||||
truncate,
|
||||
rec: Bytes::from(dst),
|
||||
main_data_offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PageCache {
|
||||
// Public GET interface functions
|
||||
|
||||
///
|
||||
/// GetPage@LSN
|
||||
///
|
||||
/// Returns an 8k page image
|
||||
///
|
||||
pub fn get_page_at_lsn(&self, tag: BufferTag, req_lsn: Lsn) -> anyhow::Result<Bytes> {
|
||||
self.num_getpage_requests.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
let lsn = self.wait_lsn(req_lsn)?;
|
||||
|
||||
// Look up cache entry. If it's a page image, return that. If it's a WAL record,
|
||||
// ask the WAL redo service to reconstruct the page image from the WAL records.
|
||||
let key = CacheKey { tag, lsn };
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
key.pack(&mut buf);
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(&buf[..]);
|
||||
|
||||
if iter.valid() {
|
||||
let k = iter.key().unwrap();
|
||||
buf.clear();
|
||||
buf.extend_from_slice(&k);
|
||||
let key = CacheKey::unpack(&mut buf);
|
||||
if key.tag == tag {
|
||||
let v = iter.value().unwrap();
|
||||
buf.clear();
|
||||
buf.extend_from_slice(&v);
|
||||
let content = CacheEntryContent::unpack(&mut buf);
|
||||
let page_img: Bytes;
|
||||
if let Some(img) = &content.page_image {
|
||||
page_img = img.clone();
|
||||
} else if content.wal_record.is_some() {
|
||||
// Request the WAL redo manager to apply the WAL records for us.
|
||||
let (base_img, records) = self.collect_records_for_apply(tag, lsn);
|
||||
page_img = self.walredo_mgr.request_redo(tag, lsn, base_img, records)?;
|
||||
|
||||
self.put_page_image(tag, lsn, page_img.clone());
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
// FIXME: assumes little-endian. Only used for the debugging log though
|
||||
let page_lsn_hi =
|
||||
u32::from_le_bytes(page_img.get(0..4).unwrap().try_into().unwrap());
|
||||
let page_lsn_lo =
|
||||
u32::from_le_bytes(page_img.get(4..8).unwrap().try_into().unwrap());
|
||||
debug!(
|
||||
"Returning page with LSN {:X}/{:X} for {}/{}/{}.{} blk {}",
|
||||
page_lsn_hi,
|
||||
page_lsn_lo,
|
||||
tag.rel.spcnode,
|
||||
tag.rel.dbnode,
|
||||
tag.rel.relnode,
|
||||
tag.rel.forknum,
|
||||
tag.blknum
|
||||
);
|
||||
return Ok(page_img);
|
||||
}
|
||||
}
|
||||
static ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
debug!("Page {:?} at {}({}) not found", tag, req_lsn, lsn);
|
||||
Ok(Bytes::from_static(&ZERO_PAGE))
|
||||
/* return Err("could not find page image")?; */
|
||||
}
|
||||
|
||||
///
|
||||
/// Get size of relation at given LSN.
|
||||
///
|
||||
pub fn relsize_get(&self, rel: &RelTag, lsn: Lsn) -> anyhow::Result<u32> {
|
||||
self.wait_lsn(lsn)?;
|
||||
self.relsize_get_nowait(rel, lsn)
|
||||
}
|
||||
|
||||
///
|
||||
/// Does relation exist at given LSN?
|
||||
///
|
||||
pub fn relsize_exist(&self, rel: &RelTag, req_lsn: Lsn) -> anyhow::Result<bool> {
|
||||
let lsn = self.wait_lsn(req_lsn)?;
|
||||
|
||||
let key = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel: *rel,
|
||||
blknum: u32::MAX,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
let mut buf = BytesMut::new();
|
||||
key.pack(&mut buf);
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(&buf[..]);
|
||||
if iter.valid() {
|
||||
let k = iter.key().unwrap();
|
||||
buf.clear();
|
||||
buf.extend_from_slice(&k);
|
||||
let tag = BufferTag::unpack(&mut buf);
|
||||
if tag.rel == *rel {
|
||||
debug!("Relation {:?} exists at {}", rel, lsn);
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
debug!("Relation {:?} doesn't exist at {}", rel, lsn);
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
// Other public functions, for updating the page cache.
|
||||
// These are used by the WAL receiver and WAL redo.
|
||||
|
||||
///
|
||||
/// Collect all the WAL records that are needed to reconstruct a page
|
||||
/// image for the given cache entry.
|
||||
///
|
||||
/// Returns an old page image (if any), and a vector of WAL records to apply
|
||||
/// over it.
|
||||
///
|
||||
pub fn collect_records_for_apply(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
) -> (Option<Bytes>, Vec<WALRecord>) {
|
||||
let mut buf = BytesMut::new();
|
||||
let key = CacheKey { tag, lsn };
|
||||
key.pack(&mut buf);
|
||||
|
||||
let mut base_img: Option<Bytes> = None;
|
||||
let mut records: Vec<WALRecord> = Vec::new();
|
||||
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(&buf[..]);
|
||||
|
||||
// Scan backwards, collecting the WAL records, until we hit an
|
||||
// old page image.
|
||||
while iter.valid() {
|
||||
let k = iter.key().unwrap();
|
||||
buf.clear();
|
||||
buf.extend_from_slice(&k);
|
||||
let key = CacheKey::unpack(&mut buf);
|
||||
if key.tag != tag {
|
||||
break;
|
||||
}
|
||||
let v = iter.value().unwrap();
|
||||
buf.clear();
|
||||
buf.extend_from_slice(&v);
|
||||
let content = CacheEntryContent::unpack(&mut buf);
|
||||
if let Some(img) = &content.page_image {
|
||||
// We have a base image. No need to dig deeper into the list of
|
||||
// records
|
||||
base_img = Some(img.clone());
|
||||
break;
|
||||
} else if let Some(rec) = &content.wal_record {
|
||||
records.push(rec.clone());
|
||||
// If this WAL record initializes the page, no need to dig deeper.
|
||||
if rec.will_init {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
panic!("no base image and no WAL record on cache entry");
|
||||
}
|
||||
iter.prev();
|
||||
}
|
||||
records.reverse();
|
||||
(base_img, records)
|
||||
}
|
||||
|
||||
///
|
||||
/// Adds a WAL record to the page cache
|
||||
///
|
||||
pub fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) {
|
||||
let lsn = rec.lsn;
|
||||
let key = CacheKey { tag, lsn };
|
||||
|
||||
let content = CacheEntryContent {
|
||||
page_image: None,
|
||||
wal_record: Some(rec),
|
||||
};
|
||||
|
||||
let mut key_buf = BytesMut::new();
|
||||
key.pack(&mut key_buf);
|
||||
let mut val_buf = BytesMut::new();
|
||||
content.pack(&mut val_buf);
|
||||
|
||||
let _res = self.db.put(&key_buf[..], &val_buf[..]);
|
||||
//trace!("put_wal_record lsn: {}", lsn);
|
||||
|
||||
self.num_entries.fetch_add(1, Ordering::Relaxed);
|
||||
self.num_wal_records.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
///
|
||||
/// Adds a relation-wide WAL record (like truncate) to the page cache,
|
||||
/// associating it with all pages started with specified block number
|
||||
///
|
||||
pub fn put_rel_wal_record(&self, tag: BufferTag, rec: WALRecord) -> anyhow::Result<()> {
|
||||
let mut key = CacheKey { tag, lsn: rec.lsn };
|
||||
|
||||
// What was the size of the relation before this record?
|
||||
let last_lsn = self.last_valid_lsn.load();
|
||||
let old_rel_size = self.relsize_get_nowait(&tag.rel, last_lsn)?;
|
||||
|
||||
let content = CacheEntryContent {
|
||||
page_image: None,
|
||||
wal_record: Some(rec),
|
||||
};
|
||||
// set new relation size
|
||||
trace!("Truncate relation {:?}", tag);
|
||||
let mut key_buf = BytesMut::new();
|
||||
let mut val_buf = BytesMut::new();
|
||||
content.pack(&mut val_buf);
|
||||
|
||||
for blknum in tag.blknum..old_rel_size {
|
||||
key_buf.clear();
|
||||
key.tag.blknum = blknum;
|
||||
key.pack(&mut key_buf);
|
||||
trace!("put_wal_record lsn: {}", key.lsn);
|
||||
let _res = self.db.put(&key_buf[..], &val_buf[..]);
|
||||
}
|
||||
let n = (old_rel_size - tag.blknum) as u64;
|
||||
self.num_entries.fetch_add(n, Ordering::Relaxed);
|
||||
self.num_wal_records.fetch_add(n, Ordering::Relaxed);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Memorize a full image of a page version
|
||||
///
|
||||
pub fn put_page_image(&self, tag: BufferTag, lsn: Lsn, img: Bytes) {
|
||||
let key = CacheKey { tag, lsn };
|
||||
let content = CacheEntryContent {
|
||||
page_image: Some(img),
|
||||
wal_record: None,
|
||||
};
|
||||
|
||||
let mut key_buf = BytesMut::new();
|
||||
key.pack(&mut key_buf);
|
||||
let mut val_buf = BytesMut::new();
|
||||
content.pack(&mut val_buf);
|
||||
|
||||
trace!("put_wal_record lsn: {}", key.lsn);
|
||||
let _res = self.db.put(&key_buf[..], &val_buf[..]);
|
||||
|
||||
//debug!("inserted page image for {}/{}/{}_{} blk {} at {}",
|
||||
// tag.spcnode, tag.dbnode, tag.relnode, tag.forknum, tag.blknum, lsn);
|
||||
self.num_page_images.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn create_database(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
db_id: Oid,
|
||||
tablespace_id: Oid,
|
||||
src_db_id: Oid,
|
||||
src_tablespace_id: Oid,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut buf = BytesMut::new();
|
||||
let key = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: src_tablespace_id,
|
||||
dbnode: src_db_id,
|
||||
relnode: 0,
|
||||
forknum: 0u8,
|
||||
},
|
||||
blknum: 0,
|
||||
},
|
||||
lsn: Lsn(0),
|
||||
};
|
||||
key.pack(&mut buf);
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek(&buf[..]);
|
||||
let mut n = 0;
|
||||
while iter.valid() {
|
||||
let k = iter.key().unwrap();
|
||||
let v = iter.value().unwrap();
|
||||
buf.clear();
|
||||
buf.extend_from_slice(&k);
|
||||
let mut key = CacheKey::unpack(&mut buf);
|
||||
if key.tag.rel.spcnode != src_tablespace_id || key.tag.rel.dbnode != src_db_id {
|
||||
break;
|
||||
}
|
||||
key.tag.rel.spcnode = tablespace_id;
|
||||
key.tag.rel.dbnode = db_id;
|
||||
key.lsn = lsn;
|
||||
buf.clear();
|
||||
key.pack(&mut buf);
|
||||
|
||||
self.db.put(&buf[..], v)?;
|
||||
n += 1;
|
||||
iter.next();
|
||||
}
|
||||
info!(
|
||||
"Create database {}/{}, copy {} entries",
|
||||
tablespace_id, db_id, n
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remember that WAL has been received and added to the page cache up to the given LSN
|
||||
pub fn advance_last_valid_lsn(&self, lsn: Lsn) {
|
||||
let old = self.last_valid_lsn.advance(lsn);
|
||||
|
||||
// Can't move backwards.
|
||||
if lsn < old {
|
||||
warn!(
|
||||
"attempted to move last valid LSN backwards (was {}, new {})",
|
||||
old, lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Remember the (end of) last valid WAL record remembered in the page cache.
|
||||
///
|
||||
/// NOTE: this updates last_valid_lsn as well.
|
||||
///
|
||||
pub fn advance_last_record_lsn(&self, lsn: Lsn) {
|
||||
// Can't move backwards.
|
||||
let old = self.last_record_lsn.fetch_max(lsn);
|
||||
assert!(old <= lsn);
|
||||
|
||||
// Also advance last_valid_lsn
|
||||
let old = self.last_valid_lsn.advance(lsn);
|
||||
// Can't move backwards.
|
||||
if lsn < old {
|
||||
warn!(
|
||||
"attempted to move last record LSN backwards (was {}, new {})",
|
||||
old, lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Remember the beginning of valid WAL.
|
||||
///
|
||||
/// TODO: This should be called by garbage collection, so that if an older
|
||||
/// page is requested, we will return an error to the requestor.
|
||||
pub fn _advance_first_valid_lsn(&self, lsn: Lsn) {
|
||||
// Can't overtake last_valid_lsn (except when we're
|
||||
// initializing the system and last_valid_lsn hasn't been set yet.
|
||||
let last_valid_lsn = self.last_valid_lsn.load();
|
||||
assert!(last_valid_lsn == Lsn(0) || lsn < last_valid_lsn);
|
||||
|
||||
let old = self.first_valid_lsn.fetch_max(lsn);
|
||||
// Can't move backwards.
|
||||
assert!(lsn >= old);
|
||||
}
|
||||
|
||||
pub fn init_valid_lsn(&self, lsn: Lsn) {
|
||||
let old = self.last_valid_lsn.advance(lsn);
|
||||
assert!(old == Lsn(0));
|
||||
let old = self.last_record_lsn.fetch_max(lsn);
|
||||
assert!(old == Lsn(0));
|
||||
let old = self.first_valid_lsn.fetch_max(lsn);
|
||||
assert!(old == Lsn(0));
|
||||
}
|
||||
|
||||
pub fn get_last_valid_lsn(&self) -> Lsn {
|
||||
self.last_valid_lsn.load()
|
||||
}
|
||||
|
||||
//
|
||||
// Get statistics to be displayed in the user interface.
|
||||
//
|
||||
pub fn get_stats(&self) -> PageCacheStats {
|
||||
PageCacheStats {
|
||||
num_entries: self.num_entries.load(Ordering::Relaxed),
|
||||
num_page_images: self.num_page_images.load(Ordering::Relaxed),
|
||||
num_wal_records: self.num_wal_records.load(Ordering::Relaxed),
|
||||
num_getpage_requests: self.num_getpage_requests.load(Ordering::Relaxed),
|
||||
}
|
||||
}
|
||||
|
||||
// Internal functions
|
||||
|
||||
//
|
||||
// Internal function to get relation size at given LSN.
|
||||
//
|
||||
// The caller must ensure that WAL has been received up to 'lsn'.
|
||||
//
|
||||
fn relsize_get_nowait(&self, rel: &RelTag, lsn: Lsn) -> anyhow::Result<u32> {
|
||||
assert!(lsn <= self.last_valid_lsn.load());
|
||||
|
||||
let mut key = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel: *rel,
|
||||
blknum: u32::MAX,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
let mut buf = BytesMut::new();
|
||||
let mut iter = self.db.raw_iterator();
|
||||
|
||||
loop {
|
||||
buf.clear();
|
||||
key.pack(&mut buf);
|
||||
iter.seek_for_prev(&buf[..]);
|
||||
if iter.valid() {
|
||||
let k = iter.key().unwrap();
|
||||
let v = iter.value().unwrap();
|
||||
buf.clear();
|
||||
buf.extend_from_slice(&k);
|
||||
let tag = BufferTag::unpack(&mut buf);
|
||||
if tag.rel == *rel {
|
||||
buf.clear();
|
||||
buf.extend_from_slice(&v);
|
||||
let content = CacheEntryContent::unpack(&mut buf);
|
||||
if let Some(rec) = &content.wal_record {
|
||||
if rec.truncate {
|
||||
if tag.blknum > 0 {
|
||||
key.tag.blknum = tag.blknum - 1;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
let relsize = tag.blknum + 1;
|
||||
debug!("Size of relation {:?} at {} is {}", rel, lsn, relsize);
|
||||
return Ok(relsize);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
debug!("Size of relation {:?} at {} is zero", rel, lsn);
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
fn do_gc(&self, conf: &PageServerConf) -> anyhow::Result<Bytes> {
|
||||
let mut buf = BytesMut::new();
|
||||
loop {
|
||||
thread::sleep(conf.gc_period);
|
||||
let last_lsn = self.get_last_valid_lsn();
|
||||
|
||||
// checked_sub() returns None on overflow.
|
||||
if let Some(horizon) = last_lsn.checked_sub(conf.gc_horizon) {
|
||||
let mut maxkey = CacheKey {
|
||||
tag: BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: u32::MAX,
|
||||
dbnode: u32::MAX,
|
||||
relnode: u32::MAX,
|
||||
forknum: u8::MAX,
|
||||
},
|
||||
blknum: u32::MAX,
|
||||
},
|
||||
lsn: Lsn::MAX,
|
||||
};
|
||||
let now = Instant::now();
|
||||
let mut reconstructed = 0u64;
|
||||
let mut truncated = 0u64;
|
||||
let mut inspected = 0u64;
|
||||
let mut deleted = 0u64;
|
||||
loop {
|
||||
buf.clear();
|
||||
maxkey.pack(&mut buf);
|
||||
let mut iter = self.db.raw_iterator();
|
||||
iter.seek_for_prev(&buf[..]);
|
||||
if iter.valid() {
|
||||
let k = iter.key().unwrap();
|
||||
let v = iter.value().unwrap();
|
||||
|
||||
inspected += 1;
|
||||
|
||||
buf.clear();
|
||||
buf.extend_from_slice(&k);
|
||||
let key = CacheKey::unpack(&mut buf);
|
||||
|
||||
// Construct boundaries for old records cleanup
|
||||
maxkey.tag = key.tag;
|
||||
let last_lsn = key.lsn;
|
||||
maxkey.lsn = min(horizon, last_lsn); // do not remove last version
|
||||
|
||||
let mut minkey = maxkey.clone();
|
||||
minkey.lsn = Lsn(0); // first version
|
||||
|
||||
// reconstruct most recent page version
|
||||
if (v[0] & PAGE_IMAGE_FLAG) == 0 {
|
||||
trace!("Reconstruct most recent page {:?}", key);
|
||||
// force reconstruction of most recent page version
|
||||
let (base_img, records) =
|
||||
self.collect_records_for_apply(key.tag, key.lsn);
|
||||
let new_img = self
|
||||
.walredo_mgr
|
||||
.request_redo(key.tag, key.lsn, base_img, records)?;
|
||||
|
||||
self.put_page_image(key.tag, key.lsn, new_img.clone());
|
||||
|
||||
reconstructed += 1;
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
maxkey.pack(&mut buf);
|
||||
|
||||
iter.seek_for_prev(&buf[..]);
|
||||
if iter.valid() {
|
||||
// do not remove last version
|
||||
if last_lsn > horizon {
|
||||
// locate most recent record before horizon
|
||||
let k = iter.key().unwrap();
|
||||
buf.clear();
|
||||
buf.extend_from_slice(&k);
|
||||
let key = CacheKey::unpack(&mut buf);
|
||||
if key.tag == maxkey.tag {
|
||||
let v = iter.value().unwrap();
|
||||
if (v[0] & PAGE_IMAGE_FLAG) == 0 {
|
||||
trace!("Reconstruct horizon page {:?}", key);
|
||||
let (base_img, records) =
|
||||
self.collect_records_for_apply(key.tag, key.lsn);
|
||||
let new_img = self
|
||||
.walredo_mgr
|
||||
.request_redo(key.tag, key.lsn, base_img, records)?;
|
||||
self.put_page_image(key.tag, key.lsn, new_img.clone());
|
||||
|
||||
truncated += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// remove records prior to horizon
|
||||
loop {
|
||||
iter.prev();
|
||||
if !iter.valid() {
|
||||
break;
|
||||
}
|
||||
let k = iter.key().unwrap();
|
||||
buf.clear();
|
||||
buf.extend_from_slice(&k);
|
||||
let key = CacheKey::unpack(&mut buf);
|
||||
if key.tag != maxkey.tag {
|
||||
break;
|
||||
}
|
||||
let v = iter.value().unwrap();
|
||||
if (v[0] & UNUSED_VERSION_FLAG) == 0 {
|
||||
let mut v = v.to_owned();
|
||||
v[0] |= UNUSED_VERSION_FLAG;
|
||||
self.db.put(k, &v[..])?;
|
||||
deleted += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
maxkey = minkey;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
info!("Garbage collection completed in {:?}:\n{} version chains inspected, {} pages reconstructed, {} version histories truncated, {} versions deleted",
|
||||
now.elapsed(), inspected, reconstructed, truncated, deleted);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Wait until WAL has been received up to the given LSN.
|
||||
//
|
||||
fn wait_lsn(&self, mut lsn: Lsn) -> anyhow::Result<Lsn> {
|
||||
// When invalid LSN is requested, it means "don't wait, return latest version of the page"
|
||||
// This is necessary for bootstrap.
|
||||
if lsn == Lsn(0) {
|
||||
let last_valid_lsn = self.last_valid_lsn.load();
|
||||
trace!(
|
||||
"walreceiver doesn't work yet last_valid_lsn {}, requested {}",
|
||||
last_valid_lsn,
|
||||
lsn
|
||||
);
|
||||
lsn = last_valid_lsn;
|
||||
}
|
||||
|
||||
self.last_valid_lsn
|
||||
.wait_for_timeout(lsn, TIMEOUT)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Timed out while waiting for WAL record at LSN {} to arrive",
|
||||
lsn
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(lsn)
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Get statistics to be displayed in the user interface.
|
||||
//
|
||||
// This combines the stats from all PageCache instances
|
||||
//
|
||||
pub fn get_stats() -> PageCacheStats {
|
||||
let pcaches = PAGECACHES.lock().unwrap();
|
||||
|
||||
let mut stats = PageCacheStats {
|
||||
num_entries: 0,
|
||||
num_page_images: 0,
|
||||
num_wal_records: 0,
|
||||
num_getpage_requests: 0,
|
||||
};
|
||||
|
||||
pcaches.iter().for_each(|(_sys_id, pcache)| {
|
||||
stats += pcache.get_stats();
|
||||
});
|
||||
stats
|
||||
pub fn get_repository() -> Arc<dyn Repository + Send + Sync> {
|
||||
let o = &REPOSITORY.lock().unwrap();
|
||||
Arc::clone(o.as_ref().unwrap())
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
// *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, bail};
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt, BE};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
@@ -19,18 +20,17 @@ use std::io::{BufReader, BufWriter, Read, Write};
|
||||
use std::net::{TcpListener, TcpStream};
|
||||
use std::str::FromStr;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::branches;
|
||||
use crate::page_cache;
|
||||
use crate::repository::{BufferTag, RelTag};
|
||||
use crate::restore_local_repo;
|
||||
use crate::walreceiver;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
|
||||
type Result<T> = std::result::Result<T, io::Error>;
|
||||
|
||||
#[derive(Debug)]
|
||||
enum FeMessage {
|
||||
StartupMessage(FeStartupMessage),
|
||||
@@ -42,18 +42,13 @@ enum FeMessage {
|
||||
Close(FeCloseMessage),
|
||||
Sync,
|
||||
Terminate,
|
||||
|
||||
//
|
||||
// All that messages are actually CopyData from libpq point of view.
|
||||
//
|
||||
ZenithExistsRequest(ZenithRequest),
|
||||
ZenithNblocksRequest(ZenithRequest),
|
||||
ZenithReadRequest(ZenithRequest),
|
||||
CopyData(Bytes),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum BeMessage {
|
||||
AuthenticationOk,
|
||||
ParameterStatus,
|
||||
ReadyForQuery,
|
||||
RowDescription,
|
||||
ParseComplete,
|
||||
@@ -61,20 +56,31 @@ enum BeMessage {
|
||||
NoData,
|
||||
BindComplete,
|
||||
CloseComplete,
|
||||
DataRow,
|
||||
DataRow(Bytes),
|
||||
CommandComplete,
|
||||
ControlFile,
|
||||
|
||||
//
|
||||
// All that messages are actually CopyData from libpq point of view.
|
||||
//
|
||||
ZenithStatusResponse(ZenithStatusResponse),
|
||||
ZenithNblocksResponse(ZenithStatusResponse),
|
||||
ZenithReadResponse(ZenithReadResponse),
|
||||
CopyData(Bytes),
|
||||
ErrorResponse(String),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
enum PagestreamFeMessage {
|
||||
Exists(PagestreamRequest),
|
||||
Nblocks(PagestreamRequest),
|
||||
Read(PagestreamRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
enum PagestreamBeMessage {
|
||||
Status(PagestreamStatusResponse),
|
||||
Nblocks(PagestreamStatusResponse),
|
||||
Read(PagestreamReadResponse),
|
||||
}
|
||||
|
||||
static HELLO_WORLD_ROW: BeMessage = BeMessage::DataRow(Bytes::from_static(b"hello world"));
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ZenithRequest {
|
||||
struct PagestreamRequest {
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
relnode: u32,
|
||||
@@ -84,13 +90,13 @@ struct ZenithRequest {
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ZenithStatusResponse {
|
||||
struct PagestreamStatusResponse {
|
||||
ok: bool,
|
||||
n_blocks: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ZenithReadResponse {
|
||||
struct PagestreamReadResponse {
|
||||
ok: bool,
|
||||
n_blocks: u32,
|
||||
page: Bytes,
|
||||
@@ -111,7 +117,7 @@ enum StartupRequestCode {
|
||||
}
|
||||
|
||||
impl FeStartupMessage {
|
||||
pub fn read(stream: &mut dyn std::io::Read) -> Result<Option<FeMessage>> {
|
||||
pub fn read(stream: &mut dyn std::io::Read) -> anyhow::Result<Option<FeMessage>> {
|
||||
const MAX_STARTUP_PACKET_LENGTH: u32 = 10000;
|
||||
const CANCEL_REQUEST_CODE: u32 = (1234 << 16) | 5678;
|
||||
const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679;
|
||||
@@ -123,19 +129,12 @@ impl FeStartupMessage {
|
||||
// in the log if the client opens connection but closes it immediately.
|
||||
let len = match stream.read_u32::<BE>() {
|
||||
Ok(len) => len,
|
||||
Err(err) => {
|
||||
if err.kind() == std::io::ErrorKind::UnexpectedEof {
|
||||
return Ok(None);
|
||||
} else {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
if len < 4 || len as u32 > MAX_STARTUP_PACKET_LENGTH {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"invalid message length",
|
||||
));
|
||||
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
bail!("invalid message length");
|
||||
}
|
||||
let bodylen = len - 4;
|
||||
|
||||
@@ -180,15 +179,12 @@ struct FeParseMessage {
|
||||
query_string: Bytes,
|
||||
}
|
||||
|
||||
fn read_null_terminated(buf: &mut Bytes) -> Result<Bytes> {
|
||||
fn read_null_terminated(buf: &mut Bytes) -> anyhow::Result<Bytes> {
|
||||
let mut result = BytesMut::new();
|
||||
|
||||
loop {
|
||||
if !buf.has_remaining() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"no null-terminator in string",
|
||||
));
|
||||
bail!("no null-terminator in string");
|
||||
}
|
||||
|
||||
let byte = buf.get_u8();
|
||||
@@ -202,7 +198,7 @@ fn read_null_terminated(buf: &mut Bytes) -> Result<Bytes> {
|
||||
}
|
||||
|
||||
impl FeParseMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
let query_string = read_null_terminated(&mut buf)?;
|
||||
let nparams = buf.get_i16();
|
||||
@@ -221,10 +217,7 @@ impl FeParseMessage {
|
||||
*/
|
||||
|
||||
if nparams != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"query params not implemented",
|
||||
));
|
||||
bail!("query params not implemented");
|
||||
}
|
||||
|
||||
Ok(FeMessage::Parse(FeParseMessage { query_string }))
|
||||
@@ -238,7 +231,7 @@ struct FeDescribeMessage {
|
||||
}
|
||||
|
||||
impl FeDescribeMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let kind = buf.get_u8();
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
@@ -253,10 +246,7 @@ impl FeDescribeMessage {
|
||||
*/
|
||||
|
||||
if kind != b'S' {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"only prepared statmement Describe is implemented",
|
||||
));
|
||||
bail!("only prepared statmement Describe is implemented");
|
||||
}
|
||||
|
||||
Ok(FeMessage::Describe(FeDescribeMessage { kind }))
|
||||
@@ -271,22 +261,16 @@ struct FeExecuteMessage {
|
||||
}
|
||||
|
||||
impl FeExecuteMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let portal_name = read_null_terminated(&mut buf)?;
|
||||
let maxrows = buf.get_i32();
|
||||
|
||||
if !portal_name.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named portals not implemented",
|
||||
));
|
||||
bail!("named portals not implemented");
|
||||
}
|
||||
|
||||
if maxrows != 0 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"row limit in Execute message not supported",
|
||||
));
|
||||
bail!("row limit in Execute message not supported");
|
||||
}
|
||||
|
||||
Ok(FeMessage::Execute(FeExecuteMessage { maxrows }))
|
||||
@@ -298,15 +282,12 @@ impl FeExecuteMessage {
|
||||
struct FeBindMessage {}
|
||||
|
||||
impl FeBindMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let portal_name = read_null_terminated(&mut buf)?;
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
if !portal_name.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named portals not implemented",
|
||||
));
|
||||
bail!("named portals not implemented");
|
||||
}
|
||||
|
||||
// FIXME: see FeParseMessage::parse
|
||||
@@ -328,7 +309,7 @@ impl FeBindMessage {
|
||||
struct FeCloseMessage {}
|
||||
|
||||
impl FeCloseMessage {
|
||||
pub fn parse(mut buf: Bytes) -> Result<FeMessage> {
|
||||
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let _kind = buf.get_u8();
|
||||
let _pstmt_or_portal_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
@@ -339,28 +320,20 @@ impl FeCloseMessage {
|
||||
}
|
||||
|
||||
impl FeMessage {
|
||||
pub fn read(stream: &mut dyn Read) -> Result<Option<FeMessage>> {
|
||||
pub fn read(stream: &mut dyn Read) -> anyhow::Result<Option<FeMessage>> {
|
||||
// Each libpq message begins with a message type byte, followed by message length
|
||||
// If the client closes the connection, return None. But if the client closes the
|
||||
// connection in the middle of a message, we will return an error.
|
||||
let tag = match stream.read_u8() {
|
||||
Ok(b) => b,
|
||||
Err(err) => {
|
||||
if err.kind() == std::io::ErrorKind::UnexpectedEof {
|
||||
return Ok(None);
|
||||
} else {
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let len = stream.read_u32::<BE>()?;
|
||||
|
||||
// The message length includes itself, so it better be at least 4
|
||||
if len < 4 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"invalid message length: parsing u32",
|
||||
));
|
||||
bail!("invalid message length: parsing u32");
|
||||
}
|
||||
let bodylen = len - 4;
|
||||
|
||||
@@ -368,7 +341,7 @@ impl FeMessage {
|
||||
let mut body_buf: Vec<u8> = vec![0; bodylen as usize];
|
||||
stream.read_exact(&mut body_buf)?;
|
||||
|
||||
let mut body = Bytes::from(body_buf);
|
||||
let body = Bytes::from(body_buf);
|
||||
|
||||
// Parse it
|
||||
match tag {
|
||||
@@ -380,37 +353,70 @@ impl FeMessage {
|
||||
b'C' => Ok(Some(FeCloseMessage::parse(body)?)),
|
||||
b'S' => Ok(Some(FeMessage::Sync)),
|
||||
b'X' => Ok(Some(FeMessage::Terminate)),
|
||||
b'd' => {
|
||||
let smgr_tag = body.get_u8();
|
||||
let zreq = ZenithRequest {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
blkno: body.get_u32(),
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
};
|
||||
b'd' => Ok(Some(FeMessage::CopyData(body))),
|
||||
tag => Err(anyhow!("unknown message tag: {},'{:?}'", tag, body)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
match smgr_tag {
|
||||
0 => Ok(Some(FeMessage::ZenithExistsRequest(zreq))),
|
||||
1 => Ok(Some(FeMessage::ZenithNblocksRequest(zreq))),
|
||||
2 => Ok(Some(FeMessage::ZenithReadRequest(zreq))),
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("unknown smgr message tag: {},'{:?}'", smgr_tag, body),
|
||||
)),
|
||||
}
|
||||
}
|
||||
tag => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("unknown message tag: {},'{:?}'", tag, body),
|
||||
impl PagestreamFeMessage {
|
||||
fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// TODO these gets can fail
|
||||
|
||||
let smgr_tag = body.get_u8();
|
||||
let zreq = PagestreamRequest {
|
||||
spcnode: body.get_u32(),
|
||||
dbnode: body.get_u32(),
|
||||
relnode: body.get_u32(),
|
||||
forknum: body.get_u8(),
|
||||
blkno: body.get_u32(),
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
};
|
||||
|
||||
// TODO: consider using protobuf or serde bincode for less error prone
|
||||
// serialization.
|
||||
match smgr_tag {
|
||||
0 => Ok(PagestreamFeMessage::Exists(zreq)),
|
||||
1 => Ok(PagestreamFeMessage::Nblocks(zreq)),
|
||||
2 => Ok(PagestreamFeMessage::Read(zreq)),
|
||||
_ => Err(anyhow!(
|
||||
"unknown smgr message tag: {},'{:?}'",
|
||||
smgr_tag,
|
||||
body
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PagestreamBeMessage {
|
||||
fn serialize(&self) -> Bytes {
|
||||
let mut bytes = BytesMut::new();
|
||||
|
||||
match self {
|
||||
Self::Status(resp) => {
|
||||
bytes.put_u8(100); /* tag from pagestore_client.h */
|
||||
bytes.put_u8(resp.ok as u8);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::Nblocks(resp) => {
|
||||
bytes.put_u8(101); /* tag from pagestore_client.h */
|
||||
bytes.put_u8(resp.ok as u8);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
}
|
||||
|
||||
Self::Read(resp) => {
|
||||
bytes.put_u8(102); /* tag from pagestore_client.h */
|
||||
bytes.put_u8(resp.ok as u8);
|
||||
bytes.put_u32(resp.n_blocks);
|
||||
bytes.put(&resp.page[..]);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
///
|
||||
@@ -418,16 +424,12 @@ impl FeMessage {
|
||||
///
|
||||
/// Listens for connections, and launches a new handler thread for each.
|
||||
///
|
||||
pub fn thread_main(conf: &PageServerConf) {
|
||||
info!("Starting page server on {}", conf.listen_addr);
|
||||
|
||||
let listener = TcpListener::bind(conf.listen_addr).unwrap();
|
||||
|
||||
pub fn thread_main(conf: &'static PageServerConf, listener: TcpListener) -> anyhow::Result<()> {
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept().unwrap();
|
||||
let (socket, peer_addr) = listener.accept()?;
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
let mut conn_handler = Connection::new(conf.clone(), socket);
|
||||
let mut conn_handler = Connection::new(conf, socket);
|
||||
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = conn_handler.run() {
|
||||
@@ -441,17 +443,15 @@ pub fn thread_main(conf: &PageServerConf) {
|
||||
struct Connection {
|
||||
stream_in: BufReader<TcpStream>,
|
||||
stream: BufWriter<TcpStream>,
|
||||
buffer: BytesMut,
|
||||
init_done: bool,
|
||||
conf: PageServerConf,
|
||||
conf: &'static PageServerConf,
|
||||
}
|
||||
|
||||
impl Connection {
|
||||
pub fn new(conf: PageServerConf, socket: TcpStream) -> Connection {
|
||||
pub fn new(conf: &'static PageServerConf, socket: TcpStream) -> Connection {
|
||||
Connection {
|
||||
stream_in: BufReader::new(socket.try_clone().unwrap()),
|
||||
stream: BufWriter::new(socket),
|
||||
buffer: BytesMut::with_capacity(10 * 1024),
|
||||
init_done: false,
|
||||
conf,
|
||||
}
|
||||
@@ -460,7 +460,7 @@ impl Connection {
|
||||
//
|
||||
// Read full message or return None if connection is closed
|
||||
//
|
||||
fn read_message(&mut self) -> Result<Option<FeMessage>> {
|
||||
fn read_message(&mut self) -> anyhow::Result<Option<FeMessage>> {
|
||||
if !self.init_done {
|
||||
FeStartupMessage::read(&mut self.stream_in)
|
||||
} else {
|
||||
@@ -476,6 +476,16 @@ impl Connection {
|
||||
self.stream.write_i32::<BE>(0)?;
|
||||
}
|
||||
|
||||
BeMessage::ParameterStatus => {
|
||||
self.stream.write_u8(b'S')?;
|
||||
// parameter names and values are specified by null terminated strings
|
||||
const PARAM_NAME_VALUE: &[u8] = b"client_encoding\0UTF8\0";
|
||||
// length of this i32 + rest of data in message
|
||||
self.stream
|
||||
.write_i32::<BE>(4 + PARAM_NAME_VALUE.len() as i32)?;
|
||||
self.stream.write_all(PARAM_NAME_VALUE)?;
|
||||
}
|
||||
|
||||
BeMessage::ReadyForQuery => {
|
||||
self.stream.write_u8(b'Z')?;
|
||||
self.stream.write_i32::<BE>(4 + 1)?;
|
||||
@@ -528,10 +538,7 @@ impl Connection {
|
||||
}
|
||||
|
||||
// XXX: accept some text data
|
||||
BeMessage::DataRow => {
|
||||
// XXX
|
||||
let b = Bytes::from("hello world");
|
||||
|
||||
BeMessage::DataRow(b) => {
|
||||
self.stream.write_u8(b'D')?;
|
||||
self.stream.write_i32::<BE>(4 + 2 + 4 + b.len() as i32)?;
|
||||
|
||||
@@ -560,30 +567,42 @@ impl Connection {
|
||||
self.stream.write_all(&b)?;
|
||||
}
|
||||
|
||||
BeMessage::ZenithStatusResponse(resp) => {
|
||||
BeMessage::CopyData(data) => {
|
||||
self.stream.write_u8(b'd')?;
|
||||
self.stream.write_u32::<BE>(4 + 1 + 1 + 4)?;
|
||||
self.stream.write_u8(100)?; /* tag from pagestore_client.h */
|
||||
self.stream.write_u8(resp.ok as u8)?;
|
||||
self.stream.write_u32::<BE>(resp.n_blocks)?;
|
||||
self.stream.write_u32::<BE>(4 + data.len() as u32)?;
|
||||
self.stream.write_all(&data)?;
|
||||
}
|
||||
|
||||
BeMessage::ZenithNblocksResponse(resp) => {
|
||||
self.stream.write_u8(b'd')?;
|
||||
self.stream.write_u32::<BE>(4 + 1 + 1 + 4)?;
|
||||
self.stream.write_u8(101)?; /* tag from pagestore_client.h */
|
||||
self.stream.write_u8(resp.ok as u8)?;
|
||||
self.stream.write_u32::<BE>(resp.n_blocks)?;
|
||||
}
|
||||
// ErrorResponse is a zero-terminated array of zero-terminated fields.
|
||||
// First byte of each field represents type of this field. Set just enough fields
|
||||
// to satisfy rust-postgres client: 'S' -- severity, 'C' -- error, 'M' -- error
|
||||
// message text.
|
||||
BeMessage::ErrorResponse(error_msg) => {
|
||||
// For all the errors set Severity to Error and error code to
|
||||
// 'internal error'.
|
||||
let severity = Bytes::from("SERROR\0");
|
||||
let code = Bytes::from("CXX000\0");
|
||||
|
||||
BeMessage::ZenithReadResponse(resp) => {
|
||||
self.stream.write_u8(b'd')?;
|
||||
self.stream
|
||||
.write_u32::<BE>(4 + 1 + 1 + 4 + resp.page.len() as u32)?;
|
||||
self.stream.write_u8(102)?; /* tag from pagestore_client.h */
|
||||
self.stream.write_u8(resp.ok as u8)?;
|
||||
self.stream.write_u32::<BE>(resp.n_blocks)?;
|
||||
self.stream.write_all(&resp.page.clone())?;
|
||||
// 'E' signalizes ErrorResponse messages
|
||||
self.stream.write_u8(b'E')?;
|
||||
self.stream.write_u32::<BE>(
|
||||
4 + severity.len() as u32
|
||||
+ code.len() as u32
|
||||
+ (1 + error_msg.len() as u32 + 1)
|
||||
+ 1,
|
||||
)?;
|
||||
|
||||
// Send severity and code fields
|
||||
self.stream.write_all(&severity)?;
|
||||
self.stream.write_all(&code)?;
|
||||
|
||||
// Send error message field
|
||||
self.stream.write_u8(b'M')?;
|
||||
self.stream.write_all(error_msg.as_bytes())?;
|
||||
self.stream.write_u8(0)?;
|
||||
|
||||
// Terminate fields
|
||||
self.stream.write_u8(0)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -595,7 +614,7 @@ impl Connection {
|
||||
self.stream.flush()
|
||||
}
|
||||
|
||||
fn run(&mut self) -> Result<()> {
|
||||
fn run(&mut self) -> anyhow::Result<()> {
|
||||
let mut unnamed_query_string = Bytes::new();
|
||||
loop {
|
||||
let msg = self.read_message()?;
|
||||
@@ -612,6 +631,9 @@ impl Connection {
|
||||
}
|
||||
StartupRequestCode::Normal => {
|
||||
self.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
// psycopg2 will not connect if client_encoding is not
|
||||
// specified by the server
|
||||
self.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
self.init_done = true;
|
||||
}
|
||||
@@ -619,7 +641,11 @@ impl Connection {
|
||||
}
|
||||
}
|
||||
Some(FeMessage::Query(m)) => {
|
||||
self.process_query(m.body)?;
|
||||
if let Err(e) = self.process_query(m.body) {
|
||||
let errmsg = format!("{}", e);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(errmsg))?;
|
||||
}
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
Some(FeMessage::Parse(m)) => {
|
||||
unnamed_query_string = m.query_string;
|
||||
@@ -637,6 +663,7 @@ impl Connection {
|
||||
}
|
||||
Some(FeMessage::Execute(_)) => {
|
||||
self.process_query(unnamed_query_string.clone())?;
|
||||
self.stream.flush()?;
|
||||
}
|
||||
Some(FeMessage::Sync) => {
|
||||
self.write_message(&BeMessage::ReadyForQuery)?;
|
||||
@@ -649,8 +676,7 @@ impl Connection {
|
||||
break;
|
||||
}
|
||||
x => {
|
||||
error!("unexpected message type : {:?}", x);
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "unexpected message"));
|
||||
bail!("unexpected message type : {:?}", x);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -658,86 +684,130 @@ impl Connection {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process_query(&mut self, query_string: Bytes) -> Result<()> {
|
||||
fn process_query(&mut self, query_string: Bytes) -> anyhow::Result<()> {
|
||||
debug!("process query {:?}", query_string);
|
||||
|
||||
// remove null terminator, if any
|
||||
let mut query_string = query_string.clone();
|
||||
let mut query_string = query_string;
|
||||
if query_string.last() == Some(&0) {
|
||||
query_string.truncate(query_string.len() - 1);
|
||||
}
|
||||
|
||||
if query_string.starts_with(b"controlfile") {
|
||||
self.handle_controlfile()
|
||||
self.handle_controlfile()?;
|
||||
} else if query_string.starts_with(b"pagestream ") {
|
||||
let (_l, r) = query_string.split_at("pagestream ".len());
|
||||
let timelineid_str = String::from_utf8(r.to_vec()).unwrap();
|
||||
let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
|
||||
let timelineid_str = String::from_utf8(r.to_vec())?;
|
||||
let timelineid = ZTimelineId::from_str(&timelineid_str)?;
|
||||
|
||||
self.handle_pagerequests(timelineid)
|
||||
self.handle_pagerequests(timelineid)?;
|
||||
} else if query_string.starts_with(b"basebackup ") {
|
||||
let (_l, r) = query_string.split_at("basebackup ".len());
|
||||
let r = r.to_vec();
|
||||
let timelineid_str = String::from(String::from_utf8(r).unwrap().trim_end());
|
||||
let basebackup_args = String::from(String::from_utf8(r)?.trim_end());
|
||||
let args: Vec<&str> = basebackup_args.rsplit(' ').collect();
|
||||
let timelineid_str = args[0];
|
||||
info!("got basebackup command: \"{}\"", timelineid_str);
|
||||
let timelineid = ZTimelineId::from_str(&timelineid_str).unwrap();
|
||||
|
||||
let timelineid = ZTimelineId::from_str(&timelineid_str)?;
|
||||
let lsn = if args.len() > 1 {
|
||||
Some(Lsn::from_str(args[1])?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(timelineid)?;
|
||||
self.handle_basebackup_request(timelineid, lsn)?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)
|
||||
} else if query_string.starts_with(b"callmemaybe ") {
|
||||
let query_str = String::from_utf8(query_string.to_vec()).unwrap();
|
||||
let query_str = String::from_utf8(query_string.to_vec())?;
|
||||
|
||||
// callmemaybe <zenith timelineid as hex string> <connstr>
|
||||
// TODO lazy static
|
||||
let re = Regex::new(r"^callmemaybe ([[:xdigit:]]+) (.*)$").unwrap();
|
||||
let caps = re.captures(&query_str);
|
||||
let caps = caps.unwrap();
|
||||
let caps = re
|
||||
.captures(&query_str)
|
||||
.ok_or_else(|| anyhow!("invalid callmemaybe: '{}'", query_str))?;
|
||||
|
||||
let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str()).unwrap();
|
||||
let timelineid = ZTimelineId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let connstr: String = String::from(caps.get(2).unwrap().as_str());
|
||||
|
||||
// Check that the timeline exists
|
||||
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
|
||||
if pcache.is_err() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("client requested callmemaybe on timeline {} which does not exist in page server", timelineid)));
|
||||
let repository = page_cache::get_repository();
|
||||
if repository.get_timeline(timelineid).is_err() {
|
||||
bail!("client requested callmemaybe on timeline {} which does not exist in page server", timelineid);
|
||||
}
|
||||
|
||||
walreceiver::launch_wal_receiver(&self.conf, timelineid, &connstr);
|
||||
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)
|
||||
} else if query_string.starts_with(b"branch_create ") {
|
||||
let query_str = String::from_utf8(query_string.to_vec())?;
|
||||
let err = || anyhow!("invalid branch_create: '{}'", query_str);
|
||||
|
||||
// branch_create <branchname> <startpoint>
|
||||
// TODO lazy static
|
||||
// TOOD: escaping, to allow branch names with spaces
|
||||
let re = Regex::new(r"^branch_create (\S+) ([^\r\n\s;]+)[\r\n\s;]*;?$").unwrap();
|
||||
let caps = re.captures(&query_str).ok_or_else(err)?;
|
||||
|
||||
let branchname: String = String::from(caps.get(1).ok_or_else(err)?.as_str());
|
||||
let startpoint_str: String = String::from(caps.get(2).ok_or_else(err)?.as_str());
|
||||
|
||||
let branch = branches::create_branch(&self.conf, &branchname, &startpoint_str)?;
|
||||
let branch = serde_json::to_vec(&branch)?;
|
||||
|
||||
self.write_message_noflush(&BeMessage::RowDescription)?;
|
||||
self.write_message_noflush(&BeMessage::DataRow(Bytes::from(branch)))?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
} else if query_string.starts_with(b"branch_list") {
|
||||
let branches =
|
||||
crate::branches::get_branches(&self.conf, &*page_cache::get_repository())?;
|
||||
let branches_buf = serde_json::to_vec(&branches)?;
|
||||
|
||||
self.write_message_noflush(&BeMessage::RowDescription)?;
|
||||
self.write_message_noflush(&BeMessage::DataRow(Bytes::from(branches_buf)))?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
} else if query_string.starts_with(b"status") {
|
||||
self.write_message_noflush(&BeMessage::RowDescription)?;
|
||||
self.write_message_noflush(&BeMessage::DataRow)?;
|
||||
self.write_message_noflush(&HELLO_WORLD_ROW)?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)
|
||||
} else {
|
||||
} else if query_string.to_ascii_lowercase().starts_with(b"set ") {
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
} else if query_string
|
||||
.to_ascii_lowercase()
|
||||
.starts_with(b"identify_system")
|
||||
{
|
||||
// TODO: match postgres response formarmat for 'identify_system'
|
||||
let system_id = crate::branches::get_system_id(&self.conf)?.to_string();
|
||||
|
||||
self.write_message_noflush(&BeMessage::RowDescription)?;
|
||||
self.write_message_noflush(&BeMessage::DataRow)?;
|
||||
self.write_message_noflush(&BeMessage::DataRow(Bytes::from(system_id)))?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)
|
||||
} else {
|
||||
bail!("unknown command");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_controlfile(&mut self) -> Result<()> {
|
||||
fn handle_controlfile(&mut self) -> io::Result<()> {
|
||||
self.write_message_noflush(&BeMessage::RowDescription)?;
|
||||
self.write_message_noflush(&BeMessage::ControlFile)?;
|
||||
self.write_message_noflush(&BeMessage::CommandComplete)?;
|
||||
self.write_message(&BeMessage::ReadyForQuery)
|
||||
self.write_message(&BeMessage::CommandComplete)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
fn handle_pagerequests(&mut self, timelineid: ZTimelineId) -> anyhow::Result<()> {
|
||||
// Check that the timeline exists
|
||||
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
|
||||
if pcache.is_err() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("client requested pagestream on timeline {} which does not exist in page server", timelineid)));
|
||||
}
|
||||
let pcache = pcache.unwrap();
|
||||
let repository = page_cache::get_repository();
|
||||
let timeline = repository.get_timeline(timelineid).map_err(|_| {
|
||||
anyhow!(
|
||||
"client requested pagestream on timeline {} which does not exist in page server",
|
||||
timelineid
|
||||
)
|
||||
})?;
|
||||
|
||||
/* switch client to COPYBOTH */
|
||||
self.stream.write_u8(b'W')?;
|
||||
@@ -746,52 +816,47 @@ impl Connection {
|
||||
self.stream.write_i16::<BE>(0)?; /* numAttributes */
|
||||
self.stream.flush()?;
|
||||
|
||||
loop {
|
||||
let message = self.read_message()?;
|
||||
while let Some(message) = self.read_message()? {
|
||||
trace!("query({:?}): {:?}", timelineid, message);
|
||||
|
||||
if let Some(m) = &message {
|
||||
trace!("query({:?}): {:?}", timelineid, m);
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
if message.is_none() {
|
||||
// connection was closed
|
||||
return Ok(());
|
||||
}
|
||||
let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
|
||||
|
||||
match message {
|
||||
Some(FeMessage::ZenithExistsRequest(req)) => {
|
||||
let tag = page_cache::RelTag {
|
||||
let response = match zenith_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => {
|
||||
let tag = RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
|
||||
let exist = pcache.relsize_exist(&tag, req.lsn).unwrap_or(false);
|
||||
let exist = timeline.get_relsize_exists(tag, req.lsn).unwrap_or(false);
|
||||
|
||||
self.write_message(&BeMessage::ZenithStatusResponse(ZenithStatusResponse {
|
||||
PagestreamBeMessage::Status(PagestreamStatusResponse {
|
||||
ok: exist,
|
||||
n_blocks: 0,
|
||||
}))?
|
||||
})
|
||||
}
|
||||
Some(FeMessage::ZenithNblocksRequest(req)) => {
|
||||
let tag = page_cache::RelTag {
|
||||
PagestreamFeMessage::Nblocks(req) => {
|
||||
let tag = RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
forknum: req.forknum,
|
||||
};
|
||||
|
||||
let n_blocks = pcache.relsize_get(&tag, req.lsn).unwrap_or(0);
|
||||
let n_blocks = timeline.get_relsize(tag, req.lsn).unwrap_or(0);
|
||||
|
||||
self.write_message(&BeMessage::ZenithNblocksResponse(ZenithStatusResponse {
|
||||
ok: true,
|
||||
n_blocks,
|
||||
}))?
|
||||
PagestreamBeMessage::Nblocks(PagestreamStatusResponse { ok: true, n_blocks })
|
||||
}
|
||||
Some(FeMessage::ZenithReadRequest(req)) => {
|
||||
let buf_tag = page_cache::BufferTag {
|
||||
rel: page_cache::RelTag {
|
||||
PagestreamFeMessage::Read(req) => {
|
||||
let buf_tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: req.spcnode,
|
||||
dbnode: req.dbnode,
|
||||
relnode: req.relnode,
|
||||
@@ -800,39 +865,47 @@ impl Connection {
|
||||
blknum: req.blkno,
|
||||
};
|
||||
|
||||
let msg = match pcache.get_page_at_lsn(buf_tag, req.lsn) {
|
||||
Ok(p) => BeMessage::ZenithReadResponse(ZenithReadResponse {
|
||||
let read_response = match timeline.get_page_at_lsn(buf_tag, req.lsn) {
|
||||
Ok(p) => PagestreamReadResponse {
|
||||
ok: true,
|
||||
n_blocks: 0,
|
||||
page: p,
|
||||
}),
|
||||
},
|
||||
Err(e) => {
|
||||
const ZERO_PAGE: [u8; 8192] = [0; 8192];
|
||||
error!("get_page_at_lsn: {}", e);
|
||||
BeMessage::ZenithReadResponse(ZenithReadResponse {
|
||||
PagestreamReadResponse {
|
||||
ok: false,
|
||||
n_blocks: 0,
|
||||
page: Bytes::from_static(&ZERO_PAGE),
|
||||
})
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
self.write_message(&msg)?
|
||||
PagestreamBeMessage::Read(read_response)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
};
|
||||
|
||||
self.write_message(&BeMessage::CopyData(response.serialize()))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_basebackup_request(&mut self, timelineid: ZTimelineId) -> Result<()> {
|
||||
fn handle_basebackup_request(
|
||||
&mut self,
|
||||
timelineid: ZTimelineId,
|
||||
lsn: Option<Lsn>,
|
||||
) -> anyhow::Result<()> {
|
||||
// check that the timeline exists
|
||||
let pcache = page_cache::get_or_restore_pagecache(&self.conf, timelineid);
|
||||
if pcache.is_err() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("client requested basebackup on timeline {} which does not exist in page server", timelineid)));
|
||||
}
|
||||
|
||||
let repository = page_cache::get_repository();
|
||||
let timeline = repository.get_timeline(timelineid).map_err(|e| {
|
||||
error!("error fetching timeline: {:?}", e);
|
||||
anyhow!(
|
||||
"client requested basebackup on timeline {} which does not exist in page server",
|
||||
timelineid
|
||||
)
|
||||
})?;
|
||||
/* switch client to COPYOUT */
|
||||
let stream = &mut self.stream;
|
||||
stream.write_u8(b'H')?;
|
||||
@@ -845,21 +918,22 @@ impl Connection {
|
||||
/* Send a tarball of the latest snapshot on the timeline */
|
||||
|
||||
// find latest snapshot
|
||||
let snapshotlsn = restore_local_repo::find_latest_snapshot(&self.conf, timelineid).unwrap();
|
||||
|
||||
basebackup::send_snapshot_tarball(&mut CopyDataSink { stream }, timelineid, snapshotlsn)?;
|
||||
|
||||
let snapshot_lsn =
|
||||
restore_local_repo::find_latest_snapshot(&self.conf, timelineid).unwrap();
|
||||
let req_lsn = lsn.unwrap_or(snapshot_lsn);
|
||||
basebackup::send_tarball_at_lsn(
|
||||
&mut CopyDataSink { stream },
|
||||
timelineid,
|
||||
&timeline,
|
||||
req_lsn,
|
||||
snapshot_lsn,
|
||||
)?;
|
||||
// CopyDone
|
||||
self.stream.write_u8(b'c')?;
|
||||
self.stream.write_u32::<BE>(4)?;
|
||||
self.stream.flush()?;
|
||||
debug!("CopyDone sent!");
|
||||
|
||||
// FIXME: I'm getting an error from the tokio copyout driver without this.
|
||||
// I think it happens when the CommandComplete, CloseComplete and ReadyForQuery
|
||||
// are sent in the same TCP packet as the CopyDone. I don't understand why.
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -872,8 +946,8 @@ struct CopyDataSink<'a> {
|
||||
stream: &'a mut BufWriter<TcpStream>,
|
||||
}
|
||||
|
||||
impl<'a> std::io::Write for CopyDataSink<'a> {
|
||||
fn write(&mut self, data: &[u8]) -> std::result::Result<usize, std::io::Error> {
|
||||
impl<'a> io::Write for CopyDataSink<'a> {
|
||||
fn write(&mut self, data: &[u8]) -> io::Result<usize> {
|
||||
// CopyData
|
||||
// FIXME: if the input is large, we should split it into multiple messages.
|
||||
// Not sure what the threshold should be, but the ultimate hard limit is that
|
||||
@@ -889,7 +963,7 @@ impl<'a> std::io::Write for CopyDataSink<'a> {
|
||||
|
||||
Ok(data.len())
|
||||
}
|
||||
fn flush(&mut self) -> std::result::Result<(), std::io::Error> {
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
// no-op
|
||||
Ok(())
|
||||
}
|
||||
|
||||
532
pageserver/src/repository.rs
Normal file
532
pageserver/src/repository.rs
Normal file
@@ -0,0 +1,532 @@
|
||||
pub mod rocksdb;
|
||||
|
||||
use crate::waldecoder::{DecodedWALRecord, Oid, XlCreateDatabase, XlSmgrTruncate};
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// A repository corresponds to one .zenith directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
pub trait Repository {
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Create a new, empty timeline. The caller is responsible for loading data into it
|
||||
fn create_empty_timeline(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
start_lsn: Lsn,
|
||||
) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
|
||||
|
||||
//fn get_stats(&self) -> RepositoryStats;
|
||||
}
|
||||
|
||||
pub trait Timeline {
|
||||
//------------------------------------------------------------------------------
|
||||
// Public GET functions
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_at_lsn(&self, tag: BufferTag, lsn: Lsn) -> Result<Bytes>;
|
||||
|
||||
/// Get size of relation
|
||||
fn get_relsize(&self, tag: RelTag, lsn: Lsn) -> Result<u32>;
|
||||
|
||||
/// Does relation exist?
|
||||
fn get_relsize_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool>;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Public PUT functions, to update the repository with new page versions.
|
||||
//
|
||||
// These are called by the WAL receiver to digest WAL records.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, tag: BufferTag, rec: WALRecord) -> Result<()>;
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(&self, tag: BufferTag, lsn: Lsn, img: Bytes) -> Result<()>;
|
||||
|
||||
/// Truncate relation
|
||||
fn put_truncation(&self, rel: RelTag, lsn: Lsn, nblocks: u32) -> Result<()>;
|
||||
|
||||
/// Drop relation or file segment
|
||||
fn put_drop(&self, tag: BufferTag, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Create a new database from a template database
|
||||
///
|
||||
/// In PostgreSQL, CREATE DATABASE works by scanning the data directory and
|
||||
/// copying all relation files from the template database. This is the equivalent
|
||||
/// of that.
|
||||
fn put_create_database(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
db_id: Oid,
|
||||
tablespace_id: Oid,
|
||||
src_db_id: Oid,
|
||||
src_tablespace_id: Oid,
|
||||
) -> Result<()>;
|
||||
|
||||
///
|
||||
/// Helper function to parse a WAL record and call the above functions for all the
|
||||
/// relations/pages that the record affects.
|
||||
///
|
||||
fn save_decoded_record(
|
||||
&self,
|
||||
decoded: DecodedWALRecord,
|
||||
recdata: Bytes,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
// Figure out which blocks the record applies to, and "put" a separate copy
|
||||
// of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
|
||||
if blk.will_drop {
|
||||
self.put_drop(tag, lsn)?;
|
||||
} else {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
self.put_wal_record(tag, rec)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle a few special record types
|
||||
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_SMGR_TRUNCATE
|
||||
{
|
||||
let truncate = XlSmgrTruncate::decode(&decoded);
|
||||
let mut rel = RelTag {
|
||||
spcnode: truncate.rnode.spcnode,
|
||||
dbnode: truncate.rnode.dbnode,
|
||||
relnode: truncate.rnode.relnode,
|
||||
forknum: pg_constants::MAIN_FORKNUM,
|
||||
};
|
||||
if (truncate.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 {
|
||||
self.put_truncation(rel, lsn, truncate.blkno)?;
|
||||
}
|
||||
if (truncate.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
|
||||
if truncate.blkno == 0 {
|
||||
rel.forknum = pg_constants::FSM_FORKNUM;
|
||||
self.put_truncation(rel, lsn, truncate.blkno)?;
|
||||
} else {
|
||||
// TODO: handle partial truncation of FSM:
|
||||
// need to map heap block number to FSM block number
|
||||
// and clear bits in the tail block
|
||||
info!("Partial truncation of FSM is not supported");
|
||||
}
|
||||
}
|
||||
if (truncate.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 {
|
||||
if truncate.blkno == 0 {
|
||||
rel.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
self.put_truncation(rel, lsn, truncate.blkno)?;
|
||||
} else {
|
||||
// TODO: handle partial truncation of VM:
|
||||
// need to map heap block number to VM block number
|
||||
// and clear bits in the tail block
|
||||
info!("Partial truncation of VM is not supported");
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_DBASE_CREATE
|
||||
{
|
||||
let createdb = XlCreateDatabase::decode(&decoded);
|
||||
self.put_create_database(
|
||||
lsn,
|
||||
createdb.db_id,
|
||||
createdb.tablespace_id,
|
||||
createdb.src_db_id,
|
||||
createdb.src_tablespace_id,
|
||||
)?;
|
||||
}
|
||||
// Now that this record has been handled, let the repository know that
|
||||
// it is up-to-date to this LSN
|
||||
self.advance_last_record_lsn(lsn);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remember the all WAL before the given LSN has been processed.
|
||||
///
|
||||
/// The WAL receiver calls this after the put_* functions, to indicate that
|
||||
/// all WAL before this point has been digested. Before that, if you call
|
||||
/// GET on an earlier LSN, it will block.
|
||||
fn advance_last_valid_lsn(&self, lsn: Lsn);
|
||||
fn get_last_valid_lsn(&self) -> Lsn;
|
||||
fn init_valid_lsn(&self, lsn: Lsn);
|
||||
|
||||
/// Like `advance_last_valid_lsn`, but this always points to the end of
|
||||
/// a WAL record, not in the middle of one.
|
||||
///
|
||||
/// This must be <= last valid LSN. This is tracked separately from last
|
||||
/// valid LSN, so that the WAL receiver knows where to restart streaming.
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn);
|
||||
fn get_last_record_lsn(&self) -> Lsn;
|
||||
|
||||
///
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
fn checkpoint(&self) -> Result<()>;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RepositoryStats {
|
||||
pub num_entries: Lsn,
|
||||
pub num_page_images: Lsn,
|
||||
pub num_wal_records: Lsn,
|
||||
pub num_getpage_requests: Lsn,
|
||||
}
|
||||
|
||||
///
|
||||
/// Relation data file segment id throughout the Postgres cluster.
|
||||
///
|
||||
/// Every data file in Postgres is uniquely identified by 4 numbers:
|
||||
/// - relation id / node (`relnode`)
|
||||
/// - database id (`dbnode`)
|
||||
/// - tablespace id (`spcnode`), in short this is a unique id of a separate
|
||||
/// directory to store data files.
|
||||
/// - forknumber (`forknum`) is used to split different kinds of data of the same relation
|
||||
/// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`).
|
||||
///
|
||||
/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value
|
||||
/// are used for the same purpose.
|
||||
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
|
||||
///
|
||||
/// We use additional fork numbers to logically separate relational and
|
||||
/// non-relational data inside pageserver key-value storage.
|
||||
/// See, e.g., `ROCKSDB_SPECIAL_FORKNUM`.
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
}
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
///
|
||||
impl fmt::Display for RelTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
||||
write!(
|
||||
f,
|
||||
"{}/{}/{}_{}",
|
||||
self.spcnode, self.dbnode, self.relnode, forkname
|
||||
)
|
||||
} else {
|
||||
write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
|
||||
/// This is used as a part of the key inside key-value storage (RocksDB currently).
|
||||
///
|
||||
/// In Postgres `BufferTag` structure is used for exactly the same purpose.
|
||||
/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: Lsn, // LSN at the *end* of the record
|
||||
pub will_init: bool,
|
||||
pub rec: Bytes,
|
||||
// Remember the offset of main_data in rec,
|
||||
// so that we don't have to parse the record again.
|
||||
// If record has no main_data, this offset equals rec.len().
|
||||
pub main_data_offset: u32,
|
||||
}
|
||||
|
||||
impl WALRecord {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u64(self.lsn.0);
|
||||
buf.put_u8(self.will_init as u8);
|
||||
buf.put_u32(self.main_data_offset);
|
||||
buf.put_u32(self.rec.len() as u32);
|
||||
buf.put_slice(&self.rec[..]);
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> WALRecord {
|
||||
let lsn = Lsn::from(buf.get_u64());
|
||||
let will_init = buf.get_u8() != 0;
|
||||
let main_data_offset = buf.get_u32();
|
||||
let mut dst = vec![0u8; buf.get_u32() as usize];
|
||||
buf.copy_to_slice(&mut dst);
|
||||
WALRecord {
|
||||
lsn,
|
||||
will_init,
|
||||
rec: Bytes::from(dst),
|
||||
main_data_offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Tests that should work the same with any Repository/Timeline implementation.
|
||||
///
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelTag = RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1000,
|
||||
forknum: 0,
|
||||
};
|
||||
|
||||
/// Convenience function to create a BufferTag for testing.
|
||||
/// Helps to keeps the tests shorter.
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_BUF(blknum: u32) -> BufferTag {
|
||||
BufferTag {
|
||||
rel: TESTREL_A,
|
||||
blknum,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_IMG(s: &str) -> Bytes {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
buf.resize(8192, 0);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
|
||||
let repo_dir = PathBuf::from(format!("../tmp_check/test_{}", test_name));
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
|
||||
let conf = PageServerConf {
|
||||
daemonize: false,
|
||||
interactive: false,
|
||||
gc_horizon: 64 * 1024 * 1024,
|
||||
gc_period: Duration::from_secs(10),
|
||||
listen_addr: "127.0.0.1:5430".parse().unwrap(),
|
||||
workdir: repo_dir,
|
||||
pg_distrib_dir: "".into(),
|
||||
};
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let walredo_mgr = TestRedoManager {};
|
||||
|
||||
let repo = rocksdb::RocksRepository::new(conf, Arc::new(walredo_mgr));
|
||||
|
||||
Ok(Box::new(repo))
|
||||
}
|
||||
|
||||
/// Test get_relsize() and truncation.
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
// get_timeline() with non-existent timeline id should fail
|
||||
//repo.get_timeline("11223344556677881122334455667788");
|
||||
|
||||
// Create timeline to work on
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(2), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TEST_BUF(0), Lsn(3), TEST_IMG("foo blk 0 at 3"))?;
|
||||
tline.put_page_image(TEST_BUF(1), Lsn(4), TEST_IMG("foo blk 1 at 4"))?;
|
||||
tline.put_page_image(TEST_BUF(2), Lsn(5), TEST_IMG("foo blk 2 at 5"))?;
|
||||
|
||||
tline.advance_last_valid_lsn(Lsn(5));
|
||||
|
||||
// FIXME: The rocksdb implementation erroneously returns 'true' here, even
|
||||
// though the relation was created only at a later LSN
|
||||
// rocksdb implementation erroneosly returns 'true' here
|
||||
assert_eq!(tline.get_relsize_exists(TESTREL_A, Lsn(1))?, true); // CORRECT: false
|
||||
// And this probably should throw an error, becaue the relation doesn't exist at Lsn(1) yet
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(1))?, 0); // CORRECT: throw error
|
||||
|
||||
assert_eq!(tline.get_relsize_exists(TESTREL_A, Lsn(2))?, true);
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(2))?, 1);
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(5))?, 3);
|
||||
|
||||
// Check page contents at each LSN
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(2))?,
|
||||
TEST_IMG("foo blk 0 at 2")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(3))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(4))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(4))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(5))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(5))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
// Truncate last block
|
||||
tline.put_truncation(TESTREL_A, Lsn(6), 2)?;
|
||||
tline.advance_last_valid_lsn(Lsn(6));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(6))?, 2);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(0), Lsn(6))?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(1), Lsn(6))?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
// should still see the truncated block with older LSN
|
||||
assert_eq!(tline.get_relsize(TESTREL_A, Lsn(5))?, 3);
|
||||
assert_eq!(
|
||||
tline.get_page_at_lsn(TEST_BUF(2), Lsn(5))?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Test get_relsize() and truncation with a file larger than 1 GB, so that it's
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
///
|
||||
/// This isn't very interesting with the RocksDb implementation, as we don't pay
|
||||
/// any attention to Postgres segment boundaries there.
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = get_test_repo("test_large_rel")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid, Lsn(0))?;
|
||||
|
||||
tline.init_valid_lsn(Lsn(1));
|
||||
|
||||
let mut lsn = 0;
|
||||
for i in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", i, Lsn(lsn)));
|
||||
lsn += 1;
|
||||
tline.put_page_image(TEST_BUF(i as u32), Lsn(lsn), img)?;
|
||||
}
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
|
||||
assert_eq!(
|
||||
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE + 1
|
||||
);
|
||||
|
||||
// Truncate one block
|
||||
lsn += 1;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE
|
||||
);
|
||||
|
||||
// Truncate another block
|
||||
lsn += 1;
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
tline.advance_last_valid_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relsize(TESTREL_A, Lsn(lsn))?,
|
||||
pg_constants::RELSEG_SIZE - 1
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager {}
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for rel {} blk {} to get to {}, with {} and {} records",
|
||||
tag.rel,
|
||||
tag.blknum,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{}", s);
|
||||
Ok(TEST_IMG(&s))
|
||||
}
|
||||
}
|
||||
}
|
||||
1019
pageserver/src/repository/rocksdb.rs
Normal file
1019
pageserver/src/repository/rocksdb.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,21 +1,9 @@
|
||||
//
|
||||
// Restore chunks from local Zenith repository
|
||||
//
|
||||
// This runs once at Page Server startup. It loads all the "snapshots" and all
|
||||
// WAL from all timelines from the local zenith repository into the in-memory page
|
||||
// cache.
|
||||
//
|
||||
// This also initializes the "last valid LSN" in the page cache to the last LSN
|
||||
// seen in the WAL, so that when the WAL receiver is started, it starts
|
||||
// streaming from that LSN.
|
||||
//
|
||||
|
||||
//!
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! zenith repository
|
||||
//!
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::fmt;
|
||||
|
||||
use std::cmp::max;
|
||||
use std::error::Error;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::Read;
|
||||
@@ -26,101 +14,49 @@ use std::path::{Path, PathBuf};
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::BufferTag;
|
||||
use crate::page_cache::PageCache;
|
||||
use crate::page_cache::RelTag;
|
||||
use crate::waldecoder::{decode_wal_record, WalStreamDecoder};
|
||||
use crate::repository::{BufferTag, RelTag, Timeline};
|
||||
use crate::waldecoder::{decode_wal_record, Oid, WalStreamDecoder};
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
// FIXME: we'll probably need these elsewhere too, move to some common location
|
||||
const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
//
|
||||
// Load it all into the page cache.
|
||||
//
|
||||
pub fn restore_timeline(
|
||||
conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
timeline: ZTimelineId,
|
||||
) -> Result<()> {
|
||||
let timelinepath = PathBuf::from("timelines").join(timeline.to_string());
|
||||
|
||||
if !timelinepath.exists() {
|
||||
anyhow::bail!("timeline {} does not exist in the page server's repository");
|
||||
}
|
||||
|
||||
// Scan .zenith/timelines/<timeline>/snapshots
|
||||
let snapshotspath = PathBuf::from("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("snapshots");
|
||||
|
||||
let mut last_snapshot_lsn: Lsn = Lsn(0);
|
||||
///
|
||||
/// Find latest snapshot in a timeline's 'snapshots' directory
|
||||
///
|
||||
pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result<Lsn> {
|
||||
let snapshotspath = format!("timelines/{}/snapshots", timeline);
|
||||
|
||||
let mut last_snapshot_lsn = Lsn(0);
|
||||
for direntry in fs::read_dir(&snapshotspath).unwrap() {
|
||||
let direntry = direntry?;
|
||||
let filename = direntry.file_name();
|
||||
let lsn = Lsn::from_filename(&filename)?;
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
let filename = direntry.unwrap().file_name();
|
||||
|
||||
// FIXME: pass filename as Path instead of str?
|
||||
let filename_str = filename.into_string().unwrap();
|
||||
restore_snapshot(conf, pcache, timeline, &filename_str)?;
|
||||
info!("restored snapshot at {:?}", filename_str);
|
||||
if let Ok(lsn) = Lsn::from_filename(&filename) {
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
} else {
|
||||
error!("unrecognized file in snapshots directory: {:?}", filename);
|
||||
}
|
||||
}
|
||||
|
||||
if last_snapshot_lsn == Lsn(0) {
|
||||
error!(
|
||||
"could not find valid snapshot in {}",
|
||||
snapshotspath.display()
|
||||
);
|
||||
// TODO return error?
|
||||
}
|
||||
pcache.init_valid_lsn(last_snapshot_lsn);
|
||||
|
||||
restore_wal(conf, pcache, timeline, last_snapshot_lsn)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn find_latest_snapshot(_conf: &PageServerConf, timeline: ZTimelineId) -> Result<u64> {
|
||||
let snapshotspath = format!("timelines/{}/snapshots", timeline);
|
||||
|
||||
let mut last_snapshot_lsn = 0;
|
||||
for direntry in fs::read_dir(&snapshotspath).unwrap() {
|
||||
let filename = direntry.unwrap().file_name().to_str().unwrap().to_owned();
|
||||
|
||||
let lsn = u64::from_str_radix(&filename, 16)?;
|
||||
last_snapshot_lsn = max(lsn, last_snapshot_lsn);
|
||||
}
|
||||
|
||||
if last_snapshot_lsn == 0 {
|
||||
error!("could not find valid snapshot in {}", &snapshotspath);
|
||||
// TODO return error?
|
||||
}
|
||||
Ok(last_snapshot_lsn)
|
||||
}
|
||||
|
||||
fn restore_snapshot(
|
||||
conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
timeline: ZTimelineId,
|
||||
snapshot: &str,
|
||||
///
|
||||
/// Import all relation data pages from local disk into the repository.
|
||||
///
|
||||
pub fn import_timeline_from_postgres_datadir(
|
||||
path: &Path,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
let snapshotpath = PathBuf::from("timelines")
|
||||
.join(timeline.to_string())
|
||||
.join("snapshots")
|
||||
.join(snapshot);
|
||||
|
||||
// Scan 'global'
|
||||
for direntry in fs::read_dir(snapshotpath.join("global"))? {
|
||||
for direntry in fs::read_dir(path.join("global"))? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
@@ -130,24 +66,22 @@ fn restore_snapshot(
|
||||
Some("pg_filenode.map") => continue,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => restore_relfile(
|
||||
conf,
|
||||
pcache,
|
||||
timeline,
|
||||
snapshot,
|
||||
GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
timeline,
|
||||
lsn,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
)?,
|
||||
}
|
||||
}
|
||||
|
||||
// Scan 'base'. It contains database dirs, the database OID is the filename.
|
||||
// E.g. 'base/12345', where 12345 is the database OID.
|
||||
for direntry in fs::read_dir(snapshotpath.join("base"))? {
|
||||
for direntry in fs::read_dir(path.join("base"))? {
|
||||
let direntry = direntry?;
|
||||
|
||||
let dboid = u32::from_str_radix(direntry.file_name().to_str().unwrap(), 10)?;
|
||||
let dboid = direntry.file_name().to_str().unwrap().parse::<u32>()?;
|
||||
|
||||
for direntry in fs::read_dir(direntry.path())? {
|
||||
let direntry = direntry?;
|
||||
@@ -159,45 +93,31 @@ fn restore_snapshot(
|
||||
Some("pg_filenode.map") => continue,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => restore_relfile(
|
||||
conf,
|
||||
pcache,
|
||||
timeline,
|
||||
snapshot,
|
||||
DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
timeline,
|
||||
lsn,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
)?,
|
||||
}
|
||||
}
|
||||
}
|
||||
for entry in fs::read_dir(snapshotpath.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
restore_nonrelfile(
|
||||
conf,
|
||||
pcache,
|
||||
timeline,
|
||||
snapshot,
|
||||
pg_constants::PG_XACT_FORKNUM,
|
||||
&entry.path(),
|
||||
)?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
timeline.checkpoint()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore_relfile(
|
||||
_conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
_timeline: ZTimelineId,
|
||||
snapshot: &str,
|
||||
spcoid: u32,
|
||||
dboid: u32,
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_relfile(
|
||||
path: &Path,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
) -> Result<()> {
|
||||
let lsn = Lsn::from_hex(snapshot)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
@@ -210,8 +130,7 @@ fn restore_relfile(
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / 8192);
|
||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
@@ -221,11 +140,11 @@ fn restore_relfile(
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum: forknum as u8,
|
||||
forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
|
||||
timeline.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf))?;
|
||||
/*
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
@@ -252,88 +171,22 @@ fn restore_relfile(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore_nonrelfile(
|
||||
_conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
_timeline: ZTimelineId,
|
||||
snapshot: &str,
|
||||
forknum: u32,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let lsn = Lsn::from_hex(snapshot)?;
|
||||
|
||||
// Does it look like a relation file?
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 0,
|
||||
relnode: 0,
|
||||
forknum: forknum as u8,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
pcache.put_page_image(tag, lsn, Bytes::copy_from_slice(&buf));
|
||||
/*
|
||||
if oldest_lsn == 0 || p.lsn < oldest_lsn {
|
||||
oldest_lsn = p.lsn;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
blknum += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Scan WAL on a timeline, starting from gien LSN, and load all the records
|
||||
// into the page cache.
|
||||
fn restore_wal(
|
||||
_conf: &PageServerConf,
|
||||
pcache: &PageCache,
|
||||
timeline: ZTimelineId,
|
||||
startpoint: Lsn,
|
||||
) -> Result<()> {
|
||||
let walpath = format!("timelines/{}/wal", timeline);
|
||||
|
||||
/// Scan PostgreSQL WAL files in given directory, and load all records >= 'startpoint' into
|
||||
/// the repository.
|
||||
pub fn import_timeline_wal(walpath: &Path, timeline: &dyn Timeline, startpoint: Lsn) -> Result<()> {
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
const SEG_SIZE: u64 = 16 * 1024 * 1024;
|
||||
let mut segno = startpoint.segment_number(SEG_SIZE);
|
||||
let mut offset = startpoint.segment_offset(SEG_SIZE);
|
||||
let mut last_lsn = Lsn(0);
|
||||
let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = startpoint;
|
||||
loop {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, 16 * 1024 * 1024);
|
||||
let mut path = walpath.clone() + "/" + &filename;
|
||||
let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut path = walpath.join(&filename);
|
||||
|
||||
// It could be as .partial
|
||||
if !PathBuf::from(&path).exists() {
|
||||
path += ".partial";
|
||||
path = walpath.join(filename + ".partial");
|
||||
}
|
||||
|
||||
// Slurp the WAL file
|
||||
@@ -351,7 +204,7 @@ fn restore_wal(
|
||||
|
||||
let mut buf = Vec::new();
|
||||
let nread = file.read_to_end(&mut buf)?;
|
||||
if nread != 16 * 1024 * 1024 - offset as usize {
|
||||
if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
|
||||
// Maybe allow this for .partial files?
|
||||
error!("read only {} bytes from WAL file", nread);
|
||||
}
|
||||
@@ -367,33 +220,7 @@ fn restore_wal(
|
||||
}
|
||||
if let Some((lsn, recdata)) = rec.unwrap() {
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
// Put the WAL record to the page cache. We make a separate copy of
|
||||
// it for every block it modifies. (The actual WAL record is kept in
|
||||
// a Bytes, which uses a reference counter for the underlying buffer,
|
||||
// so having multiple copies of it doesn't cost that much)
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
truncate: false,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
pcache.put_wal_record(tag, rec);
|
||||
}
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
pcache.advance_last_valid_lsn(lsn);
|
||||
timeline.save_decoded_record(decoded, recdata, lsn)?;
|
||||
last_lsn = lsn;
|
||||
} else {
|
||||
break;
|
||||
@@ -401,95 +228,16 @@ fn restore_wal(
|
||||
nrecords += 1;
|
||||
}
|
||||
|
||||
info!("restored {} records from WAL file {}", nrecords, filename);
|
||||
info!(
|
||||
"imported {} records from WAL file {} up to {}",
|
||||
nrecords,
|
||||
path.display(),
|
||||
last_lsn
|
||||
);
|
||||
|
||||
segno += 1;
|
||||
offset = 0;
|
||||
}
|
||||
info!("reached end of WAL at {}", last_lsn);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl Error for FilePathError {
|
||||
fn description(&self) -> &str {
|
||||
&self.msg
|
||||
}
|
||||
}
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ParsedBaseImageFileName {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u32,
|
||||
pub segno: u32,
|
||||
|
||||
pub lsn: u64,
|
||||
}
|
||||
|
||||
// formats:
|
||||
// <oid>
|
||||
// <oid>_<fork name>
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
fn parse_relfilename(fname: &str) -> Result<(u32, u32, u32), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(fname)
|
||||
.ok_or_else(|| FilePathError::new("invalid relation data file name"))?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = u32::from_str_radix(relnode_str, 10)?;
|
||||
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forknum = forkname_to_forknum(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
u32::from_str_radix(segno_match.unwrap().as_str(), 10)?
|
||||
};
|
||||
|
||||
Ok((relnode, forknum, segno))
|
||||
}
|
||||
|
||||
@@ -23,6 +23,8 @@ use tokio::runtime;
|
||||
use futures::future;
|
||||
|
||||
use crate::{page_cache, PageServerConf};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
|
||||
struct Storage {
|
||||
region: Region,
|
||||
@@ -127,56 +129,12 @@ async fn restore_chunk(conf: &PageServerConf) -> Result<(), S3Error> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
// FIXME: we'll probably need these elsewhere too, move to some common location
|
||||
const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FilePathError {
|
||||
msg: String,
|
||||
}
|
||||
|
||||
impl FilePathError {
|
||||
fn new(msg: &str) -> FilePathError {
|
||||
FilePathError {
|
||||
msg: msg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(e: core::num::ParseIntError) -> Self {
|
||||
return FilePathError {
|
||||
msg: format!("invalid filename: {}", e),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for FilePathError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "invalid filename")
|
||||
}
|
||||
}
|
||||
|
||||
fn forkname_to_forknum(forkname: Option<&str>) -> Result<u32, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(0),
|
||||
Some("fsm") => Ok(1),
|
||||
Some("vm") => Ok(2),
|
||||
Some("init") => Ok(3),
|
||||
Some(_) => Err(FilePathError::new("invalid forkname")),
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ParsedBaseImageFileName {
|
||||
pub spcnode: u32,
|
||||
pub dbnode: u32,
|
||||
pub relnode: u32,
|
||||
pub forknum: u32,
|
||||
pub forknum: u8,
|
||||
pub segno: u32,
|
||||
|
||||
pub lsn: u64,
|
||||
@@ -188,7 +146,7 @@ struct ParsedBaseImageFileName {
|
||||
// <oid>.<segment number>
|
||||
// <oid>_<fork name>.<segment number>
|
||||
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u32, u32, u64), FilePathError> {
|
||||
fn parse_filename(fname: &str) -> Result<(u32, u8, u32, u64), FilePathError> {
|
||||
let re = Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?_(?P<lsnhi>[[:xdigit:]]{8})(?P<lsnlo>[[:xdigit:]]{8})$").unwrap();
|
||||
|
||||
let caps = re
|
||||
@@ -237,7 +195,7 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
Ok(ParsedBaseImageFileName {
|
||||
spcnode: GLOBALTABLESPACE_OID,
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
relnode,
|
||||
forknum,
|
||||
@@ -260,7 +218,7 @@ fn parse_rel_file_path(path: &str) -> Result<ParsedBaseImageFileName, FilePathEr
|
||||
let (relnode, forknum, segno, lsn) = parse_filename(fname)?;
|
||||
|
||||
Ok(ParsedBaseImageFileName {
|
||||
spcnode: DEFAULTTABLESPACE_OID,
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum,
|
||||
@@ -294,8 +252,7 @@ async fn slurp_base_file(
|
||||
|
||||
let mut bytes = BytesMut::from(data.as_slice()).freeze();
|
||||
|
||||
// FIXME: use constants (BLCKSZ)
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / 8192);
|
||||
let mut blknum: u32 = parsed.segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
|
||||
let pcache = page_cache::get_pagecache(conf, sys_id);
|
||||
|
||||
@@ -305,7 +262,7 @@ async fn slurp_base_file(
|
||||
spcnode: parsed.spcnode,
|
||||
dbnode: parsed.dbnode,
|
||||
relnode: parsed.relnode,
|
||||
forknum: parsed.forknum as u8,
|
||||
forknum: parsed.forknum,
|
||||
},
|
||||
blknum,
|
||||
};
|
||||
|
||||
@@ -171,6 +171,11 @@ pub fn ui_main() -> Result<(), Box<dyn Error>> {
|
||||
})?;
|
||||
|
||||
// If ther user presses 'q', quit.
|
||||
|
||||
// silence clippy's suggestion to rewrite this as an if-statement. Match
|
||||
// makes more sense as soon as we get another command than 'q'.
|
||||
#[allow(clippy::single_match)]
|
||||
#[allow(clippy::collapsible_match)]
|
||||
if let Event::Input(key) = events.next()? {
|
||||
match key {
|
||||
Key::Char('q') => {
|
||||
@@ -229,7 +234,7 @@ impl<'a> Widget for LogWidget<'a> {
|
||||
// Render a widget to show some metrics
|
||||
struct MetricsWidget {}
|
||||
|
||||
fn get_metric_u64(title: &str, value: u64) -> Spans {
|
||||
fn _get_metric_u64(title: &str, value: u64) -> Spans {
|
||||
Spans::from(vec![
|
||||
Span::styled(format!("{:<20}", title), Style::default()),
|
||||
Span::raw(": "),
|
||||
@@ -260,9 +265,11 @@ impl tui::widgets::Widget for MetricsWidget {
|
||||
|
||||
block.render(area, buf);
|
||||
|
||||
#[allow(unused_mut)]
|
||||
let mut lines: Vec<Spans> = Vec::new();
|
||||
|
||||
let page_cache_stats = crate::page_cache::get_stats();
|
||||
// FIXME
|
||||
//let page_cache_stats = crate::page_cache::get_stats();
|
||||
|
||||
// This is not used since LSNs were removed from page cache stats.
|
||||
// Maybe it will be used in the future?
|
||||
@@ -275,7 +282,7 @@ impl tui::widgets::Widget for MetricsWidget {
|
||||
lines.push(get_metric_str("Valid LSN range", &lsnrange));
|
||||
lines.push(get_metric_str("Last record LSN", &last_valid_recordlsn_str));
|
||||
*/
|
||||
|
||||
/*
|
||||
lines.push(get_metric_u64(
|
||||
"# of cache entries",
|
||||
page_cache_stats.num_entries,
|
||||
@@ -292,7 +299,7 @@ impl tui::widgets::Widget for MetricsWidget {
|
||||
"# of GetPage@LSN calls",
|
||||
page_cache_stats.num_getpage_requests,
|
||||
));
|
||||
|
||||
*/
|
||||
let text = Text::from(lines);
|
||||
|
||||
Paragraph::new(text).render(inner_area, buf);
|
||||
|
||||
@@ -7,8 +7,13 @@ use std::str;
|
||||
use thiserror::Error;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// FIXME: this is configurable in PostgreSQL, 16 MB is the default
|
||||
const WAL_SEGMENT_SIZE: u64 = 16 * 1024 * 1024;
|
||||
pub type Oid = u32;
|
||||
pub type TransactionId = u32;
|
||||
pub type BlockNumber = u32;
|
||||
pub type OffsetNumber = u16;
|
||||
pub type MultiXactId = TransactionId;
|
||||
pub type MultiXactOffset = u32;
|
||||
pub type MultiXactStatus = u32;
|
||||
|
||||
// From PostgreSQL headers
|
||||
|
||||
@@ -92,7 +97,7 @@ impl WalStreamDecoder {
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
|
||||
loop {
|
||||
// parse and verify page boundaries as we go
|
||||
if self.lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 {
|
||||
if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
|
||||
// parse long header
|
||||
|
||||
if self.inputbuf.remaining() < SizeOfXLogLongPHD {
|
||||
@@ -185,7 +190,8 @@ impl WalStreamDecoder {
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.padlen = self.lsn.calc_padding(WAL_SEGMENT_SIZE) as u32;
|
||||
self.padlen =
|
||||
self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||
@@ -258,7 +264,8 @@ pub struct DecodedBkpBlock {
|
||||
/* Information on full-page image, if any */
|
||||
has_image: bool, /* has image, even for consistency checking */
|
||||
pub apply_image: bool, /* has image that should be restored */
|
||||
pub will_init: bool,
|
||||
pub will_init: bool, /* record intialize page content */
|
||||
pub will_drop: bool, /* record drops relation */
|
||||
//char *bkp_image;
|
||||
hole_offset: u16,
|
||||
hole_length: u16,
|
||||
@@ -283,6 +290,7 @@ impl DecodedBkpBlock {
|
||||
has_image: false,
|
||||
apply_image: false,
|
||||
will_init: false,
|
||||
will_drop: false,
|
||||
hole_offset: 0,
|
||||
hole_length: 0,
|
||||
bimg_len: 0,
|
||||
@@ -306,12 +314,6 @@ pub struct DecodedWALRecord {
|
||||
pub main_data_offset: usize,
|
||||
}
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type BlockNumber = u32;
|
||||
|
||||
pub const MAIN_FORKNUM: u8 = 0;
|
||||
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct RelFileNode {
|
||||
@@ -320,6 +322,24 @@ pub struct RelFileNode {
|
||||
pub relnode: Oid, /* relation */
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlRelmapUpdate {
|
||||
pub dbid: Oid, /* database ID, or 0 for shared map */
|
||||
pub tsid: Oid, /* database's tablespace, or pg_global */
|
||||
pub nbytes: i32, /* size of relmap data */
|
||||
}
|
||||
|
||||
impl XlRelmapUpdate {
|
||||
pub fn decode(buf: &mut Bytes) -> XlRelmapUpdate {
|
||||
XlRelmapUpdate {
|
||||
dbid: buf.get_u32_le(),
|
||||
tsid: buf.get_u32_le(),
|
||||
nbytes: buf.get_i32_le(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlSmgrTruncate {
|
||||
@@ -366,6 +386,150 @@ impl XlCreateDatabase {
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlHeapInsert {
|
||||
pub offnum: OffsetNumber,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
impl XlHeapInsert {
|
||||
pub fn decode(buf: &mut Bytes) -> XlHeapInsert {
|
||||
XlHeapInsert {
|
||||
offnum: buf.get_u16_le(),
|
||||
flags: buf.get_u8(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlHeapMultiInsert {
|
||||
pub flags: u8,
|
||||
pub ntuples: u16,
|
||||
}
|
||||
|
||||
impl XlHeapMultiInsert {
|
||||
pub fn decode(buf: &mut Bytes) -> XlHeapMultiInsert {
|
||||
XlHeapMultiInsert {
|
||||
flags: buf.get_u8(),
|
||||
ntuples: buf.get_u16_le(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlHeapDelete {
|
||||
pub xmax: TransactionId,
|
||||
pub offnum: OffsetNumber,
|
||||
pub infobits_set: u8,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
impl XlHeapDelete {
|
||||
pub fn decode(buf: &mut Bytes) -> XlHeapDelete {
|
||||
XlHeapDelete {
|
||||
xmax: buf.get_u32_le(),
|
||||
offnum: buf.get_u16_le(),
|
||||
infobits_set: buf.get_u8(),
|
||||
flags: buf.get_u8(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlHeapUpdate {
|
||||
pub old_xmax: TransactionId,
|
||||
pub old_offnum: OffsetNumber,
|
||||
pub old_infobits_set: u8,
|
||||
pub flags: u8,
|
||||
pub new_xmax: TransactionId,
|
||||
pub new_offnum: OffsetNumber,
|
||||
}
|
||||
|
||||
impl XlHeapUpdate {
|
||||
pub fn decode(buf: &mut Bytes) -> XlHeapUpdate {
|
||||
XlHeapUpdate {
|
||||
old_xmax: buf.get_u32_le(),
|
||||
old_offnum: buf.get_u16_le(),
|
||||
old_infobits_set: buf.get_u8(),
|
||||
flags: buf.get_u8(),
|
||||
new_xmax: buf.get_u32_le(),
|
||||
new_offnum: buf.get_u16_le(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct MultiXactMember {
|
||||
pub xid: TransactionId,
|
||||
pub status: MultiXactStatus,
|
||||
}
|
||||
|
||||
impl MultiXactMember {
|
||||
pub fn decode(buf: &mut Bytes) -> MultiXactMember {
|
||||
MultiXactMember {
|
||||
xid: buf.get_u32_le(),
|
||||
status: buf.get_u32_le(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlMultiXactCreate {
|
||||
pub mid: MultiXactId, /* new MultiXact's ID */
|
||||
pub moff: MultiXactOffset, /* its starting offset in members file */
|
||||
pub nmembers: u32, /* number of member XIDs */
|
||||
pub members: Vec<MultiXactMember>,
|
||||
}
|
||||
|
||||
impl XlMultiXactCreate {
|
||||
pub fn decode(buf: &mut Bytes) -> XlMultiXactCreate {
|
||||
let mid = buf.get_u32_le();
|
||||
let moff = buf.get_u32_le();
|
||||
let nmembers = buf.get_u32_le();
|
||||
let mut members = Vec::new();
|
||||
for _ in 0..nmembers {
|
||||
members.push(MultiXactMember::decode(buf));
|
||||
}
|
||||
XlMultiXactCreate {
|
||||
mid,
|
||||
moff,
|
||||
nmembers,
|
||||
members,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlMultiXactTruncate {
|
||||
oldest_multi_db: Oid,
|
||||
/* to-be-truncated range of multixact offsets */
|
||||
start_trunc_off: MultiXactId, /* just for completeness' sake */
|
||||
end_trunc_off: MultiXactId,
|
||||
|
||||
/* to-be-truncated range of multixact members */
|
||||
start_trunc_memb: MultiXactOffset,
|
||||
end_trunc_memb: MultiXactOffset,
|
||||
}
|
||||
|
||||
impl XlMultiXactTruncate {
|
||||
pub fn decode(buf: &mut Bytes) -> XlMultiXactTruncate {
|
||||
XlMultiXactTruncate {
|
||||
oldest_multi_db: buf.get_u32_le(),
|
||||
start_trunc_off: buf.get_u32_le(),
|
||||
end_trunc_off: buf.get_u32_le(),
|
||||
start_trunc_memb: buf.get_u32_le(),
|
||||
end_trunc_memb: buf.get_u32_le(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Routines to decode a WAL record and figure out which blocks are modified
|
||||
//
|
||||
@@ -614,30 +778,10 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
assert_eq!(buf.remaining(), main_data_len as usize);
|
||||
}
|
||||
|
||||
//5. Handle special CLOG and XACT records
|
||||
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = buf.get_i32_le() as u32;
|
||||
blk.will_init = true;
|
||||
trace!("RM_CLOG_ID updates block {}", blk.blkno);
|
||||
blocks.push(blk);
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
//5. Handle special XACT records
|
||||
if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
|
||||
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
|
||||
xlogrec.xl_prev & 0xffffffff,
|
||||
xlogrec.xl_xid,
|
||||
blk.blkno,
|
||||
main_data_len
|
||||
);
|
||||
blocks.push(blk);
|
||||
|
||||
//parse commit record to extract subtrans entries
|
||||
// xl_xact_commit starts with time of commit
|
||||
let _xact_time = buf.get_i64_le();
|
||||
@@ -652,17 +796,8 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
let mut prev_blkno = u32::MAX;
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if prev_blkno != blkno {
|
||||
prev_blkno = blkno;
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = blkno;
|
||||
blocks.push(blk);
|
||||
}
|
||||
let _subxact = buf.get_u32_le();
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
@@ -671,7 +806,13 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO handle this too?
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::MAIN_FORKNUM;
|
||||
blk.rnode_spcnode = spcnode;
|
||||
blk.rnode_dbnode = dbnode;
|
||||
blk.rnode_relnode = relnode;
|
||||
blk.will_drop = true;
|
||||
blocks.push(blk);
|
||||
trace!(
|
||||
"XLOG_XACT_COMMIT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
@@ -693,18 +834,6 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
//TODO handle this to be able to restore pg_twophase on node start
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = xlogrec.xl_xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
trace!(
|
||||
"XLOG_XACT_ABORT xl_info {} xl_prev {:X}/{:X} xid {} updates block {} main_data_len {}",
|
||||
xlogrec.xl_info, (xlogrec.xl_prev >> 32),
|
||||
xlogrec.xl_prev & 0xffffffff,
|
||||
xlogrec.xl_xid,
|
||||
blk.blkno,
|
||||
main_data_len
|
||||
);
|
||||
blocks.push(blk);
|
||||
//parse abort record to extract subtrans entries
|
||||
// xl_xact_abort starts with time of commit
|
||||
let _xact_time = buf.get_i64_le();
|
||||
@@ -719,17 +848,8 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
let mut prev_blkno = u32::MAX;
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
if prev_blkno != blkno {
|
||||
prev_blkno = blkno;
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::PG_XACT_FORKNUM as u8;
|
||||
blk.blkno = blkno;
|
||||
blocks.push(blk);
|
||||
}
|
||||
let _subxact = buf.get_u32_le();
|
||||
}
|
||||
}
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
@@ -738,7 +858,13 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
let spcnode = buf.get_u32_le();
|
||||
let dbnode = buf.get_u32_le();
|
||||
let relnode = buf.get_u32_le();
|
||||
//TODO save these too
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::MAIN_FORKNUM;
|
||||
blk.rnode_spcnode = spcnode;
|
||||
blk.rnode_dbnode = dbnode;
|
||||
blk.rnode_relnode = relnode;
|
||||
blk.will_drop = true;
|
||||
blocks.push(blk);
|
||||
trace!(
|
||||
"XLOG_XACT_ABORT relfilenode {}/{}/{}",
|
||||
spcnode,
|
||||
@@ -782,6 +908,79 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
} else {
|
||||
trace!("XLOG_TBLSPC_DROP is not handled yet");
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = XlHeapInsert::decode(&mut buf);
|
||||
if (xlrec.flags
|
||||
& (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
|
||||
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
|
||||
!= 0
|
||||
{
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = XlHeapDelete::decode(&mut buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_UPDATE
|
||||
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
|
||||
{
|
||||
let xlrec = XlHeapUpdate::decode(&mut buf);
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0
|
||||
&& blocks.len() > 1
|
||||
{
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blocks[1].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
|
||||
blk.rnode_spcnode = blocks[1].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[1].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[1].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = XlHeapMultiInsert::decode(&mut buf);
|
||||
if (xlrec.flags
|
||||
& (pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED
|
||||
| pg_constants::XLH_INSERT_ALL_FROZEN_SET))
|
||||
!= 0
|
||||
{
|
||||
let mut blk = DecodedBkpBlock::new();
|
||||
let blkno = blocks[0].blkno / pg_constants::HEAPBLOCKS_PER_PAGE as u32;
|
||||
blk.forknum = pg_constants::VISIBILITYMAP_FORKNUM;
|
||||
blk.blkno = blkno;
|
||||
blk.rnode_spcnode = blocks[0].rnode_spcnode;
|
||||
blk.rnode_dbnode = blocks[0].rnode_dbnode;
|
||||
blk.rnode_relnode = blocks[0].rnode_relnode;
|
||||
blocks.push(blk);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DecodedWALRecord {
|
||||
|
||||
@@ -7,17 +7,20 @@
|
||||
//!
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::{BufferTag, RelTag};
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Error;
|
||||
use anyhow::{Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres::fallible_iterator::FallibleIterator;
|
||||
use postgres::replication::ReplicationIter;
|
||||
use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
@@ -27,11 +30,7 @@ use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
use std::time::Duration;
|
||||
use tokio::runtime::Runtime;
|
||||
use tokio_postgres::replication::{PgTimestamp, ReplicationStream};
|
||||
use tokio_postgres::{NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use tokio_stream::StreamExt;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
//
|
||||
@@ -48,7 +47,7 @@ lazy_static! {
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &PageServerConf,
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
) {
|
||||
@@ -65,11 +64,10 @@ pub fn launch_wal_receiver(
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Also launch a new thread to handle this connection
|
||||
let conf_copy = conf.clone();
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
thread_main(&conf_copy, timelineid);
|
||||
thread_main(conf, timelineid);
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
@@ -90,22 +88,12 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
|
||||
//
|
||||
// This is the entry point for the WAL receiver thread.
|
||||
//
|
||||
fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId) {
|
||||
info!(
|
||||
"WAL receiver thread started for timeline : '{}'",
|
||||
timelineid
|
||||
);
|
||||
|
||||
// We need a tokio runtime to call the rust-postgres copy_both function.
|
||||
// Most functions in the rust-postgres driver have a blocking wrapper,
|
||||
// but copy_both does not (TODO: the copy_both support is still work-in-progress
|
||||
// as of this writing. Check later if that has changed, or implement the
|
||||
// wrapper ourselves in rust-postgres)
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
//
|
||||
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
|
||||
// and start streaming WAL from it. If the connection is lost, keep retrying.
|
||||
@@ -114,7 +102,7 @@ fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
|
||||
let res = walreceiver_main(&runtime, conf, timelineid, &wal_producer_connstr);
|
||||
let res = walreceiver_main(conf, timelineid, &wal_producer_connstr);
|
||||
|
||||
if let Err(e) = res {
|
||||
info!(
|
||||
@@ -127,8 +115,7 @@ fn thread_main(conf: &PageServerConf, timelineid: ZTimelineId) {
|
||||
}
|
||||
|
||||
fn walreceiver_main(
|
||||
runtime: &Runtime,
|
||||
conf: &PageServerConf,
|
||||
_conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
) -> Result<(), Error> {
|
||||
@@ -136,154 +123,73 @@ fn walreceiver_main(
|
||||
info!("connecting to {:?}", wal_producer_connstr);
|
||||
let connect_cfg = format!("{} replication=true", wal_producer_connstr);
|
||||
|
||||
let (rclient, connection) = runtime.block_on(tokio_postgres::connect(&connect_cfg, NoTls))?;
|
||||
let mut rclient = Client::connect(&connect_cfg, NoTls)?;
|
||||
info!("connected!");
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
runtime.spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
error!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
let identify = identify_system(runtime, &rclient)?;
|
||||
let identify = identify_system(&mut rclient)?;
|
||||
info!("{:?}", identify);
|
||||
let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
|
||||
let mut caught_up = false;
|
||||
|
||||
let pcache = page_cache::get_pagecache(&conf, timelineid).unwrap();
|
||||
let repository = page_cache::get_repository();
|
||||
let timeline = repository.get_timeline(timelineid).unwrap();
|
||||
|
||||
//
|
||||
// Start streaming the WAL, from where we left off previously.
|
||||
//
|
||||
let mut startpoint = pcache.get_last_valid_lsn();
|
||||
let last_valid_lsn = pcache.get_last_valid_lsn();
|
||||
// If we had previously received WAL up to some point in the middle of a WAL record, we
|
||||
// better start from the end of last full WAL record, not in the middle of one. Hence,
|
||||
// use 'last_record_lsn' rather than 'last_valid_lsn' here.
|
||||
let mut last_rec_lsn = timeline.get_last_record_lsn();
|
||||
let mut startpoint = last_rec_lsn;
|
||||
|
||||
if startpoint == Lsn(0) {
|
||||
// If we start here with identify.xlogpos we will have race condition with
|
||||
// postgres start: insert into postgres may request page that was modified with lsn
|
||||
// smaller than identify.xlogpos.
|
||||
//
|
||||
// Current procedure for starting postgres will anyway be changed to something
|
||||
// different like having 'initdb' method on a pageserver (or importing some shared
|
||||
// empty database snapshot), so for now I just put start of first segment which
|
||||
// seems to be a valid record.
|
||||
pcache.init_valid_lsn(Lsn(0x0100_0000));
|
||||
startpoint = Lsn(0x0100_0000);
|
||||
} else {
|
||||
// There might be some padding after the last full record, skip it.
|
||||
//
|
||||
// FIXME: It probably would be better to always start streaming from the beginning
|
||||
// of the page, or the segment, so that we could check the page/segment headers
|
||||
// too. Just for the sake of paranoia.
|
||||
startpoint += startpoint.calc_padding(8u32);
|
||||
error!("No previous WAL position");
|
||||
}
|
||||
|
||||
// There might be some padding after the last full record, skip it.
|
||||
//
|
||||
// FIXME: It probably would be better to always start streaming from the beginning
|
||||
// of the page, or the segment, so that we could check the page/segment headers
|
||||
// too. Just for the sake of paranoia.
|
||||
startpoint += startpoint.calc_padding(8u32);
|
||||
|
||||
debug!(
|
||||
"last_valid_lsn {} starting replication from {} for timeline {}, server is at {}...",
|
||||
last_valid_lsn, startpoint, timelineid, end_of_wal
|
||||
"last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, timelineid, end_of_wal
|
||||
);
|
||||
|
||||
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
|
||||
|
||||
let copy_stream = runtime.block_on(rclient.copy_both_simple::<bytes::Bytes>(&query))?;
|
||||
|
||||
let physical_stream = ReplicationStream::new(copy_stream);
|
||||
tokio::pin!(physical_stream);
|
||||
let copy_stream = rclient.copy_both_simple(&query)?;
|
||||
let mut physical_stream = ReplicationIter::new(copy_stream);
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
while let Some(replication_message) = runtime.block_on(physical_stream.next()) {
|
||||
match replication_message? {
|
||||
while let Some(replication_message) = physical_stream.next()? {
|
||||
match replication_message {
|
||||
ReplicationMessage::XLogData(xlog_data) => {
|
||||
// Pass the WAL data to the decoder, and see if we can decode
|
||||
// more records as a result.
|
||||
let data = xlog_data.data();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
let prev_last_rec_lsn = last_rec_lsn;
|
||||
|
||||
write_wal_file(
|
||||
startlsn,
|
||||
timelineid,
|
||||
16 * 1024 * 1024, // FIXME
|
||||
data,
|
||||
)?;
|
||||
write_wal_file(startlsn, timelineid, pg_constants::WAL_SEGMENT_SIZE, data)?;
|
||||
|
||||
trace!("received XLogData between {} and {}", startlsn, endlsn);
|
||||
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
loop {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
// Put the WAL record to the page cache. We make a separate copy of
|
||||
// it for every block it modifies. (The actual WAL record is kept in
|
||||
// a Bytes, which uses a reference counter for the underlying buffer,
|
||||
// so having multiple copies of it doesn't cost that much)
|
||||
for blk in decoded.blocks.iter() {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum as u8,
|
||||
},
|
||||
blknum: blk.blkno,
|
||||
};
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
timeline.save_decoded_record(decoded, recdata, lsn)?;
|
||||
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
truncate: false,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
pcache.put_wal_record(tag, rec);
|
||||
}
|
||||
// include truncate wal record in all pages
|
||||
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_SMGR_TRUNCATE
|
||||
{
|
||||
let truncate = XlSmgrTruncate::decode(&decoded);
|
||||
if (truncate.flags & SMGR_TRUNCATE_HEAP) != 0 {
|
||||
let tag = BufferTag {
|
||||
rel: RelTag {
|
||||
spcnode: truncate.rnode.spcnode,
|
||||
dbnode: truncate.rnode.dbnode,
|
||||
relnode: truncate.rnode.relnode,
|
||||
forknum: MAIN_FORKNUM,
|
||||
},
|
||||
blknum: truncate.blkno,
|
||||
};
|
||||
let rec = page_cache::WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
truncate: true,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
pcache.put_rel_wal_record(tag, rec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_DBASE_CREATE
|
||||
{
|
||||
let createdb = XlCreateDatabase::decode(&decoded);
|
||||
pcache.create_database(
|
||||
lsn,
|
||||
createdb.db_id,
|
||||
createdb.tablespace_id,
|
||||
createdb.src_db_id,
|
||||
createdb.src_tablespace_id,
|
||||
)?;
|
||||
}
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
pcache.advance_last_record_lsn(lsn);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
// Now that this record has been handled, let the page cache know that
|
||||
// it is up-to-date to this LSN
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
last_rec_lsn = lsn;
|
||||
}
|
||||
|
||||
// Update the last_valid LSN value in the page cache one more time. We updated
|
||||
@@ -292,7 +198,34 @@ fn walreceiver_main(
|
||||
// better reflect that, because GetPage@LSN requests might also point in the
|
||||
// middle of a record, if the request LSN was taken from the server's current
|
||||
// flush ptr.
|
||||
pcache.advance_last_valid_lsn(endlsn);
|
||||
timeline.advance_last_valid_lsn(endlsn);
|
||||
|
||||
// Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
|
||||
// "checkpoint" the repository to flush all the changes from WAL we've processed
|
||||
// so far to disk. After this, we don't need the original WAL anymore, and it
|
||||
// can be removed.
|
||||
//
|
||||
// TODO: We don't actually dare to remove the WAL. It's useful for debugging,
|
||||
// and we might it for logical decoiding other things in the future. Although
|
||||
// we should also be able to fetch it back from the WAL safekeepers or S3 if
|
||||
// needed.
|
||||
if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
!= last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
{
|
||||
info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
|
||||
let (oldest_segno, newest_segno) = find_wal_file_range(
|
||||
timelineid,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
last_rec_lsn,
|
||||
)?;
|
||||
|
||||
if newest_segno - oldest_segno >= 10 {
|
||||
timeline.checkpoint()?;
|
||||
|
||||
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
||||
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!("caught up at LSN {}", endlsn);
|
||||
@@ -306,25 +239,22 @@ fn walreceiver_main(
|
||||
let reply_requested: bool = keepalive.reply() != 0;
|
||||
|
||||
trace!(
|
||||
"received PrimaryKeepAlive(wal_end: {}, timestamp: {} reply: {})",
|
||||
"received PrimaryKeepAlive(wal_end: {}, timestamp: {:?} reply: {})",
|
||||
wal_end,
|
||||
timestamp,
|
||||
reply_requested,
|
||||
);
|
||||
if reply_requested {
|
||||
// TODO: More thought should go into what values are sent here.
|
||||
let last_lsn = PgLsn::from(u64::from(pcache.get_last_valid_lsn()));
|
||||
let last_lsn = PgLsn::from(u64::from(timeline.get_last_valid_lsn()));
|
||||
let write_lsn = last_lsn;
|
||||
let flush_lsn = last_lsn;
|
||||
let apply_lsn = PgLsn::INVALID;
|
||||
let ts = PgTimestamp::now()?;
|
||||
let apply_lsn = PgLsn::from(0);
|
||||
let ts = SystemTime::now();
|
||||
const NO_REPLY: u8 = 0u8;
|
||||
|
||||
runtime.block_on(
|
||||
physical_stream
|
||||
.as_mut()
|
||||
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY),
|
||||
)?;
|
||||
physical_stream
|
||||
.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
@@ -333,6 +263,45 @@ fn walreceiver_main(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_wal_file_range(
|
||||
timeline: ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
written_upto: Lsn,
|
||||
) -> Result<(u64, u64)> {
|
||||
let written_upto_segno = written_upto.segment_number(wal_seg_size);
|
||||
|
||||
let mut oldest_segno = written_upto_segno;
|
||||
let mut newest_segno = written_upto_segno;
|
||||
// Scan the wal directory, and count how many WAL filed we could remove
|
||||
let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
|
||||
for entry in fs::read_dir(wal_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let filename = path.file_name().unwrap().to_str().unwrap();
|
||||
|
||||
if IsXLogFileName(filename) {
|
||||
let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
|
||||
|
||||
if segno > written_upto_segno {
|
||||
// that's strange.
|
||||
warn!("there is a WAL file from future at {}", path.display());
|
||||
continue;
|
||||
}
|
||||
|
||||
oldest_segno = min(oldest_segno, segno);
|
||||
newest_segno = max(newest_segno, segno);
|
||||
}
|
||||
}
|
||||
// FIXME: would be good to assert that there are no gaps in the WAL files
|
||||
|
||||
Ok((oldest_segno, newest_segno))
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
///
|
||||
/// See the [postgres docs] for more details.
|
||||
@@ -353,12 +322,9 @@ pub struct IdentifySystem {
|
||||
pub struct IdentifyError;
|
||||
|
||||
/// Run the postgres `IDENTIFY_SYSTEM` command
|
||||
pub fn identify_system(
|
||||
runtime: &Runtime,
|
||||
client: &tokio_postgres::Client,
|
||||
) -> Result<IdentifySystem, Error> {
|
||||
pub fn identify_system(client: &mut Client) -> Result<IdentifySystem, Error> {
|
||||
let query_str = "IDENTIFY_SYSTEM";
|
||||
let response = runtime.block_on(client.simple_query(query_str))?;
|
||||
let response = client.simple_query(query_str)?;
|
||||
|
||||
// get(N) from row, then parse it as some destination type.
|
||||
fn get_parse<T>(row: &SimpleQueryRow, idx: usize) -> Result<T, IdentifyError>
|
||||
@@ -379,7 +345,7 @@ pub fn identify_system(
|
||||
dbname: get_parse(first_row, 3).ok(),
|
||||
})
|
||||
} else {
|
||||
Err(IdentifyError)?
|
||||
Err(IdentifyError.into())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -398,7 +364,7 @@ fn write_wal_file(
|
||||
let wal_dir = PathBuf::from(format!("timelines/{}/wal", timeline));
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = start_pos.segment_offset(wal_seg_size as u64) as usize;
|
||||
let mut xlogoff = start_pos.segment_offset(wal_seg_size);
|
||||
|
||||
while bytes_left != 0 {
|
||||
let bytes_to_write;
|
||||
@@ -414,7 +380,7 @@ fn write_wal_file(
|
||||
}
|
||||
|
||||
/* Open file */
|
||||
let segno = start_pos.segment_number(wal_seg_size as u64);
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
segno,
|
||||
@@ -466,7 +432,7 @@ fn write_wal_file(
|
||||
xlogoff += bytes_to_write;
|
||||
|
||||
/* Did we reach the end of a WAL segment? */
|
||||
if start_pos.segment_offset(wal_seg_size as u64) == 0 {
|
||||
if start_pos.segment_offset(wal_seg_size) == 0 {
|
||||
xlogoff = 0;
|
||||
if partial {
|
||||
fs::rename(&wal_file_partial_path, &wal_file_path)?;
|
||||
|
||||
@@ -1,28 +1,27 @@
|
||||
//
|
||||
// WAL redo
|
||||
//
|
||||
// We rely on Postgres to perform WAL redo for us. We launch a
|
||||
// postgres process in special "wal redo" mode that's similar to
|
||||
// single-user mode. We then pass the the previous page image, if any,
|
||||
// and all the WAL records we want to apply, to the postgress
|
||||
// process. Then we get the page image back. Communication with the
|
||||
// postgres process happens via stdin/stdout
|
||||
//
|
||||
// See src/backend/tcop/zenith_wal_redo.c for the other side of
|
||||
// this communication.
|
||||
//
|
||||
// TODO: Even though the postgres code runs in a separate process,
|
||||
// it's not a secure sandbox.
|
||||
//
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
//!
|
||||
//! WAL redo
|
||||
//!
|
||||
//! We rely on Postgres to perform WAL redo for us. We launch a
|
||||
//! postgres process in special "wal redo" mode that's similar to
|
||||
//! single-user mode. We then pass the the previous page image, if any,
|
||||
//! and all the WAL records we want to apply, to the postgres
|
||||
//! process. Then we get the page image back. Communication with the
|
||||
//! postgres process happens via stdin/stdout
|
||||
//!
|
||||
//! See src/backend/tcop/zenith_wal_redo.c for the other side of
|
||||
//! this communication.
|
||||
//!
|
||||
//! TODO: Even though the postgres code runs in a separate process,
|
||||
//! it's not a secure sandbox.
|
||||
//!
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use std::assert;
|
||||
use std::cell::RefCell;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::Error;
|
||||
use std::path::PathBuf;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Stdio;
|
||||
use std::sync::mpsc;
|
||||
use std::sync::Mutex;
|
||||
@@ -32,30 +31,65 @@ use tokio::io::AsyncBufReadExt;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::process::{ChildStdin, ChildStdout, Command};
|
||||
use tokio::time::timeout;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::page_cache::BufferTag;
|
||||
use crate::page_cache::WALRecord;
|
||||
use crate::repository::BufferTag;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::PageServerConf;
|
||||
use crate::ZTimelineId;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::XLogRecord;
|
||||
|
||||
///
|
||||
/// WAL Redo Manager is responsible for replaying WAL records.
|
||||
///
|
||||
/// Callers use the WAL redo manager through this abstract interface,
|
||||
/// which makes it easy to mock it in tests.
|
||||
pub trait WalRedoManager: Send + Sync {
|
||||
/// Apply some WAL records.
|
||||
///
|
||||
/// The caller passes an old page image, and WAL records that should be
|
||||
/// applied over it. The return value is a new page image, after applying
|
||||
/// the reords.
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError>;
|
||||
}
|
||||
|
||||
///
|
||||
/// A dummy WAL Redo Manager implementation that doesn't allow replaying
|
||||
/// anything. Currently used during bootstrapping (zenith init), to create
|
||||
/// a Repository object without launching the real WAL redo process.
|
||||
///
|
||||
pub struct DummyRedoManager {}
|
||||
impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
_tag: BufferTag,
|
||||
_lsn: Lsn,
|
||||
_base_img: Option<Bytes>,
|
||||
_records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
Err(WalRedoError::InvalidState)
|
||||
}
|
||||
}
|
||||
|
||||
static TIMEOUT: Duration = Duration::from_secs(20);
|
||||
|
||||
///
|
||||
/// A WAL redo manager consists of two parts: WalRedoManager, and
|
||||
/// WalRedoManagerInternal. WalRedoManager is the public struct
|
||||
/// The implementation consists of two parts: PostgresRedoManager, and
|
||||
/// PostgresRedoManagerInternal. PostgresRedoManager is the public struct
|
||||
/// that can be used to send redo requests to the manager.
|
||||
/// WalRedoManagerInternal is used by the manager thread itself.
|
||||
/// PostgresRedoManagerInternal is used by the manager thread itself.
|
||||
///
|
||||
pub struct WalRedoManager {
|
||||
pub struct PostgresRedoManager {
|
||||
request_tx: Mutex<mpsc::Sender<WalRedoRequest>>,
|
||||
}
|
||||
|
||||
struct WalRedoManagerInternal {
|
||||
_conf: PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
struct PostgresRedoManagerInternal {
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
request_rx: mpsc::Receiver<WalRedoRequest>,
|
||||
}
|
||||
@@ -76,18 +110,20 @@ struct WalRedoRequest {
|
||||
pub enum WalRedoError {
|
||||
#[error(transparent)]
|
||||
IoError(#[from] std::io::Error),
|
||||
|
||||
#[error("cannot perform WAL redo now")]
|
||||
InvalidState,
|
||||
}
|
||||
|
||||
///
|
||||
/// Public interface of WAL redo manager
|
||||
///
|
||||
impl WalRedoManager {
|
||||
impl PostgresRedoManager {
|
||||
///
|
||||
/// Create a new WalRedoManager.
|
||||
/// Create a new PostgresRedoManager.
|
||||
///
|
||||
/// This only initializes the struct. You need to call WalRedoManager::launch to
|
||||
/// start the thread that processes the requests.
|
||||
pub fn new(conf: &PageServerConf, timelineid: ZTimelineId) -> WalRedoManager {
|
||||
/// This launches a new thread to handle the requests.
|
||||
pub fn new(conf: &'static PageServerConf) -> PostgresRedoManager {
|
||||
let (tx, rx) = mpsc::channel();
|
||||
|
||||
//
|
||||
@@ -96,32 +132,31 @@ impl WalRedoManager {
|
||||
// Get mutable references to the values that we need to pass to the
|
||||
// thread.
|
||||
let request_rx = rx;
|
||||
let conf_copy = conf.clone();
|
||||
|
||||
// Currently, the join handle is not saved anywhere and we
|
||||
// won't try restart the thread if it dies.
|
||||
let _walredo_thread = std::thread::Builder::new()
|
||||
.name("WAL redo thread".into())
|
||||
.spawn(move || {
|
||||
let mut internal = WalRedoManagerInternal {
|
||||
_conf: conf_copy,
|
||||
timelineid,
|
||||
request_rx,
|
||||
};
|
||||
let mut internal = PostgresRedoManagerInternal { conf, request_rx };
|
||||
internal.wal_redo_main();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
WalRedoManager {
|
||||
PostgresRedoManager {
|
||||
request_tx: Mutex::new(tx),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl WalRedoManager for PostgresRedoManager {
|
||||
///
|
||||
/// Request the WAL redo manager to apply WAL records, to reconstruct the page image
|
||||
/// of the given page version.
|
||||
/// Request the WAL redo manager to apply some WAL records
|
||||
///
|
||||
pub fn request_redo(
|
||||
/// The WAL redo is handled by a separate thread, so this just sends a request
|
||||
/// to the thread and waits for response.
|
||||
///
|
||||
fn request_redo(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
lsn: Lsn,
|
||||
@@ -153,12 +188,12 @@ impl WalRedoManager {
|
||||
///
|
||||
/// WAL redo thread
|
||||
///
|
||||
impl WalRedoManagerInternal {
|
||||
impl PostgresRedoManagerInternal {
|
||||
//
|
||||
// Main entry point for the WAL applicator thread.
|
||||
//
|
||||
fn wal_redo_main(&mut self) {
|
||||
info!("WAL redo thread started {}", self.timelineid);
|
||||
info!("WAL redo thread started");
|
||||
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
@@ -168,17 +203,25 @@ impl WalRedoManagerInternal {
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let process: WalRedoProcess;
|
||||
let datadir = format!("wal-redo/{}", self.timelineid);
|
||||
let process: PostgresRedoProcess;
|
||||
|
||||
info!("launching WAL redo postgres process {}", self.timelineid);
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = self.conf.workdir.join("wal-redo-datadir");
|
||||
|
||||
process = runtime.block_on(WalRedoProcess::launch(&datadir)).unwrap();
|
||||
info!("WAL redo postgres started");
|
||||
info!("launching WAL redo postgres process");
|
||||
|
||||
process = runtime
|
||||
.block_on(PostgresRedoProcess::launch(&datadir))
|
||||
.unwrap();
|
||||
|
||||
// Loop forever, handling requests as they come.
|
||||
loop {
|
||||
let request = self.request_rx.recv().unwrap();
|
||||
let request = self
|
||||
.request_rx
|
||||
.recv()
|
||||
.expect("WAL redo request channel was closed");
|
||||
|
||||
let result = runtime.block_on(self.handle_apply_request(&process, &request));
|
||||
let result_ok = result.is_ok();
|
||||
@@ -187,33 +230,17 @@ impl WalRedoManagerInternal {
|
||||
let _ = request.response_channel.send(result);
|
||||
|
||||
if !result_ok {
|
||||
error!("wal-redo-postgres filed to apply request {:?}", request);
|
||||
error!("wal-redo-postgres failed to apply request {:?}", request);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn transaction_id_set_status_bit(&self, xid: u32, status: u8, page: &mut BytesMut) {
|
||||
trace!(
|
||||
"handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
|
||||
status
|
||||
);
|
||||
|
||||
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
|
||||
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
|
||||
|
||||
let bshift: u8 = ((xid % pg_constants::CLOG_XACTS_PER_BYTE)
|
||||
* pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
|
||||
page[byteno] =
|
||||
(page[byteno] & !(pg_constants::CLOG_XACT_BITMASK << bshift)) | (status << bshift);
|
||||
}
|
||||
|
||||
///
|
||||
/// Process one request for WAL redo.
|
||||
///
|
||||
async fn handle_apply_request(
|
||||
&self,
|
||||
process: &WalRedoProcess,
|
||||
process: &PostgresRedoProcess,
|
||||
request: &WalRedoRequest,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let tag = request.tag;
|
||||
@@ -226,102 +253,7 @@ impl WalRedoManagerInternal {
|
||||
let start = Instant::now();
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
if tag.rel.forknum == pg_constants::PG_XACT_FORKNUM as u8 {
|
||||
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let mut page = BytesMut::new();
|
||||
if let Some(fpi) = base_img {
|
||||
page.extend_from_slice(&fpi[..]);
|
||||
} else {
|
||||
page.extend_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
for record in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
// FIXME: refactor to avoid code duplication.
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
|
||||
//move to main data
|
||||
// TODO probably, we should store some records in our special format
|
||||
// to avoid this weird parsing on replay
|
||||
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
|
||||
if buf.remaining() > skip {
|
||||
buf.advance(skip);
|
||||
}
|
||||
|
||||
if xlogrec.xl_rmid == pg_constants::RM_CLOG_ID {
|
||||
let info = xlogrec.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
page.clone_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = xlogrec.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let mut status = 0;
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
status = pg_constants::TRANSACTION_STATUS_COMMITTED;
|
||||
self.transaction_id_set_status_bit(xlogrec.xl_xid, status, &mut page);
|
||||
//handle subtrans
|
||||
let _xact_time = buf.get_i64_le();
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
}
|
||||
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag.blknum == blkno {
|
||||
status = pg_constants::TRANSACTION_STATUS_SUB_COMMITTED;
|
||||
self.transaction_id_set_status_bit(subxact, status, &mut page);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
self.transaction_id_set_status_bit(xlogrec.xl_xid, status, &mut page);
|
||||
//handle subtrans
|
||||
let _xact_time = buf.get_i64_le();
|
||||
let mut xinfo = 0;
|
||||
if xlogrec.xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 {
|
||||
xinfo = buf.get_u32_le();
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 {
|
||||
let _dbid = buf.get_u32_le();
|
||||
let _tsid = buf.get_u32_le();
|
||||
}
|
||||
}
|
||||
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_SUBXACTS != 0 {
|
||||
let nsubxacts = buf.get_i32_le();
|
||||
for _i in 0..nsubxacts {
|
||||
let subxact = buf.get_u32_le();
|
||||
let blkno = subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
// only update xids on the requested page
|
||||
if tag.blknum == blkno {
|
||||
status = pg_constants::TRANSACTION_STATUS_ABORTED;
|
||||
self.transaction_id_set_status_bit(subxact, status, &mut page);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
trace!("handle_apply_request for RM_XACT_ID-{} NOT SUPPORTED YET. RETURN. lsn {} main_data_offset {}, rec.len {}",
|
||||
status,
|
||||
record.lsn,
|
||||
record.main_data_offset, record.rec.len());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
apply_result = Ok::<Bytes, Error>(page.freeze());
|
||||
} else {
|
||||
apply_result = process.apply_wal_records(tag, base_img, records).await;
|
||||
}
|
||||
apply_result = process.apply_wal_records(tag, base_img, records).await;
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
@@ -348,23 +280,31 @@ impl WalRedoManagerInternal {
|
||||
}
|
||||
}
|
||||
|
||||
struct WalRedoProcess {
|
||||
struct PostgresRedoProcess {
|
||||
stdin: RefCell<ChildStdin>,
|
||||
stdout: RefCell<ChildStdout>,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
impl PostgresRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
// Tests who run pageserver binary are setting proper PG_BIN_DIR
|
||||
// and PG_LIB_DIR so that WalRedo would start right postgres. We may later
|
||||
// and PG_LIB_DIR so that WalRedo would start right postgres.
|
||||
|
||||
// do that: We may later
|
||||
// switch to setting same things in pageserver config file.
|
||||
async fn launch(datadir: &str) -> Result<WalRedoProcess, Error> {
|
||||
// Create empty data directory for wal-redo postgres deleting old one.
|
||||
fs::remove_dir_all(datadir).ok();
|
||||
async fn launch(datadir: &Path) -> Result<PostgresRedoProcess, Error> {
|
||||
// Create empty data directory for wal-redo postgres, deleting old one first.
|
||||
if datadir.exists() {
|
||||
info!("directory {:?} exists, removing", &datadir);
|
||||
if let Err(e) = fs::remove_dir_all(&datadir) {
|
||||
error!("could not remove old wal-redo-datadir: {:?}", e);
|
||||
}
|
||||
}
|
||||
info!("running initdb in {:?}", datadir.display());
|
||||
let initdb = Command::new("initdb")
|
||||
.args(&["-D", datadir])
|
||||
.args(&["-D", datadir.to_str().unwrap()])
|
||||
.arg("-N")
|
||||
.output()
|
||||
.await
|
||||
@@ -383,6 +323,8 @@ impl WalRedoProcess {
|
||||
.open(PathBuf::from(&datadir).join("postgresql.conf"))?;
|
||||
config.write_all(b"shared_buffers=128kB\n")?;
|
||||
config.write_all(b"fsync=off\n")?;
|
||||
config.write_all(b"shared_preload_libraries=zenith\n")?;
|
||||
config.write_all(b"zenith.wal_redo=on\n")?;
|
||||
}
|
||||
// Start postgres itself
|
||||
let mut child = Command::new("postgres")
|
||||
@@ -394,7 +336,10 @@ impl WalRedoProcess {
|
||||
.spawn()
|
||||
.expect("postgres --wal-redo command failed to start");
|
||||
|
||||
info!("launched WAL redo postgres process on {}", datadir);
|
||||
info!(
|
||||
"launched WAL redo postgres process on {:?}",
|
||||
datadir.display()
|
||||
);
|
||||
|
||||
let stdin = child.stdin.take().expect("failed to open child's stdin");
|
||||
let stderr = child.stderr.take().expect("failed to open child's stderr");
|
||||
@@ -421,7 +366,7 @@ impl WalRedoProcess {
|
||||
};
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
Ok(WalRedoProcess {
|
||||
Ok(PostgresRedoProcess {
|
||||
stdin: RefCell::new(stdin),
|
||||
stdout: RefCell::new(stdout),
|
||||
})
|
||||
@@ -435,7 +380,7 @@ impl WalRedoProcess {
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
base_img: Option<Bytes>,
|
||||
records: &Vec<WALRecord>,
|
||||
records: &[WALRecord],
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
let mut stdin = self.stdin.borrow_mut();
|
||||
let mut stdout = self.stdout.borrow_mut();
|
||||
@@ -493,7 +438,7 @@ impl WalRedoProcess {
|
||||
Ok::<[u8; 8192], Error>(buf)
|
||||
};
|
||||
|
||||
let res = futures::try_join!(f_stdout, f_stdin)?;
|
||||
let res = tokio::try_join!(f_stdout, f_stdin)?;
|
||||
|
||||
let buf = res.0;
|
||||
|
||||
@@ -506,14 +451,31 @@ impl WalRedoProcess {
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 5 * 4;
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
// FIXME: this is a temporary hack that should go away when we refactor
|
||||
// the postgres protocol serialization + handlers.
|
||||
//
|
||||
// BytesMut is a dynamic growable buffer, used a lot in tokio code but
|
||||
// not in the std library. To write to a BytesMut from a serde serializer,
|
||||
// we need to either:
|
||||
// - pre-allocate the required buffer space. This is annoying because we
|
||||
// shouldn't care what the exact serialized size is-- that's the
|
||||
// serializer's job.
|
||||
// - Or, we need to create a temporary "writer" (which implements the
|
||||
// `Write` trait). It's a bit awkward, because the writer consumes the
|
||||
// underlying BytesMut, and we need to extract it later with
|
||||
// `into_inner`.
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
@@ -521,15 +483,18 @@ fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
let len = 4 + 5 * 4 + base_img.len();
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let mut buf = writer.into_inner();
|
||||
buf.put(base_img);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
@@ -543,20 +508,23 @@ fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.put(rec);
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 5 * 4;
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
tag.pack(&mut buf);
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
assert!(buf.len() == 1 + len);
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
@@ -9,12 +9,16 @@ edition = "2018"
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
hex = "0.4.3"
|
||||
lazy_static = "1.4"
|
||||
log = "0.4.14"
|
||||
thiserror = "1.0"
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
[build-dependencies]
|
||||
bindgen = "0.57"
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
pub mod xlog_utils;
|
||||
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
|
||||
32
postgres_ffi/src/nonrelfile_utils.rs
Normal file
32
postgres_ffi/src/nonrelfile_utils.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
//!
|
||||
//! Common utilities for dealing with PostgreSQL non-relation files.
|
||||
//!
|
||||
use crate::pg_constants;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
|
||||
pub fn transaction_id_set_status(xid: u32, status: u8, page: &mut BytesMut) {
|
||||
trace!(
|
||||
"handle_apply_request for RM_XACT_ID-{} (1-commit, 2-abort, 3-sub_commit)",
|
||||
status
|
||||
);
|
||||
|
||||
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
|
||||
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
|
||||
|
||||
let bshift: u8 =
|
||||
((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
|
||||
page[byteno] =
|
||||
(page[byteno] & !(pg_constants::CLOG_XACT_BITMASK << bshift)) | (status << bshift);
|
||||
}
|
||||
|
||||
pub fn transaction_id_get_status(xid: u32, page: &[u8]) -> u8 {
|
||||
let byteno: usize = ((xid as u32 % pg_constants::CLOG_XACTS_PER_PAGE as u32)
|
||||
/ pg_constants::CLOG_XACTS_PER_BYTE) as usize;
|
||||
|
||||
let bshift: u8 =
|
||||
((xid % pg_constants::CLOG_XACTS_PER_BYTE) * pg_constants::CLOG_BITS_PER_XACT as u32) as u8;
|
||||
|
||||
((page[byteno] >> bshift) & pg_constants::CLOG_XACT_BITMASK) as u8
|
||||
}
|
||||
@@ -1,37 +1,44 @@
|
||||
//!
|
||||
//! Misc constants, copied from PostgreSQL headers.
|
||||
//!
|
||||
|
||||
//
|
||||
// From pg_tablespace_d.h
|
||||
//
|
||||
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
pub const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
//Special values for non-rel files' tags
|
||||
//TODO maybe use enum?
|
||||
pub const PG_CONTROLFILE_FORKNUM: u32 = 42;
|
||||
pub const PG_FILENODEMAP_FORKNUM: u32 = 43;
|
||||
pub const PG_XACT_FORKNUM: u32 = 44;
|
||||
pub const PG_MXACT_OFFSETS_FORKNUM: u32 = 45;
|
||||
pub const PG_MXACT_MEMBERS_FORKNUM: u32 = 46;
|
||||
|
||||
//
|
||||
// constants from clog.h
|
||||
// Fork numbers, from relpath.h
|
||||
//
|
||||
pub const CLOG_XACTS_PER_BYTE: u32 = 4;
|
||||
pub const CLOG_XACTS_PER_PAGE: u32 = 8192 * CLOG_XACTS_PER_BYTE;
|
||||
pub const CLOG_BITS_PER_XACT: u8 = 2;
|
||||
pub const CLOG_XACT_BITMASK: u8 = (1 << CLOG_BITS_PER_XACT) - 1;
|
||||
pub const MAIN_FORKNUM: u8 = 0;
|
||||
pub const FSM_FORKNUM: u8 = 1;
|
||||
pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
|
||||
pub const ROCKSDB_SPECIAL_FORKNUM: u8 = 50;
|
||||
|
||||
// From storage_xlog.h
|
||||
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
|
||||
pub const SMGR_TRUNCATE_VM: u32 = 0x0002;
|
||||
pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
|
||||
|
||||
//
|
||||
// Constants from visbilitymap.h
|
||||
//
|
||||
pub const SIZE_OF_PAGE_HEADER: u16 = 24;
|
||||
pub const BITS_PER_HEAPBLOCK: u16 = 2;
|
||||
pub const HEAPBLOCKS_PER_PAGE: u16 = (BLCKSZ - SIZE_OF_PAGE_HEADER) * 8 / BITS_PER_HEAPBLOCK;
|
||||
|
||||
pub const TRANSACTION_STATUS_COMMITTED: u8 = 0x01;
|
||||
pub const TRANSACTION_STATUS_ABORTED: u8 = 0x02;
|
||||
pub const TRANSACTION_STATUS_SUB_COMMITTED: u8 = 0x03;
|
||||
|
||||
pub const CLOG_ZEROPAGE: u8 = 0x00;
|
||||
pub const CLOG_TRUNCATE: u8 = 0x10;
|
||||
|
||||
// From xact.h
|
||||
pub const XLOG_XACT_COMMIT: u8 = 0x00;
|
||||
pub const XLOG_XACT_PREPARE: u8 = 0x10;
|
||||
pub const XLOG_XACT_ABORT: u8 = 0x20;
|
||||
|
||||
// From srlu.h
|
||||
pub const SLRU_PAGES_PER_SEGMENT: u32 = 32;
|
||||
|
||||
/* mask for filtering opcodes out of xl_info */
|
||||
pub const XLOG_XACT_OPMASK: u8 = 0x70;
|
||||
/* does this record have a 'xinfo' field or not */
|
||||
@@ -53,13 +60,31 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
|
||||
// From pg_control.h and rmgrlist.h
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
|
||||
// From heapam_xlog.h
|
||||
pub const XLOG_HEAP_INSERT: u8 = 0x00;
|
||||
pub const XLOG_HEAP_DELETE: u8 = 0x10;
|
||||
pub const XLOG_HEAP_UPDATE: u8 = 0x20;
|
||||
pub const XLOG_HEAP_HOT_UPDATE: u8 = 0x40;
|
||||
pub const XLOG_HEAP2_VISIBLE: u8 = 0x40;
|
||||
pub const XLOG_HEAP2_MULTI_INSERT: u8 = 0x50;
|
||||
pub const XLH_INSERT_ALL_FROZEN_SET: u8 = (1 << 5) as u8;
|
||||
pub const XLH_INSERT_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
|
||||
pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
|
||||
|
||||
pub const RM_XLOG_ID: u8 = 0;
|
||||
pub const RM_XACT_ID: u8 = 1;
|
||||
pub const RM_SMGR_ID: u8 = 2;
|
||||
pub const RM_CLOG_ID: u8 = 3;
|
||||
pub const RM_DBASE_ID: u8 = 4;
|
||||
pub const RM_TBLSPC_ID: u8 = 5;
|
||||
// pub const RM_MULTIXACT_ID:u8 = 6;
|
||||
pub const RM_MULTIXACT_ID: u8 = 6;
|
||||
pub const RM_RELMAP_ID: u8 = 7;
|
||||
pub const RM_STANDBY_ID: u8 = 8;
|
||||
pub const RM_HEAP2_ID: u8 = 9;
|
||||
pub const RM_HEAP_ID: u8 = 10;
|
||||
|
||||
// from xlogreader.h
|
||||
pub const XLR_INFO_MASK: u8 = 0x0F;
|
||||
@@ -74,8 +99,10 @@ pub const XLOG_TBLSPC_DROP: u8 = 0x10;
|
||||
|
||||
pub const SIZEOF_XLOGRECORD: u32 = 24;
|
||||
|
||||
// FIXME:
|
||||
// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and
|
||||
// --with-segsize=SEGSIZE, but assume the defaults for now.
|
||||
pub const BLCKSZ: u16 = 8192;
|
||||
pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32);
|
||||
|
||||
//
|
||||
// from xlogrecord.h
|
||||
@@ -98,3 +125,6 @@ pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous
|
||||
pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
|
||||
pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
|
||||
pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
|
||||
|
||||
/* FIXME: pageserver should request wal_seg_size from compute node */
|
||||
pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;
|
||||
|
||||
141
postgres_ffi/src/relfile_utils.rs
Normal file
141
postgres_ffi/src/relfile_utils.rs
Normal file
@@ -0,0 +1,141 @@
|
||||
//!
|
||||
//! Common utilities for dealing with PostgreSQL relation files.
|
||||
//!
|
||||
use crate::pg_constants;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
#[derive(Debug, Clone, thiserror::Error, PartialEq)]
|
||||
pub enum FilePathError {
|
||||
#[error("invalid relation fork name")]
|
||||
InvalidForkName,
|
||||
#[error("invalid relation data file name")]
|
||||
InvalidFileName,
|
||||
}
|
||||
|
||||
impl From<core::num::ParseIntError> for FilePathError {
|
||||
fn from(_e: core::num::ParseIntError) -> Self {
|
||||
FilePathError::InvalidFileName
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert Postgres relation file's fork suffix to fork number.
|
||||
pub fn forkname_to_number(forkname: Option<&str>) -> Result<u8, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(pg_constants::MAIN_FORKNUM),
|
||||
Some("fsm") => Ok(pg_constants::FSM_FORKNUM),
|
||||
Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM),
|
||||
Some("init") => Ok(pg_constants::INIT_FORKNUM),
|
||||
Some(_) => Err(FilePathError::InvalidForkName),
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert Postgres fork number to the right suffix of the relation data file.
|
||||
pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
||||
match forknum {
|
||||
pg_constants::MAIN_FORKNUM => None,
|
||||
pg_constants::FSM_FORKNUM => Some("fsm"),
|
||||
pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"),
|
||||
pg_constants::INIT_FORKNUM => Some("init"),
|
||||
_ => Some("UNKNOWN FORKNUM"),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Parse a filename of a relation file. Returns (relfilenode, forknum, segno) tuple.
|
||||
///
|
||||
/// Formats:
|
||||
/// <oid>
|
||||
/// <oid>_<fork name>
|
||||
/// <oid>.<segment number>
|
||||
/// <oid>_<fork name>.<segment number>
|
||||
///
|
||||
/// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
|
||||
///
|
||||
pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
|
||||
lazy_static! {
|
||||
static ref RELFILE_RE: Regex =
|
||||
Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
|
||||
}
|
||||
let caps = RELFILE_RE
|
||||
.captures(fname)
|
||||
.ok_or(FilePathError::InvalidFileName)?;
|
||||
|
||||
let relnode_str = caps.name("relnode").unwrap().as_str();
|
||||
let relnode = relnode_str.parse::<u32>()?;
|
||||
|
||||
let forkname = caps.name("forkname").map(|f| f.as_str());
|
||||
let forknum = forkname_to_number(forkname)?;
|
||||
|
||||
let segno_match = caps.name("segno");
|
||||
let segno = if segno_match.is_none() {
|
||||
0
|
||||
} else {
|
||||
segno_match.unwrap().as_str().parse::<u32>()?
|
||||
};
|
||||
|
||||
Ok((relnode, forknum, segno))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_parse_valid_relfilenames() {
|
||||
assert_eq!(parse_relfilename("1234"), Ok((1234, 0, 0)));
|
||||
assert_eq!(parse_relfilename("1234_fsm"), Ok((1234, 1, 0)));
|
||||
assert_eq!(parse_relfilename("1234_vm"), Ok((1234, 2, 0)));
|
||||
assert_eq!(parse_relfilename("1234_init"), Ok((1234, 3, 0)));
|
||||
|
||||
assert_eq!(parse_relfilename("1234.12"), Ok((1234, 0, 12)));
|
||||
assert_eq!(parse_relfilename("1234_fsm.12"), Ok((1234, 1, 12)));
|
||||
assert_eq!(parse_relfilename("1234_vm.12"), Ok((1234, 2, 12)));
|
||||
assert_eq!(parse_relfilename("1234_init.12"), Ok((1234, 3, 12)));
|
||||
|
||||
// relfilenode is unsigned, so it can go up to 2^32-1
|
||||
assert_eq!(parse_relfilename("3147483648"), Ok((3147483648, 0, 0)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_invalid_relfilenames() {
|
||||
assert_eq!(
|
||||
parse_relfilename("foo"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("1.2.3"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("1234_invalid"),
|
||||
Err(FilePathError::InvalidForkName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("1234_"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
|
||||
// too large for u32
|
||||
assert_eq!(
|
||||
parse_relfilename("12345678901"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_relfilename("-1234"),
|
||||
Err(FilePathError::InvalidFileName)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_weird_relfilenames() {
|
||||
// we accept 0 for the relfilenode, but PostgreSQL should never do that.
|
||||
assert_eq!(parse_relfilename("0"), Ok((0, 0, 0)));
|
||||
|
||||
// PostgreSQL has a limit of 2^32-2 blocks in a table. With 8k block size and
|
||||
// 1 GB segments, the max segment number is 32767. But we accept larger values
|
||||
// currently.
|
||||
assert_eq!(parse_relfilename("1.123456"), Ok((1, 0, 123456)));
|
||||
}
|
||||
}
|
||||
@@ -27,26 +27,18 @@ pub const XLOG_SIZE_OF_XLOG_SHORT_PHD: usize = XLP_REM_LEN_OFFS + 4 + 4;
|
||||
pub const XLOG_SIZE_OF_XLOG_LONG_PHD: usize = XLOG_SIZE_OF_XLOG_SHORT_PHD + 8 + 4 + 4;
|
||||
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
|
||||
pub const XLOG_SIZE_OF_XLOG_RECORD: usize = XLOG_RECORD_CRC_OFFS + 4;
|
||||
pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
||||
|
||||
pub type XLogRecPtr = u64;
|
||||
pub type TimeLineID = u32;
|
||||
pub type TimestampTz = u64;
|
||||
pub type XLogSegNo = u64;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentOffset(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> u32 {
|
||||
(xlogptr as u32) & (wal_segsz_bytes as u32 - 1)
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
(0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLByteToSeg(xlogptr: XLogRecPtr, wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
xlogptr / wal_segsz_bytes as u64
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegNoOffsetToRecPtr(
|
||||
segno: XLogSegNo,
|
||||
@@ -207,33 +199,31 @@ pub fn find_end_of_wal(
|
||||
let mut high_tli: TimeLineID = 0;
|
||||
let mut high_ispartial = false;
|
||||
|
||||
for entry in fs::read_dir(data_dir).unwrap() {
|
||||
if let Ok(entry) = entry {
|
||||
let ispartial: bool;
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
/*
|
||||
* Check if the filename looks like an xlog file, or a .partial file.
|
||||
*/
|
||||
if IsXLogFileName(fname) {
|
||||
ispartial = false;
|
||||
} else if IsPartialXLogFileName(fname) {
|
||||
ispartial = true;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
|
||||
if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
|
||||
continue;
|
||||
}
|
||||
if segno > high_segno
|
||||
|| (segno == high_segno && tli > high_tli)
|
||||
|| (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
|
||||
{
|
||||
high_segno = segno;
|
||||
high_tli = tli;
|
||||
high_ispartial = ispartial;
|
||||
}
|
||||
for entry in fs::read_dir(data_dir).unwrap().flatten() {
|
||||
let ispartial: bool;
|
||||
let entry_name = entry.file_name();
|
||||
let fname = entry_name.to_str().unwrap();
|
||||
/*
|
||||
* Check if the filename looks like an xlog file, or a .partial file.
|
||||
*/
|
||||
if IsXLogFileName(fname) {
|
||||
ispartial = false;
|
||||
} else if IsPartialXLogFileName(fname) {
|
||||
ispartial = true;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
|
||||
if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
|
||||
continue;
|
||||
}
|
||||
if segno > high_segno
|
||||
|| (segno == high_segno && tli > high_tli)
|
||||
|| (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
|
||||
{
|
||||
high_segno = segno;
|
||||
high_tli = tli;
|
||||
high_ispartial = ispartial;
|
||||
}
|
||||
}
|
||||
if high_segno > 0 {
|
||||
|
||||
92
test_runner/README.md
Normal file
92
test_runner/README.md
Normal file
@@ -0,0 +1,92 @@
|
||||
## Zenith test runner
|
||||
|
||||
This directory contains integration tests.
|
||||
|
||||
Prerequisites:
|
||||
- Python 3.6 or later
|
||||
- Python packages: pytest, psycopg2
|
||||
- pytest 6.0 is required.
|
||||
- __NOTE: `apt install` on Debian/Ubuntu won't work.__
|
||||
They ship a much older version of pytest (and sometimes rename it to
|
||||
`pytest-3`.)
|
||||
- Install using something like this:
|
||||
- `pip3 install pytest psycopg2` (Debian or Ubuntu)
|
||||
- Zenith and Postgres binaries
|
||||
- See the root README.md for build directions
|
||||
- Tests can be run from the git tree; or see the environment variables
|
||||
below to run from other directories.
|
||||
- The zenith git repo, including the postgres submodule
|
||||
(for some tests, e.g. pg_regress)
|
||||
|
||||
### Test Organization
|
||||
|
||||
The tests are divided into a few batches, such that each batch takes roughly
|
||||
the same amount of time. The batches can be run in parallel, to minimize total
|
||||
runtime. Currently, there are only two batches:
|
||||
|
||||
- test_batch_pg_regress: Runs PostgreSQL regression tests
|
||||
- test_others: All other tests
|
||||
|
||||
### Running the tests
|
||||
|
||||
Because pytest will search all subdirectories for tests, it's easiest to
|
||||
run the tests from within the `test_runner` directory.
|
||||
|
||||
Test state (postgres data, pageserver state, and log files) will
|
||||
be stored under a directory `test_output`.
|
||||
|
||||
You can run all the tests with:
|
||||
|
||||
`pytest`
|
||||
|
||||
If you want to run all the tests in a particular file:
|
||||
|
||||
`pytest test_pgbench.py`
|
||||
|
||||
If you want to run all tests that have the string "bench" in their names:
|
||||
|
||||
`pytest -k bench`
|
||||
|
||||
Useful environment variables:
|
||||
|
||||
`ZENITH_BIN`: The directory where zenith binaries can be found.
|
||||
`POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found.
|
||||
`TEST_OUTPUT`: Set the directory where test state and test output files
|
||||
should go.
|
||||
`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
|
||||
|
||||
Let stdout and stderr go to the terminal instead of capturing them:
|
||||
`pytest -s ...`
|
||||
(Note many tests capture subprocess outputs separately, so this may not
|
||||
show much.)
|
||||
|
||||
Exit after the first test failure:
|
||||
`pytest -x ...`
|
||||
(there are many more pytest options; run `pytest -h` to see them.)
|
||||
|
||||
|
||||
### Building new tests
|
||||
|
||||
The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
|
||||
|
||||
Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
|
||||
|
||||
So this code:
|
||||
```
|
||||
def test_something(zenith_cli, pg_bin):
|
||||
pass
|
||||
```
|
||||
|
||||
... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
|
||||
|
||||
Fixtures can't be imported using the normal python syntax. Instead, use this:
|
||||
```
|
||||
pytest_plugins = ("fixtures.something")
|
||||
```
|
||||
That will make all the fixtures in the `fixtures/something.py` file available.
|
||||
|
||||
Anything that's likely to be used in multiple tests should be built into a fixture.
|
||||
|
||||
Note that fixtures can clean up after themselves if they use the `yield` syntax.
|
||||
Cleanup will happen even if the test fails (raises an unhandled exception).
|
||||
Python destructors, e.g. `__del__()` aren't recommended for cleanup.
|
||||
67
test_runner/batch_others/test_branch_behind.py
Normal file
67
test_runner/batch_others/test_branch_behind.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import pytest
|
||||
import getpass
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
# Create a couple of branches off the main branch, at a historical point in time.
|
||||
#
|
||||
def test_branch_behind(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind", "empty"]);
|
||||
|
||||
pgmain = postgres.create_start('test_branch_behind')
|
||||
print("postgres is running on 'test_branch_behind' branch")
|
||||
|
||||
main_pg_conn = psycopg2.connect(pgmain.connstr());
|
||||
main_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
main_cur = main_pg_conn.cursor()
|
||||
|
||||
# Create table, and insert the first 100 rows
|
||||
main_cur.execute('CREATE TABLE foo (t text)');
|
||||
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100) g");
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
lsn_a = main_cur.fetchone()[0]
|
||||
print('LSN after 100 rows: ' + lsn_a)
|
||||
|
||||
# Insert some more rows. (This generates enough WAL to fill a few segments.)
|
||||
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g");
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
print('LSN after 100100 rows: ' + lsn_b)
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@"+lsn_a]);
|
||||
|
||||
# Insert many more rows. This generates enough WAL to fill a few segments.
|
||||
main_cur.execute("INSERT INTO foo SELECT 'long string to consume some space' || g FROM generate_series(1, 100000) g");
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
print('LSN after 200100 rows: ' + lsn_c)
|
||||
|
||||
# Branch at the point where only 200 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@"+lsn_b]);
|
||||
|
||||
pg_hundred = postgres.create_start("test_branch_behind_hundred")
|
||||
pg_more = postgres.create_start("test_branch_behind_more")
|
||||
|
||||
# On the 'hundred' branch, we should see only 100 rows
|
||||
hundred_pg_conn = psycopg2.connect(pg_hundred.connstr())
|
||||
hundred_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
hundred_cur = hundred_pg_conn.cursor()
|
||||
hundred_cur.execute('SELECT count(*) FROM foo');
|
||||
assert(hundred_cur.fetchone()[0] == 100);
|
||||
|
||||
# On the 'more' branch, we should see 100200 rows
|
||||
more_pg_conn = psycopg2.connect(pg_more.connstr())
|
||||
more_pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
more_cur = more_pg_conn.cursor()
|
||||
more_cur.execute('SELECT count(*) FROM foo');
|
||||
assert(more_cur.fetchone()[0] == 100100);
|
||||
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo');
|
||||
assert(main_cur.fetchone()[0] == 200100);
|
||||
30
test_runner/batch_others/test_config.py
Normal file
30
test_runner/batch_others/test_config.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import pytest
|
||||
import os
|
||||
import getpass
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test starting Postgres with custom options
|
||||
#
|
||||
def test_config(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_config", "empty"]);
|
||||
|
||||
# change config
|
||||
pg = postgres.create_start('test_config', ['log_min_messages=debug1'])
|
||||
print('postgres is running on test_config branch')
|
||||
|
||||
pg_conn = psycopg2.connect(pg.connstr())
|
||||
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
#check that config change was applied
|
||||
cur.execute('SELECT name, setting from pg_settings WHERE source!=%s and source!=%s', ("default","override",))
|
||||
for record in cur:
|
||||
if record[0] == 'log_min_messages':
|
||||
assert(record[1] == 'debug1')
|
||||
|
||||
pg_conn.close()
|
||||
37
test_runner/batch_others/test_createdb.py
Normal file
37
test_runner/batch_others/test_createdb.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import pytest
|
||||
import getpass
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
# Test CREATE DATABASE when there have been relmapper changes
|
||||
#
|
||||
def test_createdb(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_createdb", "empty"]);
|
||||
|
||||
pg = postgres.create_start('test_createdb')
|
||||
print("postgres is running on 'test_createdb' branch")
|
||||
|
||||
conn = psycopg2.connect(pg.connstr());
|
||||
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Cause a 'relmapper' change in the original branch
|
||||
cur.execute('VACUUM FULL pg_class');
|
||||
|
||||
cur.execute('CREATE DATABASE foodb');
|
||||
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()');
|
||||
lsn = cur.fetchone()[0]
|
||||
|
||||
conn.close();
|
||||
|
||||
# Create a branch
|
||||
zenith_cli.run(["branch", "test_createdb2", "test_createdb@"+lsn]);
|
||||
|
||||
pg2 = postgres.create_start('test_createdb2')
|
||||
|
||||
# Test that you can connect to the new database on both branches
|
||||
conn = psycopg2.connect(pg.connstr('foodb'));
|
||||
conn2 = psycopg2.connect(pg2.connstr('foodb'));
|
||||
71
test_runner/batch_others/test_multixact.py
Normal file
71
test_runner/batch_others/test_multixact.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import pytest
|
||||
import os
|
||||
import psycopg2
|
||||
import multiprocessing
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
#
|
||||
# Test multixact state after branching
|
||||
# Now this test is very minimalistic -
|
||||
# it only checks next_multixact_id field in restored pg_control,
|
||||
# since we don't have functions to check multixact internals.
|
||||
#
|
||||
|
||||
def runQuery(connstr):
|
||||
con = psycopg2.connect(connstr)
|
||||
con.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = con.cursor()
|
||||
cur.execute('select * from t1 for key share;')
|
||||
|
||||
|
||||
def test_multixact(pageserver, postgres, pg_bin, zenith_cli, base_dir):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_multixact", "empty"])
|
||||
pg = postgres.create_start('test_multixact')
|
||||
|
||||
print("postgres is running on 'test_multixact' branch")
|
||||
pg_conn = psycopg2.connect(pg.connstr())
|
||||
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = pg_conn.cursor()
|
||||
|
||||
cur.execute('CREATE TABLE t1(i int primary key);'
|
||||
'INSERT INTO t1 select * from generate_series(1,100);')
|
||||
|
||||
cur.execute('SELECT next_multixact_id FROM pg_control_checkpoint();')
|
||||
next_multixact_id_old = cur.fetchone()[0]
|
||||
|
||||
# Lock entries in parallel connections to set multixact
|
||||
nclients = 3
|
||||
pool = multiprocessing.Pool(nclients)
|
||||
args = [pg.connstr()] * nclients
|
||||
pool.map(runQuery, args)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
# force wal flush
|
||||
cur.execute('checkpoint')
|
||||
|
||||
cur.execute('SELECT next_multixact_id, pg_current_wal_flush_lsn() FROM pg_control_checkpoint();')
|
||||
res = cur.fetchone()
|
||||
next_multixact_id = res[0]
|
||||
lsn = res[1]
|
||||
|
||||
# Ensure that we did lock some tuples
|
||||
assert(int(next_multixact_id) > int(next_multixact_id_old))
|
||||
|
||||
# Branch at this point
|
||||
zenith_cli.run(["branch", "test_multixact_new", "test_multixact@"+lsn]);
|
||||
pg_new = postgres.create_start('test_multixact_new')
|
||||
|
||||
print("postgres is running on 'test_multixact_new' branch")
|
||||
pg_new_conn = psycopg2.connect(pg_new.connstr())
|
||||
pg_new_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur_new = pg_new_conn.cursor()
|
||||
|
||||
cur_new.execute('SELECT next_multixact_id FROM pg_control_checkpoint();')
|
||||
next_multixact_id_new = cur_new.fetchone()[0]
|
||||
|
||||
# Check that we restored pg_controlfile correctly
|
||||
assert(next_multixact_id_new == next_multixact_id)
|
||||
54
test_runner/batch_others/test_pageserver_api.py
Normal file
54
test_runner/batch_others/test_pageserver_api.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import pytest
|
||||
import psycopg2
|
||||
import getpass
|
||||
import json
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
def test_status(pageserver):
|
||||
pg_conn = psycopg2.connect(pageserver.connstr())
|
||||
pg_conn.autocommit = True
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute('status;')
|
||||
assert cur.fetchone() == ('hello world',)
|
||||
pg_conn.close()
|
||||
|
||||
def test_branch_list(pageserver, zenith_cli):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_branch_list_main", "empty"]);
|
||||
|
||||
page_server_conn = psycopg2.connect(pageserver.connstr())
|
||||
page_server_conn.autocommit = True
|
||||
page_server_cur = page_server_conn.cursor()
|
||||
|
||||
page_server_cur.execute('branch_list;')
|
||||
branches = json.loads(page_server_cur.fetchone()[0])
|
||||
# Filter out branches created by other tests
|
||||
branches = [x for x in branches if x['name'].startswith('test_branch_list')]
|
||||
|
||||
assert len(branches) == 1
|
||||
assert branches[0]['name'] == 'test_branch_list_main'
|
||||
assert 'timeline_id' in branches[0]
|
||||
assert 'latest_valid_lsn' in branches[0]
|
||||
assert 'ancestor_id' in branches[0]
|
||||
assert 'ancestor_lsn' in branches[0]
|
||||
|
||||
# Create another branch, and start Postgres on it
|
||||
zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
|
||||
zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
|
||||
|
||||
page_server_cur.execute('branch_list;')
|
||||
new_branches = json.loads(page_server_cur.fetchone()[0])
|
||||
# Filter out branches created by other tests
|
||||
new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
|
||||
assert len(new_branches) == 2
|
||||
new_branches.sort(key=lambda k: k['name'])
|
||||
|
||||
assert new_branches[0]['name'] == 'test_branch_list_experimental'
|
||||
assert new_branches[0]['timeline_id'] != branches[0]['timeline_id']
|
||||
|
||||
# TODO: do the LSNs have to match here?
|
||||
assert new_branches[1] == branches[0]
|
||||
|
||||
page_server_conn.close()
|
||||
17
test_runner/batch_others/test_pgbench.py
Normal file
17
test_runner/batch_others/test_pgbench.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import pytest
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def test_pgbench(pageserver, postgres, pg_bin, zenith_cli):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_pgbench", "empty"]);
|
||||
|
||||
pg = postgres.create_start('test_pgbench')
|
||||
print("postgres is running on 'test_pgbench' branch")
|
||||
|
||||
connstr = pg.connstr();
|
||||
|
||||
pg_bin.run_capture(['pgbench', '-i', connstr])
|
||||
pg_bin.run_capture(['pgbench'] + '-c 10 -T 5 -P 1 -M prepared'.split() + [connstr])
|
||||
50
test_runner/batch_others/test_twophase.py
Normal file
50
test_runner/batch_others/test_twophase.py
Normal file
@@ -0,0 +1,50 @@
|
||||
#
|
||||
# Test branching, when a transaction is in prepared state
|
||||
#
|
||||
import pytest
|
||||
import getpass
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
def test_twophase(zenith_cli, pageserver, postgres, pg_bin):
|
||||
zenith_cli.run(["branch", "test_twophase", "empty"]);
|
||||
|
||||
pg = postgres.create_start('test_twophase', ['max_prepared_transactions=5'])
|
||||
print("postgres is running on 'test_twophase' branch")
|
||||
|
||||
conn = psycopg2.connect(pg.connstr());
|
||||
conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute('CREATE TABLE foo (t text)');
|
||||
|
||||
# Prepare a transaction that will insert a row
|
||||
cur.execute('BEGIN');
|
||||
cur.execute("INSERT INTO foo VALUES ('one')");
|
||||
cur.execute("PREPARE TRANSACTION 'insert_one'");
|
||||
|
||||
# Prepare another transaction that will insert a row
|
||||
cur.execute('BEGIN');
|
||||
cur.execute("INSERT INTO foo VALUES ('two')");
|
||||
cur.execute("PREPARE TRANSACTION 'insert_two'");
|
||||
|
||||
# Create a branch with the transaction in prepared state
|
||||
zenith_cli.run(["branch", "test_twophase_prepared", "test_twophase"]);
|
||||
|
||||
pg2 = postgres.create_start('test_twophase_prepared', ['max_prepared_transactions=5'])
|
||||
conn2 = psycopg2.connect(pg2.connstr());
|
||||
conn2.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur2 = conn2.cursor()
|
||||
|
||||
# On the new branch, commit one of the prepared transactions, abort the other one.
|
||||
cur2.execute("COMMIT PREPARED 'insert_one'");
|
||||
cur2.execute("ROLLBACK PREPARED 'insert_two'");
|
||||
|
||||
cur2.execute('SELECT * FROM foo');
|
||||
assert(cur2.fetchall() == [('one',)]);
|
||||
|
||||
# Neither insert is visible on the original branch, the transactions are still
|
||||
# in prepared state there.
|
||||
cur.execute('SELECT * FROM foo');
|
||||
assert(cur.fetchall() == []);
|
||||
49
test_runner/batch_others/test_zenith_cli.py
Normal file
49
test_runner/batch_others/test_zenith_cli.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import pytest
|
||||
import psycopg2
|
||||
import json
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
def helper_compare_branch_list(page_server_cur, zenith_cli):
|
||||
"""
|
||||
Compare branches list returned by CLI and directly via API.
|
||||
Filters out branches created by other tests.
|
||||
"""
|
||||
|
||||
page_server_cur.execute('branch_list;')
|
||||
branches_api = sorted(map(lambda b: b['name'], json.loads(page_server_cur.fetchone()[0])))
|
||||
branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')]
|
||||
|
||||
res = zenith_cli.run(["branch"]);
|
||||
assert(res.stderr == '')
|
||||
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
|
||||
branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')]
|
||||
|
||||
assert(branches_api == branches_cli)
|
||||
|
||||
def test_cli_branch_list(pageserver, zenith_cli):
|
||||
|
||||
page_server_conn = psycopg2.connect(pageserver.connstr())
|
||||
page_server_conn.autocommit = True
|
||||
page_server_cur = page_server_conn.cursor()
|
||||
|
||||
# Initial sanity check
|
||||
helper_compare_branch_list(page_server_cur, zenith_cli)
|
||||
|
||||
# Create a branch for us
|
||||
res = zenith_cli.run(["branch", "test_cli_branch_list_main", "main"]);
|
||||
assert(res.stderr == '')
|
||||
helper_compare_branch_list(page_server_cur, zenith_cli)
|
||||
|
||||
# Create a nested branch
|
||||
res = zenith_cli.run(["branch", "test_cli_branch_list_nested", "test_cli_branch_list_main"]);
|
||||
assert(res.stderr == '')
|
||||
helper_compare_branch_list(page_server_cur, zenith_cli)
|
||||
|
||||
# Check that all new branches are visible via CLI
|
||||
res = zenith_cli.run(["branch"]);
|
||||
assert(res.stderr == '')
|
||||
branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n")))
|
||||
|
||||
assert('test_cli_branch_list_main' in branches_cli)
|
||||
assert('test_cli_branch_list_nested' in branches_cli)
|
||||
61
test_runner/batch_pg_regress/test_pg_regress.py
Normal file
61
test_runner/batch_pg_regress/test_pg_regress.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import pytest
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
import getpass
|
||||
import os
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
# FIXME: put host + port in a fixture
|
||||
HOST = 'localhost'
|
||||
PORT = 55432
|
||||
|
||||
|
||||
def test_pg_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, base_dir, capsys):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_pg_regress", "empty"]);
|
||||
|
||||
# Connect to postgres and create a database called "regression".
|
||||
pg = postgres.create_start('test_pg_regress')
|
||||
pg_conn = psycopg2.connect(pg.connstr())
|
||||
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute('CREATE DATABASE regression')
|
||||
pg_conn.close()
|
||||
|
||||
# Create some local directories for pg_regress to run in.
|
||||
runpath = os.path.join(test_output_dir, 'regress')
|
||||
mkdir_if_needed(runpath)
|
||||
mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
|
||||
|
||||
# Compute all the file locations that pg_regress will need.
|
||||
build_path = os.path.join(
|
||||
pg_distrib_dir, 'build/src/test/regress')
|
||||
src_path = os.path.join(
|
||||
base_dir, 'vendor/postgres/src/test/regress')
|
||||
bindir = os.path.join(pg_distrib_dir, 'bin')
|
||||
schedule = os.path.join(src_path, 'parallel_schedule')
|
||||
pg_regress = os.path.join(build_path, 'pg_regress')
|
||||
|
||||
pg_regress_command = [
|
||||
pg_regress,
|
||||
'--bindir=""',
|
||||
'--use-existing',
|
||||
'--bindir={}'.format(bindir),
|
||||
'--dlpath={}'.format(build_path),
|
||||
'--schedule={}'.format(schedule),
|
||||
'--inputdir={}'.format(src_path),
|
||||
]
|
||||
|
||||
env = {
|
||||
'PGPORT': str(pg.port),
|
||||
'PGUSER': pg.username,
|
||||
'PGHOST': pg.host,
|
||||
}
|
||||
|
||||
# Run the command.
|
||||
# We don't capture the output. It's not too chatty, and it always
|
||||
# logs the exact same data to `regression.out` anyway.
|
||||
with capsys.disabled():
|
||||
pg_bin.run(pg_regress_command, env=env, cwd=runpath)
|
||||
62
test_runner/batch_pg_regress/test_zehith_regress.py
Normal file
62
test_runner/batch_pg_regress/test_zehith_regress.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import pytest
|
||||
from fixtures.utils import mkdir_if_needed
|
||||
import getpass
|
||||
import os
|
||||
import psycopg2
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
# FIXME: put host + port in a fixture
|
||||
HOST = 'localhost'
|
||||
PORT = 55432
|
||||
|
||||
|
||||
def test_zenith_regress(pageserver, postgres, pg_bin, zenith_cli, test_output_dir, pg_distrib_dir, base_dir, capsys):
|
||||
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_zenith_regress", "empty"]);
|
||||
|
||||
# Connect to postgres and create a database called "regression".
|
||||
pg = postgres.create_start('test_zenith_regress')
|
||||
pg_conn = psycopg2.connect(pg.connstr())
|
||||
pg_conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
|
||||
cur = pg_conn.cursor()
|
||||
cur.execute('CREATE DATABASE regression')
|
||||
pg_conn.close()
|
||||
|
||||
# Create some local directories for pg_regress to run in.
|
||||
runpath = os.path.join(test_output_dir, 'regress')
|
||||
mkdir_if_needed(runpath)
|
||||
mkdir_if_needed(os.path.join(runpath, 'testtablespace'))
|
||||
|
||||
# Compute all the file locations that pg_regress will need.
|
||||
# This test runs zenith specific tests
|
||||
build_path = os.path.join(
|
||||
pg_distrib_dir, 'build/src/test/regress')
|
||||
src_path = os.path.join(
|
||||
base_dir, 'test_runner/zenith_regress')
|
||||
bindir = os.path.join(pg_distrib_dir, 'bin')
|
||||
schedule = os.path.join(src_path, 'parallel_schedule')
|
||||
pg_regress = os.path.join(build_path, 'pg_regress')
|
||||
|
||||
pg_regress_command = [
|
||||
pg_regress,
|
||||
'--use-existing',
|
||||
'--bindir={}'.format(bindir),
|
||||
'--dlpath={}'.format(build_path),
|
||||
'--schedule={}'.format(schedule),
|
||||
'--inputdir={}'.format(src_path),
|
||||
]
|
||||
|
||||
print(pg_regress_command)
|
||||
env = {
|
||||
'PGPORT': str(pg.port),
|
||||
'PGUSER': pg.username,
|
||||
'PGHOST': pg.host,
|
||||
}
|
||||
|
||||
# Run the command.
|
||||
# We don't capture the output. It's not too chatty, and it always
|
||||
# logs the exact same data to `regression.out` anyway.
|
||||
with capsys.disabled():
|
||||
pg_bin.run(pg_regress_command, env=env, cwd=runpath)
|
||||
1
test_runner/conftest.py
Normal file
1
test_runner/conftest.py
Normal file
@@ -0,0 +1 @@
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
0
test_runner/fixtures/__init__.py
Normal file
0
test_runner/fixtures/__init__.py
Normal file
53
test_runner/fixtures/utils.py
Normal file
53
test_runner/fixtures/utils.py
Normal file
@@ -0,0 +1,53 @@
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
def get_self_dir():
|
||||
""" Get the path to the directory where this script lives. """
|
||||
return os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
def mkdir_if_needed(path):
|
||||
""" Create a directory if it doesn't already exist
|
||||
|
||||
Note this won't try to create intermediate directories.
|
||||
"""
|
||||
if os.path.exists(path):
|
||||
assert os.path.isdir(path)
|
||||
return
|
||||
os.mkdir(path)
|
||||
|
||||
|
||||
def subprocess_capture(capture_dir, cmd, **kwargs):
|
||||
""" Run a process and capture its output
|
||||
|
||||
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
|
||||
where "cmd" is the name of the program and NNN is an incrementing
|
||||
counter.
|
||||
|
||||
If those files already exist, we will overwrite them.
|
||||
"""
|
||||
assert type(cmd) is list
|
||||
base = os.path.basename(cmd[0]) + '_{}'.format(global_counter())
|
||||
basepath = os.path.join(capture_dir, base)
|
||||
stdout_filename = basepath + '.stdout'
|
||||
stderr_filename = basepath + '.stderr'
|
||||
|
||||
with open(stdout_filename, 'w') as stdout_f:
|
||||
with open(stderr_filename, 'w') as stderr_f:
|
||||
print('(capturing output to "{}.stdout")'.format(base))
|
||||
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
|
||||
|
||||
|
||||
_global_counter = 0
|
||||
|
||||
|
||||
def global_counter():
|
||||
""" A really dumb global counter.
|
||||
|
||||
This is useful for giving output files a unique number, so if we run the
|
||||
same command multiple times we can keep their output separate.
|
||||
"""
|
||||
global _global_counter
|
||||
_global_counter += 1
|
||||
return _global_counter
|
||||
357
test_runner/fixtures/zenith_fixtures.py
Normal file
357
test_runner/fixtures/zenith_fixtures.py
Normal file
@@ -0,0 +1,357 @@
|
||||
import getpass
|
||||
import os
|
||||
import psycopg2
|
||||
import pytest
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from .utils import (get_self_dir, mkdir_if_needed,
|
||||
subprocess_capture, global_counter)
|
||||
|
||||
"""
|
||||
This file contains pytest fixtures. A fixture is a test resource that can be
|
||||
summoned by placing its name in the test's arguments.
|
||||
|
||||
A fixture is created with the decorator @zenfixture, which is a wrapper around
|
||||
the standard pytest.fixture with some extra behavior.
|
||||
|
||||
There are several environment variables that can control the running of tests:
|
||||
ZENITH_BIN, POSTGRES_DISTRIB_DIR, etc. See README.md for more information.
|
||||
|
||||
To use fixtures in a test file, add this line of code:
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
Don't import functions from this file, or pytest will emit warnings. Instead
|
||||
put directly-importable functions into utils.py or another separate file.
|
||||
"""
|
||||
|
||||
DEFAULT_OUTPUT_DIR = 'test_output'
|
||||
DEFAULT_POSTGRES_DIR = 'tmp_install'
|
||||
|
||||
|
||||
def determine_scope(fixture_name, config):
|
||||
return 'session'
|
||||
|
||||
|
||||
def zenfixture(func):
|
||||
""" This is a python decorator for fixtures with a flexible scope.
|
||||
|
||||
By default every test function will set up and tear down a new
|
||||
database. In pytest, this is called fixtures "function" scope.
|
||||
|
||||
If the environment variable TEST_SHARED_FIXTURES is set, then all
|
||||
tests will share the same database. State, logs, etc. will be
|
||||
stored in a directory called "shared".
|
||||
|
||||
"""
|
||||
if os.environ.get('TEST_SHARED_FIXTURES') is None:
|
||||
scope = 'function'
|
||||
else:
|
||||
scope = 'session'
|
||||
return pytest.fixture(func, scope=scope)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True, scope='session')
|
||||
def safety_check():
|
||||
""" Ensure that no unwanted daemons are running before we start testing. """
|
||||
# does not use -c as it is not supported on macOS
|
||||
cmd = ['pgrep', 'pageserver|postgres|wal_acceptor']
|
||||
result = subprocess.run(cmd, stdout=subprocess.DEVNULL)
|
||||
if result.returncode == 0:
|
||||
# returncode of 0 means it found something.
|
||||
# This is bad; we don't want any of those processes polluting the
|
||||
# result of the test.
|
||||
raise Exception('found interfering processes running')
|
||||
|
||||
|
||||
class ZenithCli:
|
||||
""" An object representing the CLI binary named "zenith".
|
||||
|
||||
We also store an environment that will tell the CLI to operate
|
||||
on a particular ZENITH_REPO_DIR.
|
||||
"""
|
||||
|
||||
def __init__(self, binpath, repo_dir, pg_distrib_dir):
|
||||
assert os.path.isdir(binpath)
|
||||
self.binpath = binpath
|
||||
self.bin_zenith = os.path.join(binpath, 'zenith')
|
||||
self.env = os.environ.copy()
|
||||
self.env['ZENITH_REPO_DIR'] = repo_dir
|
||||
self.env['POSTGRES_DISTRIB_DIR'] = pg_distrib_dir
|
||||
|
||||
def run(self, arguments):
|
||||
""" Run "zenith" with the specified arguments.
|
||||
|
||||
arguments must be in list form, e.g. ['pg', 'create']
|
||||
|
||||
Return both stdout and stderr, which can be accessed as
|
||||
|
||||
result = zenith_cli.run(...)
|
||||
assert(result.stderr == "")
|
||||
print(result.stdout)
|
||||
|
||||
"""
|
||||
assert type(arguments) == list
|
||||
args = [self.bin_zenith] + arguments
|
||||
print('Running command "{}"'.format(' '.join(args)))
|
||||
return subprocess.run(args, env=self.env, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
|
||||
|
||||
@zenfixture
|
||||
def zenith_cli(zenith_binpath, repo_dir, pg_distrib_dir):
|
||||
return ZenithCli(zenith_binpath, repo_dir, pg_distrib_dir)
|
||||
|
||||
|
||||
class ZenithPageserver:
|
||||
""" An object representing a running pageserver. """
|
||||
|
||||
def __init__(self, zenith_cli):
|
||||
self.zenith_cli = zenith_cli
|
||||
self.running = False
|
||||
|
||||
# Initialize the repository, i.e. run "zenith init"
|
||||
def init(self):
|
||||
self.zenith_cli.run(['init'])
|
||||
|
||||
# Start the page server
|
||||
def start(self):
|
||||
self.zenith_cli.run(['start'])
|
||||
self.running = True
|
||||
|
||||
# Stop the page server
|
||||
def stop(self):
|
||||
self.zenith_cli.run(['stop'])
|
||||
self.running = True
|
||||
|
||||
# The page server speaks the Postgres FE/BE protocol, so you can connect
|
||||
# to it with any Postgres client, and run special commands. This function
|
||||
# returns a libpq connection string for connecting to it.
|
||||
def connstr(self):
|
||||
username = getpass.getuser()
|
||||
conn_str = 'host={} port={} dbname=postgres user={}'.format(
|
||||
'localhost', 64000, username)
|
||||
return conn_str
|
||||
|
||||
# The 'pageserver' fixture provides a Page Server that's up and running.
|
||||
#
|
||||
# If TEST_SHARED_FIXTURES is set, the Page Server instance is shared by all
|
||||
# the tests. To avoid clashing with other tests, don't use the 'main' branch in
|
||||
# the tests directly. Instead, create a branch off the 'empty' branch and use
|
||||
# that.
|
||||
#
|
||||
# By convention, the test branches are named after the tests. For example,
|
||||
# test called 'test_foo' would create and use branches with the 'test_foo' prefix.
|
||||
@zenfixture
|
||||
def pageserver(zenith_cli):
|
||||
ps = ZenithPageserver(zenith_cli)
|
||||
ps.init()
|
||||
ps.start()
|
||||
# For convenience in tests, create a branch from the freshly-initialized cluster.
|
||||
zenith_cli.run(["branch", "empty", "main"]);
|
||||
yield ps
|
||||
# After the yield comes any cleanup code we need.
|
||||
print('Starting pageserver cleanup')
|
||||
ps.stop()
|
||||
|
||||
class Postgres:
|
||||
""" An object representing a running postgres daemon. """
|
||||
|
||||
def __init__(self, zenith_cli, repo_dir, instance_num):
|
||||
self.zenith_cli = zenith_cli
|
||||
self.instance_num = instance_num
|
||||
self.running = False
|
||||
self.username = getpass.getuser()
|
||||
self.host = 'localhost'
|
||||
self.port = 55431 + instance_num
|
||||
self.repo_dir = repo_dir
|
||||
self.branch = None
|
||||
# path to conf is <repo_dir>/pgdatadirs/<branch_name>/postgresql.conf
|
||||
|
||||
def create_start(self, branch, config_lines=None):
|
||||
""" create the pg data directory, and start the server """
|
||||
self.zenith_cli.run(['pg', 'create', branch])
|
||||
self.branch = branch
|
||||
if config_lines is None:
|
||||
config_lines = []
|
||||
self.config(config_lines)
|
||||
self.zenith_cli.run(['pg', 'start', branch])
|
||||
self.running = True
|
||||
return
|
||||
|
||||
#lines should be an array of valid postgresql.conf rows
|
||||
def config(self, lines):
|
||||
filename = 'pgdatadirs/{}/postgresql.conf'.format(self.branch)
|
||||
config_name = os.path.join(self.repo_dir, filename)
|
||||
with open(config_name, 'a') as conf:
|
||||
for line in lines:
|
||||
conf.write(line)
|
||||
conf.write('\n')
|
||||
|
||||
def stop(self):
|
||||
if self.running:
|
||||
self.zenith_cli.run(['pg', 'stop', self.branch])
|
||||
|
||||
# Return a libpq connection string to connect to the Postgres instance
|
||||
def connstr(self, dbname='postgres'):
|
||||
conn_str = 'host={} port={} dbname={} user={}'.format(
|
||||
self.host, self.port, dbname, self.username)
|
||||
return conn_str
|
||||
|
||||
class PostgresFactory:
|
||||
""" An object representing multiple running postgres daemons. """
|
||||
def __init__(self, zenith_cli, repo_dir):
|
||||
self.zenith_cli = zenith_cli
|
||||
self.host = 'localhost'
|
||||
self.repo_dir = repo_dir
|
||||
self.num_instances = 0
|
||||
self.instances = []
|
||||
|
||||
def create_start(self, branch="main", config_lines=None):
|
||||
pg = Postgres(self.zenith_cli, self.repo_dir, self.num_instances + 1)
|
||||
self.num_instances += 1
|
||||
self.instances.append(pg)
|
||||
pg.create_start(branch, config_lines)
|
||||
return pg
|
||||
|
||||
def stop_all(self):
|
||||
for pg in self.instances:
|
||||
pg.stop()
|
||||
|
||||
@zenfixture
|
||||
def postgres(zenith_cli, repo_dir):
|
||||
pgfactory = PostgresFactory(zenith_cli, repo_dir)
|
||||
yield pgfactory
|
||||
# After the yield comes any cleanup code we need.
|
||||
print('Starting postgres cleanup')
|
||||
pgfactory.stop_all()
|
||||
|
||||
|
||||
class PgBin:
|
||||
""" A helper class for executing postgres binaries """
|
||||
|
||||
def __init__(self, log_dir, pg_distrib_dir):
|
||||
self.log_dir = log_dir
|
||||
self.pg_install_path = pg_distrib_dir
|
||||
self.pg_bin_path = os.path.join(self.pg_install_path, 'bin')
|
||||
self.env = os.environ.copy()
|
||||
self.env['LD_LIBRARY_PATH'] = os.path.join(self.pg_install_path, 'lib')
|
||||
|
||||
def _fixpath(self, command):
|
||||
if not '/' in command[0]:
|
||||
command[0] = os.path.join(self.pg_bin_path, command[0])
|
||||
|
||||
def _build_env(self, env_add):
|
||||
if env_add is None:
|
||||
return self.env
|
||||
env = self.env.copy()
|
||||
env.update(env_add)
|
||||
return env
|
||||
|
||||
def run(self, command, env=None, cwd=None):
|
||||
""" Run one of the postgres binaries.
|
||||
|
||||
The command should be in list form, e.g. ['pgbench', '-p', '55432']
|
||||
|
||||
All the necessary environment variables will be set.
|
||||
|
||||
If the first argument (the command name) doesn't include a path (no '/'
|
||||
characters present), then it will be edited to include the correct path.
|
||||
|
||||
If you want stdout/stderr captured to files, use `run_capture` instead.
|
||||
|
||||
"""
|
||||
self._fixpath(command)
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
subprocess.run(command, env=env, cwd=cwd, check=True)
|
||||
|
||||
def run_capture(self, command, env=None, cwd=None):
|
||||
""" Run one of the postgres binaries, with stderr and stdout redirected to a file.
|
||||
|
||||
This is just like `run`, but for chatty programs.
|
||||
"""
|
||||
self._fixpath(command)
|
||||
print('Running command "{}"'.format(' '.join(command)))
|
||||
env = self._build_env(env)
|
||||
subprocess_capture(self.log_dir, command, env=env, cwd=cwd, check=True)
|
||||
|
||||
|
||||
@zenfixture
|
||||
def pg_bin(test_output_dir, pg_distrib_dir):
|
||||
return PgBin(test_output_dir, pg_distrib_dir)
|
||||
|
||||
|
||||
@zenfixture
|
||||
def base_dir():
|
||||
""" find the base directory (currently this is the git root) """
|
||||
base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..'))
|
||||
print('base_dir is', base_dir)
|
||||
return base_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def top_output_dir(base_dir):
|
||||
""" Compute the top-level directory for all tests. """
|
||||
env_test_output = os.environ.get('TEST_OUTPUT')
|
||||
if env_test_output is not None:
|
||||
output_dir = env_test_output
|
||||
else:
|
||||
output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
|
||||
mkdir_if_needed(output_dir)
|
||||
return output_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def test_output_dir(request, top_output_dir):
|
||||
""" Compute the working directory for an individual test. """
|
||||
if os.environ.get('TEST_SHARED_FIXTURES') is None:
|
||||
# one directory per test
|
||||
test_name = request.node.name
|
||||
else:
|
||||
# We're running shared fixtures. Share a single directory.
|
||||
test_name = 'shared'
|
||||
|
||||
test_output_dir = os.path.join(top_output_dir, test_name)
|
||||
print('test_output_dir is', test_output_dir)
|
||||
shutil.rmtree(test_output_dir, ignore_errors=True)
|
||||
mkdir_if_needed(test_output_dir)
|
||||
return test_output_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def repo_dir(request, test_output_dir):
|
||||
""" Compute the test repo_dir
|
||||
|
||||
"repo_dir" is the place where all of the pageserver files will go.
|
||||
It doesn't have anything to do with the git repo.
|
||||
"""
|
||||
repo_dir = os.path.join(test_output_dir, 'repo')
|
||||
return repo_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def zenith_binpath(base_dir):
|
||||
""" find the zenith binaries """
|
||||
env_zenith_bin = os.environ.get('ZENITH_BIN')
|
||||
if env_zenith_bin:
|
||||
zenith_dir = env_zenith_bin
|
||||
else:
|
||||
zenith_dir = os.path.join(base_dir, 'target/debug')
|
||||
if not os.path.exists(os.path.join(zenith_dir, 'pageserver')):
|
||||
raise Exception('zenith binaries not found at "{}"'.format(zenith_dir))
|
||||
return zenith_dir
|
||||
|
||||
|
||||
@zenfixture
|
||||
def pg_distrib_dir(base_dir):
|
||||
""" find the postgress install """
|
||||
env_postgres_bin = os.environ.get('POSTGRES_DISTRIB_DIR')
|
||||
if env_postgres_bin:
|
||||
pg_dir = env_postgres_bin
|
||||
else:
|
||||
pg_dir = os.path.normpath(os.path.join(base_dir, DEFAULT_POSTGRES_DIR))
|
||||
print('postgres dir is', pg_dir)
|
||||
if not os.path.exists(os.path.join(pg_dir, 'bin/postgres')):
|
||||
raise Exception('postgres not found at "{}"'.format(pg_dir))
|
||||
return pg_dir
|
||||
2
test_runner/pytest.ini
Normal file
2
test_runner/pytest.ini
Normal file
@@ -0,0 +1,2 @@
|
||||
[pytest]
|
||||
minversion = 6.0
|
||||
33
test_runner/test_broken.py
Normal file
33
test_runner/test_broken.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import pytest
|
||||
import os
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
"""
|
||||
|
||||
Use this test to see what happens when tests fail.
|
||||
|
||||
We should be able to clean up after ourselves, including stopping any
|
||||
postgres or pageserver processes.
|
||||
|
||||
Set the environment variable RUN_BROKEN to see this test run (and fail,
|
||||
and hopefully not leave any server processes behind).
|
||||
|
||||
"""
|
||||
|
||||
|
||||
run_broken = pytest.mark.skipif(
|
||||
os.environ.get('RUN_BROKEN') == None,
|
||||
reason="only used for testing the fixtures"
|
||||
)
|
||||
|
||||
@run_broken
|
||||
def test_broken(zenith_cli, pageserver, postgres, pg_bin):
|
||||
# Create a branch for us
|
||||
zenith_cli.run(["branch", "test_broken", "empty"]);
|
||||
|
||||
pg = postgres.create_start("test_broken")
|
||||
print('postgres is running')
|
||||
|
||||
print('THIS NEXT COMMAND WILL FAIL:')
|
||||
pg_bin.run('pgbench -i_am_a_broken_test'.split())
|
||||
11
test_runner/zenith_regress/.gitignore
vendored
Normal file
11
test_runner/zenith_regress/.gitignore
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
# Local binaries
|
||||
/pg_regress
|
||||
|
||||
# Generated subdirectories
|
||||
/tmp_check/
|
||||
/results/
|
||||
/log/
|
||||
|
||||
# Note: regression.* are only left behind on a failure; that's why they're not ignored
|
||||
#/regression.diffs
|
||||
#/regression.out
|
||||
11
test_runner/zenith_regress/README.md
Normal file
11
test_runner/zenith_regress/README.md
Normal file
@@ -0,0 +1,11 @@
|
||||
To add a new SQL test
|
||||
|
||||
- add sql script to run to zenith_regress/sql/testname.sql
|
||||
- add expected output to zenith/regress/expected/testname.out
|
||||
- add testname to both parallel_schedule and serial_schedule files*
|
||||
|
||||
That's it.
|
||||
For more complex tests see PostgreSQL regression tests. These works basically the same.
|
||||
|
||||
*it was changed recently in PostgreSQL upstream - no more separate serial_schedule.
|
||||
Someday we'll catch up with these changes.
|
||||
9
test_runner/zenith_regress/expected/.gitignore
vendored
Normal file
9
test_runner/zenith_regress/expected/.gitignore
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
/constraints.out
|
||||
/copy.out
|
||||
/create_function_1.out
|
||||
/create_function_2.out
|
||||
/largeobject.out
|
||||
/largeobject_1.out
|
||||
/misc.out
|
||||
/security_label.out
|
||||
/tablespace.out
|
||||
34
test_runner/zenith_regress/expected/zenith-cid.out
Normal file
34
test_runner/zenith_regress/expected/zenith-cid.out
Normal file
@@ -0,0 +1,34 @@
|
||||
BEGIN;
|
||||
SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;
|
||||
CREATE TABLE cursor (a int);
|
||||
INSERT INTO cursor VALUES (1);
|
||||
DECLARE c1 NO SCROLL CURSOR FOR SELECT * FROM cursor FOR UPDATE;
|
||||
UPDATE cursor SET a = 2;
|
||||
FETCH ALL FROM c1;
|
||||
a
|
||||
---
|
||||
(0 rows)
|
||||
|
||||
COMMIT;
|
||||
DROP TABLE cursor;
|
||||
create table to_be_evicted(x bigint);
|
||||
begin;
|
||||
insert into to_be_evicted values (1);
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
select sum(x) from to_be_evicted;
|
||||
sum
|
||||
-------------
|
||||
25937424601
|
||||
(1 row)
|
||||
|
||||
end;
|
||||
drop table to_be_evicted;
|
||||
15
test_runner/zenith_regress/expected/zenith-clog.out
Normal file
15
test_runner/zenith_regress/expected/zenith-clog.out
Normal file
@@ -0,0 +1,15 @@
|
||||
create or replace procedure do_commits() as $$
|
||||
declare
|
||||
xid xid8;
|
||||
i integer;
|
||||
begin
|
||||
for i in 1..1000000 loop
|
||||
xid = txid_current();
|
||||
commit;
|
||||
if (pg_xact_status(xid) <> 'committed') then
|
||||
raise exception 'CLOG corruption';
|
||||
end if;
|
||||
end loop;
|
||||
end;
|
||||
$$ language plpgsql;
|
||||
call do_commits();
|
||||
19
test_runner/zenith_regress/expected/zenith-rel-truncate.out
Normal file
19
test_runner/zenith_regress/expected/zenith-rel-truncate.out
Normal file
@@ -0,0 +1,19 @@
|
||||
--
|
||||
-- Test that when a relation is truncated by VACUUM, the next smgrnblocks()
|
||||
-- query to get the relation's size returns the new size.
|
||||
-- (This isn't related to the TRUNCATE command, which works differently,
|
||||
-- by creating a new relation file)
|
||||
--
|
||||
CREATE TABLE truncatetest (i int);
|
||||
INSERT INTO truncatetest SELECT g FROM generate_series(1, 10000) g;
|
||||
-- Remove all the rows, and run VACUUM to remove the dead tuples and
|
||||
-- truncate the physical relation to 0 blocks.
|
||||
DELETE FROM truncatetest;
|
||||
VACUUM truncatetest;
|
||||
-- Check that a SeqScan sees correct relation size (which is now 0)
|
||||
SELECT * FROM truncatetest;
|
||||
i
|
||||
---
|
||||
(0 rows)
|
||||
|
||||
DROP TABLE truncatetest;
|
||||
9
test_runner/zenith_regress/expected/zenith-truncate.out
Normal file
9
test_runner/zenith_regress/expected/zenith-truncate.out
Normal file
@@ -0,0 +1,9 @@
|
||||
create table tt(x integer);
|
||||
insert into tt values (generate_series(1,10000));
|
||||
delete from tt;
|
||||
vacuum tt;
|
||||
insert into tt values (generate_series(1,10000));
|
||||
delete from tt;
|
||||
vacuum tt;
|
||||
insert into tt values (generate_series(1,10000));
|
||||
drop table tt;
|
||||
304
test_runner/zenith_regress/expected/zenith-vacuum-full.out
Normal file
304
test_runner/zenith_regress/expected/zenith-vacuum-full.out
Normal file
@@ -0,0 +1,304 @@
|
||||
create table foo(a int primary key, b int, c int);
|
||||
insert into foo values (generate_series(1,10000), generate_series(1,10000), generate_series(1,10000));
|
||||
create index concurrently on foo(b);
|
||||
create index concurrently on foo(c);
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
Table "public.foo"
|
||||
Column | Type | Collation | Nullable | Default
|
||||
--------+---------+-----------+----------+---------
|
||||
a | integer | | not null |
|
||||
b | integer | | |
|
||||
c | integer | | |
|
||||
Indexes:
|
||||
"foo_pkey" PRIMARY KEY, btree (a)
|
||||
"foo_b_idx" btree (b)
|
||||
"foo_c_idx" btree (c)
|
||||
|
||||
drop table foo;
|
||||
12
test_runner/zenith_regress/parallel_schedule
Normal file
12
test_runner/zenith_regress/parallel_schedule
Normal file
@@ -0,0 +1,12 @@
|
||||
# ----------
|
||||
# src/test/regress/parallel_schedule
|
||||
#
|
||||
# By convention, we put no more than twenty tests in any one parallel group;
|
||||
# this limits the number of connections needed to run the tests.
|
||||
# ----------
|
||||
|
||||
test: zenith-cid
|
||||
test: zenith-rel-truncate
|
||||
test: zenith-clog
|
||||
test: zenith-vacuum-full
|
||||
test: zenith-truncate
|
||||
7
test_runner/zenith_regress/serial_schedule
Normal file
7
test_runner/zenith_regress/serial_schedule
Normal file
@@ -0,0 +1,7 @@
|
||||
# src/test/regress/serial_schedule
|
||||
# This should probably be in an order similar to parallel_schedule.
|
||||
test: zenith-cid
|
||||
test: zenith-rel-truncate
|
||||
test: zenith-clog
|
||||
test: zenith-vacuum-full
|
||||
test: zenith-truncate
|
||||
8
test_runner/zenith_regress/sql/.gitignore
vendored
Normal file
8
test_runner/zenith_regress/sql/.gitignore
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/constraints.sql
|
||||
/copy.sql
|
||||
/create_function_1.sql
|
||||
/create_function_2.sql
|
||||
/largeobject.sql
|
||||
/misc.sql
|
||||
/security_label.sql
|
||||
/tablespace.sql
|
||||
26
test_runner/zenith_regress/sql/zenith-cid.sql
Normal file
26
test_runner/zenith_regress/sql/zenith-cid.sql
Normal file
@@ -0,0 +1,26 @@
|
||||
BEGIN;
|
||||
SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;
|
||||
CREATE TABLE cursor (a int);
|
||||
INSERT INTO cursor VALUES (1);
|
||||
DECLARE c1 NO SCROLL CURSOR FOR SELECT * FROM cursor FOR UPDATE;
|
||||
UPDATE cursor SET a = 2;
|
||||
FETCH ALL FROM c1;
|
||||
COMMIT;
|
||||
DROP TABLE cursor;
|
||||
|
||||
create table to_be_evicted(x bigint);
|
||||
begin;
|
||||
insert into to_be_evicted values (1);
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
insert into to_be_evicted select x*10 from to_be_evicted;
|
||||
select sum(x) from to_be_evicted;
|
||||
end;
|
||||
drop table to_be_evicted;
|
||||
16
test_runner/zenith_regress/sql/zenith-clog.sql
Normal file
16
test_runner/zenith_regress/sql/zenith-clog.sql
Normal file
@@ -0,0 +1,16 @@
|
||||
create or replace procedure do_commits() as $$
|
||||
declare
|
||||
xid xid8;
|
||||
i integer;
|
||||
begin
|
||||
for i in 1..1000000 loop
|
||||
xid = txid_current();
|
||||
commit;
|
||||
if (pg_xact_status(xid) <> 'committed') then
|
||||
raise exception 'CLOG corruption';
|
||||
end if;
|
||||
end loop;
|
||||
end;
|
||||
$$ language plpgsql;
|
||||
|
||||
call do_commits();
|
||||
18
test_runner/zenith_regress/sql/zenith-rel-truncate.sql
Normal file
18
test_runner/zenith_regress/sql/zenith-rel-truncate.sql
Normal file
@@ -0,0 +1,18 @@
|
||||
--
|
||||
-- Test that when a relation is truncated by VACUUM, the next smgrnblocks()
|
||||
-- query to get the relation's size returns the new size.
|
||||
-- (This isn't related to the TRUNCATE command, which works differently,
|
||||
-- by creating a new relation file)
|
||||
--
|
||||
CREATE TABLE truncatetest (i int);
|
||||
INSERT INTO truncatetest SELECT g FROM generate_series(1, 10000) g;
|
||||
|
||||
-- Remove all the rows, and run VACUUM to remove the dead tuples and
|
||||
-- truncate the physical relation to 0 blocks.
|
||||
DELETE FROM truncatetest;
|
||||
VACUUM truncatetest;
|
||||
|
||||
-- Check that a SeqScan sees correct relation size (which is now 0)
|
||||
SELECT * FROM truncatetest;
|
||||
|
||||
DROP TABLE truncatetest;
|
||||
9
test_runner/zenith_regress/sql/zenith-truncate.sql
Normal file
9
test_runner/zenith_regress/sql/zenith-truncate.sql
Normal file
@@ -0,0 +1,9 @@
|
||||
create table tt(x integer);
|
||||
insert into tt values (generate_series(1,10000));
|
||||
delete from tt;
|
||||
vacuum tt;
|
||||
insert into tt values (generate_series(1,10000));
|
||||
delete from tt;
|
||||
vacuum tt;
|
||||
insert into tt values (generate_series(1,10000));
|
||||
drop table tt;
|
||||
51
test_runner/zenith_regress/sql/zenith-vacuum-full.sql
Normal file
51
test_runner/zenith_regress/sql/zenith-vacuum-full.sql
Normal file
@@ -0,0 +1,51 @@
|
||||
create table foo(a int primary key, b int, c int);
|
||||
insert into foo values (generate_series(1,10000), generate_series(1,10000), generate_series(1,10000));
|
||||
create index concurrently on foo(b);
|
||||
create index concurrently on foo(c);
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
vacuum full foo;
|
||||
\d foo
|
||||
drop table foo;
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: c2409039d6...62852ad64e
@@ -20,17 +20,20 @@ slog = "2.7.0"
|
||||
log = "0.4.14"
|
||||
clap = "2.33.0"
|
||||
daemonize = "0.4.1"
|
||||
rust-s3 = { git = "https://github.com/hlinnaka/rust-s3", rev="7f15a24ec7daa0a5d9516da706212745f9042818", features = ["no-verify-ssl"] }
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
tokio = { version = "1.3.0", features = ["full"] }
|
||||
tokio-stream = { version = "0.1.4" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="a0d067b66447951d1276a53fb09886539c3fa094" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
parse_duration = "*"
|
||||
parse_duration = "2.1.1"
|
||||
walkdir = "2"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
hex = "0.4.3"
|
||||
|
||||
# FIXME: 'pageserver' is needed for ZTimelineId. Refactor
|
||||
pageserver = { path = "../pageserver" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
|
||||
282
walkeeper/README.md
Normal file
282
walkeeper/README.md
Normal file
@@ -0,0 +1,282 @@
|
||||
# Proxy-safekeeper communication consensus protocol.
|
||||
|
||||
## General requirements and architecture
|
||||
|
||||
There is single stateless master and several safekeepers. Number of safekeepers is determined by redundancy level.
|
||||
To minimize number of changes in Postgres core, we are using standard streaming replication from master (through WAL sender).
|
||||
This replication stream is initiated by `safekeeper_proxy` which receives data from the master and broadcasts it to safekeepers.
|
||||
To provide durability we use synchronous replication at master (response to the commit statement is sent to the client
|
||||
only when acknowledged by WAL receiver). `safekeeper_proxy` sends this acknowledgment only when LSN of commit record is confirmed by quorum of safekeepers.
|
||||
|
||||
`Safekeeper_proxy` tries to establish connections with safekeepers.
|
||||
At any moment of time each safekeeper can serve exactly once proxy, but it can accept new connections.
|
||||
|
||||
Any of safekeepers can be used as WAL server, producing replication stream. So both `Pagers` and `Replicas`
|
||||
(read-only computation nodes) can connect to safekeeper to receive WAL stream. Safekeepers is streaming WAL until
|
||||
it reaches min(`commitLSN`,`flushLSN`). Then replication is suspended until new data arrives from master.
|
||||
|
||||
|
||||
## Handshake
|
||||
The goal of handshake is to collect quorum (to be able to perform recovery)
|
||||
and avoid split-brains caused by simultaneous presence of old and new master.
|
||||
Procedure of handshake consists of the following steps:
|
||||
|
||||
1. Broadcast information about server to all safekeepers (wal segment size, system_id,...)
|
||||
2. Receive responses with information about safekeepers.
|
||||
3. Once quorum of handshake responses are received, propose new `NodeId(max(term)+1, server.uuid)`
|
||||
to all of them.
|
||||
4. On receiving proposed nodeId, safekeeper compares it with locally stored nodeId and if it is greater or equals
|
||||
then accepts proposed nodeId and persists this choice in the local control file.
|
||||
5. If quorum of safekeepers approve proposed nodeId, then server assumes that handshake is successfully completed and switch to recovery stage.
|
||||
|
||||
## Recovery
|
||||
Proxy computes max(`restartLSN`) and max(`flushLSN`) from quorum of attached safekeepers.
|
||||
`RestartLSN` - is position in WAL which is known to be delivered to all safekeepers.
|
||||
In other words: `restartLSN` can be also considered as cut-off horizon (all preceding WAL segments can be removed).
|
||||
`FlushLSN` is position flushed by safekeeper to the local persistent storage.
|
||||
|
||||
If max(`restartLSN`) != max(`flushLSN`), then recovery has to be performed.
|
||||
Proxy creates replication channel with most advanced safekeeper (safekeeper with the largest `flushLSN`).
|
||||
Then it downloads all WAL messages between max(`restartLSN`)..max(`flushLSN`).
|
||||
Messages are inserted in L1-list (ordered by LSN). Then we locate position of each safekeeper in this list according
|
||||
to their `flushLSN`s. Safekeepers that are not yet connected (out of quorum) should start from the beginning of the list
|
||||
(corresponding to `restartLSN`).
|
||||
|
||||
We need to choose max(`flushLSN`) because voting quorum may be different from quorum committed the last message.
|
||||
So we do not know whether records with max(`flushLSN`) was committed by quorum or not. So we have to consider it committed
|
||||
to avoid loose of committed data.
|
||||
|
||||
Calculated max(`flushLSN`) is called `VCL` (Volume Complete LSN). As far as it is chosen among quorum, there may be some other offline safekeeper with larger
|
||||
`VCL`. Once it becomes online, we need to overwrite its WAL beyond `VCL`. To support it, each safekeeper maintains
|
||||
`epoch` number. `Epoch` plays almost the same role as `term`, but algorithm of `epoch` bumping is different.
|
||||
`VCL` and new epoch are received by safekeeper from proxy during voting.
|
||||
But safekeeper doesn't switch to new epoch immediately after voting.
|
||||
Instead of it, safekeepers waits record with LSN > Max(`flushLSN`,`VCL`) is received.
|
||||
It means that we restore all records from old generation and switch to new generation.
|
||||
When proxy calculates max(`FlushLSN`), it first compares `Epoch`. So actually we compare (`Epoch`,`FlushLSN`) pairs.
|
||||
|
||||
Let's looks at the examples. Consider that we have three safekeepers: S1, S2, S3. Si(N) means that i-th safekeeper has epoch=N.
|
||||
Ri(x) - WAL record for resource X with LSN=i. Assume that we have the following state:
|
||||
|
||||
```
|
||||
S1(1): R1(a)
|
||||
S2(1): R1(a),R2(b)
|
||||
S3(1): R1(a),R2(b),R3(c),R4(d) - offline
|
||||
```
|
||||
|
||||
Proxy choose quorum (S1,S2). VCL for them is 2. We download S2 to proxy and schedule its write to S1.
|
||||
After receiving record R5 the picture can be:
|
||||
|
||||
```
|
||||
S1(2): R1(a),R2(b),R3(e)
|
||||
S2(2): R1(a),R2(b),R3(e)
|
||||
S3(1): R1(a),R2(b),R3(c),R4(d) - offline
|
||||
```
|
||||
|
||||
Now if server is crashed or restarted, we perform new voting and
|
||||
doesn't matter which quorum we choose: (S1,S2), (S2,S3)...
|
||||
in any case VCL=3, because S3 has smaller epoch.
|
||||
R3(c) will be overwritten with R3(e):
|
||||
|
||||
```
|
||||
S1(3): R1(a),R2(b),R3(e)
|
||||
S2(3): R1(a),R2(b),R3(e)
|
||||
S3(1): R1(a),R2(b),R3(e),R4(d)
|
||||
```
|
||||
|
||||
Epoch of S3 will be adjusted once it overwrites R4:
|
||||
|
||||
```
|
||||
S1(3): R1(a),R2(b),R3(e),R4(f)
|
||||
S2(3): R1(a),R2(b),R3(e),R4(f)
|
||||
S3(3): R1(a),R2(b),R3(e),R4(f)
|
||||
```
|
||||
|
||||
Crash can happen before epoch was bumped. Let's return back to the initial position:
|
||||
|
||||
```
|
||||
S1(1): R1(a)
|
||||
S2(1): R1(a),R2(b)
|
||||
S3(1): R1(a),R2(b),R3(c),R4(d) - offline
|
||||
```
|
||||
|
||||
Assume that we start recovery:
|
||||
|
||||
```
|
||||
S1(1): R1(a),R2(b)
|
||||
S2(1): R1(a),R2(b)
|
||||
S3(1): R1(a),R2(b),R3(c),R4(d) - offline
|
||||
```
|
||||
|
||||
and then crash happens. During voting we choose quorum (S3,S3).
|
||||
Now them belong to the same epoch and S3 is most advanced among them.
|
||||
So VCL is set to 4 and we recover S1 and S2 from S3:
|
||||
|
||||
```
|
||||
S1(1): R1(a),R2(b),R3(c),R4(d)
|
||||
S2(1): R1(a),R2(b),R3(c),R4(d)
|
||||
S3(1): R1(a),R2(b),R3(c),R4(d)
|
||||
```
|
||||
|
||||
## Main loop
|
||||
Once recovery is completed, proxy switches to normal processing loop: it receives WAL stream from master and appends WAL
|
||||
messages to the list. At the same time it tries to push messages to safekeepers. Each safekeeper is associated
|
||||
with some element in message list and once it acknowledged receiving of the message, position is moved forward.
|
||||
Each queue element contains acknowledgment mask, which bits corresponds to safekeepers.
|
||||
Once all safekeepers acknowledged receiving of this message (by setting correspondent bit),
|
||||
then element can be removed from queue and `restartLSN` is advanced forward.
|
||||
|
||||
Proxy maintains `restartLSN` and `commitLSN` based on the responses received by safekeepers.
|
||||
`RestartLSN` equals to the LSN of head message in the list. `CommitLSN` is `flushLSN[nSafekeepers-Quorum]` element
|
||||
in ordered array with `flushLSN`s of safekeepers. `CommitLSN` and `RestartLSN` are included in requests
|
||||
sent from proxy to safekeepers and stored in safekeepers control file.
|
||||
To avoid overhead of extra fsync, this control file is not fsynced on each request. Flushing this file is performed
|
||||
periodically, which means that `restartLSN`/`commitLSN` stored by safekeeper may be slightly deteriorated.
|
||||
It is not critical because may only cause redundant processing of some WAL record.
|
||||
And `FlushLSN` is recalculated after node restart by scanning local WAL files.
|
||||
|
||||
## Fault tolerance
|
||||
Once `safekeeper_proxy` looses connection to safekeeper it tries to reestablish this connection using the same nodeId.
|
||||
If `safekeeper_proxy` looses connection with master, it is terminated. Right now safekeeper is standalone process,
|
||||
which can be launched at any node, but it can be also spawned as master's background worker, so that it is automatically
|
||||
restarted in case of Postgres instance restart.
|
||||
|
||||
Restart of `safekeeper_proxy` initiates new round of voting and switching new epoch.
|
||||
|
||||
## Limitations
|
||||
Right now message queue is maintained in main memory and is not spilled to the disk.
|
||||
It can cause memory overflow in case of presence of lagging safekeepers.
|
||||
It is assumed that in case of loosing local data by some safekeepers, it should be recovered using some external mechanism.
|
||||
|
||||
|
||||
## Glossary
|
||||
* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
|
||||
* `RestartLSN`: position in WAL confirmed by all safekeepers.
|
||||
* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
|
||||
* `NodeID`: pair (term,UUID)
|
||||
* `Pager`: Zenith component restoring pages from WAL stream
|
||||
* `Replica`: read-only computatio node
|
||||
* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.
|
||||
|
||||
## Algorithm
|
||||
|
||||
```python
|
||||
process SafekeeperProxy(safekeepers,server,curr_epoch,restart_lsn=0,message_queue={},feedbacks={})
|
||||
function do_recovery(epoch,restart_lsn,VCL)
|
||||
leader = i:safekeepers[i].state.epoch=epoch and safekeepers[i].state.flushLsn=VCL
|
||||
wal_stream = safekeepers[leader].start_replication(restart_lsn,VCL)
|
||||
do
|
||||
message = wal_stream.read()
|
||||
message_queue.append(message)
|
||||
while message.startPos < VCL
|
||||
|
||||
for i in 1..safekeepers.size()
|
||||
for message in message_queue
|
||||
if message.endLsn < safekeepers[i].state.flushLsn
|
||||
message.delivered += i
|
||||
else
|
||||
send_message(i, message)
|
||||
break
|
||||
end function
|
||||
|
||||
function send_message(i,msg)
|
||||
msg.restartLsn = restart_lsn
|
||||
msg.commitLsn = get_commit_lsn()
|
||||
safekeepers[i].send(msg, response_handler)
|
||||
end function
|
||||
|
||||
function do_broadcast(message)
|
||||
for i in 1..safekeepers.size()
|
||||
if not safekeepers[i].sending()
|
||||
send_message(i, message)
|
||||
end function
|
||||
|
||||
function get_commit_lsn()
|
||||
sorted_feedbacks = feedbacks.sort()
|
||||
return sorted_feedbacks[safekeepers.size() - quorum]
|
||||
end function
|
||||
|
||||
function response_handler(i,message,response)
|
||||
feedbacks[i] = if response.epoch=curr_epoch then response.flushLsn else VCL
|
||||
server.write(get_commit_lsn())
|
||||
|
||||
message.delivered += i
|
||||
next_message = message_queue.next(message)
|
||||
if next_message
|
||||
send_message(i, next_message)
|
||||
|
||||
while message_queue.head.delivered.size() = safekeepers.size()
|
||||
if restart_lsn < message_queue.head.beginLsn
|
||||
restart_lsn = message_queue.head.endLsn
|
||||
message_queue.pop_head()
|
||||
end function
|
||||
|
||||
server_info = server.read()
|
||||
|
||||
safekeepers.write(server_info)
|
||||
safekeepers.state = safekeepers.read()
|
||||
next_term = max(safekeepers.state.nodeId.term)+1
|
||||
restart_lsn = max(safekeepers.state.restartLsn)
|
||||
epoch,VCL = max(safekeepers.state.epoch,safekeepers.state.flushLsn)
|
||||
curr_epoch = epoch + 1
|
||||
|
||||
proposal = Proposal(NodeId(next_term,server.id),curr_epoch,VCL)
|
||||
safekeepers.send(proposal)
|
||||
responses = safekeepers.read()
|
||||
if any responses.is_rejected()
|
||||
exit()
|
||||
|
||||
for i in 1..safekeepers.size()
|
||||
feedbacks[i].flushLsn = if epoch=safekeepers[i].state.epoch then safekeepers[i].state.flushLsn else restart_lsn
|
||||
|
||||
if restart_lsn != VCL
|
||||
do_recovery(epoch,restart_lsn,VCL)
|
||||
|
||||
wal_stream = server.start_replication(VCL)
|
||||
for ever
|
||||
message = wal_stream.read()
|
||||
message_queue.append(message)
|
||||
do_broadcast(message)
|
||||
end process
|
||||
|
||||
process safekeeper(gateway,state)
|
||||
function handshake()
|
||||
proxy = gateway.accept()
|
||||
server_info = proxy.read()
|
||||
proxy.write(state)
|
||||
proposal = proxy.read()
|
||||
if proposal.nodeId < state.nodeId
|
||||
proxy.write(rejected)
|
||||
return null
|
||||
else
|
||||
state.nodeId = proposal.nodeId
|
||||
state.proposed_epoch = proposal.epoch
|
||||
state.VCL = proposal.VCL
|
||||
write_control_file(state)
|
||||
proxy.write(accepted)
|
||||
return proxy
|
||||
end function
|
||||
|
||||
state = read_control_file()
|
||||
state.flushLsn = locate_end_of_wal()
|
||||
|
||||
for ever
|
||||
proxy = handshake()
|
||||
if not proxy
|
||||
continue
|
||||
for ever
|
||||
req = proxy.read()
|
||||
if req.nodeId != state.nodeId
|
||||
break
|
||||
save_wal_file(req.data)
|
||||
state.restartLsn = req.restartLsn
|
||||
if state.epoch < state.proposed_epoch and req.endPos > max(state.flushLsn,state.VCL)
|
||||
state.epoch = state.proposed_epoch
|
||||
if req.endPos > state.flushLsn
|
||||
state.flushLsn = req.endPos
|
||||
save_control_file(state)
|
||||
resp = Response(state.epoch,req.endPos)
|
||||
proxy.write(resp)
|
||||
notify_wal_sender(Min(req.commitLsn,req.endPos))
|
||||
end process
|
||||
```
|
||||
@@ -1,20 +1,18 @@
|
||||
//
|
||||
// Main entry point for the wal_acceptor executable
|
||||
//
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{App, Arg};
|
||||
use daemonize::Daemonize;
|
||||
use log::*;
|
||||
use parse_duration::parse;
|
||||
use slog::Drain;
|
||||
use std::io;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use std::{fs::File, fs::OpenOptions};
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
|
||||
use slog::Drain;
|
||||
|
||||
use walkeeper::s3_offload;
|
||||
use walkeeper::wal_service;
|
||||
use walkeeper::WalAcceptorConf;
|
||||
@@ -115,8 +113,18 @@ fn main() -> Result<()> {
|
||||
}
|
||||
|
||||
fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
let log_filename = conf.data_dir.join("wal_acceptor.log");
|
||||
// Don't open the same file for output multiple times;
|
||||
// the different fds could overwrite each other's output.
|
||||
let log_file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&log_filename)
|
||||
.with_context(|| format!("failed to open {:?}", &log_filename))?;
|
||||
|
||||
// Initialize logger
|
||||
let _scope_guard = init_logging(&conf)?;
|
||||
let logger_file = log_file.try_clone().unwrap();
|
||||
let _scope_guard = init_logging(&conf, logger_file)?;
|
||||
let _log_guard = slog_stdlog::init().unwrap();
|
||||
// Note: this `info!(...)` macro comes from `log` crate
|
||||
info!("standard logging redirected to slog");
|
||||
@@ -126,16 +134,8 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
|
||||
// There should'n be any logging to stdin/stdout. Redirect it to the main log so
|
||||
// that we will see any accidental manual fprintf's or backtraces.
|
||||
let stdout = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open("wal_acceptor.log")
|
||||
.unwrap();
|
||||
let stderr = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open("wal_acceptor.log")
|
||||
.unwrap();
|
||||
let stdout = log_file.try_clone().unwrap();
|
||||
let stderr = log_file;
|
||||
|
||||
let daemonize = Daemonize::new()
|
||||
.pid_file("wal_acceptor.pid")
|
||||
@@ -167,7 +167,10 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
.name("WAL acceptor thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
wal_service::thread_main(conf);
|
||||
let thread_result = wal_service::thread_main(conf);
|
||||
if let Err(e) = thread_result {
|
||||
info!("wal_service thread terminated: {}", e);
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(wal_acceptor_thread);
|
||||
@@ -178,14 +181,11 @@ fn start_wal_acceptor(conf: WalAcceptorConf) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn init_logging(conf: &WalAcceptorConf) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
|
||||
fn init_logging(
|
||||
conf: &WalAcceptorConf,
|
||||
log_file: File,
|
||||
) -> Result<slog_scope::GlobalLoggerGuard, io::Error> {
|
||||
if conf.daemonize {
|
||||
let log = conf.data_dir.join("wal_acceptor.log");
|
||||
let log_file = File::create(&log).map_err(|err| {
|
||||
// We failed to initialize logging, so we can't log this message with error!
|
||||
eprintln!("Could not create log file {:?}: {}", log, err);
|
||||
err
|
||||
})?;
|
||||
let decorator = slog_term::PlainSyncDecorator::new(log_file);
|
||||
let drain = slog_term::CompactFormat::new(decorator).build();
|
||||
let drain = std::sync::Mutex::new(drain).fuse();
|
||||
|
||||
@@ -3,8 +3,12 @@ use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
mod pq_protocol;
|
||||
pub mod pq_protocol;
|
||||
pub mod receive_wal;
|
||||
pub mod replication;
|
||||
pub mod s3_offload;
|
||||
pub mod send_wal;
|
||||
pub mod timeline;
|
||||
pub mod wal_service;
|
||||
|
||||
use crate::pq_protocol::SystemId;
|
||||
|
||||
@@ -1,17 +1,15 @@
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use pageserver::ZTimelineId;
|
||||
use std::io;
|
||||
use std::io::{self, Read};
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
|
||||
pub type Oid = u32;
|
||||
pub type SystemId = u64;
|
||||
pub type Result<T> = std::result::Result<T, io::Error>;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum FeMessage {
|
||||
StartupMessage(FeStartupMessage),
|
||||
Query(FeQueryMessage),
|
||||
Terminate,
|
||||
CopyData(FeCopyData),
|
||||
@@ -52,28 +50,22 @@ pub enum StartupRequestCode {
|
||||
}
|
||||
|
||||
impl FeStartupMessage {
|
||||
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
|
||||
pub fn read_from(reader: &mut impl Read) -> io::Result<Self> {
|
||||
const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
|
||||
const CANCEL_REQUEST_CODE: u32 = (1234 << 16) | 5678;
|
||||
const NEGOTIATE_SSL_CODE: u32 = (1234 << 16) | 5679;
|
||||
const NEGOTIATE_GSS_CODE: u32 = (1234 << 16) | 5680;
|
||||
|
||||
if buf.len() < 4 {
|
||||
return Ok(None);
|
||||
}
|
||||
let len = BigEndian::read_u32(&buf[0..4]) as usize;
|
||||
let len = reader.read_u32::<BigEndian>()? as usize;
|
||||
|
||||
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"invalid message length",
|
||||
"FeStartupMessage: invalid message length",
|
||||
));
|
||||
}
|
||||
if buf.len() < len {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let version = BigEndian::read_u32(&buf[4..8]);
|
||||
let version = reader.read_u32::<BigEndian>()?;
|
||||
|
||||
let kind = match version {
|
||||
CANCEL_REQUEST_CODE => StartupRequestCode::Cancel,
|
||||
@@ -82,7 +74,10 @@ impl FeStartupMessage {
|
||||
_ => StartupRequestCode::Normal,
|
||||
};
|
||||
|
||||
let params_bytes = &buf[8..len];
|
||||
let params_len = len - 8;
|
||||
let mut params_bytes = vec![0u8; params_len];
|
||||
reader.read_exact(params_bytes.as_mut())?;
|
||||
|
||||
let params_str = str::from_utf8(¶ms_bytes).unwrap();
|
||||
let params = params_str.split('\0');
|
||||
let mut options = false;
|
||||
@@ -110,13 +105,12 @@ impl FeStartupMessage {
|
||||
));
|
||||
}
|
||||
|
||||
buf.advance(len as usize);
|
||||
Ok(Some(FeMessage::StartupMessage(FeStartupMessage {
|
||||
Ok(FeStartupMessage {
|
||||
version,
|
||||
kind,
|
||||
appname,
|
||||
timelineid: timelineid.unwrap(),
|
||||
})))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -202,44 +196,28 @@ impl<'a> BeMessage<'a> {
|
||||
}
|
||||
|
||||
impl FeMessage {
|
||||
pub fn parse(buf: &mut BytesMut) -> Result<Option<FeMessage>> {
|
||||
if buf.len() < 5 {
|
||||
let to_read = 5 - buf.len();
|
||||
buf.reserve(to_read);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let tag = buf[0];
|
||||
let len = BigEndian::read_u32(&buf[1..5]);
|
||||
pub fn read_from(reader: &mut impl Read) -> io::Result<FeMessage> {
|
||||
let tag = reader.read_u8()?;
|
||||
let len = reader.read_u32::<BigEndian>()?;
|
||||
|
||||
if len < 4 {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"invalid message length: parsing u32",
|
||||
"FeMessage: invalid message length",
|
||||
));
|
||||
}
|
||||
|
||||
let total_len = len as usize + 1;
|
||||
if buf.len() < total_len {
|
||||
let to_read = total_len - buf.len();
|
||||
buf.reserve(to_read);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut body = buf.split_to(total_len);
|
||||
body.advance(5);
|
||||
let body_len = (len - 4) as usize;
|
||||
let mut body = vec![0u8; body_len];
|
||||
reader.read_exact(&mut body)?;
|
||||
|
||||
match tag {
|
||||
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage {
|
||||
body: body.freeze(),
|
||||
}))),
|
||||
b'd' => Ok(Some(FeMessage::CopyData(FeCopyData {
|
||||
body: body.freeze(),
|
||||
}))),
|
||||
b'X' => Ok(Some(FeMessage::Terminate)),
|
||||
b'Q' => Ok(FeMessage::Query(FeQueryMessage { body: body.into() })),
|
||||
b'd' => Ok(FeMessage::CopyData(FeCopyData { body: body.into() })),
|
||||
b'X' => Ok(FeMessage::Terminate),
|
||||
tag => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
format!("unknown message tag: {},'{:?}'", tag, buf),
|
||||
format!("unknown message tag: {},'{:?}'", tag, body),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
447
walkeeper/src/receive_wal.rs
Normal file
447
walkeeper/src/receive_wal.rs
Normal file
@@ -0,0 +1,447 @@
|
||||
//! This implements the Safekeeper protocol.
|
||||
//!
|
||||
//! FIXME: better description needed here
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use log::*;
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::{max, min};
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{BufReader, Read, Seek, SeekFrom, Write};
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::pq_protocol::*;
|
||||
use crate::replication::HotStandbyFeedback;
|
||||
use crate::timeline::{Timeline, TimelineTools};
|
||||
use crate::WalAcceptorConf;
|
||||
use pageserver::ZTimelineId;
|
||||
use postgres_ffi::xlog_utils::{TimeLineID, XLogFileName, MAX_SEND_SIZE, XLOG_BLCKSZ};
|
||||
|
||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||
pub const SK_FORMAT_VERSION: u32 = 1;
|
||||
const SK_PROTOCOL_VERSION: u32 = 1;
|
||||
const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
const END_OF_STREAM: Lsn = Lsn(0);
|
||||
pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
|
||||
|
||||
/// Unique node identifier used by Paxos
|
||||
#[derive(Debug, Clone, Copy, Ord, PartialOrd, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct NodeId {
|
||||
term: u64,
|
||||
uuid: [u8; 16],
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
||||
pub struct ServerInfo {
|
||||
/// proxy-safekeeper protocol version
|
||||
pub protocol_version: u32,
|
||||
/// Postgres server version
|
||||
pub pg_version: u32,
|
||||
pub node_id: NodeId,
|
||||
pub system_id: SystemId,
|
||||
/// Zenith timelineid
|
||||
pub timeline_id: ZTimelineId,
|
||||
pub wal_end: Lsn,
|
||||
pub timeline: TimeLineID,
|
||||
pub wal_seg_size: u32,
|
||||
}
|
||||
|
||||
/// Vote request sent from proxy to safekeepers
|
||||
#[derive(Debug, PartialEq, Serialize, Deserialize)]
|
||||
struct RequestVote {
|
||||
node_id: NodeId,
|
||||
/// volume commit LSN
|
||||
vcl: Lsn,
|
||||
/// new epoch when safekeeper reaches vcl
|
||||
epoch: u64,
|
||||
}
|
||||
|
||||
/// Information of about storage node
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct SafeKeeperInfo {
|
||||
/// magic for verifying content the control file
|
||||
pub magic: u32,
|
||||
/// safekeeper format version
|
||||
pub format_version: u32,
|
||||
/// safekeeper's epoch
|
||||
pub epoch: u64,
|
||||
/// information about server
|
||||
pub server: ServerInfo,
|
||||
/// part of WAL acknowledged by quorum
|
||||
pub commit_lsn: Lsn,
|
||||
/// locally flushed part of WAL
|
||||
pub flush_lsn: Lsn,
|
||||
/// minimal LSN which may be needed for recovery of some safekeeper: min(commit_lsn) for all safekeepers
|
||||
pub restart_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl SafeKeeperInfo {
|
||||
pub fn new() -> SafeKeeperInfo {
|
||||
SafeKeeperInfo {
|
||||
magic: SK_MAGIC,
|
||||
format_version: SK_FORMAT_VERSION,
|
||||
epoch: 0,
|
||||
server: ServerInfo {
|
||||
protocol_version: SK_PROTOCOL_VERSION, /* proxy-safekeeper protocol version */
|
||||
pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
|
||||
node_id: NodeId {
|
||||
term: 0,
|
||||
uuid: [0; 16],
|
||||
},
|
||||
system_id: 0, /* Postgres system identifier */
|
||||
timeline_id: ZTimelineId::from([0u8; 16]),
|
||||
wal_end: Lsn(0),
|
||||
timeline: 0,
|
||||
wal_seg_size: 0,
|
||||
},
|
||||
commit_lsn: Lsn(0), /* part of WAL acknowledged by quorum */
|
||||
flush_lsn: Lsn(0), /* locally flushed part of WAL */
|
||||
restart_lsn: Lsn(0), /* minimal LSN which may be needed for recovery of some safekeeper */
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Request with WAL message sent from proxy to safekeeper.
|
||||
#[derive(Debug, PartialEq, Serialize, Deserialize)]
|
||||
struct SafeKeeperRequest {
|
||||
/// Sender's node identifier (looks like we do not need it for TCP streaming connection)
|
||||
sender_id: NodeId,
|
||||
/// start position of message in WAL
|
||||
begin_lsn: Lsn,
|
||||
/// end position of message in WAL
|
||||
end_lsn: Lsn,
|
||||
/// restart LSN position (minimal LSN which may be needed by proxy to perform recovery)
|
||||
restart_lsn: Lsn,
|
||||
/// LSN committed by quorum of safekeepers
|
||||
commit_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Report safekeeper state to proxy
|
||||
#[derive(Debug, PartialEq, Serialize, Deserialize)]
|
||||
struct SafeKeeperResponse {
|
||||
epoch: u64,
|
||||
flush_lsn: Lsn,
|
||||
hs_feedback: HotStandbyFeedback,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ReceiveWalConn {
|
||||
pub timeline: Option<Arc<Timeline>>,
|
||||
/// Postgres connection, buffered input
|
||||
pub stream_in: BufReader<TcpStream>,
|
||||
/// Postgres connection, output
|
||||
pub stream_out: TcpStream,
|
||||
/// The cached result of socket.peer_addr()
|
||||
pub peer_addr: SocketAddr,
|
||||
/// wal acceptor configuration
|
||||
pub conf: WalAcceptorConf,
|
||||
}
|
||||
|
||||
impl ReceiveWalConn {
|
||||
pub fn new(socket: TcpStream, conf: WalAcceptorConf) -> Result<ReceiveWalConn> {
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
let conn = ReceiveWalConn {
|
||||
timeline: None,
|
||||
stream_in: BufReader::new(socket.try_clone()?),
|
||||
stream_out: socket,
|
||||
peer_addr,
|
||||
conf,
|
||||
};
|
||||
Ok(conn)
|
||||
}
|
||||
|
||||
fn read_req<T: LeSer>(&mut self) -> Result<T> {
|
||||
// As the trait bound implies, this always encodes little-endian.
|
||||
Ok(T::des_from(&mut self.stream_in)?)
|
||||
}
|
||||
|
||||
fn request_callback(&self) -> std::result::Result<(), postgres::error::Error> {
|
||||
if let Some(addr) = self.conf.pageserver_addr {
|
||||
let ps_connstr = format!(
|
||||
"host={} port={} dbname={} user={}",
|
||||
addr.ip(),
|
||||
addr.port(),
|
||||
"no_db",
|
||||
"no_user",
|
||||
);
|
||||
let callme = format!(
|
||||
"callmemaybe {} host={} port={} options='-c ztimelineid={}'",
|
||||
self.timeline.get().timelineid,
|
||||
self.conf.listen_addr.ip(),
|
||||
self.conf.listen_addr.port(),
|
||||
self.timeline.get().timelineid
|
||||
);
|
||||
info!(
|
||||
"requesting page server to connect to us: start {} {}",
|
||||
ps_connstr, callme
|
||||
);
|
||||
let mut client = Client::connect(&ps_connstr, NoTls)?;
|
||||
client.simple_query(&callme)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Receive WAL from wal_proposer
|
||||
pub fn run(&mut self) -> Result<()> {
|
||||
// Receive information about server
|
||||
let server_info = self.read_req::<ServerInfo>()?;
|
||||
info!(
|
||||
"Start handshake with wal_proposer {} sysid {} timeline {}",
|
||||
self.peer_addr, server_info.system_id, server_info.timeline_id,
|
||||
);
|
||||
// FIXME: also check that the system identifier matches
|
||||
self.timeline.set(server_info.timeline_id)?;
|
||||
self.timeline.get().load_control_file(&self.conf)?;
|
||||
|
||||
let mut my_info = self.timeline.get().get_info();
|
||||
|
||||
/* Check protocol compatibility */
|
||||
if server_info.protocol_version != SK_PROTOCOL_VERSION {
|
||||
bail!(
|
||||
"Incompatible protocol version {}, expected {}",
|
||||
server_info.protocol_version,
|
||||
SK_PROTOCOL_VERSION
|
||||
);
|
||||
}
|
||||
/* Postgres upgrade is not treated as fatal error */
|
||||
if server_info.pg_version != my_info.server.pg_version
|
||||
&& my_info.server.pg_version != UNKNOWN_SERVER_VERSION
|
||||
{
|
||||
info!(
|
||||
"Incompatible server version {}, expected {}",
|
||||
server_info.pg_version, my_info.server.pg_version
|
||||
);
|
||||
}
|
||||
|
||||
/* Update information about server, but preserve locally stored node_id */
|
||||
let node_id = my_info.server.node_id;
|
||||
my_info.server = server_info;
|
||||
my_info.server.node_id = node_id;
|
||||
|
||||
/* Calculate WAL end based on local data */
|
||||
let (flush_lsn, timeline) = self.timeline.find_end_of_wal(&self.conf.data_dir, true);
|
||||
my_info.flush_lsn = flush_lsn;
|
||||
my_info.server.timeline = timeline;
|
||||
|
||||
/* Report my identifier to proxy */
|
||||
my_info.ser_into(&mut self.stream_out)?;
|
||||
|
||||
/* Wait for vote request */
|
||||
let prop = self.read_req::<RequestVote>()?;
|
||||
/* This is Paxos check which should ensure that only one master can perform commits */
|
||||
if prop.node_id < my_info.server.node_id {
|
||||
/* Send my node-id to inform proxy that it's candidate was rejected */
|
||||
my_info.server.node_id.ser_into(&mut self.stream_out)?;
|
||||
bail!(
|
||||
"Reject connection attempt with term {} because my term is {}",
|
||||
prop.node_id.term,
|
||||
my_info.server.node_id.term,
|
||||
);
|
||||
}
|
||||
my_info.server.node_id = prop.node_id;
|
||||
self.timeline.get().set_info(&my_info);
|
||||
/* Need to persist our vote first */
|
||||
self.timeline.get().save_control_file(true)?;
|
||||
|
||||
let mut flushed_restart_lsn = Lsn(0);
|
||||
let wal_seg_size = server_info.wal_seg_size as usize;
|
||||
|
||||
/* Acknowledge the proposed candidate by returning it to the proxy */
|
||||
prop.node_id.ser_into(&mut self.stream_out)?;
|
||||
|
||||
// Need to establish replication channel with page server.
|
||||
// Add far as replication in postgres is initiated by receiver, we should use callme mechanism
|
||||
if let Err(e) = self.request_callback() {
|
||||
// Do not treate it as fatal error and continue work
|
||||
// FIXME: we should retry after a while...
|
||||
error!("Failed to send callme request to pageserver: {}", e);
|
||||
}
|
||||
|
||||
info!(
|
||||
"Start streaming from timeline {} address {:?}",
|
||||
server_info.timeline_id, self.peer_addr,
|
||||
);
|
||||
|
||||
// Main loop
|
||||
loop {
|
||||
let mut sync_control_file = false;
|
||||
|
||||
/* Receive message header */
|
||||
let req = self.read_req::<SafeKeeperRequest>()?;
|
||||
if req.sender_id != my_info.server.node_id {
|
||||
bail!("Sender NodeId is changed");
|
||||
}
|
||||
if req.begin_lsn == END_OF_STREAM {
|
||||
info!("Server stops streaming");
|
||||
break;
|
||||
}
|
||||
let start_pos = req.begin_lsn;
|
||||
let end_pos = req.end_lsn;
|
||||
let rec_size = end_pos.checked_sub(start_pos).unwrap().0 as usize;
|
||||
assert!(rec_size <= MAX_SEND_SIZE);
|
||||
|
||||
debug!(
|
||||
"received for {} bytes between {} and {}",
|
||||
rec_size, start_pos, end_pos,
|
||||
);
|
||||
|
||||
/* Receive message body */
|
||||
let mut inbuf = vec![0u8; rec_size];
|
||||
self.stream_in.read_exact(&mut inbuf)?;
|
||||
|
||||
/* Save message in file */
|
||||
self.write_wal_file(start_pos, timeline, wal_seg_size, &inbuf)?;
|
||||
|
||||
my_info.restart_lsn = req.restart_lsn;
|
||||
my_info.commit_lsn = req.commit_lsn;
|
||||
|
||||
/*
|
||||
* Epoch switch happen when written WAL record cross the boundary.
|
||||
* The boundary is maximum of last WAL position at this node (FlushLSN) and global
|
||||
* maximum (vcl) determined by safekeeper_proxy during handshake.
|
||||
* Switching epoch means that node completes recovery and start writing in the WAL new data.
|
||||
*/
|
||||
if my_info.epoch < prop.epoch && end_pos > max(my_info.flush_lsn, prop.vcl) {
|
||||
info!("Switch to new epoch {}", prop.epoch);
|
||||
my_info.epoch = prop.epoch; /* bump epoch */
|
||||
sync_control_file = true;
|
||||
}
|
||||
if end_pos > my_info.flush_lsn {
|
||||
my_info.flush_lsn = end_pos;
|
||||
}
|
||||
/*
|
||||
* Update restart LSN in control file.
|
||||
* To avoid negative impact on performance of extra fsync, do it only
|
||||
* when restart_lsn delta exceeds WAL segment size.
|
||||
*/
|
||||
sync_control_file |= flushed_restart_lsn + (wal_seg_size as u64) < my_info.restart_lsn;
|
||||
self.timeline.get().save_control_file(sync_control_file)?;
|
||||
|
||||
if sync_control_file {
|
||||
flushed_restart_lsn = my_info.restart_lsn;
|
||||
}
|
||||
|
||||
/* Report flush position */
|
||||
//info!("Confirm LSN: {:X}/{:>08X}", (end_pos>>32) as u32, end_pos as u32);
|
||||
let resp = SafeKeeperResponse {
|
||||
epoch: my_info.epoch,
|
||||
flush_lsn: end_pos,
|
||||
hs_feedback: self.timeline.get().get_hs_feedback(),
|
||||
};
|
||||
resp.ser_into(&mut self.stream_out)?;
|
||||
|
||||
/*
|
||||
* Ping wal sender that new data is available.
|
||||
* FlushLSN (end_pos) can be smaller than commitLSN in case we are at catching-up safekeeper.
|
||||
*/
|
||||
self.timeline
|
||||
.get()
|
||||
.notify_wal_senders(min(req.commit_lsn, end_pos));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_wal_file(
|
||||
&self,
|
||||
startpos: Lsn,
|
||||
timeline: TimeLineID,
|
||||
wal_seg_size: usize,
|
||||
buf: &[u8],
|
||||
) -> Result<()> {
|
||||
let mut bytes_left: usize = buf.len();
|
||||
let mut bytes_written: usize = 0;
|
||||
let mut partial;
|
||||
let mut start_pos = startpos;
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
/* Extract WAL location for this block */
|
||||
let mut xlogoff = start_pos.segment_offset(wal_seg_size) as usize;
|
||||
|
||||
while bytes_left != 0 {
|
||||
let bytes_to_write;
|
||||
|
||||
/*
|
||||
* If crossing a WAL boundary, only write up until we reach wal
|
||||
* segment size.
|
||||
*/
|
||||
if xlogoff + bytes_left > wal_seg_size {
|
||||
bytes_to_write = wal_seg_size - xlogoff;
|
||||
} else {
|
||||
bytes_to_write = bytes_left;
|
||||
}
|
||||
|
||||
/* Open file */
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
let wal_file_name = XLogFileName(timeline, segno, wal_seg_size);
|
||||
let wal_file_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
.join(self.timeline.get().timelineid.to_string())
|
||||
.join(wal_file_name.clone());
|
||||
let wal_file_partial_path = self
|
||||
.conf
|
||||
.data_dir
|
||||
.join(self.timeline.get().timelineid.to_string())
|
||||
.join(wal_file_name.clone() + ".partial");
|
||||
|
||||
{
|
||||
let mut wal_file: File;
|
||||
/* Try to open already completed segment */
|
||||
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
|
||||
wal_file = file;
|
||||
partial = false;
|
||||
} else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path)
|
||||
{
|
||||
/* Try to open existed partial file */
|
||||
wal_file = file;
|
||||
partial = true;
|
||||
} else {
|
||||
/* Create and fill new partial file */
|
||||
partial = true;
|
||||
match OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
.open(&wal_file_partial_path)
|
||||
{
|
||||
Ok(mut file) => {
|
||||
for _ in 0..(wal_seg_size / XLOG_BLCKSZ) {
|
||||
file.write_all(&ZERO_BLOCK)?;
|
||||
}
|
||||
wal_file = file;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
wal_file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
wal_file.write_all(&buf[bytes_written..(bytes_written + bytes_to_write)])?;
|
||||
|
||||
// Flush file is not prohibited
|
||||
if !self.conf.no_sync {
|
||||
wal_file.sync_all()?;
|
||||
}
|
||||
}
|
||||
/* Write was successful, advance our position */
|
||||
bytes_written += bytes_to_write;
|
||||
bytes_left -= bytes_to_write;
|
||||
start_pos += bytes_to_write as u64;
|
||||
xlogoff += bytes_to_write;
|
||||
|
||||
/* Did we reach the end of a WAL segment? */
|
||||
if start_pos.segment_offset(wal_seg_size) == 0 {
|
||||
xlogoff = 0;
|
||||
if partial {
|
||||
fs::rename(&wal_file_partial_path, &wal_file_path)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
238
walkeeper/src/replication.rs
Normal file
238
walkeeper/src/replication.rs
Normal file
@@ -0,0 +1,238 @@
|
||||
//! This module implements the replication protocol, starting with the
|
||||
//! "START REPLICATION" message.
|
||||
|
||||
use crate::pq_protocol::{BeMessage, FeMessage};
|
||||
use crate::send_wal::SendWalConn;
|
||||
use crate::timeline::{Timeline, TimelineTools};
|
||||
use crate::WalAcceptorConf;
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use log::*;
|
||||
use postgres_ffi::xlog_utils::{get_current_timestamp, TimestampTz, XLogFileName, MAX_SEND_SIZE};
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::min;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, Read, Seek, SeekFrom, Write};
|
||||
use std::net::TcpStream;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::{str, thread};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
const XLOG_HDR_SIZE: usize = 1 + 8 * 3; /* 'w' + startPos + walEnd + timestamp */
|
||||
const LIBPQ_HDR_SIZE: usize = 5; /* 1 byte with message type + 4 bytes length */
|
||||
const LIBPQ_MSG_SIZE_OFFS: usize = 1;
|
||||
pub const END_REPLICATION_MARKER: Lsn = Lsn::MAX;
|
||||
|
||||
type FullTransactionId = u64;
|
||||
|
||||
/// Hot standby feedback received from replica
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct HotStandbyFeedback {
|
||||
pub ts: TimestampTz,
|
||||
pub xmin: FullTransactionId,
|
||||
pub catalog_xmin: FullTransactionId,
|
||||
}
|
||||
|
||||
/// A network connection that's speaking the replication protocol.
|
||||
pub struct ReplicationConn {
|
||||
timeline: Option<Arc<Timeline>>,
|
||||
/// Postgres connection, buffered input
|
||||
///
|
||||
/// This is an `Option` because we will spawn a background thread that will
|
||||
/// `take` it from us.
|
||||
stream_in: Option<BufReader<TcpStream>>,
|
||||
/// Postgres connection, output
|
||||
stream_out: TcpStream,
|
||||
/// wal acceptor configuration
|
||||
conf: WalAcceptorConf,
|
||||
/// assigned application name
|
||||
appname: Option<String>,
|
||||
}
|
||||
|
||||
impl ReplicationConn {
|
||||
/// Create a new `SendWal`, consuming the `Connection`.
|
||||
pub fn new(conn: SendWalConn) -> Self {
|
||||
Self {
|
||||
timeline: conn.timeline,
|
||||
stream_in: Some(conn.stream_in),
|
||||
stream_out: conn.stream_out,
|
||||
conf: conn.conf,
|
||||
appname: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle incoming messages from the network.
|
||||
///
|
||||
/// This is spawned into the background by `handle_start_replication`.
|
||||
///
|
||||
fn background_thread(mut stream_in: impl Read, timeline: Arc<Timeline>) -> Result<()> {
|
||||
// Wait for replica's feedback.
|
||||
// We only handle `CopyData` messages. Anything else is ignored.
|
||||
loop {
|
||||
match FeMessage::read_from(&mut stream_in)? {
|
||||
FeMessage::CopyData(m) => {
|
||||
let feedback = HotStandbyFeedback::des(&m.body)?;
|
||||
timeline.add_hs_feedback(feedback)
|
||||
}
|
||||
msg => {
|
||||
info!("unexpected message {:?}", msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function that parses a pair of LSNs.
|
||||
fn parse_start_stop(cmd: &[u8]) -> Result<(Lsn, Lsn)> {
|
||||
let re = Regex::new(r"([[:xdigit:]]+/[[:xdigit:]]+)").unwrap();
|
||||
let caps = re.captures_iter(str::from_utf8(cmd)?);
|
||||
let mut lsns = caps.map(|cap| cap[1].parse::<Lsn>());
|
||||
let start_pos = lsns
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("failed to find start LSN"))??;
|
||||
let stop_pos = lsns.next().transpose()?.unwrap_or(Lsn(0));
|
||||
Ok((start_pos, stop_pos))
|
||||
}
|
||||
|
||||
/// Helper function for opening a wal file.
|
||||
fn open_wal_file(wal_file_path: &Path) -> Result<File> {
|
||||
// First try to open the .partial file.
|
||||
let mut partial_path = wal_file_path.to_owned();
|
||||
partial_path.set_extension("partial");
|
||||
if let Ok(opened_file) = File::open(&partial_path) {
|
||||
return Ok(opened_file);
|
||||
}
|
||||
|
||||
// If that failed, try it without the .partial extension.
|
||||
match File::open(&wal_file_path) {
|
||||
Ok(opened_file) => Ok(opened_file),
|
||||
Err(e) => {
|
||||
error!("Failed to open log file {:?}: {}", &wal_file_path, e);
|
||||
Err(e.into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Handle START_REPLICATION replication command
|
||||
///
|
||||
pub fn run(&mut self, cmd: &Bytes) -> Result<()> {
|
||||
// spawn the background thread which receives HotStandbyFeedback messages.
|
||||
let bg_timeline = Arc::clone(self.timeline.get());
|
||||
let bg_stream_in = self.stream_in.take().unwrap();
|
||||
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = Self::background_thread(bg_stream_in, bg_timeline) {
|
||||
error!("socket error: {}", err);
|
||||
}
|
||||
});
|
||||
|
||||
let (mut start_pos, mut stop_pos) = Self::parse_start_stop(&cmd)?;
|
||||
|
||||
let wal_seg_size = self.timeline.get().get_info().server.wal_seg_size as usize;
|
||||
if wal_seg_size == 0 {
|
||||
bail!("Can not start replication before connecting to wal_proposer");
|
||||
}
|
||||
let (wal_end, timeline) = self.timeline.find_end_of_wal(&self.conf.data_dir, false);
|
||||
if start_pos == Lsn(0) {
|
||||
start_pos = wal_end;
|
||||
}
|
||||
if stop_pos == Lsn(0) && self.appname == Some("wal_proposer_recovery".to_string()) {
|
||||
stop_pos = wal_end;
|
||||
}
|
||||
info!("Start replication from {} till {}", start_pos, stop_pos);
|
||||
|
||||
let mut outbuf = BytesMut::new();
|
||||
BeMessage::write(&mut outbuf, &BeMessage::Copy);
|
||||
self.send(&outbuf)?;
|
||||
outbuf.clear();
|
||||
|
||||
let mut end_pos: Lsn;
|
||||
let mut wal_file: Option<File> = None;
|
||||
|
||||
loop {
|
||||
/* Wait until we have some data to stream */
|
||||
if stop_pos != Lsn(0) {
|
||||
/* recovery mode: stream up to the specified LSN (VCL) */
|
||||
if start_pos >= stop_pos {
|
||||
/* recovery finished */
|
||||
break;
|
||||
}
|
||||
end_pos = stop_pos;
|
||||
} else {
|
||||
/* normal mode */
|
||||
let timeline = self.timeline.get();
|
||||
end_pos = timeline.wait_for_lsn(start_pos);
|
||||
}
|
||||
if end_pos == END_REPLICATION_MARKER {
|
||||
break;
|
||||
}
|
||||
|
||||
// Take the `File` from `wal_file`, or open a new file.
|
||||
let mut file = match wal_file.take() {
|
||||
Some(file) => file,
|
||||
None => {
|
||||
// Open a new file.
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
let wal_file_name = XLogFileName(timeline, segno, wal_seg_size);
|
||||
let timeline_id = self.timeline.get().timelineid.to_string();
|
||||
let wal_file_path = self.conf.data_dir.join(timeline_id).join(wal_file_name);
|
||||
Self::open_wal_file(&wal_file_path)?
|
||||
}
|
||||
};
|
||||
|
||||
let xlogoff = start_pos.segment_offset(wal_seg_size) as usize;
|
||||
|
||||
// How much to read and send in message? We cannot cross the WAL file
|
||||
// boundary, and we don't want send more than MAX_SEND_SIZE.
|
||||
let send_size = end_pos.checked_sub(start_pos).unwrap().0 as usize;
|
||||
let send_size = min(send_size, wal_seg_size - xlogoff);
|
||||
let send_size = min(send_size, MAX_SEND_SIZE);
|
||||
|
||||
let msg_size = LIBPQ_HDR_SIZE + XLOG_HDR_SIZE + send_size;
|
||||
|
||||
// Read some data from the file.
|
||||
let mut file_buf = vec![0u8; send_size];
|
||||
file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
file.read_exact(&mut file_buf)?;
|
||||
|
||||
// Write some data to the network socket.
|
||||
// FIXME: turn these into structs.
|
||||
// 'd' is CopyData;
|
||||
// 'w' is "WAL records"
|
||||
// https://www.postgresql.org/docs/9.1/protocol-message-formats.html
|
||||
// src/backend/replication/walreceiver.c
|
||||
outbuf.clear();
|
||||
outbuf.put_u8(b'd');
|
||||
outbuf.put_u32((msg_size - LIBPQ_MSG_SIZE_OFFS) as u32);
|
||||
outbuf.put_u8(b'w');
|
||||
outbuf.put_u64(start_pos.0);
|
||||
outbuf.put_u64(end_pos.0);
|
||||
outbuf.put_u64(get_current_timestamp());
|
||||
|
||||
assert!(outbuf.len() + file_buf.len() == msg_size);
|
||||
// This thread has exclusive access to the TcpStream, so it's fine
|
||||
// to do this as two separate calls.
|
||||
self.send(&outbuf)?;
|
||||
self.send(&file_buf)?;
|
||||
start_pos += send_size as u64;
|
||||
|
||||
debug!("Sent WAL to page server up to {}", end_pos);
|
||||
|
||||
// Decide whether to reuse this file. If we don't set wal_file here
|
||||
// a new file will be opened next time.
|
||||
if start_pos.segment_offset(wal_seg_size) != 0 {
|
||||
wal_file = Some(file);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Send messages on the network.
|
||||
fn send(&mut self, buf: &[u8]) -> Result<()> {
|
||||
self.stream_out.write_all(buf.as_ref())?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -12,8 +12,7 @@ use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::fs::{self, File};
|
||||
use std::io::prelude::*;
|
||||
use std::iter::FromIterator;
|
||||
use std::path::PathBuf;
|
||||
use std::path::Path;
|
||||
use std::time::SystemTime;
|
||||
use tokio::runtime;
|
||||
use tokio::time::sleep;
|
||||
@@ -42,7 +41,7 @@ pub fn thread_main(conf: WalAcceptorConf) {
|
||||
async fn offload_files(
|
||||
bucket: &Bucket,
|
||||
listing: &HashSet<String>,
|
||||
dir_path: &PathBuf,
|
||||
dir_path: &Path,
|
||||
conf: &WalAcceptorConf,
|
||||
) -> Result<u64> {
|
||||
let horizon = SystemTime::now() - conf.ttl.unwrap();
|
||||
@@ -93,11 +92,10 @@ async fn main_loop(conf: &WalAcceptorConf) -> Result<()> {
|
||||
let results = bucket
|
||||
.list("walarchive/".to_string(), Some("".to_string()))
|
||||
.await?;
|
||||
let listing = HashSet::from_iter(
|
||||
results
|
||||
.iter()
|
||||
.flat_map(|b| b.contents.iter().map(|o| o.key.clone())),
|
||||
);
|
||||
let listing = results
|
||||
.iter()
|
||||
.flat_map(|b| b.contents.iter().map(|o| o.key.clone()))
|
||||
.collect();
|
||||
|
||||
let n = offload_files(&bucket, &listing, &conf.data_dir, conf).await?;
|
||||
info!("Offload {} files to S3", n);
|
||||
|
||||
155
walkeeper/src/send_wal.rs
Normal file
155
walkeeper/src/send_wal.rs
Normal file
@@ -0,0 +1,155 @@
|
||||
//! This implements the libpq replication protocol between wal_acceptor
|
||||
//! and replicas/pagers
|
||||
//!
|
||||
|
||||
use crate::pq_protocol::{
|
||||
BeMessage, FeMessage, FeStartupMessage, RowDescriptor, StartupRequestCode,
|
||||
};
|
||||
use crate::replication::ReplicationConn;
|
||||
use crate::timeline::{Timeline, TimelineTools};
|
||||
use crate::WalAcceptorConf;
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
use std::io::{BufReader, Write};
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
use std::sync::Arc;
|
||||
|
||||
/// A network connection that's speaking the libpq replication protocol.
|
||||
pub struct SendWalConn {
|
||||
pub timeline: Option<Arc<Timeline>>,
|
||||
/// Postgres connection, buffered input
|
||||
pub stream_in: BufReader<TcpStream>,
|
||||
/// Postgres connection, output
|
||||
pub stream_out: TcpStream,
|
||||
/// The cached result of socket.peer_addr()
|
||||
pub peer_addr: SocketAddr,
|
||||
/// wal acceptor configuration
|
||||
pub conf: WalAcceptorConf,
|
||||
/// assigned application name
|
||||
appname: Option<String>,
|
||||
}
|
||||
|
||||
impl SendWalConn {
|
||||
/// Create a new `SendWal`, consuming the `Connection`.
|
||||
pub fn new(socket: TcpStream, conf: WalAcceptorConf) -> Result<Self> {
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
let conn = SendWalConn {
|
||||
timeline: None,
|
||||
stream_in: BufReader::new(socket.try_clone()?),
|
||||
stream_out: socket,
|
||||
peer_addr,
|
||||
conf,
|
||||
appname: None,
|
||||
};
|
||||
Ok(conn)
|
||||
}
|
||||
|
||||
///
|
||||
/// Send WAL to replica or WAL receiver using standard libpq replication protocol
|
||||
///
|
||||
pub fn run(mut self) -> Result<()> {
|
||||
let peer_addr = self.peer_addr;
|
||||
info!("WAL sender to {:?} is started", peer_addr);
|
||||
|
||||
// Handle the startup message first.
|
||||
|
||||
let m = FeStartupMessage::read_from(&mut self.stream_in)?;
|
||||
trace!("got startup message {:?}", m);
|
||||
match m.kind {
|
||||
StartupRequestCode::NegotiateGss | StartupRequestCode::NegotiateSsl => {
|
||||
let mut buf = BytesMut::new();
|
||||
BeMessage::write(&mut buf, &BeMessage::Negotiate);
|
||||
info!("SSL requested");
|
||||
self.stream_out.write_all(&buf)?;
|
||||
}
|
||||
StartupRequestCode::Normal => {
|
||||
let mut buf = BytesMut::new();
|
||||
BeMessage::write(&mut buf, &BeMessage::AuthenticationOk);
|
||||
BeMessage::write(&mut buf, &BeMessage::ReadyForQuery);
|
||||
self.stream_out.write_all(&buf)?;
|
||||
self.timeline.set(m.timelineid)?;
|
||||
self.appname = m.appname;
|
||||
}
|
||||
StartupRequestCode::Cancel => return Ok(()),
|
||||
}
|
||||
|
||||
loop {
|
||||
let msg = FeMessage::read_from(&mut self.stream_in)?;
|
||||
match msg {
|
||||
FeMessage::Query(q) => {
|
||||
trace!("got query {:?}", q.body);
|
||||
|
||||
if q.body.starts_with(b"IDENTIFY_SYSTEM") {
|
||||
self.handle_identify_system()?;
|
||||
} else if q.body.starts_with(b"START_REPLICATION") {
|
||||
// Create a new replication object, consuming `self`.
|
||||
ReplicationConn::new(self).run(&q.body)?;
|
||||
break;
|
||||
} else {
|
||||
bail!("Unexpected command {:?}", q.body);
|
||||
}
|
||||
}
|
||||
FeMessage::Terminate => {
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
bail!("unexpected message");
|
||||
}
|
||||
}
|
||||
}
|
||||
info!("WAL sender to {:?} is finished", peer_addr);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Handle IDENTIFY_SYSTEM replication command
|
||||
///
|
||||
fn handle_identify_system(&mut self) -> Result<()> {
|
||||
let (start_pos, timeline) = self.timeline.find_end_of_wal(&self.conf.data_dir, false);
|
||||
let lsn = start_pos.to_string();
|
||||
let tli = timeline.to_string();
|
||||
let sysid = self.timeline.get().get_info().server.system_id.to_string();
|
||||
let lsn_bytes = lsn.as_bytes();
|
||||
let tli_bytes = tli.as_bytes();
|
||||
let sysid_bytes = sysid.as_bytes();
|
||||
|
||||
let mut outbuf = BytesMut::new();
|
||||
BeMessage::write(
|
||||
&mut outbuf,
|
||||
&BeMessage::RowDescription(&[
|
||||
RowDescriptor {
|
||||
name: b"systemid\0",
|
||||
typoid: 25,
|
||||
typlen: -1,
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"timeline\0",
|
||||
typoid: 23,
|
||||
typlen: 4,
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"xlogpos\0",
|
||||
typoid: 25,
|
||||
typlen: -1,
|
||||
},
|
||||
RowDescriptor {
|
||||
name: b"dbname\0",
|
||||
typoid: 25,
|
||||
typlen: -1,
|
||||
},
|
||||
]),
|
||||
);
|
||||
BeMessage::write(
|
||||
&mut outbuf,
|
||||
&BeMessage::DataRow(&[Some(sysid_bytes), Some(tli_bytes), Some(lsn_bytes), None]),
|
||||
);
|
||||
BeMessage::write(
|
||||
&mut outbuf,
|
||||
&BeMessage::CommandComplete(b"IDENTIFY_SYSTEM\0"),
|
||||
);
|
||||
BeMessage::write(&mut outbuf, &BeMessage::ReadyForQuery);
|
||||
self.stream_out.write_all(&outbuf)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
267
walkeeper/src/timeline.rs
Normal file
267
walkeeper/src/timeline.rs
Normal file
@@ -0,0 +1,267 @@
|
||||
//! This module contains tools for managing timelines.
|
||||
//!
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use fs2::FileExt;
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use pageserver::ZTimelineId;
|
||||
use postgres_ffi::xlog_utils::{find_end_of_wal, TimeLineID};
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{Seek, SeekFrom};
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, Condvar, Mutex};
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::receive_wal::{SafeKeeperInfo, CONTROL_FILE_NAME, SK_FORMAT_VERSION, SK_MAGIC};
|
||||
use crate::replication::{HotStandbyFeedback, END_REPLICATION_MARKER};
|
||||
use crate::WalAcceptorConf;
|
||||
|
||||
/// Shared state associated with database instance (tenant)
|
||||
#[derive(Debug)]
|
||||
struct SharedState {
|
||||
/// quorum commit LSN
|
||||
commit_lsn: Lsn,
|
||||
/// information about this safekeeper
|
||||
info: SafeKeeperInfo,
|
||||
/// opened file control file handle (needed to hold exlusive file lock
|
||||
control_file: Option<File>,
|
||||
/// combined hot standby feedback from all replicas
|
||||
hs_feedback: HotStandbyFeedback,
|
||||
}
|
||||
|
||||
impl SharedState {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
commit_lsn: Lsn(0),
|
||||
info: SafeKeeperInfo::new(),
|
||||
control_file: None,
|
||||
hs_feedback: HotStandbyFeedback {
|
||||
ts: 0,
|
||||
xmin: u64::MAX,
|
||||
catalog_xmin: u64::MAX,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Accumulate hot standby feedbacks from replicas
|
||||
pub fn add_hs_feedback(&mut self, feedback: HotStandbyFeedback) {
|
||||
self.hs_feedback.xmin = min(self.hs_feedback.xmin, feedback.xmin);
|
||||
self.hs_feedback.catalog_xmin = min(self.hs_feedback.catalog_xmin, feedback.catalog_xmin);
|
||||
self.hs_feedback.ts = max(self.hs_feedback.ts, feedback.ts);
|
||||
}
|
||||
|
||||
/// Load and lock control file (prevent running more than one instance of safekeeper)
|
||||
pub fn load_control_file(
|
||||
&mut self,
|
||||
conf: &WalAcceptorConf,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<()> {
|
||||
if self.control_file.is_some() {
|
||||
info!("control file for timeline {} is already open", timelineid);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let control_file_path = conf
|
||||
.data_dir
|
||||
.join(timelineid.to_string())
|
||||
.join(CONTROL_FILE_NAME);
|
||||
info!("loading control file {}", control_file_path.display());
|
||||
match OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.open(&control_file_path)
|
||||
{
|
||||
Ok(file) => {
|
||||
// Lock file to prevent two or more active wal_acceptors
|
||||
match file.try_lock_exclusive() {
|
||||
Ok(()) => {}
|
||||
Err(e) => {
|
||||
bail!(
|
||||
"Control file {:?} is locked by some other process: {}",
|
||||
&control_file_path,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
self.control_file = Some(file);
|
||||
|
||||
let cfile_ref = self.control_file.as_mut().unwrap();
|
||||
match SafeKeeperInfo::des_from(cfile_ref) {
|
||||
Err(e) => {
|
||||
warn!("read from {:?} failed: {}", control_file_path, e);
|
||||
}
|
||||
Ok(info) => {
|
||||
if info.magic != SK_MAGIC {
|
||||
bail!("Invalid control file magic: {}", info.magic);
|
||||
}
|
||||
if info.format_version != SK_FORMAT_VERSION {
|
||||
bail!(
|
||||
"Incompatible format version: {} vs. {}",
|
||||
info.format_version,
|
||||
SK_FORMAT_VERSION
|
||||
);
|
||||
}
|
||||
self.info = info;
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
panic!(
|
||||
"Failed to open control file {:?}: {}",
|
||||
&control_file_path, e
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn save_control_file(&mut self, sync: bool) -> Result<()> {
|
||||
let file = self.control_file.as_mut().unwrap();
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
self.info.ser_into(file)?;
|
||||
if sync {
|
||||
file.sync_all()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Database instance (tenant)
|
||||
#[derive(Debug)]
|
||||
pub struct Timeline {
|
||||
pub timelineid: ZTimelineId,
|
||||
mutex: Mutex<SharedState>,
|
||||
/// conditional variable used to notify wal senders
|
||||
cond: Condvar,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
fn new(timelineid: ZTimelineId, shared_state: SharedState) -> Timeline {
|
||||
Timeline {
|
||||
timelineid,
|
||||
mutex: Mutex::new(shared_state),
|
||||
cond: Condvar::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Wait for an LSN to be committed.
|
||||
///
|
||||
/// Returns the last committed LSN, which will be at least
|
||||
/// as high as the LSN waited for.
|
||||
///
|
||||
pub fn wait_for_lsn(&self, lsn: Lsn) -> Lsn {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
loop {
|
||||
let commit_lsn = shared_state.commit_lsn;
|
||||
// This must be `>`, not `>=`.
|
||||
if commit_lsn > lsn {
|
||||
return commit_lsn;
|
||||
}
|
||||
shared_state = self.cond.wait(shared_state).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Notify caught-up WAL senders about new WAL data received
|
||||
pub fn notify_wal_senders(&self, commit_lsn: Lsn) {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
if shared_state.commit_lsn < commit_lsn {
|
||||
shared_state.commit_lsn = commit_lsn;
|
||||
self.cond.notify_all();
|
||||
}
|
||||
}
|
||||
|
||||
fn _stop_wal_senders(&self) {
|
||||
self.notify_wal_senders(END_REPLICATION_MARKER);
|
||||
}
|
||||
|
||||
pub fn get_info(&self) -> SafeKeeperInfo {
|
||||
return self.mutex.lock().unwrap().info.clone();
|
||||
}
|
||||
|
||||
pub fn set_info(&self, info: &SafeKeeperInfo) {
|
||||
self.mutex.lock().unwrap().info = info.clone();
|
||||
}
|
||||
|
||||
// Accumulate hot standby feedbacks from replicas
|
||||
pub fn add_hs_feedback(&self, feedback: HotStandbyFeedback) {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.add_hs_feedback(feedback);
|
||||
}
|
||||
|
||||
pub fn get_hs_feedback(&self) -> HotStandbyFeedback {
|
||||
let shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.hs_feedback.clone()
|
||||
}
|
||||
|
||||
pub fn load_control_file(&self, conf: &WalAcceptorConf) -> Result<()> {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.load_control_file(conf, self.timelineid)
|
||||
}
|
||||
|
||||
pub fn save_control_file(&self, sync: bool) -> Result<()> {
|
||||
let mut shared_state = self.mutex.lock().unwrap();
|
||||
shared_state.save_control_file(sync)
|
||||
}
|
||||
}
|
||||
|
||||
// Utilities needed by various Connection-like objects
|
||||
pub trait TimelineTools {
|
||||
fn set(&mut self, timeline_id: ZTimelineId) -> Result<()>;
|
||||
fn get(&self) -> &Arc<Timeline>;
|
||||
fn find_end_of_wal(&self, data_dir: &Path, precise: bool) -> (Lsn, TimeLineID);
|
||||
}
|
||||
|
||||
impl TimelineTools for Option<Arc<Timeline>> {
|
||||
fn set(&mut self, timeline_id: ZTimelineId) -> Result<()> {
|
||||
// We will only set the timeline once. If it were to ever change,
|
||||
// anyone who cloned the Arc would be out of date.
|
||||
assert!(self.is_none());
|
||||
*self = Some(GlobalTimelines::store(timeline_id)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get(&self) -> &Arc<Timeline> {
|
||||
self.as_ref().unwrap()
|
||||
}
|
||||
|
||||
/// Find last WAL record. If "precise" is false then just locate last partial segment
|
||||
fn find_end_of_wal(&self, data_dir: &Path, precise: bool) -> (Lsn, TimeLineID) {
|
||||
let seg_size = self.get().get_info().server.wal_seg_size as usize;
|
||||
let (lsn, timeline) = find_end_of_wal(data_dir, seg_size, precise);
|
||||
(Lsn(lsn), timeline)
|
||||
}
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
pub static ref TIMELINES: Mutex<HashMap<ZTimelineId, Arc<Timeline>>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
/// A zero-sized struct used to manage access to the global timelines map.
|
||||
struct GlobalTimelines;
|
||||
|
||||
impl GlobalTimelines {
|
||||
/// Store a new timeline into the global TIMELINES map.
|
||||
fn store(timeline_id: ZTimelineId) -> Result<Arc<Timeline>> {
|
||||
let mut timelines = TIMELINES.lock().unwrap();
|
||||
|
||||
match timelines.get(&timeline_id) {
|
||||
Some(result) => Ok(Arc::clone(result)),
|
||||
None => {
|
||||
info!("creating timeline dir {}", timeline_id);
|
||||
fs::create_dir_all(timeline_id.to_string())?;
|
||||
|
||||
let shared_state = SharedState::new();
|
||||
|
||||
let new_tid = Arc::new(Timeline::new(timeline_id, shared_state));
|
||||
timelines.insert(timeline_id, Arc::clone(&new_tid));
|
||||
Ok(new_tid)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
22
workspace_hack/Cargo.toml
Normal file
22
workspace_hack/Cargo.toml
Normal file
@@ -0,0 +1,22 @@
|
||||
[package]
|
||||
name = "workspace_hack"
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
|
||||
[target.'cfg(all())'.dependencies]
|
||||
libc = { version = "0.2", features = ["default", "extra_traits", "std"] }
|
||||
memchr = { version = "2", features = ["default", "std", "use_std"] }
|
||||
num-integer = { version = "0.1", default-features = false, features = ["std"] }
|
||||
num-traits = { version = "0.2", default-features = false, features = ["std"] }
|
||||
regex = { version = "1", features = ["aho-corasick", "default", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
|
||||
regex-syntax = { version = "0.6", features = ["default", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
|
||||
serde = { version = "1", features = ["default", "derive", "serde_derive", "std"] }
|
||||
|
||||
[target.'cfg(all())'.build-dependencies]
|
||||
libc = { version = "0.2", features = ["default", "extra_traits", "std"] }
|
||||
memchr = { version = "2", features = ["default", "std", "use_std"] }
|
||||
proc-macro2 = { version = "1", features = ["default", "proc-macro"] }
|
||||
quote = { version = "1", features = ["default", "proc-macro"] }
|
||||
regex = { version = "1", features = ["aho-corasick", "default", "memchr", "perf", "perf-cache", "perf-dfa", "perf-inline", "perf-literal", "std", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
|
||||
regex-syntax = { version = "0.6", features = ["default", "unicode", "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment"] }
|
||||
syn = { version = "1", features = ["clone-impls", "default", "derive", "full", "parsing", "printing", "proc-macro", "quote", "visit", "visit-mut"] }
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user